summaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Vectorize')
-rw-r--r--lib/Transforms/Vectorize/CMakeLists.txt4
-rw-r--r--lib/Transforms/Vectorize/LoadStoreVectorizer.cpp298
-rw-r--r--lib/Transforms/Vectorize/LoopVectorizationLegality.cpp1072
-rw-r--r--lib/Transforms/Vectorize/LoopVectorizationPlanner.h282
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp2806
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp1926
-rw-r--r--lib/Transforms/Vectorize/VPRecipeBuilder.h131
-rw-r--r--lib/Transforms/Vectorize/VPlan.cpp37
-rw-r--r--lib/Transforms/Vectorize/VPlan.h201
-rw-r--r--lib/Transforms/Vectorize/VPlanBuilder.h61
-rw-r--r--lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp336
-rw-r--r--lib/Transforms/Vectorize/VPlanHCFGBuilder.h55
-rw-r--r--lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp73
-rw-r--r--lib/Transforms/Vectorize/VPlanHCFGTransforms.h36
-rw-r--r--lib/Transforms/Vectorize/VPlanValue.h35
-rw-r--r--lib/Transforms/Vectorize/VPlanVerifier.cpp133
-rw-r--r--lib/Transforms/Vectorize/VPlanVerifier.h44
-rw-r--r--lib/Transforms/Vectorize/Vectorize.cpp4
18 files changed, 4618 insertions, 2916 deletions
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
index 7622ed6d194f..27a4d241b320 100644
--- a/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,9 +1,13 @@
add_llvm_library(LLVMVectorize
LoadStoreVectorizer.cpp
+ LoopVectorizationLegality.cpp
LoopVectorize.cpp
SLPVectorizer.cpp
Vectorize.cpp
VPlan.cpp
+ VPlanHCFGBuilder.cpp
+ VPlanHCFGTransforms.cpp
+ VPlanVerifier.cpp
ADDITIONAL_HEADER_DIRS
${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index dc83b6d4d292..5f3d127202ad 100644
--- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -6,6 +6,38 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
+//
+// This pass merges loads/stores to/from sequential memory addresses into vector
+// loads/stores. Although there's nothing GPU-specific in here, this pass is
+// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
+//
+// (For simplicity below we talk about loads only, but everything also applies
+// to stores.)
+//
+// This pass is intended to be run late in the pipeline, after other
+// vectorization opportunities have been exploited. So the assumption here is
+// that immediately following our new vector load we'll need to extract out the
+// individual elements of the load, so we can operate on them individually.
+//
+// On CPUs this transformation is usually not beneficial, because extracting the
+// elements of a vector register is expensive on most architectures. It's
+// usually better just to load each element individually into its own scalar
+// register.
+//
+// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
+// "vector load" loads directly into a series of scalar registers. In effect,
+// extracting the elements of the vector is free. It's therefore always
+// beneficial to vectorize a sequence of loads on these architectures.
+//
+// Vectorizing (perhaps a better name might be "coalescing") loads can have
+// large performance impacts on GPU kernels, and opportunities for vectorizing
+// are common in GPU code. This pass tries very hard to find such
+// opportunities; its runtime is quadratic in the number of loads in a BB.
+//
+// Some CPU architectures, such as ARM, have instructions that load into
+// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
+// could use this pass (with some modifications), but currently it implements
+// its own pass to do something similar to what we do here.
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
@@ -21,6 +53,7 @@
#include "llvm/Analysis/OrderedBasicBlock.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Attributes.h"
@@ -45,7 +78,6 @@
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <cassert>
@@ -65,8 +97,16 @@ static const unsigned StackAdjustedAlignment = 4;
namespace {
+/// ChainID is an arbitrary token that is allowed to be different only for the
+/// accesses that are guaranteed to be considered non-consecutive by
+/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
+/// together and reducing the number of instructions the main search operates on
+/// at a time, i.e. this is to reduce compile time and nothing else as the main
+/// search has O(n^2) time complexity. The underlying type of ChainID should not
+/// be relied upon.
+using ChainID = const Value *;
using InstrList = SmallVector<Instruction *, 8>;
-using InstrListMap = MapVector<Value *, InstrList>;
+using InstrListMap = MapVector<ChainID, InstrList>;
class Vectorizer {
Function &F;
@@ -86,10 +126,6 @@ public:
bool run();
private:
- Value *getPointerOperand(Value *I) const;
-
- GetElementPtrInst *getSourceGEP(Value *Src) const;
-
unsigned getPointerAddressSpace(Value *I);
unsigned getAlignment(LoadInst *LI) const {
@@ -108,7 +144,15 @@ private:
return DL.getABITypeAlignment(SI->getValueOperand()->getType());
}
+ static const unsigned MaxDepth = 3;
+
bool isConsecutiveAccess(Value *A, Value *B);
+ bool areConsecutivePointers(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
+ unsigned Depth = 0) const;
+ bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
+ unsigned Depth) const;
+ bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
+ unsigned Depth) const;
/// After vectorization, reorder the instructions that I depends on
/// (the instructions defining its operands), to ensure they dominate I.
@@ -239,14 +283,6 @@ bool Vectorizer::run() {
return Changed;
}
-Value *Vectorizer::getPointerOperand(Value *I) const {
- if (LoadInst *LI = dyn_cast<LoadInst>(I))
- return LI->getPointerOperand();
- if (StoreInst *SI = dyn_cast<StoreInst>(I))
- return SI->getPointerOperand();
- return nullptr;
-}
-
unsigned Vectorizer::getPointerAddressSpace(Value *I) {
if (LoadInst *L = dyn_cast<LoadInst>(I))
return L->getPointerAddressSpace();
@@ -255,23 +291,10 @@ unsigned Vectorizer::getPointerAddressSpace(Value *I) {
return -1;
}
-GetElementPtrInst *Vectorizer::getSourceGEP(Value *Src) const {
- // First strip pointer bitcasts. Make sure pointee size is the same with
- // and without casts.
- // TODO: a stride set by the add instruction below can match the difference
- // in pointee type size here. Currently it will not be vectorized.
- Value *SrcPtr = getPointerOperand(Src);
- Value *SrcBase = SrcPtr->stripPointerCasts();
- if (DL.getTypeStoreSize(SrcPtr->getType()->getPointerElementType()) ==
- DL.getTypeStoreSize(SrcBase->getType()->getPointerElementType()))
- SrcPtr = SrcBase;
- return dyn_cast<GetElementPtrInst>(SrcPtr);
-}
-
// FIXME: Merge with llvm::isConsecutiveAccess
bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
- Value *PtrA = getPointerOperand(A);
- Value *PtrB = getPointerOperand(B);
+ Value *PtrA = getLoadStorePointerOperand(A);
+ Value *PtrB = getLoadStorePointerOperand(B);
unsigned ASA = getPointerAddressSpace(A);
unsigned ASB = getPointerAddressSpace(B);
@@ -280,18 +303,27 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
return false;
// Make sure that A and B are different pointers of the same size type.
- unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
Type *PtrATy = PtrA->getType()->getPointerElementType();
Type *PtrBTy = PtrB->getType()->getPointerElementType();
if (PtrA == PtrB ||
+ PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
DL.getTypeStoreSize(PtrATy->getScalarType()) !=
DL.getTypeStoreSize(PtrBTy->getScalarType()))
return false;
+ unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
- APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
+ return areConsecutivePointers(PtrA, PtrB, Size);
+}
+
+bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
+ const APInt &PtrDelta,
+ unsigned Depth) const {
+ unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
+ APInt OffsetA(PtrBitWidth, 0);
+ APInt OffsetB(PtrBitWidth, 0);
PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
@@ -300,11 +332,11 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
// Check if they are based on the same pointer. That makes the offsets
// sufficient.
if (PtrA == PtrB)
- return OffsetDelta == Size;
+ return OffsetDelta == PtrDelta;
// Compute the necessary base pointer delta to have the necessary final delta
- // equal to the size.
- APInt BaseDelta = Size - OffsetDelta;
+ // equal to the pointer delta requested.
+ APInt BaseDelta = PtrDelta - OffsetDelta;
// Compute the distance with SCEV between the base pointers.
const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
@@ -314,71 +346,127 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
if (X == PtrSCEVB)
return true;
+ // The above check will not catch the cases where one of the pointers is
+ // factorized but the other one is not, such as (C + (S * (A + B))) vs
+ // (AS + BS). Get the minus scev. That will allow re-combining the expresions
+ // and getting the simplified difference.
+ const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
+ if (C == Dist)
+ return true;
+
// Sometimes even this doesn't work, because SCEV can't always see through
// patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
// things the hard way.
+ return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
+}
+
+bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
+ APInt PtrDelta,
+ unsigned Depth) const {
+ auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
+ auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
+ if (!GEPA || !GEPB)
+ return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
// Look through GEPs after checking they're the same except for the last
// index.
- GetElementPtrInst *GEPA = getSourceGEP(A);
- GetElementPtrInst *GEPB = getSourceGEP(B);
- if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands())
+ if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
+ GEPA->getPointerOperand() != GEPB->getPointerOperand())
return false;
- unsigned FinalIndex = GEPA->getNumOperands() - 1;
- for (unsigned i = 0; i < FinalIndex; i++)
- if (GEPA->getOperand(i) != GEPB->getOperand(i))
+ gep_type_iterator GTIA = gep_type_begin(GEPA);
+ gep_type_iterator GTIB = gep_type_begin(GEPB);
+ for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
+ if (GTIA.getOperand() != GTIB.getOperand())
return false;
+ ++GTIA;
+ ++GTIB;
+ }
- Instruction *OpA = dyn_cast<Instruction>(GEPA->getOperand(FinalIndex));
- Instruction *OpB = dyn_cast<Instruction>(GEPB->getOperand(FinalIndex));
+ Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
+ Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
OpA->getType() != OpB->getType())
return false;
+ if (PtrDelta.isNegative()) {
+ if (PtrDelta.isMinSignedValue())
+ return false;
+ PtrDelta.negate();
+ std::swap(OpA, OpB);
+ }
+ uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
+ if (PtrDelta.urem(Stride) != 0)
+ return false;
+ unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
+ APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
+
// Only look through a ZExt/SExt.
if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
return false;
bool Signed = isa<SExtInst>(OpA);
- OpA = dyn_cast<Instruction>(OpA->getOperand(0));
+ // At this point A could be a function parameter, i.e. not an instruction
+ Value *ValA = OpA->getOperand(0);
OpB = dyn_cast<Instruction>(OpB->getOperand(0));
- if (!OpA || !OpB || OpA->getType() != OpB->getType())
+ if (!OpB || ValA->getType() != OpB->getType())
return false;
- // Now we need to prove that adding 1 to OpA won't overflow.
+ // Now we need to prove that adding IdxDiff to ValA won't overflow.
bool Safe = false;
- // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA,
- // we're okay.
+ // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
+ // ValA, we're okay.
if (OpB->getOpcode() == Instruction::Add &&
isa<ConstantInt>(OpB->getOperand(1)) &&
- cast<ConstantInt>(OpB->getOperand(1))->getSExtValue() > 0) {
+ IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
if (Signed)
Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
else
Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
}
- unsigned BitWidth = OpA->getType()->getScalarSizeInBits();
+ unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
// Second attempt:
- // If any bits are known to be zero other than the sign bit in OpA, we can
- // add 1 to it while guaranteeing no overflow of any sort.
+ // If all set bits of IdxDiff or any higher order bit other than the sign bit
+ // are known to be zero in ValA, we can add Diff to it while guaranteeing no
+ // overflow of any sort.
if (!Safe) {
+ OpA = dyn_cast<Instruction>(ValA);
+ if (!OpA)
+ return false;
KnownBits Known(BitWidth);
computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
- if (Known.countMaxTrailingOnes() < (BitWidth - 1))
- Safe = true;
+ APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
+ if (Signed)
+ BitsAllowedToBeSet.clearBit(BitWidth - 1);
+ if (BitsAllowedToBeSet.ult(IdxDiff))
+ return false;
}
- if (!Safe)
+ const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
+ const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
+ const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
+ const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
+ return X == OffsetSCEVB;
+}
+
+bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
+ const APInt &PtrDelta,
+ unsigned Depth) const {
+ if (Depth++ == MaxDepth)
return false;
- const SCEV *OffsetSCEVA = SE.getSCEV(OpA);
- const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
- const SCEV *One = SE.getConstant(APInt(BitWidth, 1));
- const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One);
- return X2 == OffsetSCEVB;
+ if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
+ if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
+ return SelectA->getCondition() == SelectB->getCondition() &&
+ areConsecutivePointers(SelectA->getTrueValue(),
+ SelectB->getTrueValue(), PtrDelta, Depth) &&
+ areConsecutivePointers(SelectA->getFalseValue(),
+ SelectB->getFalseValue(), PtrDelta, Depth);
+ }
+ }
+ return false;
}
void Vectorizer::reorder(Instruction *I) {
@@ -448,7 +536,7 @@ Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
SmallVector<Instruction *, 16> Instrs;
for (Instruction *I : Chain) {
- Value *PtrOperand = getPointerOperand(I);
+ Value *PtrOperand = getLoadStorePointerOperand(I);
assert(PtrOperand && "Instruction must have a pointer operand.");
Instrs.push_back(I);
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
@@ -484,7 +572,7 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
SmallVector<Instruction *, 16> ChainInstrs;
bool IsLoadChain = isa<LoadInst>(Chain[0]);
- DEBUG({
+ LLVM_DEBUG({
for (Instruction *I : Chain) {
if (IsLoadChain)
assert(isa<LoadInst>(I) &&
@@ -506,11 +594,12 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
Intrinsic::sideeffect) {
// Ignore llvm.sideeffect calls.
} else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
- DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I << '\n');
+ LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
+ << '\n');
break;
} else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
- DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
- << '\n');
+ LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
+ << '\n');
break;
}
}
@@ -536,32 +625,40 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
break;
- if (isa<LoadInst>(MemInstr) && isa<LoadInst>(ChainInstr))
+ auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
+ auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
+ if (MemLoad && ChainLoad)
continue;
+ // We can ignore the alias if the we have a load store pair and the load
+ // is known to be invariant. The load cannot be clobbered by the store.
+ auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
+ return LI->getMetadata(LLVMContext::MD_invariant_load);
+ };
+
// We can ignore the alias as long as the load comes before the store,
// because that means we won't be moving the load past the store to
// vectorize it (the vectorized load is inserted at the location of the
// first load in the chain).
- if (isa<StoreInst>(MemInstr) && isa<LoadInst>(ChainInstr) &&
- OBB.dominates(ChainInstr, MemInstr))
+ if (isa<StoreInst>(MemInstr) && ChainLoad &&
+ (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr)))
continue;
// Same case, but in reverse.
- if (isa<LoadInst>(MemInstr) && isa<StoreInst>(ChainInstr) &&
- OBB.dominates(MemInstr, ChainInstr))
+ if (MemLoad && isa<StoreInst>(ChainInstr) &&
+ (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr)))
continue;
if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
MemoryLocation::get(ChainInstr))) {
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "LSV: Found alias:\n"
" Aliasing instruction and pointer:\n"
<< " " << *MemInstr << '\n'
- << " " << *getPointerOperand(MemInstr) << '\n'
+ << " " << *getLoadStorePointerOperand(MemInstr) << '\n'
<< " Aliased instruction and pointer:\n"
<< " " << *ChainInstr << '\n'
- << " " << *getPointerOperand(ChainInstr) << '\n';
+ << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
});
// Save this aliasing memory instruction as a barrier, but allow other
// instructions that precede the barrier to be vectorized with this one.
@@ -594,6 +691,20 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
return Chain.slice(0, ChainIdx);
}
+static ChainID getChainID(const Value *Ptr, const DataLayout &DL) {
+ const Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
+ if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
+ // The select's themselves are distinct instructions even if they share the
+ // same condition and evaluate to consecutive pointers for true and false
+ // values of the condition. Therefore using the select's themselves for
+ // grouping instructions would put consecutive accesses into different lists
+ // and they won't be even checked for being consecutive, and won't be
+ // vectorized.
+ return Sel->getCondition();
+ }
+ return ObjPtr;
+}
+
std::pair<InstrListMap, InstrListMap>
Vectorizer::collectInstructions(BasicBlock *BB) {
InstrListMap LoadRefs;
@@ -632,8 +743,12 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
unsigned AS = Ptr->getType()->getPointerAddressSpace();
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+ unsigned VF = VecRegSize / TySize;
+ VectorType *VecTy = dyn_cast<VectorType>(Ty);
+
// No point in looking at these if they're too big to vectorize.
- if (TySize > VecRegSize / 2)
+ if (TySize > VecRegSize / 2 ||
+ (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
continue;
// Make sure all the users of a vector are constant-index extracts.
@@ -644,8 +759,8 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
continue;
// Save the load locations.
- Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
- LoadRefs[ObjPtr].push_back(LI);
+ const ChainID ID = getChainID(Ptr, DL);
+ LoadRefs[ID].push_back(LI);
} else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
if (!SI->isSimple())
continue;
@@ -675,8 +790,12 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
unsigned AS = Ptr->getType()->getPointerAddressSpace();
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+ unsigned VF = VecRegSize / TySize;
+ VectorType *VecTy = dyn_cast<VectorType>(Ty);
+
// No point in looking at these if they're too big to vectorize.
- if (TySize > VecRegSize / 2)
+ if (TySize > VecRegSize / 2 ||
+ (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
continue;
if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
@@ -686,8 +805,8 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
continue;
// Save store location.
- Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
- StoreRefs[ObjPtr].push_back(SI);
+ const ChainID ID = getChainID(Ptr, DL);
+ StoreRefs[ID].push_back(SI);
}
}
@@ -697,12 +816,12 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
bool Vectorizer::vectorizeChains(InstrListMap &Map) {
bool Changed = false;
- for (const std::pair<Value *, InstrList> &Chain : Map) {
+ for (const std::pair<ChainID, InstrList> &Chain : Map) {
unsigned Size = Chain.second.size();
if (Size < 2)
continue;
- DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
+ LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
// Process the stores in chunks of 64.
for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
@@ -716,7 +835,8 @@ bool Vectorizer::vectorizeChains(InstrListMap &Map) {
}
bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
- DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n");
+ LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
+ << " instructions.\n");
SmallVector<int, 16> Heads, Tails;
int ConsecutiveChain[64];
@@ -852,14 +972,14 @@ bool Vectorizer::vectorizeStoreChain(
// vector factor, break it into two pieces.
unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
- DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
- " Creating two separate arrays.\n");
+ LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
+ " Creating two separate arrays.\n");
return vectorizeStoreChain(Chain.slice(0, TargetVF),
InstructionsProcessed) |
vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
}
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "LSV: Stores to vectorize:\n";
for (Instruction *I : Chain)
dbgs() << " " << *I << "\n";
@@ -1000,8 +1120,8 @@ bool Vectorizer::vectorizeLoadChain(
// vector factor, break it into two pieces.
unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
- DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
- " Creating two separate arrays.\n");
+ LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
+ " Creating two separate arrays.\n");
return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
}
@@ -1024,7 +1144,7 @@ bool Vectorizer::vectorizeLoadChain(
Alignment = NewAlign;
}
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "LSV: Loads to vectorize:\n";
for (Instruction *I : Chain)
I->dump();
@@ -1107,7 +1227,7 @@ bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
SzInBytes * 8, AddressSpace,
Alignment, &Fast);
- DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
- << " and fast? " << Fast << "\n";);
+ LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
+ << " and fast? " << Fast << "\n";);
return !Allows || !Fast;
}
diff --git a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
new file mode 100644
index 000000000000..697bc1b448d7
--- /dev/null
+++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -0,0 +1,1072 @@
+//===- LoopVectorizationLegality.cpp --------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides loop vectorization legality analysis. Original code
+// resided in LoopVectorize.cpp for a long time.
+//
+// At this point, it is implemented as a utility class, not as an analysis
+// pass. It should be easy to create an analysis pass around it if there
+// is a need (but D45420 needs to happen first).
+//
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/IntrinsicInst.h"
+
+using namespace llvm;
+
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
+static cl::opt<bool>
+ EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
+ cl::desc("Enable if-conversion during vectorization."));
+
+static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
+ "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
+ cl::desc("The maximum allowed number of runtime memory checks with a "
+ "vectorize(enable) pragma."));
+
+static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
+ "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
+ cl::desc("The maximum number of SCEV checks allowed."));
+
+static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
+ "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
+ cl::desc("The maximum number of SCEV checks allowed with a "
+ "vectorize(enable) pragma"));
+
+/// Maximum vectorization interleave count.
+static const unsigned MaxInterleaveFactor = 16;
+
+namespace llvm {
+
+OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName,
+ StringRef RemarkName,
+ Loop *TheLoop,
+ Instruction *I) {
+ Value *CodeRegion = TheLoop->getHeader();
+ DebugLoc DL = TheLoop->getStartLoc();
+
+ if (I) {
+ CodeRegion = I->getParent();
+ // If there is no debug location attached to the instruction, revert back to
+ // using the loop's.
+ if (I->getDebugLoc())
+ DL = I->getDebugLoc();
+ }
+
+ OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
+ R << "loop not vectorized: ";
+ return R;
+}
+
+bool LoopVectorizeHints::Hint::validate(unsigned Val) {
+ switch (Kind) {
+ case HK_WIDTH:
+ return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
+ case HK_UNROLL:
+ return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
+ case HK_FORCE:
+ return (Val <= 1);
+ case HK_ISVECTORIZED:
+ return (Val == 0 || Val == 1);
+ }
+ return false;
+}
+
+LoopVectorizeHints::LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
+ OptimizationRemarkEmitter &ORE)
+ : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
+ Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
+ Force("vectorize.enable", FK_Undefined, HK_FORCE),
+ IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
+ // Populate values with existing loop metadata.
+ getHintsFromMetadata();
+
+ // force-vector-interleave overrides DisableInterleaving.
+ if (VectorizerParams::isInterleaveForced())
+ Interleave.Value = VectorizerParams::VectorizationInterleave;
+
+ if (IsVectorized.Value != 1)
+ // If the vectorization width and interleaving count are both 1 then
+ // consider the loop to have been already vectorized because there's
+ // nothing more that we can do.
+ IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
+ LLVM_DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
+ << "LV: Interleaving disabled by the pass manager\n");
+}
+
+bool LoopVectorizeHints::allowVectorization(Function *F, Loop *L,
+ bool AlwaysVectorize) const {
+ if (getForce() == LoopVectorizeHints::FK_Disabled) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
+ emitRemarkWithHints();
+ return false;
+ }
+
+ if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
+ emitRemarkWithHints();
+ return false;
+ }
+
+ if (getIsVectorized() == 1) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
+ // FIXME: Add interleave.disable metadata. This will allow
+ // vectorize.disable to be used without disabling the pass and errors
+ // to differentiate between disabled vectorization and a width of 1.
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
+ "AllDisabled", L->getStartLoc(),
+ L->getHeader())
+ << "loop not vectorized: vectorization and interleaving are "
+ "explicitly disabled, or the loop has already been "
+ "vectorized";
+ });
+ return false;
+ }
+
+ return true;
+}
+
+void LoopVectorizeHints::emitRemarkWithHints() const {
+ using namespace ore;
+
+ ORE.emit([&]() {
+ if (Force.Value == LoopVectorizeHints::FK_Disabled)
+ return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "loop not vectorized: vectorization is explicitly disabled";
+ else {
+ OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
+ TheLoop->getStartLoc(), TheLoop->getHeader());
+ R << "loop not vectorized";
+ if (Force.Value == LoopVectorizeHints::FK_Enabled) {
+ R << " (Force=" << NV("Force", true);
+ if (Width.Value != 0)
+ R << ", Vector Width=" << NV("VectorWidth", Width.Value);
+ if (Interleave.Value != 0)
+ R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
+ R << ")";
+ }
+ return R;
+ }
+ });
+}
+
+const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
+ if (getWidth() == 1)
+ return LV_NAME;
+ if (getForce() == LoopVectorizeHints::FK_Disabled)
+ return LV_NAME;
+ if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
+ return LV_NAME;
+ return OptimizationRemarkAnalysis::AlwaysPrint;
+}
+
+void LoopVectorizeHints::getHintsFromMetadata() {
+ MDNode *LoopID = TheLoop->getLoopID();
+ if (!LoopID)
+ return;
+
+ // First operand should refer to the loop id itself.
+ assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+ assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ const MDString *S = nullptr;
+ SmallVector<Metadata *, 4> Args;
+
+ // The expected hint is either a MDString or a MDNode with the first
+ // operand a MDString.
+ if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
+ if (!MD || MD->getNumOperands() == 0)
+ continue;
+ S = dyn_cast<MDString>(MD->getOperand(0));
+ for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
+ Args.push_back(MD->getOperand(i));
+ } else {
+ S = dyn_cast<MDString>(LoopID->getOperand(i));
+ assert(Args.size() == 0 && "too many arguments for MDString");
+ }
+
+ if (!S)
+ continue;
+
+ // Check if the hint starts with the loop metadata prefix.
+ StringRef Name = S->getString();
+ if (Args.size() == 1)
+ setHint(Name, Args[0]);
+ }
+}
+
+void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
+ if (!Name.startswith(Prefix()))
+ return;
+ Name = Name.substr(Prefix().size(), StringRef::npos);
+
+ const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
+ if (!C)
+ return;
+ unsigned Val = C->getZExtValue();
+
+ Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
+ for (auto H : Hints) {
+ if (Name == H->Name) {
+ if (H->validate(Val))
+ H->Value = Val;
+ else
+ LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
+ break;
+ }
+ }
+}
+
+MDNode *LoopVectorizeHints::createHintMetadata(StringRef Name,
+ unsigned V) const {
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+ Metadata *MDs[] = {
+ MDString::get(Context, Name),
+ ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))};
+ return MDNode::get(Context, MDs);
+}
+
+bool LoopVectorizeHints::matchesHintMetadataName(MDNode *Node,
+ ArrayRef<Hint> HintTypes) {
+ MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
+ if (!Name)
+ return false;
+
+ for (auto H : HintTypes)
+ if (Name->getString().endswith(H.Name))
+ return true;
+ return false;
+}
+
+void LoopVectorizeHints::writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
+ if (HintTypes.empty())
+ return;
+
+ // Reserve the first element to LoopID (see below).
+ SmallVector<Metadata *, 4> MDs(1);
+ // If the loop already has metadata, then ignore the existing operands.
+ MDNode *LoopID = TheLoop->getLoopID();
+ if (LoopID) {
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
+ // If node in update list, ignore old value.
+ if (!matchesHintMetadataName(Node, HintTypes))
+ MDs.push_back(Node);
+ }
+ }
+
+ // Now, add the missing hints.
+ for (auto H : HintTypes)
+ MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
+
+ // Replace current metadata node with new one.
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+ MDNode *NewLoopID = MDNode::get(Context, MDs);
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+
+ TheLoop->setLoopID(NewLoopID);
+}
+
+bool LoopVectorizationRequirements::doesNotMeet(
+ Function *F, Loop *L, const LoopVectorizeHints &Hints) {
+ const char *PassName = Hints.vectorizeAnalysisPassName();
+ bool Failed = false;
+ if (UnsafeAlgebraInst && !Hints.allowReordering()) {
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysisFPCommute(
+ PassName, "CantReorderFPOps", UnsafeAlgebraInst->getDebugLoc(),
+ UnsafeAlgebraInst->getParent())
+ << "loop not vectorized: cannot prove it is safe to reorder "
+ "floating-point operations";
+ });
+ Failed = true;
+ }
+
+ // Test if runtime memcheck thresholds are exceeded.
+ bool PragmaThresholdReached =
+ NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+ bool ThresholdReached =
+ NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
+ if ((ThresholdReached && !Hints.allowReordering()) ||
+ PragmaThresholdReached) {
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
+ L->getStartLoc(),
+ L->getHeader())
+ << "loop not vectorized: cannot prove it is safe to reorder "
+ "memory operations";
+ });
+ LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+ Failed = true;
+ }
+
+ return Failed;
+}
+
+// Return true if the inner loop \p Lp is uniform with regard to the outer loop
+// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
+// executing the inner loop will execute the same iterations). This check is
+// very constrained for now but it will be relaxed in the future. \p Lp is
+// considered uniform if it meets all the following conditions:
+// 1) it has a canonical IV (starting from 0 and with stride 1),
+// 2) its latch terminator is a conditional branch and,
+// 3) its latch condition is a compare instruction whose operands are the
+// canonical IV and an OuterLp invariant.
+// This check doesn't take into account the uniformity of other conditions not
+// related to the loop latch because they don't affect the loop uniformity.
+//
+// NOTE: We decided to keep all these checks and its associated documentation
+// together so that we can easily have a picture of the current supported loop
+// nests. However, some of the current checks don't depend on \p OuterLp and
+// would be redundantly executed for each \p Lp if we invoked this function for
+// different candidate outer loops. This is not the case for now because we
+// don't currently have the infrastructure to evaluate multiple candidate outer
+// loops and \p OuterLp will be a fixed parameter while we only support explicit
+// outer loop vectorization. It's also very likely that these checks go away
+// before introducing the aforementioned infrastructure. However, if this is not
+// the case, we should move the \p OuterLp independent checks to a separate
+// function that is only executed once for each \p Lp.
+static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
+ assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
+
+ // If Lp is the outer loop, it's uniform by definition.
+ if (Lp == OuterLp)
+ return true;
+ assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
+
+ // 1.
+ PHINode *IV = Lp->getCanonicalInductionVariable();
+ if (!IV) {
+ LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
+ return false;
+ }
+
+ // 2.
+ BasicBlock *Latch = Lp->getLoopLatch();
+ auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
+ if (!LatchBr || LatchBr->isUnconditional()) {
+ LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
+ return false;
+ }
+
+ // 3.
+ auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition());
+ if (!LatchCmp) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
+ return false;
+ }
+
+ Value *CondOp0 = LatchCmp->getOperand(0);
+ Value *CondOp1 = LatchCmp->getOperand(1);
+ Value *IVUpdate = IV->getIncomingValueForBlock(Latch);
+ if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) &&
+ !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) {
+ LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
+ return false;
+ }
+
+ return true;
+}
+
+// Return true if \p Lp and all its nested loops are uniform with regard to \p
+// OuterLp.
+static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
+ if (!isUniformLoop(Lp, OuterLp))
+ return false;
+
+ // Check if nested loops are uniform.
+ for (Loop *SubLp : *Lp)
+ if (!isUniformLoopNest(SubLp, OuterLp))
+ return false;
+
+ return true;
+}
+
+/// Check whether it is safe to if-convert this phi node.
+///
+/// Phi nodes with constant expressions that can trap are not safe to if
+/// convert.
+static bool canIfConvertPHINodes(BasicBlock *BB) {
+ for (PHINode &Phi : BB->phis()) {
+ for (Value *V : Phi.incoming_values())
+ if (auto *C = dyn_cast<Constant>(V))
+ if (C->canTrap())
+ return false;
+ }
+ return true;
+}
+
+static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
+ if (Ty->isPointerTy())
+ return DL.getIntPtrType(Ty);
+
+ // It is possible that char's or short's overflow when we ask for the loop's
+ // trip count, work around this by changing the type size.
+ if (Ty->getScalarSizeInBits() < 32)
+ return Type::getInt32Ty(Ty->getContext());
+
+ return Ty;
+}
+
+static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
+ Ty0 = convertPointerToIntegerType(DL, Ty0);
+ Ty1 = convertPointerToIntegerType(DL, Ty1);
+ if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
+ return Ty0;
+ return Ty1;
+}
+
+/// Check that the instruction has outside loop users and is not an
+/// identified reduction variable.
+static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
+ SmallPtrSetImpl<Value *> &AllowedExit) {
+ // Reduction and Induction instructions are allowed to have exit users. All
+ // other instructions must not have external users.
+ if (!AllowedExit.count(Inst))
+ // Check that all of the users of the loop are inside the BB.
+ for (User *U : Inst->users()) {
+ Instruction *UI = cast<Instruction>(U);
+ // This user may be a reduction exit value.
+ if (!TheLoop->contains(UI)) {
+ LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
+ return true;
+ }
+ }
+ return false;
+}
+
+int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
+ const ValueToValueMap &Strides =
+ getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
+
+ int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
+ if (Stride == 1 || Stride == -1)
+ return Stride;
+ return 0;
+}
+
+bool LoopVectorizationLegality::isUniform(Value *V) {
+ return LAI->isUniform(V);
+}
+
+bool LoopVectorizationLegality::canVectorizeOuterLoop() {
+ assert(!TheLoop->empty() && "We are not vectorizing an outer loop.");
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // Check whether the BB terminator is a BranchInst. Any other terminator is
+ // not supported yet.
+ auto *Br = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!Br) {
+ LLVM_DEBUG(dbgs() << "LV: Unsupported basic block terminator.\n");
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Check whether the BranchInst is a supported one. Only unconditional
+ // branches, conditional branches with an outer loop invariant condition or
+ // backedges are supported.
+ if (Br && Br->isConditional() &&
+ !TheLoop->isLoopInvariant(Br->getCondition()) &&
+ !LI->isLoopHeader(Br->getSuccessor(0)) &&
+ !LI->isLoopHeader(Br->getSuccessor(1))) {
+ LLVM_DEBUG(dbgs() << "LV: Unsupported conditional branch.\n");
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+ }
+
+ // Check whether inner loops are uniform. At this point, we only support
+ // simple outer loops scenarios with uniform nested loops.
+ if (!isUniformLoopNest(TheLoop /*loop nest*/,
+ TheLoop /*context outer loop*/)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Not vectorizing: Outer loop contains divergent loops.\n");
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ return Result;
+}
+
+void LoopVectorizationLegality::addInductionPhi(
+ PHINode *Phi, const InductionDescriptor &ID,
+ SmallPtrSetImpl<Value *> &AllowedExit) {
+ Inductions[Phi] = ID;
+
+ // In case this induction also comes with casts that we know we can ignore
+ // in the vectorized loop body, record them here. All casts could be recorded
+ // here for ignoring, but suffices to record only the first (as it is the
+ // only one that may bw used outside the cast sequence).
+ const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
+ if (!Casts.empty())
+ InductionCastsToIgnore.insert(*Casts.begin());
+
+ Type *PhiTy = Phi->getType();
+ const DataLayout &DL = Phi->getModule()->getDataLayout();
+
+ // Get the widest type.
+ if (!PhiTy->isFloatingPointTy()) {
+ if (!WidestIndTy)
+ WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
+ else
+ WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
+ }
+
+ // Int inductions are special because we only allow one IV.
+ if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
+ ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
+ isa<Constant>(ID.getStartValue()) &&
+ cast<Constant>(ID.getStartValue())->isNullValue()) {
+
+ // Use the phi node with the widest type as induction. Use the last
+ // one if there are multiple (no good reason for doing this other
+ // than it is expedient). We've checked that it begins at zero and
+ // steps by one, so this is a canonical induction variable.
+ if (!PrimaryInduction || PhiTy == WidestIndTy)
+ PrimaryInduction = Phi;
+ }
+
+ // Both the PHI node itself, and the "post-increment" value feeding
+ // back into the PHI node may have external users.
+ // We can allow those uses, except if the SCEVs we have for them rely
+ // on predicates that only hold within the loop, since allowing the exit
+ // currently means re-using this SCEV outside the loop.
+ if (PSE.getUnionPredicate().isAlwaysTrue()) {
+ AllowedExit.insert(Phi);
+ AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
+}
+
+bool LoopVectorizationLegality::canVectorizeInstrs() {
+ BasicBlock *Header = TheLoop->getHeader();
+
+ // Look for the attribute signaling the absence of NaNs.
+ Function &F = *Header->getParent();
+ HasFunNoNaNAttr =
+ F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
+
+ // For each block in the loop.
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // Scan the instructions in the block and look for hazards.
+ for (Instruction &I : *BB) {
+ if (auto *Phi = dyn_cast<PHINode>(&I)) {
+ Type *PhiTy = Phi->getType();
+ // Check that this PHI type is allowed.
+ if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
+ !PhiTy->isPointerTy()) {
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
+ << "loop control flow is not understood by vectorizer");
+ LLVM_DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
+ return false;
+ }
+
+ // If this PHINode is not in the header block, then we know that we
+ // can convert it to select during if-conversion. No need to check if
+ // the PHIs in this block are induction or reduction variables.
+ if (BB != Header) {
+ // Check that this instruction has no outside users or is an
+ // identified reduction value with an outside user.
+ if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit))
+ continue;
+ ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi)
+ << "value could not be identified as "
+ "an induction or reduction variable");
+ return false;
+ }
+
+ // We only allow if-converted PHIs with exactly two incoming values.
+ if (Phi->getNumIncomingValues() != 2) {
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
+ << "control flow not understood by vectorizer");
+ LLVM_DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+ return false;
+ }
+
+ RecurrenceDescriptor RedDes;
+ if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
+ DT)) {
+ if (RedDes.hasUnsafeAlgebra())
+ Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
+ AllowedExit.insert(RedDes.getLoopExitInstr());
+ Reductions[Phi] = RedDes;
+ continue;
+ }
+
+ InductionDescriptor ID;
+ if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
+ addInductionPhi(Phi, ID, AllowedExit);
+ if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
+ Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
+ continue;
+ }
+
+ if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
+ SinkAfter, DT)) {
+ FirstOrderRecurrences.insert(Phi);
+ continue;
+ }
+
+ // As a last resort, coerce the PHI to a AddRec expression
+ // and re-try classifying it a an induction PHI.
+ if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
+ addInductionPhi(Phi, ID, AllowedExit);
+ continue;
+ }
+
+ ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi)
+ << "value that could not be identified as "
+ "reduction is used outside the loop");
+ LLVM_DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n");
+ return false;
+ } // end of PHI handling
+
+ // We handle calls that:
+ // * Are debug info intrinsics.
+ // * Have a mapping to an IR intrinsic.
+ // * Have a vector version available.
+ auto *CI = dyn_cast<CallInst>(&I);
+ if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
+ !isa<DbgInfoIntrinsic>(CI) &&
+ !(CI->getCalledFunction() && TLI &&
+ TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
+ ORE->emit(createMissedAnalysis("CantVectorizeCall", CI)
+ << "call instruction cannot be vectorized");
+ LLVM_DEBUG(
+ dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
+ return false;
+ }
+
+ // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
+ // second argument is the same (i.e. loop invariant)
+ if (CI && hasVectorInstrinsicScalarOpd(
+ getVectorIntrinsicIDForCall(CI, TLI), 1)) {
+ auto *SE = PSE.getSE();
+ if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
+ ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI)
+ << "intrinsic instruction cannot be vectorized");
+ LLVM_DEBUG(dbgs()
+ << "LV: Found unvectorizable intrinsic " << *CI << "\n");
+ return false;
+ }
+ }
+
+ // Check that the instruction return type is vectorizable.
+ // Also, we can't vectorize extractelement instructions.
+ if ((!VectorType::isValidElementType(I.getType()) &&
+ !I.getType()->isVoidTy()) ||
+ isa<ExtractElementInst>(I)) {
+ ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I)
+ << "instruction return type cannot be vectorized");
+ LLVM_DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
+ return false;
+ }
+
+ // Check that the stored type is vectorizable.
+ if (auto *ST = dyn_cast<StoreInst>(&I)) {
+ Type *T = ST->getValueOperand()->getType();
+ if (!VectorType::isValidElementType(T)) {
+ ORE->emit(createMissedAnalysis("CantVectorizeStore", ST)
+ << "store instruction cannot be vectorized");
+ return false;
+ }
+
+ // FP instructions can allow unsafe algebra, thus vectorizable by
+ // non-IEEE-754 compliant SIMD units.
+ // This applies to floating-point math operations and calls, not memory
+ // operations, shuffles, or casts, as they don't change precision or
+ // semantics.
+ } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
+ !I.isFast()) {
+ LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
+ Hints->setPotentiallyUnsafe();
+ }
+
+ // Reduction instructions are allowed to have exit users.
+ // All other instructions must not have external users.
+ if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
+ ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I)
+ << "value cannot be used outside the loop");
+ return false;
+ }
+ } // next instr.
+ }
+
+ if (!PrimaryInduction) {
+ LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+ if (Inductions.empty()) {
+ ORE->emit(createMissedAnalysis("NoInductionVariable")
+ << "loop induction variable could not be identified");
+ return false;
+ }
+ }
+
+ // Now we know the widest induction type, check if our found induction
+ // is the same size. If it's not, unset it here and InnerLoopVectorizer
+ // will create another.
+ if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
+ PrimaryInduction = nullptr;
+
+ return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeMemory() {
+ LAI = &(*GetLAA)(*TheLoop);
+ const OptimizationRemarkAnalysis *LAR = LAI->getReport();
+ if (LAR) {
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
+ "loop not vectorized: ", *LAR);
+ });
+ }
+ if (!LAI->canVectorizeMemory())
+ return false;
+
+ if (LAI->hasStoreToLoopInvariantAddress()) {
+ ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
+ << "write to a loop invariant address could not be vectorized");
+ LLVM_DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
+ return false;
+ }
+
+ Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
+ PSE.addPredicate(LAI->getPSE().getUnionPredicate());
+
+ return true;
+}
+
+bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
+ Value *In0 = const_cast<Value *>(V);
+ PHINode *PN = dyn_cast_or_null<PHINode>(In0);
+ if (!PN)
+ return false;
+
+ return Inductions.count(PN);
+}
+
+bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
+ auto *Inst = dyn_cast<Instruction>(V);
+ return (Inst && InductionCastsToIgnore.count(Inst));
+}
+
+bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+ return isInductionPhi(V) || isCastedInductionVariable(V);
+}
+
+bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
+ return FirstOrderRecurrences.count(Phi);
+}
+
+bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
+ return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
+}
+
+bool LoopVectorizationLegality::blockCanBePredicated(
+ BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) {
+ const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
+
+ for (Instruction &I : *BB) {
+ // Check that we don't have a constant expression that can trap as operand.
+ for (Value *Operand : I.operands()) {
+ if (auto *C = dyn_cast<Constant>(Operand))
+ if (C->canTrap())
+ return false;
+ }
+ // We might be able to hoist the load.
+ if (I.mayReadFromMemory()) {
+ auto *LI = dyn_cast<LoadInst>(&I);
+ if (!LI)
+ return false;
+ if (!SafePtrs.count(LI->getPointerOperand())) {
+ // !llvm.mem.parallel_loop_access implies if-conversion safety.
+ // Otherwise, record that the load needs (real or emulated) masking
+ // and let the cost model decide.
+ if (!IsAnnotatedParallel)
+ MaskedOp.insert(LI);
+ continue;
+ }
+ }
+
+ if (I.mayWriteToMemory()) {
+ auto *SI = dyn_cast<StoreInst>(&I);
+ if (!SI)
+ return false;
+ // Predicated store requires some form of masking:
+ // 1) masked store HW instruction,
+ // 2) emulation via load-blend-store (only if safe and legal to do so,
+ // be aware on the race conditions), or
+ // 3) element-by-element predicate check and scalar store.
+ MaskedOp.insert(SI);
+ continue;
+ }
+ if (I.mayThrow())
+ return false;
+ }
+
+ return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
+ if (!EnableIfConversion) {
+ ORE->emit(createMissedAnalysis("IfConversionDisabled")
+ << "if-conversion is disabled");
+ return false;
+ }
+
+ assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
+
+ // A list of pointers that we can safely read and write to.
+ SmallPtrSet<Value *, 8> SafePointes;
+
+ // Collect safe addresses.
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ if (blockNeedsPredication(BB))
+ continue;
+
+ for (Instruction &I : *BB)
+ if (auto *Ptr = getLoadStorePointerOperand(&I))
+ SafePointes.insert(Ptr);
+ }
+
+ // Collect the blocks that need predication.
+ BasicBlock *Header = TheLoop->getHeader();
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // We don't support switch statements inside loops.
+ if (!isa<BranchInst>(BB->getTerminator())) {
+ ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator())
+ << "loop contains a switch statement");
+ return false;
+ }
+
+ // We must be able to predicate all blocks that need to be predicated.
+ if (blockNeedsPredication(BB)) {
+ if (!blockCanBePredicated(BB, SafePointes)) {
+ ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
+ << "control flow cannot be substituted for a select");
+ return false;
+ }
+ } else if (BB != Header && !canIfConvertPHINodes(BB)) {
+ ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
+ << "control flow cannot be substituted for a select");
+ return false;
+ }
+ }
+
+ // We can if-convert this loop.
+ return true;
+}
+
+// Helper function to canVectorizeLoopNestCFG.
+bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
+ bool UseVPlanNativePath) {
+ assert((UseVPlanNativePath || Lp->empty()) &&
+ "VPlan-native path is not enabled.");
+
+ // TODO: ORE should be improved to show more accurate information when an
+ // outer loop can't be vectorized because a nested loop is not understood or
+ // legal. Something like: "outer_loop_location: loop not vectorized:
+ // (inner_loop_location) loop control flow is not understood by vectorizer".
+
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+
+ // We must have a loop in canonical form. Loops with indirectbr in them cannot
+ // be canonicalized.
+ if (!Lp->getLoopPreheader()) {
+ LLVM_DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n");
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // We must have a single backedge.
+ if (Lp->getNumBackEdges() != 1) {
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // We must have a single exiting block.
+ if (!Lp->getExitingBlock()) {
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // We only handle bottom-tested loops, i.e. loop in which the condition is
+ // checked at the end of each iteration. With that we can assume that all
+ // instructions in the loop are executed the same number of times.
+ if (Lp->getExitingBlock() != Lp->getLoopLatch()) {
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ return Result;
+}
+
+bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
+ Loop *Lp, bool UseVPlanNativePath) {
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+ if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Recursively check whether the loop control flow of nested loops is
+ // understood.
+ for (Loop *SubLp : *Lp)
+ if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) {
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ return Result;
+}
+
+bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+ // Check whether the loop-related control flow in the loop nest is expected by
+ // vectorizer.
+ if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) {
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // We need to have a loop header.
+ LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
+ << '\n');
+
+ // Specific checks for outer loops. We skip the remaining legal checks at this
+ // point because they don't support outer loops.
+ if (!TheLoop->empty()) {
+ assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
+
+ if (!canVectorizeOuterLoop()) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Unsupported outer loop.\n");
+ // TODO: Implement DoExtraAnalysis when subsequent legal checks support
+ // outer loops.
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
+ return Result;
+ }
+
+ assert(TheLoop->empty() && "Inner loop expected.");
+ // Check if we can if-convert non-single-bb loops.
+ unsigned NumBlocks = TheLoop->getNumBlocks();
+ if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
+ LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Check if we can vectorize the instructions and CFG in this loop.
+ if (!canVectorizeInstrs()) {
+ LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Go over each instruction and look at memory deps.
+ if (!canVectorizeMemory()) {
+ LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
+ << (LAI->getRuntimePointerChecking()->Need
+ ? " (with a runtime bound check)"
+ : "")
+ << "!\n");
+
+ unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
+ if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
+ SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
+
+ if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
+ ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks")
+ << "Too many SCEV assumptions need to be made and checked "
+ << "at runtime");
+ LLVM_DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Okay! We've done all the tests. If any have failed, return false. Otherwise
+ // we can vectorize, and at this point we don't have any other mem analysis
+ // which may limit our maximum vectorization factor, so just return true with
+ // no restrictions.
+ return Result;
+}
+
+} // namespace llvm
diff --git a/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
new file mode 100644
index 000000000000..2aa219064299
--- /dev/null
+++ b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -0,0 +1,282 @@
+//===- LoopVectorizationPlanner.h - Planner for LoopVectorization ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides a LoopVectorizationPlanner class.
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
+/// LoopVectorizationPlanner - drives the vectorization process after having
+/// passed Legality checks.
+/// The planner builds and optimizes the Vectorization Plans which record the
+/// decisions how to vectorize the given loop. In particular, represent the
+/// control-flow of the vectorized version, the replication of instructions that
+/// are to be scalarized, and interleave access groups.
+///
+/// Also provides a VPlan-based builder utility analogous to IRBuilder.
+/// It provides an instruction-level API for generating VPInstructions while
+/// abstracting away the Recipe manipulation details.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
+#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
+
+#include "VPlan.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+
+namespace llvm {
+
+/// VPlan-based builder utility analogous to IRBuilder.
+class VPBuilder {
+private:
+ VPBasicBlock *BB = nullptr;
+ VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
+
+ VPInstruction *createInstruction(unsigned Opcode,
+ ArrayRef<VPValue *> Operands) {
+ VPInstruction *Instr = new VPInstruction(Opcode, Operands);
+ if (BB)
+ BB->insert(Instr, InsertPt);
+ return Instr;
+ }
+
+ VPInstruction *createInstruction(unsigned Opcode,
+ std::initializer_list<VPValue *> Operands) {
+ return createInstruction(Opcode, ArrayRef<VPValue *>(Operands));
+ }
+
+public:
+ VPBuilder() {}
+
+ /// Clear the insertion point: created instructions will not be inserted into
+ /// a block.
+ void clearInsertionPoint() {
+ BB = nullptr;
+ InsertPt = VPBasicBlock::iterator();
+ }
+
+ VPBasicBlock *getInsertBlock() const { return BB; }
+ VPBasicBlock::iterator getInsertPoint() const { return InsertPt; }
+
+ /// InsertPoint - A saved insertion point.
+ class VPInsertPoint {
+ VPBasicBlock *Block = nullptr;
+ VPBasicBlock::iterator Point;
+
+ public:
+ /// Creates a new insertion point which doesn't point to anything.
+ VPInsertPoint() = default;
+
+ /// Creates a new insertion point at the given location.
+ VPInsertPoint(VPBasicBlock *InsertBlock, VPBasicBlock::iterator InsertPoint)
+ : Block(InsertBlock), Point(InsertPoint) {}
+
+ /// Returns true if this insert point is set.
+ bool isSet() const { return Block != nullptr; }
+
+ VPBasicBlock *getBlock() const { return Block; }
+ VPBasicBlock::iterator getPoint() const { return Point; }
+ };
+
+ /// Sets the current insert point to a previously-saved location.
+ void restoreIP(VPInsertPoint IP) {
+ if (IP.isSet())
+ setInsertPoint(IP.getBlock(), IP.getPoint());
+ else
+ clearInsertionPoint();
+ }
+
+ /// This specifies that created VPInstructions should be appended to the end
+ /// of the specified block.
+ void setInsertPoint(VPBasicBlock *TheBB) {
+ assert(TheBB && "Attempting to set a null insert point");
+ BB = TheBB;
+ InsertPt = BB->end();
+ }
+
+ /// This specifies that created instructions should be inserted at the
+ /// specified point.
+ void setInsertPoint(VPBasicBlock *TheBB, VPBasicBlock::iterator IP) {
+ BB = TheBB;
+ InsertPt = IP;
+ }
+
+ /// Insert and return the specified instruction.
+ VPInstruction *insert(VPInstruction *I) const {
+ BB->insert(I, InsertPt);
+ return I;
+ }
+
+ /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
+ /// its underlying Instruction.
+ VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
+ Instruction *Inst = nullptr) {
+ VPInstruction *NewVPInst = createInstruction(Opcode, Operands);
+ NewVPInst->setUnderlyingValue(Inst);
+ return NewVPInst;
+ }
+ VPValue *createNaryOp(unsigned Opcode,
+ std::initializer_list<VPValue *> Operands,
+ Instruction *Inst = nullptr) {
+ return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst);
+ }
+
+ VPValue *createNot(VPValue *Operand) {
+ return createInstruction(VPInstruction::Not, {Operand});
+ }
+
+ VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
+ return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
+ }
+
+ VPValue *createOr(VPValue *LHS, VPValue *RHS) {
+ return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
+ }
+
+ //===--------------------------------------------------------------------===//
+ // RAII helpers.
+ //===--------------------------------------------------------------------===//
+
+ /// RAII object that stores the current insertion point and restores it when
+ /// the object is destroyed.
+ class InsertPointGuard {
+ VPBuilder &Builder;
+ VPBasicBlock *Block;
+ VPBasicBlock::iterator Point;
+
+ public:
+ InsertPointGuard(VPBuilder &B)
+ : Builder(B), Block(B.getInsertBlock()), Point(B.getInsertPoint()) {}
+
+ InsertPointGuard(const InsertPointGuard &) = delete;
+ InsertPointGuard &operator=(const InsertPointGuard &) = delete;
+
+ ~InsertPointGuard() { Builder.restoreIP(VPInsertPoint(Block, Point)); }
+ };
+};
+
+/// TODO: The following VectorizationFactor was pulled out of
+/// LoopVectorizationCostModel class. LV also deals with
+/// VectorizerParams::VectorizationFactor and VectorizationCostTy.
+/// We need to streamline them.
+
+/// Information about vectorization costs
+struct VectorizationFactor {
+ // Vector width with best cost
+ unsigned Width;
+ // Cost of the loop with that width
+ unsigned Cost;
+};
+
+/// Planner drives the vectorization process after having passed
+/// Legality checks.
+class LoopVectorizationPlanner {
+ /// The loop that we evaluate.
+ Loop *OrigLoop;
+
+ /// Loop Info analysis.
+ LoopInfo *LI;
+
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
+ /// Target Transform Info.
+ const TargetTransformInfo *TTI;
+
+ /// The legality analysis.
+ LoopVectorizationLegality *Legal;
+
+ /// The profitablity analysis.
+ LoopVectorizationCostModel &CM;
+
+ using VPlanPtr = std::unique_ptr<VPlan>;
+
+ SmallVector<VPlanPtr, 4> VPlans;
+
+ /// This class is used to enable the VPlan to invoke a method of ILV. This is
+ /// needed until the method is refactored out of ILV and becomes reusable.
+ struct VPCallbackILV : public VPCallback {
+ InnerLoopVectorizer &ILV;
+
+ VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}
+
+ Value *getOrCreateVectorValues(Value *V, unsigned Part) override;
+ };
+
+ /// A builder used to construct the current plan.
+ VPBuilder Builder;
+
+ unsigned BestVF = 0;
+ unsigned BestUF = 0;
+
+public:
+ LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI,
+ LoopVectorizationLegality *Legal,
+ LoopVectorizationCostModel &CM)
+ : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {}
+
+ /// Plan how to best vectorize, return the best VF and its cost.
+ VectorizationFactor plan(bool OptForSize, unsigned UserVF);
+
+ /// Use the VPlan-native path to plan how to best vectorize, return the best
+ /// VF and its cost.
+ VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF);
+
+ /// Finalize the best decision and dispose of all other VPlans.
+ void setBestPlan(unsigned VF, unsigned UF);
+
+ /// Generate the IR code for the body of the vectorized loop according to the
+ /// best selected VPlan.
+ void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
+
+ void printPlans(raw_ostream &O) {
+ for (const auto &Plan : VPlans)
+ O << *Plan;
+ }
+
+ /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
+ /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
+ /// returned value holds for the entire \p Range.
+ static bool
+ getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
+ VFRange &Range);
+
+protected:
+ /// Collect the instructions from the original loop that would be trivially
+ /// dead in the vectorized loop if generated.
+ void collectTriviallyDeadInstructions(
+ SmallPtrSetImpl<Instruction *> &DeadInstructions);
+
+ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
+ /// according to the information gathered by Legal when it checked if it is
+ /// legal to vectorize the loop.
+ void buildVPlans(unsigned MinVF, unsigned MaxVF);
+
+private:
+ /// Build a VPlan according to the information gathered by Legal. \return a
+ /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
+ /// exclusive, possibly decreasing \p Range.End.
+ VPlanPtr buildVPlan(VFRange &Range);
+
+ /// Build a VPlan using VPRecipes according to the information gather by
+ /// Legal. This method is only used for the legacy inner loop vectorizer.
+ VPlanPtr
+ buildVPlanWithVPRecipes(VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
+ SmallPtrSetImpl<Instruction *> &DeadInstructions);
+
+ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
+ /// according to the information gathered by Legal when it checked if it is
+ /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
+ void buildVPlansWithVPRecipes(unsigned MinVF, unsigned MaxVF);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 52f32cda2609..3c693f5d5ee0 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -26,6 +26,14 @@
// of vectorization. It decides on the optimal vector width, which
// can be one, if vectorization is not profitable.
//
+// There is a development effort going on to migrate loop vectorizer to the
+// VPlan infrastructure and to introduce outer loop vectorization support (see
+// docs/Proposal/VectorizationPlan.rst and
+// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
+// purpose, we temporarily introduced the VPlan-native vectorization path: an
+// alternative vectorization path that is natively implemented on top of the
+// VPlan infrastructure. See EnableVPlanNativePath for enabling.
+//
//===----------------------------------------------------------------------===//
//
// The reduction-variable vectorization is based on the paper:
@@ -47,8 +55,9 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
-#include "VPlan.h"
-#include "VPlanBuilder.h"
+#include "LoopVectorizationPlanner.h"
+#include "VPRecipeBuilder.h"
+#include "VPlanHCFGBuilder.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
@@ -57,11 +66,9 @@
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SCCIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
@@ -70,6 +77,7 @@
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
@@ -124,6 +132,7 @@
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -145,10 +154,6 @@ using namespace llvm;
STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
-static cl::opt<bool>
- EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
- cl::desc("Enable if-conversion during vectorization."));
-
/// Loops with a known constant trip count below this number are vectorized only
/// if no scalar iteration overheads are incurred.
static cl::opt<unsigned> TinyTripCountVectorThreshold(
@@ -184,9 +189,6 @@ static cl::opt<unsigned> ForceTargetNumVectorRegs(
"force-target-num-vector-regs", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's number of vector registers."));
-/// Maximum vectorization interleave count.
-static const unsigned MaxInterleaveFactor = 16;
-
static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
"force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's max interleave factor for "
@@ -209,7 +211,7 @@ static cl::opt<unsigned> SmallLoopCost(
"The cost of a loop that is considered 'small' by the interleaver."));
static cl::opt<bool> LoopVectorizeWithBlockFrequency(
- "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
+ "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
cl::desc("Enable the use of the block frequency analysis to access PGO "
"heuristics minimizing code growth in cold regions and being more "
"aggressive in hot regions."));
@@ -238,71 +240,21 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC(
cl::desc("The maximum interleave count to use when interleaving a scalar "
"reduction in a nested loop."));
-static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
- "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
- cl::desc("The maximum allowed number of runtime memory checks with a "
- "vectorize(enable) pragma."));
-
-static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
- "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
- cl::desc("The maximum number of SCEV checks allowed."));
-
-static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
- "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
- cl::desc("The maximum number of SCEV checks allowed with a "
- "vectorize(enable) pragma"));
-
-/// Create an analysis remark that explains why vectorization failed
-///
-/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
-/// RemarkName is the identifier for the remark. If \p I is passed it is an
-/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
-/// the location of the remark. \return the remark object that can be
-/// streamed to.
-static OptimizationRemarkAnalysis
-createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
- Instruction *I = nullptr) {
- Value *CodeRegion = TheLoop->getHeader();
- DebugLoc DL = TheLoop->getStartLoc();
-
- if (I) {
- CodeRegion = I->getParent();
- // If there is no debug location attached to the instruction, revert back to
- // using the loop's.
- if (I->getDebugLoc())
- DL = I->getDebugLoc();
- }
-
- OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
- R << "loop not vectorized: ";
- return R;
-}
-
-namespace {
-
-class LoopVectorizationLegality;
-class LoopVectorizationCostModel;
-class LoopVectorizationRequirements;
-
-} // end anonymous namespace
-
-/// Returns true if the given loop body has a cycle, excluding the loop
-/// itself.
-static bool hasCyclesInLoopBody(const Loop &L) {
- if (!L.empty())
- return true;
-
- for (const auto &SCC :
- make_range(scc_iterator<Loop, LoopBodyTraits>::begin(L),
- scc_iterator<Loop, LoopBodyTraits>::end(L))) {
- if (SCC.size() > 1) {
- DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n");
- DEBUG(L.dump());
- return true;
- }
- }
- return false;
-}
+static cl::opt<bool> EnableVPlanNativePath(
+ "enable-vplan-native-path", cl::init(false), cl::Hidden,
+ cl::desc("Enable VPlan-native vectorization path with "
+ "support for outer loop vectorization."));
+
+// This flag enables the stress testing of the VPlan H-CFG construction in the
+// VPlan-native vectorization path. It must be used in conjuction with
+// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
+// verification of the H-CFGs built.
+static cl::opt<bool> VPlanBuildStressTest(
+ "vplan-build-stress-test", cl::init(false), cl::Hidden,
+ cl::desc(
+ "Build VPlan for every supported loop nest in the function and bail "
+ "out right after the build (stress test the VPlan H-CFG construction "
+ "in the VPlan-native vectorization path)."));
/// A helper function for converting Scalar types to vector types.
/// If the incoming type is void, we return void. If the VF is 1, we return
@@ -317,16 +269,6 @@ static Type *ToVectorTy(Type *Scalar, unsigned VF) {
// in the project. They can be effectively organized in a common Load/Store
// utilities unit.
-/// A helper function that returns the pointer operand of a load or store
-/// instruction.
-static Value *getPointerOperand(Value *I) {
- if (auto *LI = dyn_cast<LoadInst>(I))
- return LI->getPointerOperand();
- if (auto *SI = dyn_cast<StoreInst>(I))
- return SI->getPointerOperand();
- return nullptr;
-}
-
/// A helper function that returns the type of loaded or stored value.
static Type *getMemInstValueType(Value *I) {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
@@ -373,7 +315,7 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
/// A helper function that returns the reciprocal of the block probability of
/// predicated blocks. If we return X, we are assuming the predicated block
-/// will execute once for for every X iterations of the loop header.
+/// will execute once for every X iterations of the loop header.
///
/// TODO: We should use actual block probability here, if available. Currently,
/// we always assume predicated blocks have a 50% chance of executing.
@@ -502,7 +444,7 @@ public:
void vectorizeMemoryInstruction(Instruction *Instr,
VectorParts *BlockInMask = nullptr);
- /// \brief Set the debug location in the builder using the debug location in
+ /// Set the debug location in the builder using the debug location in
/// the instruction.
void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
@@ -538,7 +480,7 @@ protected:
/// vectorizing this phi node.
void fixReduction(PHINode *Phi);
- /// \brief The Loop exit block may have single value PHI nodes with some
+ /// The Loop exit block may have single value PHI nodes with some
/// incoming value. While vectorizing we only handled real values
/// that were defined inside the loop and we should have one value for
/// each predecessor of its parent basic block. See PR14725.
@@ -573,9 +515,9 @@ protected:
/// Compute scalar induction steps. \p ScalarIV is the scalar induction
/// variable on which to base the steps, \p Step is the size of the step, and
/// \p EntryVal is the value from the original loop that maps to the steps.
- /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
- /// can be a truncate instruction).
- void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal,
+ /// Note that \p EntryVal doesn't have to be an induction variable - it
+ /// can also be a truncate instruction.
+ void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
const InductionDescriptor &ID);
/// Create a vector induction phi node based on an existing scalar one. \p
@@ -602,10 +544,20 @@ protected:
/// vector loop for both the Phi and the cast.
/// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
/// Otherwise, \p VectorLoopValue is a widened/vectorized value.
- void recordVectorLoopValueForInductionCast (const InductionDescriptor &ID,
- Value *VectorLoopValue,
- unsigned Part,
- unsigned Lane = UINT_MAX);
+ ///
+ /// \p EntryVal is the value from the original loop that maps to the vector
+ /// phi node and is used to distinguish what is the IV currently being
+ /// processed - original one (if \p EntryVal is a phi corresponding to the
+ /// original IV) or the "newly-created" one based on the proof mentioned above
+ /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
+ /// latter case \p EntryVal is a TruncInst and we must not record anything for
+ /// that IV, but it's error-prone to expect callers of this routine to care
+ /// about that, hence this explicit parameter.
+ void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
+ const Instruction *EntryVal,
+ Value *VectorLoopValue,
+ unsigned Part,
+ unsigned Lane = UINT_MAX);
/// Generate a shuffle sequence that will reverse the vector Vec.
virtual Value *reverseVector(Value *Vec);
@@ -646,7 +598,7 @@ protected:
/// loop.
void addMetadata(Instruction *To, Instruction *From);
- /// \brief Similar to the previous function but it adds the metadata to a
+ /// Similar to the previous function but it adds the metadata to a
/// vector of instructions.
void addMetadata(ArrayRef<Value *> To, Instruction *From);
@@ -679,7 +631,7 @@ protected:
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
- /// \brief LoopVersioning. It's only set up (non-null) if memchecks were
+ /// LoopVersioning. It's only set up (non-null) if memchecks were
/// used.
///
/// This is currently only used to add no-alias metadata based on the
@@ -777,7 +729,7 @@ private:
} // end namespace llvm
-/// \brief Look for a meaningful debug location on the instruction or it's
+/// Look for a meaningful debug location on the instruction or it's
/// operands.
static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
if (!I)
@@ -849,7 +801,7 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
namespace llvm {
-/// \brief The group of interleaved loads/stores sharing the same stride and
+/// The group of interleaved loads/stores sharing the same stride and
/// close to each other.
///
/// Each member in this group has an index starting from 0, and the largest
@@ -893,7 +845,7 @@ public:
unsigned getAlignment() const { return Align; }
unsigned getNumMembers() const { return Members.size(); }
- /// \brief Try to insert a new member \p Instr with index \p Index and
+ /// Try to insert a new member \p Instr with index \p Index and
/// alignment \p NewAlign. The index is related to the leader and it could be
/// negative if it is the new leader.
///
@@ -927,7 +879,7 @@ public:
return true;
}
- /// \brief Get the member with the given index \p Index
+ /// Get the member with the given index \p Index
///
/// \returns nullptr if contains no such member.
Instruction *getMember(unsigned Index) const {
@@ -938,7 +890,7 @@ public:
return Members.find(Key)->second;
}
- /// \brief Get the index for the given member. Unlike the key in the member
+ /// Get the index for the given member. Unlike the key in the member
/// map, the index starts from 0.
unsigned getIndex(Instruction *Instr) const {
for (auto I : Members)
@@ -989,7 +941,7 @@ private:
namespace {
-/// \brief Drive the analysis of interleaved memory accesses in the loop.
+/// Drive the analysis of interleaved memory accesses in the loop.
///
/// Use this class to analyze interleaved accesses only when we can vectorize
/// a loop. Otherwise it's meaningless to do analysis as the vectorization
@@ -1000,11 +952,12 @@ namespace {
class InterleavedAccessInfo {
public:
InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
- DominatorTree *DT, LoopInfo *LI)
- : PSE(PSE), TheLoop(L), DT(DT), LI(LI) {}
+ DominatorTree *DT, LoopInfo *LI,
+ const LoopAccessInfo *LAI)
+ : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
~InterleavedAccessInfo() {
- SmallSet<InterleaveGroup *, 4> DelSet;
+ SmallPtrSet<InterleaveGroup *, 4> DelSet;
// Avoid releasing a pointer twice.
for (auto &I : InterleaveGroupMap)
DelSet.insert(I.second);
@@ -1012,16 +965,16 @@ public:
delete Ptr;
}
- /// \brief Analyze the interleaved accesses and collect them in interleave
+ /// Analyze the interleaved accesses and collect them in interleave
/// groups. Substitute symbolic strides using \p Strides.
- void analyzeInterleaving(const ValueToValueMap &Strides);
+ void analyzeInterleaving();
- /// \brief Check if \p Instr belongs to any interleave group.
+ /// Check if \p Instr belongs to any interleave group.
bool isInterleaved(Instruction *Instr) const {
return InterleaveGroupMap.count(Instr);
}
- /// \brief Get the interleave group that \p Instr belongs to.
+ /// Get the interleave group that \p Instr belongs to.
///
/// \returns nullptr if doesn't have such group.
InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
@@ -1030,13 +983,10 @@ public:
return nullptr;
}
- /// \brief Returns true if an interleaved group that may access memory
+ /// Returns true if an interleaved group that may access memory
/// out-of-bounds requires a scalar epilogue iteration for correctness.
bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
- /// \brief Initialize the LoopAccessInfo used for dependence checking.
- void setLAI(const LoopAccessInfo *Info) { LAI = Info; }
-
private:
/// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
/// Simplifies SCEV expressions in the context of existing SCEV assumptions.
@@ -1047,7 +997,7 @@ private:
Loop *TheLoop;
DominatorTree *DT;
LoopInfo *LI;
- const LoopAccessInfo *LAI = nullptr;
+ const LoopAccessInfo *LAI;
/// True if the loop may contain non-reversed interleaved groups with
/// out-of-bounds accesses. We ensure we don't speculatively access memory
@@ -1061,7 +1011,7 @@ private:
/// access to a set of dependent sink accesses.
DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences;
- /// \brief The descriptor for a strided memory access.
+ /// The descriptor for a strided memory access.
struct StrideDescriptor {
StrideDescriptor() = default;
StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
@@ -1081,10 +1031,10 @@ private:
unsigned Align = 0;
};
- /// \brief A type for holding instructions and their stride descriptors.
+ /// A type for holding instructions and their stride descriptors.
using StrideEntry = std::pair<Instruction *, StrideDescriptor>;
- /// \brief Create a new interleave group with the given instruction \p Instr,
+ /// Create a new interleave group with the given instruction \p Instr,
/// stride \p Stride and alignment \p Align.
///
/// \returns the newly created interleave group.
@@ -1096,7 +1046,7 @@ private:
return InterleaveGroupMap[Instr];
}
- /// \brief Release the group and remove all the relationships.
+ /// Release the group and remove all the relationships.
void releaseGroup(InterleaveGroup *Group) {
for (unsigned i = 0; i < Group->getFactor(); i++)
if (Instruction *Member = Group->getMember(i))
@@ -1105,28 +1055,28 @@ private:
delete Group;
}
- /// \brief Collect all the accesses with a constant stride in program order.
+ /// Collect all the accesses with a constant stride in program order.
void collectConstStrideAccesses(
MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
const ValueToValueMap &Strides);
- /// \brief Returns true if \p Stride is allowed in an interleaved group.
+ /// Returns true if \p Stride is allowed in an interleaved group.
static bool isStrided(int Stride) {
unsigned Factor = std::abs(Stride);
return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
}
- /// \brief Returns true if \p BB is a predicated block.
+ /// Returns true if \p BB is a predicated block.
bool isPredicated(BasicBlock *BB) const {
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
}
- /// \brief Returns true if LoopAccessInfo can be used for dependence queries.
+ /// Returns true if LoopAccessInfo can be used for dependence queries.
bool areDependencesValid() const {
return LAI && LAI->getDepChecker().getDependences();
}
- /// \brief Returns true if memory accesses \p A and \p B can be reordered, if
+ /// Returns true if memory accesses \p A and \p B can be reordered, if
/// necessary, when constructing interleaved groups.
///
/// \p A must precede \p B in program order. We return false if reordering is
@@ -1174,7 +1124,7 @@ private:
return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink);
}
- /// \brief Collect the dependences from LoopAccessInfo.
+ /// Collect the dependences from LoopAccessInfo.
///
/// We process the dependences once during the interleaved access analysis to
/// enable constant-time dependence queries.
@@ -1187,315 +1137,6 @@ private:
}
};
-/// Utility class for getting and setting loop vectorizer hints in the form
-/// of loop metadata.
-/// This class keeps a number of loop annotations locally (as member variables)
-/// and can, upon request, write them back as metadata on the loop. It will
-/// initially scan the loop for existing metadata, and will update the local
-/// values based on information in the loop.
-/// We cannot write all values to metadata, as the mere presence of some info,
-/// for example 'force', means a decision has been made. So, we need to be
-/// careful NOT to add them if the user hasn't specifically asked so.
-class LoopVectorizeHints {
- enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED };
-
- /// Hint - associates name and validation with the hint value.
- struct Hint {
- const char *Name;
- unsigned Value; // This may have to change for non-numeric values.
- HintKind Kind;
-
- Hint(const char *Name, unsigned Value, HintKind Kind)
- : Name(Name), Value(Value), Kind(Kind) {}
-
- bool validate(unsigned Val) {
- switch (Kind) {
- case HK_WIDTH:
- return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
- case HK_UNROLL:
- return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
- case HK_FORCE:
- return (Val <= 1);
- case HK_ISVECTORIZED:
- return (Val==0 || Val==1);
- }
- return false;
- }
- };
-
- /// Vectorization width.
- Hint Width;
-
- /// Vectorization interleave factor.
- Hint Interleave;
-
- /// Vectorization forced
- Hint Force;
-
- /// Already Vectorized
- Hint IsVectorized;
-
- /// Return the loop metadata prefix.
- static StringRef Prefix() { return "llvm.loop."; }
-
- /// True if there is any unsafe math in the loop.
- bool PotentiallyUnsafe = false;
-
-public:
- enum ForceKind {
- FK_Undefined = -1, ///< Not selected.
- FK_Disabled = 0, ///< Forcing disabled.
- FK_Enabled = 1, ///< Forcing enabled.
- };
-
- LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
- OptimizationRemarkEmitter &ORE)
- : Width("vectorize.width", VectorizerParams::VectorizationFactor,
- HK_WIDTH),
- Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
- Force("vectorize.enable", FK_Undefined, HK_FORCE),
- IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
- // Populate values with existing loop metadata.
- getHintsFromMetadata();
-
- // force-vector-interleave overrides DisableInterleaving.
- if (VectorizerParams::isInterleaveForced())
- Interleave.Value = VectorizerParams::VectorizationInterleave;
-
- if (IsVectorized.Value != 1)
- // If the vectorization width and interleaving count are both 1 then
- // consider the loop to have been already vectorized because there's
- // nothing more that we can do.
- IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
- DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
- << "LV: Interleaving disabled by the pass manager\n");
- }
-
- /// Mark the loop L as already vectorized by setting the width to 1.
- void setAlreadyVectorized() {
- IsVectorized.Value = 1;
- Hint Hints[] = {IsVectorized};
- writeHintsToMetadata(Hints);
- }
-
- bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const {
- if (getForce() == LoopVectorizeHints::FK_Disabled) {
- DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
- emitRemarkWithHints();
- return false;
- }
-
- if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
- DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
- emitRemarkWithHints();
- return false;
- }
-
- if (getIsVectorized() == 1) {
- DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
- // FIXME: Add interleave.disable metadata. This will allow
- // vectorize.disable to be used without disabling the pass and errors
- // to differentiate between disabled vectorization and a width of 1.
- ORE.emit([&]() {
- return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
- "AllDisabled", L->getStartLoc(),
- L->getHeader())
- << "loop not vectorized: vectorization and interleaving are "
- "explicitly disabled, or the loop has already been "
- "vectorized";
- });
- return false;
- }
-
- return true;
- }
-
- /// Dumps all the hint information.
- void emitRemarkWithHints() const {
- using namespace ore;
-
- ORE.emit([&]() {
- if (Force.Value == LoopVectorizeHints::FK_Disabled)
- return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
- TheLoop->getStartLoc(),
- TheLoop->getHeader())
- << "loop not vectorized: vectorization is explicitly disabled";
- else {
- OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
- TheLoop->getStartLoc(),
- TheLoop->getHeader());
- R << "loop not vectorized";
- if (Force.Value == LoopVectorizeHints::FK_Enabled) {
- R << " (Force=" << NV("Force", true);
- if (Width.Value != 0)
- R << ", Vector Width=" << NV("VectorWidth", Width.Value);
- if (Interleave.Value != 0)
- R << ", Interleave Count="
- << NV("InterleaveCount", Interleave.Value);
- R << ")";
- }
- return R;
- }
- });
- }
-
- unsigned getWidth() const { return Width.Value; }
- unsigned getInterleave() const { return Interleave.Value; }
- unsigned getIsVectorized() const { return IsVectorized.Value; }
- enum ForceKind getForce() const { return (ForceKind)Force.Value; }
-
- /// \brief If hints are provided that force vectorization, use the AlwaysPrint
- /// pass name to force the frontend to print the diagnostic.
- const char *vectorizeAnalysisPassName() const {
- if (getWidth() == 1)
- return LV_NAME;
- if (getForce() == LoopVectorizeHints::FK_Disabled)
- return LV_NAME;
- if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
- return LV_NAME;
- return OptimizationRemarkAnalysis::AlwaysPrint;
- }
-
- bool allowReordering() const {
- // When enabling loop hints are provided we allow the vectorizer to change
- // the order of operations that is given by the scalar loop. This is not
- // enabled by default because can be unsafe or inefficient. For example,
- // reordering floating-point operations will change the way round-off
- // error accumulates in the loop.
- return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
- }
-
- bool isPotentiallyUnsafe() const {
- // Avoid FP vectorization if the target is unsure about proper support.
- // This may be related to the SIMD unit in the target not handling
- // IEEE 754 FP ops properly, or bad single-to-double promotions.
- // Otherwise, a sequence of vectorized loops, even without reduction,
- // could lead to different end results on the destination vectors.
- return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
- }
-
- void setPotentiallyUnsafe() { PotentiallyUnsafe = true; }
-
-private:
- /// Find hints specified in the loop metadata and update local values.
- void getHintsFromMetadata() {
- MDNode *LoopID = TheLoop->getLoopID();
- if (!LoopID)
- return;
-
- // First operand should refer to the loop id itself.
- assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
- assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
-
- for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
- const MDString *S = nullptr;
- SmallVector<Metadata *, 4> Args;
-
- // The expected hint is either a MDString or a MDNode with the first
- // operand a MDString.
- if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
- if (!MD || MD->getNumOperands() == 0)
- continue;
- S = dyn_cast<MDString>(MD->getOperand(0));
- for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
- Args.push_back(MD->getOperand(i));
- } else {
- S = dyn_cast<MDString>(LoopID->getOperand(i));
- assert(Args.size() == 0 && "too many arguments for MDString");
- }
-
- if (!S)
- continue;
-
- // Check if the hint starts with the loop metadata prefix.
- StringRef Name = S->getString();
- if (Args.size() == 1)
- setHint(Name, Args[0]);
- }
- }
-
- /// Checks string hint with one operand and set value if valid.
- void setHint(StringRef Name, Metadata *Arg) {
- if (!Name.startswith(Prefix()))
- return;
- Name = Name.substr(Prefix().size(), StringRef::npos);
-
- const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
- if (!C)
- return;
- unsigned Val = C->getZExtValue();
-
- Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
- for (auto H : Hints) {
- if (Name == H->Name) {
- if (H->validate(Val))
- H->Value = Val;
- else
- DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
- break;
- }
- }
- }
-
- /// Create a new hint from name / value pair.
- MDNode *createHintMetadata(StringRef Name, unsigned V) const {
- LLVMContext &Context = TheLoop->getHeader()->getContext();
- Metadata *MDs[] = {MDString::get(Context, Name),
- ConstantAsMetadata::get(
- ConstantInt::get(Type::getInt32Ty(Context), V))};
- return MDNode::get(Context, MDs);
- }
-
- /// Matches metadata with hint name.
- bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
- MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
- if (!Name)
- return false;
-
- for (auto H : HintTypes)
- if (Name->getString().endswith(H.Name))
- return true;
- return false;
- }
-
- /// Sets current hints into loop metadata, keeping other values intact.
- void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
- if (HintTypes.empty())
- return;
-
- // Reserve the first element to LoopID (see below).
- SmallVector<Metadata *, 4> MDs(1);
- // If the loop already has metadata, then ignore the existing operands.
- MDNode *LoopID = TheLoop->getLoopID();
- if (LoopID) {
- for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
- MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
- // If node in update list, ignore old value.
- if (!matchesHintMetadataName(Node, HintTypes))
- MDs.push_back(Node);
- }
- }
-
- // Now, add the missing hints.
- for (auto H : HintTypes)
- MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
-
- // Replace current metadata node with new one.
- LLVMContext &Context = TheLoop->getHeader()->getContext();
- MDNode *NewLoopID = MDNode::get(Context, MDs);
- // Set operand 0 to refer to the loop id itself.
- NewLoopID->replaceOperandWith(0, NewLoopID);
-
- TheLoop->setLoopID(NewLoopID);
- }
-
- /// The loop these hints belong to.
- const Loop *TheLoop;
-
- /// Interface to emit optimization remarks.
- OptimizationRemarkEmitter &ORE;
-};
-
} // end anonymous namespace
static void emitMissedWarning(Function *F, Loop *L,
@@ -1519,324 +1160,7 @@ static void emitMissedWarning(Function *F, Loop *L,
}
}
-namespace {
-
-/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
-/// to what vectorization factor.
-/// This class does not look at the profitability of vectorization, only the
-/// legality. This class has two main kinds of checks:
-/// * Memory checks - The code in canVectorizeMemory checks if vectorization
-/// will change the order of memory accesses in a way that will change the
-/// correctness of the program.
-/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
-/// checks for a number of different conditions, such as the availability of a
-/// single induction variable, that all types are supported and vectorize-able,
-/// etc. This code reflects the capabilities of InnerLoopVectorizer.
-/// This class is also used by InnerLoopVectorizer for identifying
-/// induction variable and the different reduction variables.
-class LoopVectorizationLegality {
-public:
- LoopVectorizationLegality(
- Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
- TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
- const TargetTransformInfo *TTI,
- std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
- OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
- LoopVectorizeHints *H)
- : TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA),
- ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H) {}
-
- /// ReductionList contains the reduction descriptors for all
- /// of the reductions that were found in the loop.
- using ReductionList = DenseMap<PHINode *, RecurrenceDescriptor>;
-
- /// InductionList saves induction variables and maps them to the
- /// induction descriptor.
- using InductionList = MapVector<PHINode *, InductionDescriptor>;
-
- /// RecurrenceSet contains the phi nodes that are recurrences other than
- /// inductions and reductions.
- using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
-
- /// Returns true if it is legal to vectorize this loop.
- /// This does not mean that it is profitable to vectorize this
- /// loop, only that it is legal to do so.
- bool canVectorize();
-
- /// Returns the primary induction variable.
- PHINode *getPrimaryInduction() { return PrimaryInduction; }
-
- /// Returns the reduction variables found in the loop.
- ReductionList *getReductionVars() { return &Reductions; }
-
- /// Returns the induction variables found in the loop.
- InductionList *getInductionVars() { return &Inductions; }
-
- /// Return the first-order recurrences found in the loop.
- RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
-
- /// Return the set of instructions to sink to handle first-order recurrences.
- DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; }
-
- /// Returns the widest induction type.
- Type *getWidestInductionType() { return WidestIndTy; }
-
- /// Returns True if V is a Phi node of an induction variable in this loop.
- bool isInductionPhi(const Value *V);
-
- /// Returns True if V is a cast that is part of an induction def-use chain,
- /// and had been proven to be redundant under a runtime guard (in other
- /// words, the cast has the same SCEV expression as the induction phi).
- bool isCastedInductionVariable(const Value *V);
-
- /// Returns True if V can be considered as an induction variable in this
- /// loop. V can be the induction phi, or some redundant cast in the def-use
- /// chain of the inducion phi.
- bool isInductionVariable(const Value *V);
-
- /// Returns True if PN is a reduction variable in this loop.
- bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
-
- /// Returns True if Phi is a first-order recurrence in this loop.
- bool isFirstOrderRecurrence(const PHINode *Phi);
-
- /// Return true if the block BB needs to be predicated in order for the loop
- /// to be vectorized.
- bool blockNeedsPredication(BasicBlock *BB);
-
- /// Check if this pointer is consecutive when vectorizing. This happens
- /// when the last index of the GEP is the induction variable, or that the
- /// pointer itself is an induction variable.
- /// This check allows us to vectorize A[idx] into a wide load/store.
- /// Returns:
- /// 0 - Stride is unknown or non-consecutive.
- /// 1 - Address is consecutive.
- /// -1 - Address is consecutive, and decreasing.
- /// NOTE: This method must only be used before modifying the original scalar
- /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
- int isConsecutivePtr(Value *Ptr);
-
- /// Returns true if the value V is uniform within the loop.
- bool isUniform(Value *V);
-
- /// Returns the information that we collected about runtime memory check.
- const RuntimePointerChecking *getRuntimePointerChecking() const {
- return LAI->getRuntimePointerChecking();
- }
-
- const LoopAccessInfo *getLAI() const { return LAI; }
-
- /// \brief Check if \p Instr belongs to any interleaved access group.
- bool isAccessInterleaved(Instruction *Instr) {
- return InterleaveInfo.isInterleaved(Instr);
- }
-
- /// \brief Get the interleaved access group that \p Instr belongs to.
- const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
- return InterleaveInfo.getInterleaveGroup(Instr);
- }
-
- /// \brief Returns true if an interleaved group requires a scalar iteration
- /// to handle accesses with gaps.
- bool requiresScalarEpilogue() const {
- return InterleaveInfo.requiresScalarEpilogue();
- }
-
- unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
-
- uint64_t getMaxSafeRegisterWidth() const {
- return LAI->getDepChecker().getMaxSafeRegisterWidth();
- }
-
- bool hasStride(Value *V) { return LAI->hasStride(V); }
-
- /// Returns true if the target machine supports masked store operation
- /// for the given \p DataType and kind of access to \p Ptr.
- bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
- return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
- }
-
- /// Returns true if the target machine supports masked load operation
- /// for the given \p DataType and kind of access to \p Ptr.
- bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
- return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
- }
-
- /// Returns true if the target machine supports masked scatter operation
- /// for the given \p DataType.
- bool isLegalMaskedScatter(Type *DataType) {
- return TTI->isLegalMaskedScatter(DataType);
- }
-
- /// Returns true if the target machine supports masked gather operation
- /// for the given \p DataType.
- bool isLegalMaskedGather(Type *DataType) {
- return TTI->isLegalMaskedGather(DataType);
- }
-
- /// Returns true if the target machine can represent \p V as a masked gather
- /// or scatter operation.
- bool isLegalGatherOrScatter(Value *V) {
- auto *LI = dyn_cast<LoadInst>(V);
- auto *SI = dyn_cast<StoreInst>(V);
- if (!LI && !SI)
- return false;
- auto *Ptr = getPointerOperand(V);
- auto *Ty = cast<PointerType>(Ptr->getType())->getElementType();
- return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
- }
-
- /// Returns true if vector representation of the instruction \p I
- /// requires mask.
- bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
-
- unsigned getNumStores() const { return LAI->getNumStores(); }
- unsigned getNumLoads() const { return LAI->getNumLoads(); }
- unsigned getNumPredStores() const { return NumPredStores; }
-
- /// Returns true if \p I is an instruction that will be scalarized with
- /// predication. Such instructions include conditional stores and
- /// instructions that may divide by zero.
- bool isScalarWithPredication(Instruction *I);
-
- /// Returns true if \p I is a memory instruction with consecutive memory
- /// access that can be widened.
- bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
-
- // Returns true if the NoNaN attribute is set on the function.
- bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
-
-private:
- /// Check if a single basic block loop is vectorizable.
- /// At this point we know that this is a loop with a constant trip count
- /// and we only need to check individual instructions.
- bool canVectorizeInstrs();
-
- /// When we vectorize loops we may change the order in which
- /// we read and write from memory. This method checks if it is
- /// legal to vectorize the code, considering only memory constrains.
- /// Returns true if the loop is vectorizable
- bool canVectorizeMemory();
-
- /// Return true if we can vectorize this loop using the IF-conversion
- /// transformation.
- bool canVectorizeWithIfConvert();
-
- /// Return true if all of the instructions in the block can be speculatively
- /// executed. \p SafePtrs is a list of addresses that are known to be legal
- /// and we know that we can read from them without segfault.
- bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
-
- /// Updates the vectorization state by adding \p Phi to the inductions list.
- /// This can set \p Phi as the main induction of the loop if \p Phi is a
- /// better choice for the main induction than the existing one.
- void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
- SmallPtrSetImpl<Value *> &AllowedExit);
-
- /// Create an analysis remark that explains why vectorization failed
- ///
- /// \p RemarkName is the identifier for the remark. If \p I is passed it is
- /// an instruction that prevents vectorization. Otherwise the loop is used
- /// for the location of the remark. \return the remark object that can be
- /// streamed to.
- OptimizationRemarkAnalysis
- createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const {
- return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
- RemarkName, TheLoop, I);
- }
-
- /// \brief If an access has a symbolic strides, this maps the pointer value to
- /// the stride symbol.
- const ValueToValueMap *getSymbolicStrides() {
- // FIXME: Currently, the set of symbolic strides is sometimes queried before
- // it's collected. This happens from canVectorizeWithIfConvert, when the
- // pointer is checked to reference consecutive elements suitable for a
- // masked access.
- return LAI ? &LAI->getSymbolicStrides() : nullptr;
- }
-
- unsigned NumPredStores = 0;
-
- /// The loop that we evaluate.
- Loop *TheLoop;
-
- /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
- /// Applies dynamic knowledge to simplify SCEV expressions in the context
- /// of existing SCEV assumptions. The analysis will also add a minimal set
- /// of new predicates if this is required to enable vectorization and
- /// unrolling.
- PredicatedScalarEvolution &PSE;
-
- /// Target Library Info.
- TargetLibraryInfo *TLI;
-
- /// Target Transform Info
- const TargetTransformInfo *TTI;
-
- /// Dominator Tree.
- DominatorTree *DT;
-
- // LoopAccess analysis.
- std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
-
- // And the loop-accesses info corresponding to this loop. This pointer is
- // null until canVectorizeMemory sets it up.
- const LoopAccessInfo *LAI = nullptr;
-
- /// Interface to emit optimization remarks.
- OptimizationRemarkEmitter *ORE;
-
- /// The interleave access information contains groups of interleaved accesses
- /// with the same stride and close to each other.
- InterleavedAccessInfo InterleaveInfo;
-
- // --- vectorization state --- //
-
- /// Holds the primary induction variable. This is the counter of the
- /// loop.
- PHINode *PrimaryInduction = nullptr;
-
- /// Holds the reduction variables.
- ReductionList Reductions;
-
- /// Holds all of the induction variables that we found in the loop.
- /// Notice that inductions don't need to start at zero and that induction
- /// variables can be pointers.
- InductionList Inductions;
-
- /// Holds all the casts that participate in the update chain of the induction
- /// variables, and that have been proven to be redundant (possibly under a
- /// runtime guard). These casts can be ignored when creating the vectorized
- /// loop body.
- SmallPtrSet<Instruction *, 4> InductionCastsToIgnore;
-
- /// Holds the phi nodes that are first-order recurrences.
- RecurrenceSet FirstOrderRecurrences;
-
- /// Holds instructions that need to sink past other instructions to handle
- /// first-order recurrences.
- DenseMap<Instruction *, Instruction *> SinkAfter;
-
- /// Holds the widest induction type encountered.
- Type *WidestIndTy = nullptr;
-
- /// Allowed outside users. This holds the induction and reduction
- /// vars which can be accessed from outside the loop.
- SmallPtrSet<Value *, 4> AllowedExit;
-
- /// Can we assume the absence of NaNs.
- bool HasFunNoNaNAttr = false;
-
- /// Vectorization requirements that will go through late-evaluation.
- LoopVectorizationRequirements *Requirements;
-
- /// Used to emit an analysis of any legality issues.
- LoopVectorizeHints *Hints;
-
- /// While vectorizing these instructions we have to generate a
- /// call to the appropriate masked intrinsic
- SmallPtrSet<const Instruction *, 8> MaskedOp;
-};
+namespace llvm {
/// LoopVectorizationCostModel - estimates the expected speedups due to
/// vectorization.
@@ -1853,23 +1177,15 @@ public:
const TargetLibraryInfo *TLI, DemandedBits *DB,
AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, const Function *F,
- const LoopVectorizeHints *Hints)
+ const LoopVectorizeHints *Hints,
+ InterleavedAccessInfo &IAI)
: TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
- AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}
+ AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
/// \return An upper bound for the vectorization factor, or None if
/// vectorization should be avoided up front.
Optional<unsigned> computeMaxVF(bool OptForSize);
- /// Information about vectorization costs
- struct VectorizationFactor {
- // Vector width with best cost
- unsigned Width;
-
- // Cost of the loop with that width
- unsigned Cost;
- };
-
/// \return The most profitable vectorization factor and the cost of that VF.
/// This method checks every power of two up to MaxVF. If UserVF is not ZERO
/// then this vectorization factor will be selected if vectorization is
@@ -1903,7 +1219,7 @@ public:
/// avoid redundant calculations.
void setCostBasedWideningDecision(unsigned VF);
- /// \brief A struct that represents some properties of the register usage
+ /// A struct that represents some properties of the register usage
/// of a loop.
struct RegisterUsage {
/// Holds the number of loop invariant values that are used in the loop.
@@ -1911,9 +1227,6 @@ public:
/// Holds the maximum number of concurrent live intervals in the loop.
unsigned MaxLocalUsers;
-
- /// Holds the number of instructions in the loop.
- unsigned NumInstructions;
};
/// \return Returns information about the register usages of the loop for the
@@ -2063,7 +1376,69 @@ public:
collectLoopScalars(VF);
}
+ /// Returns true if the target machine supports masked store operation
+ /// for the given \p DataType and kind of access to \p Ptr.
+ bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
+ return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
+ }
+
+ /// Returns true if the target machine supports masked load operation
+ /// for the given \p DataType and kind of access to \p Ptr.
+ bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
+ return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
+ }
+
+ /// Returns true if the target machine supports masked scatter operation
+ /// for the given \p DataType.
+ bool isLegalMaskedScatter(Type *DataType) {
+ return TTI.isLegalMaskedScatter(DataType);
+ }
+
+ /// Returns true if the target machine supports masked gather operation
+ /// for the given \p DataType.
+ bool isLegalMaskedGather(Type *DataType) {
+ return TTI.isLegalMaskedGather(DataType);
+ }
+
+ /// Returns true if the target machine can represent \p V as a masked gather
+ /// or scatter operation.
+ bool isLegalGatherOrScatter(Value *V) {
+ bool LI = isa<LoadInst>(V);
+ bool SI = isa<StoreInst>(V);
+ if (!LI && !SI)
+ return false;
+ auto *Ty = getMemInstValueType(V);
+ return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
+ }
+
+ /// Returns true if \p I is an instruction that will be scalarized with
+ /// predication. Such instructions include conditional stores and
+ /// instructions that may divide by zero.
+ bool isScalarWithPredication(Instruction *I);
+
+ /// Returns true if \p I is a memory instruction with consecutive memory
+ /// access that can be widened.
+ bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
+
+ /// Check if \p Instr belongs to any interleaved access group.
+ bool isAccessInterleaved(Instruction *Instr) {
+ return InterleaveInfo.isInterleaved(Instr);
+ }
+
+ /// Get the interleaved access group that \p Instr belongs to.
+ const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
+ return InterleaveInfo.getInterleaveGroup(Instr);
+ }
+
+ /// Returns true if an interleaved group requires a scalar iteration
+ /// to handle accesses with gaps.
+ bool requiresScalarEpilogue() const {
+ return InterleaveInfo.requiresScalarEpilogue();
+ }
+
private:
+ unsigned NumPredStores = 0;
+
/// \return An upper bound for the vectorization factor, larger than zero.
/// One is returned if vectorization should best be avoided due to cost.
unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
@@ -2115,12 +1490,16 @@ private:
/// as a vector operation.
bool isConsecutiveLoadOrStore(Instruction *I);
+ /// Returns true if an artificially high cost for emulated masked memrefs
+ /// should be used.
+ bool useEmulatedMaskMemRefHack(Instruction *I);
+
/// Create an analysis remark that explains why vectorization failed
///
/// \p RemarkName is the identifier for the remark. \return the remark object
/// that can be streamed to.
OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
- return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
+ return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
RemarkName, TheLoop);
}
@@ -2222,6 +1601,10 @@ public:
/// Loop Vectorize Hint.
const LoopVectorizeHints *Hints;
+ /// The interleave access information contains groups of interleaved accesses
+ /// with the same stride and close to each other.
+ InterleavedAccessInfo &InterleaveInfo;
+
/// Values to ignore in the cost model.
SmallPtrSet<const Value *, 16> ValuesToIgnore;
@@ -2229,271 +1612,78 @@ public:
SmallPtrSet<const Value *, 16> VecValuesToIgnore;
};
-} // end anonymous namespace
-
-namespace llvm {
-
-/// InnerLoopVectorizer vectorizes loops which contain only one basic
-/// LoopVectorizationPlanner - drives the vectorization process after having
-/// passed Legality checks.
-/// The planner builds and optimizes the Vectorization Plans which record the
-/// decisions how to vectorize the given loop. In particular, represent the
-/// control-flow of the vectorized version, the replication of instructions that
-/// are to be scalarized, and interleave access groups.
-class LoopVectorizationPlanner {
- /// The loop that we evaluate.
- Loop *OrigLoop;
-
- /// Loop Info analysis.
- LoopInfo *LI;
-
- /// Target Library Info.
- const TargetLibraryInfo *TLI;
-
- /// Target Transform Info.
- const TargetTransformInfo *TTI;
-
- /// The legality analysis.
- LoopVectorizationLegality *Legal;
-
- /// The profitablity analysis.
- LoopVectorizationCostModel &CM;
-
- using VPlanPtr = std::unique_ptr<VPlan>;
-
- SmallVector<VPlanPtr, 4> VPlans;
-
- /// This class is used to enable the VPlan to invoke a method of ILV. This is
- /// needed until the method is refactored out of ILV and becomes reusable.
- struct VPCallbackILV : public VPCallback {
- InnerLoopVectorizer &ILV;
-
- VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}
-
- Value *getOrCreateVectorValues(Value *V, unsigned Part) override {
- return ILV.getOrCreateVectorValue(V, Part);
- }
- };
-
- /// A builder used to construct the current plan.
- VPBuilder Builder;
-
- /// When we if-convert we need to create edge masks. We have to cache values
- /// so that we don't end up with exponential recursion/IR. Note that
- /// if-conversion currently takes place during VPlan-construction, so these
- /// caches are only used at that stage.
- using EdgeMaskCacheTy =
- DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>;
- using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>;
- EdgeMaskCacheTy EdgeMaskCache;
- BlockMaskCacheTy BlockMaskCache;
-
- unsigned BestVF = 0;
- unsigned BestUF = 0;
-
-public:
- LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
- const TargetTransformInfo *TTI,
- LoopVectorizationLegality *Legal,
- LoopVectorizationCostModel &CM)
- : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {}
-
- /// Plan how to best vectorize, return the best VF and its cost.
- LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
- unsigned UserVF);
-
- /// Finalize the best decision and dispose of all other VPlans.
- void setBestPlan(unsigned VF, unsigned UF);
-
- /// Generate the IR code for the body of the vectorized loop according to the
- /// best selected VPlan.
- void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
-
- void printPlans(raw_ostream &O) {
- for (const auto &Plan : VPlans)
- O << *Plan;
- }
-
-protected:
- /// Collect the instructions from the original loop that would be trivially
- /// dead in the vectorized loop if generated.
- void collectTriviallyDeadInstructions(
- SmallPtrSetImpl<Instruction *> &DeadInstructions);
-
- /// A range of powers-of-2 vectorization factors with fixed start and
- /// adjustable end. The range includes start and excludes end, e.g.,:
- /// [1, 9) = {1, 2, 4, 8}
- struct VFRange {
- // A power of 2.
- const unsigned Start;
-
- // Need not be a power of 2. If End <= Start range is empty.
- unsigned End;
- };
-
- /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
- /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
- /// returned value holds for the entire \p Range.
- bool getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
- VFRange &Range);
-
- /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
- /// according to the information gathered by Legal when it checked if it is
- /// legal to vectorize the loop.
- void buildVPlans(unsigned MinVF, unsigned MaxVF);
-
-private:
- /// A helper function that computes the predicate of the block BB, assuming
- /// that the header block of the loop is set to True. It returns the *entry*
- /// mask for the block BB.
- VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan);
-
- /// A helper function that computes the predicate of the edge between SRC
- /// and DST.
- VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan);
-
- /// Check if \I belongs to an Interleave Group within the given VF \p Range,
- /// \return true in the first returned value if so and false otherwise.
- /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG
- /// for \p Range.Start, and provide it as the second returned value.
- /// Note that if \I is an adjunct member of an IG for \p Range.Start, the
- /// \return value is <true, nullptr>, as it is handled by another recipe.
- /// \p Range.End may be decreased to ensure same decision from \p Range.Start
- /// to \p Range.End.
- VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
-
- // Check if \I is a memory instruction to be widened for \p Range.Start and
- // potentially masked. Such instructions are handled by a recipe that takes an
- // additional VPInstruction for the mask.
- VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I,
- VFRange &Range,
- VPlanPtr &Plan);
-
- /// Check if an induction recipe should be constructed for \I within the given
- /// VF \p Range. If so build and return it. If not, return null. \p Range.End
- /// may be decreased to ensure same decision from \p Range.Start to
- /// \p Range.End.
- VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I,
- VFRange &Range);
-
- /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
- /// a sequence of select instructions as the vectorizer currently performs
- /// full if-conversion.
- VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan);
-
- /// Check if \p I can be widened within the given VF \p Range. If \p I can be
- /// widened for \p Range.Start, check if the last recipe of \p VPBB can be
- /// extended to include \p I or else build a new VPWidenRecipe for it and
- /// append it to \p VPBB. Return true if \p I can be widened for Range.Start,
- /// false otherwise. Range.End may be decreased to ensure same decision from
- /// \p Range.Start to \p Range.End.
- bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range);
-
- /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
- /// is predicated. \return \p VPBB augmented with this new recipe if \p I is
- /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
- /// Region. Update the packing decision of predicated instructions if they
- /// feed \p I. Range.End may be decreased to ensure same recipe behavior from
- /// \p Range.Start to \p Range.End.
- VPBasicBlock *handleReplication(
- Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
- DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
- VPlanPtr &Plan);
-
- /// Create a replicating region for instruction \p I that requires
- /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
- VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe,
- VPlanPtr &Plan);
-
- /// Build a VPlan according to the information gathered by Legal. \return a
- /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
- /// exclusive, possibly decreasing \p Range.End.
- VPlanPtr buildVPlan(VFRange &Range,
- const SmallPtrSetImpl<Value *> &NeedDef);
-};
-
} // end namespace llvm
-namespace {
-
-/// \brief This holds vectorization requirements that must be verified late in
-/// the process. The requirements are set by legalize and costmodel. Once
-/// vectorization has been determined to be possible and profitable the
-/// requirements can be verified by looking for metadata or compiler options.
-/// For example, some loops require FP commutativity which is only allowed if
-/// vectorization is explicitly specified or if the fast-math compiler option
-/// has been provided.
-/// Late evaluation of these requirements allows helpful diagnostics to be
-/// composed that tells the user what need to be done to vectorize the loop. For
-/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late
-/// evaluation should be used only when diagnostics can generated that can be
-/// followed by a non-expert user.
-class LoopVectorizationRequirements {
-public:
- LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {}
-
- void addUnsafeAlgebraInst(Instruction *I) {
- // First unsafe algebra instruction.
- if (!UnsafeAlgebraInst)
- UnsafeAlgebraInst = I;
- }
-
- void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
-
- bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
- const char *PassName = Hints.vectorizeAnalysisPassName();
- bool Failed = false;
- if (UnsafeAlgebraInst && !Hints.allowReordering()) {
- ORE.emit([&]() {
- return OptimizationRemarkAnalysisFPCommute(
- PassName, "CantReorderFPOps",
- UnsafeAlgebraInst->getDebugLoc(),
- UnsafeAlgebraInst->getParent())
- << "loop not vectorized: cannot prove it is safe to reorder "
- "floating-point operations";
- });
- Failed = true;
- }
-
- // Test if runtime memcheck thresholds are exceeded.
- bool PragmaThresholdReached =
- NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
- bool ThresholdReached =
- NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
- if ((ThresholdReached && !Hints.allowReordering()) ||
- PragmaThresholdReached) {
- ORE.emit([&]() {
- return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
- L->getStartLoc(),
- L->getHeader())
- << "loop not vectorized: cannot prove it is safe to reorder "
- "memory operations";
- });
- DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
- Failed = true;
- }
+// Return true if \p OuterLp is an outer loop annotated with hints for explicit
+// vectorization. The loop needs to be annotated with #pragma omp simd
+// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
+// vector length information is not provided, vectorization is not considered
+// explicit. Interleave hints are not allowed either. These limitations will be
+// relaxed in the future.
+// Please, note that we are currently forced to abuse the pragma 'clang
+// vectorize' semantics. This pragma provides *auto-vectorization hints*
+// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
+// provides *explicit vectorization hints* (LV can bypass legal checks and
+// assume that vectorization is legal). However, both hints are implemented
+// using the same metadata (llvm.loop.vectorize, processed by
+// LoopVectorizeHints). This will be fixed in the future when the native IR
+// representation for pragma 'omp simd' is introduced.
+static bool isExplicitVecOuterLoop(Loop *OuterLp,
+ OptimizationRemarkEmitter *ORE) {
+ assert(!OuterLp->empty() && "This is not an outer loop");
+ LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
+
+ // Only outer loops with an explicit vectorization hint are supported.
+ // Unannotated outer loops are ignored.
+ if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
+ return false;
- return Failed;
+ Function *Fn = OuterLp->getHeader()->getParent();
+ if (!Hints.allowVectorization(Fn, OuterLp, false /*AlwaysVectorize*/)) {
+ LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
+ return false;
}
-private:
- unsigned NumRuntimePointerChecks = 0;
- Instruction *UnsafeAlgebraInst = nullptr;
+ if (!Hints.getWidth()) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n");
+ emitMissedWarning(Fn, OuterLp, Hints, ORE);
+ return false;
+ }
- /// Interface to emit optimization remarks.
- OptimizationRemarkEmitter &ORE;
-};
+ if (Hints.getInterleave() > 1) {
+ // TODO: Interleave support is future work.
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
+ "outer loops.\n");
+ emitMissedWarning(Fn, OuterLp, Hints, ORE);
+ return false;
+ }
-} // end anonymous namespace
+ return true;
+}
-static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
- if (L.empty()) {
- if (!hasCyclesInLoopBody(L))
+static void collectSupportedLoops(Loop &L, LoopInfo *LI,
+ OptimizationRemarkEmitter *ORE,
+ SmallVectorImpl<Loop *> &V) {
+ // Collect inner loops and outer loops without irreducible control flow. For
+ // now, only collect outer loops that have explicit vectorization hints. If we
+ // are stress testing the VPlan H-CFG construction, we collect the outermost
+ // loop of every loop nest.
+ if (L.empty() || VPlanBuildStressTest ||
+ (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
+ LoopBlocksRPO RPOT(&L);
+ RPOT.perform(LI);
+ if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
V.push_back(&L);
- return;
+ // TODO: Collect inner loops inside marked outer loops in case
+ // vectorization fails for the outer loop. Do not invoke
+ // 'containsIrreducibleCFG' again for inner loops when the outer loop is
+ // already known to be reducible. We can use an inherited attribute for
+ // that.
+ return;
+ }
}
for (Loop *InnerL : L)
- addAcyclicInnerLoop(*InnerL, V);
+ collectSupportedLoops(*InnerL, LI, ORE, V);
}
namespace {
@@ -2562,14 +1752,16 @@ struct LoopVectorize : public FunctionPass {
//===----------------------------------------------------------------------===//
Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
- // We need to place the broadcast of invariant variables outside the loop.
+ // We need to place the broadcast of invariant variables outside the loop,
+ // but only if it's proven safe to do so. Else, broadcast will be inside
+ // vector loop body.
Instruction *Instr = dyn_cast<Instruction>(V);
- bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
- bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
-
+ bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
+ (!Instr ||
+ DT->dominates(Instr->getParent(), LoopVectorPreHeader));
// Place the code for broadcasting invariant variables in the new preheader.
IRBuilder<>::InsertPointGuard Guard(Builder);
- if (Invariant)
+ if (SafeToHoist)
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
// Broadcast the scalar into all locations in the vector.
@@ -2580,6 +1772,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
+ assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
+ "Expected either an induction phi-node or a truncate of it!");
Value *Start = II.getStartValue();
// Construct the initial value of the vector IV in the vector loop preheader
@@ -2627,14 +1821,18 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
// factor. The last of those goes into the PHI.
PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
&*LoopVectorBody->getFirstInsertionPt());
+ VecInd->setDebugLoc(EntryVal->getDebugLoc());
Instruction *LastInduction = VecInd;
for (unsigned Part = 0; Part < UF; ++Part) {
VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
- recordVectorLoopValueForInductionCast(II, LastInduction, Part);
+
if (isa<TruncInst>(EntryVal))
addMetadata(LastInduction, EntryVal);
+ recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
+
LastInduction = cast<Instruction>(addFastMathFlag(
Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
+ LastInduction->setDebugLoc(EntryVal->getDebugLoc());
}
// Move the last step to the end of the latch block. This ensures consistent
@@ -2665,8 +1863,20 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
}
void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
- const InductionDescriptor &ID, Value *VectorLoopVal, unsigned Part,
- unsigned Lane) {
+ const InductionDescriptor &ID, const Instruction *EntryVal,
+ Value *VectorLoopVal, unsigned Part, unsigned Lane) {
+ assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
+ "Expected either an induction phi-node or a truncate of it!");
+
+ // This induction variable is not the phi from the original loop but the
+ // newly-created IV based on the proof that casted Phi is equal to the
+ // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
+ // re-uses the same InductionDescriptor that original IV uses but we don't
+ // have to do any recording in this case - that is done when original IV is
+ // processed.
+ if (isa<TruncInst>(EntryVal))
+ return;
+
const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
if (Casts.empty())
return;
@@ -2754,15 +1964,16 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
// If we haven't yet vectorized the induction variable, splat the scalar
// induction variable, and build the necessary step vectors.
+ // TODO: Don't do it unless the vectorized IV is really required.
if (!VectorizedIV) {
Value *Broadcasted = getBroadcastInstrs(ScalarIV);
for (unsigned Part = 0; Part < UF; ++Part) {
Value *EntryPart =
getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
- recordVectorLoopValueForInductionCast(ID, EntryPart, Part);
if (Trunc)
addMetadata(EntryPart, Trunc);
+ recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
}
}
@@ -2833,7 +2044,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
}
void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
- Value *EntryVal,
+ Instruction *EntryVal,
const InductionDescriptor &ID) {
// We shouldn't have to build scalar steps if we aren't vectorizing.
assert(VF > 1 && "VF should be greater than one");
@@ -2868,25 +2079,11 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
- recordVectorLoopValueForInductionCast(ID, Add, Part, Lane);
+ recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
}
}
}
-int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
- const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :
- ValueToValueMap();
-
- int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
- if (Stride == 1 || Stride == -1)
- return Stride;
- return 0;
-}
-
-bool LoopVectorizationLegality::isUniform(Value *V) {
- return LAI->isUniform(V);
-}
-
Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
assert(V != Induction && "The new induction variable should not be used.");
assert(!V->getType()->isVectorTy() && "Can't widen a vector");
@@ -3046,7 +2243,7 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
- const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
+ const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr);
assert(Group && "Fail to get an interleaved access group.");
// Skip if current instruction is not the insert position.
@@ -3054,7 +2251,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
return;
const DataLayout &DL = Instr->getModule()->getDataLayout();
- Value *Ptr = getPointerOperand(Instr);
+ Value *Ptr = getLoadStorePointerOperand(Instr);
// Prepare for the vector type of the interleaved load/store.
Type *ScalarTy = getMemInstValueType(Instr);
@@ -3076,6 +2273,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
if (Group->isReverse())
Index += (VF - 1) * Group->getFactor();
+ bool InBounds = false;
+ if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
+ InBounds = gep->isInBounds();
+
for (unsigned Part = 0; Part < UF; Part++) {
Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
@@ -3091,6 +2292,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
// A[i+2] = c; // Member of index 2 (Current instruction)
// Current pointer is pointed to A[i+2], adjust it to A[i].
NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
+ if (InBounds)
+ cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
// Cast to the vector pointer type.
NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
@@ -3196,7 +2399,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
Type *ScalarDataTy = getMemInstValueType(Instr);
Type *DataTy = VectorType::get(ScalarDataTy, VF);
- Value *Ptr = getPointerOperand(Instr);
+ Value *Ptr = getLoadStorePointerOperand(Instr);
unsigned Alignment = getMemInstAlignment(Instr);
// An alignment of 0 means target abi alignment. We need to use the scalar's
// target abi alignment in such a case.
@@ -3227,10 +2430,37 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
if (isMaskRequired)
Mask = *BlockInMask;
+ bool InBounds = false;
+ if (auto *gep = dyn_cast<GetElementPtrInst>(
+ getLoadStorePointerOperand(Instr)->stripPointerCasts()))
+ InBounds = gep->isInBounds();
+
+ const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
+ // Calculate the pointer for the specific unroll-part.
+ GetElementPtrInst *PartPtr = nullptr;
+
+ if (Reverse) {
+ // If the address is consecutive but reversed, then the
+ // wide store needs to start at the last vector element.
+ PartPtr = cast<GetElementPtrInst>(
+ Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)));
+ PartPtr->setIsInBounds(InBounds);
+ PartPtr = cast<GetElementPtrInst>(
+ Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)));
+ PartPtr->setIsInBounds(InBounds);
+ if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
+ Mask[Part] = reverseVector(Mask[Part]);
+ } else {
+ PartPtr = cast<GetElementPtrInst>(
+ Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)));
+ PartPtr->setIsInBounds(InBounds);
+ }
+
+ return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
+ };
+
// Handle Stores:
if (SI) {
- assert(!Legal->isUniform(SI->getPointerOperand()) &&
- "We do not allow storing to uniform addresses");
setDebugLocFromInst(Builder, SI);
for (unsigned Part = 0; Part < UF; ++Part) {
@@ -3242,30 +2472,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
MaskPart);
} else {
- // Calculate the pointer for the specific unroll-part.
- Value *PartPtr =
- Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
-
if (Reverse) {
// If we store to reverse consecutive memory locations, then we need
// to reverse the order of elements in the stored value.
StoredVal = reverseVector(StoredVal);
// We don't want to update the value in the map as it might be used in
// another expression. So don't call resetVectorValue(StoredVal).
-
- // If the address is consecutive but reversed, then the
- // wide store needs to start at the last vector element.
- PartPtr =
- Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
- PartPtr =
- Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
- if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
- Mask[Part] = reverseVector(Mask[Part]);
}
-
- Value *VecPtr =
- Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
-
+ auto *VecPtr = CreateVecPtr(Part, Ptr);
if (isMaskRequired)
NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
Mask[Part]);
@@ -3289,21 +2503,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
nullptr, "wide.masked.gather");
addMetadata(NewLI, LI);
} else {
- // Calculate the pointer for the specific unroll-part.
- Value *PartPtr =
- Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
-
- if (Reverse) {
- // If the address is consecutive but reversed, then the
- // wide load needs to start at the last vector element.
- PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
- PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
- if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
- Mask[Part] = reverseVector(Mask[Part]);
- }
-
- Value *VecPtr =
- Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
+ auto *VecPtr = CreateVecPtr(Part, Ptr);
if (isMaskRequired)
NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
UndefValue::get(DataTy),
@@ -3457,7 +2657,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
// does not evenly divide the trip count, no adjustment is necessary since
// there will already be scalar iterations. Note that the minimum iterations
// check ensures that N >= Step.
- if (VF > 1 && Legal->requiresScalarEpilogue()) {
+ if (VF > 1 && Cost->requiresScalarEpilogue()) {
auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
R = Builder.CreateSelect(IsZero, Step, R);
}
@@ -3508,8 +2708,8 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
// vector trip count is zero. This check also covers the case where adding one
// to the backedge-taken count overflowed leading to an incorrect trip count
// of zero. In this case we will also jump to the scalar loop.
- auto P = Legal->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
- : ICmpInst::ICMP_ULT;
+ auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
+ : ICmpInst::ICMP_ULT;
Value *CheckMinIters = Builder.CreateICmp(
P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
@@ -3714,6 +2914,8 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// Create phi nodes to merge from the backedge-taken check block.
PHINode *BCResumeVal = PHINode::Create(
OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
+ // Copy original phi DL over to the new one.
+ BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
Value *&EndValue = IVEndValues[OrigPhi];
if (OrigPhi == OldInduction) {
// We know what the end value is.
@@ -3871,7 +3073,7 @@ struct CSEDenseMapInfo {
} // end anonymous namespace
-///\brief Perform cse of induction variable instructions.
+///Perform cse of induction variable instructions.
static void cse(BasicBlock *BB) {
// Perform simple cse.
SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
@@ -3893,7 +3095,7 @@ static void cse(BasicBlock *BB) {
}
}
-/// \brief Estimate the overhead of scalarizing an instruction. This is a
+/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
const TargetTransformInfo &TTI) {
@@ -4074,7 +3276,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
- } else if (isa<LoadInst>(I)) {
+ } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
// Don't do anything with the operands, just extend the result.
continue;
} else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
@@ -4089,7 +3291,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
NewI = B.CreateExtractElement(O0, EE->getOperand(2));
} else {
- llvm_unreachable("Unhandled instruction type!");
+ // If we don't know what to do, be conservative and don't do anything.
+ continue;
}
// Lastly, extend the result.
@@ -4164,15 +3367,12 @@ void InnerLoopVectorizer::fixCrossIterationPHIs() {
// the currently empty PHI nodes. At this point every instruction in the
// original loop is widened to a vector form so we can use them to construct
// the incoming edges.
- for (Instruction &I : *OrigLoop->getHeader()) {
- PHINode *Phi = dyn_cast<PHINode>(&I);
- if (!Phi)
- break;
+ for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
// Handle first-order recurrences and reductions that need to be fixed.
- if (Legal->isFirstOrderRecurrence(Phi))
- fixFirstOrderRecurrence(Phi);
- else if (Legal->isReductionVariable(Phi))
- fixReduction(Phi);
+ if (Legal->isFirstOrderRecurrence(&Phi))
+ fixFirstOrderRecurrence(&Phi);
+ else if (Legal->isReductionVariable(&Phi))
+ fixReduction(&Phi);
}
}
@@ -4335,15 +3535,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// Finally, fix users of the recurrence outside the loop. The users will need
// either the last value of the scalar recurrence or the last value of the
// vector recurrence we extracted in the middle block. Since the loop is in
- // LCSSA form, we just need to find the phi node for the original scalar
+ // LCSSA form, we just need to find all the phi nodes for the original scalar
// recurrence in the exit block, and then add an edge for the middle block.
- for (auto &I : *LoopExitBlock) {
- auto *LCSSAPhi = dyn_cast<PHINode>(&I);
- if (!LCSSAPhi)
- break;
- if (LCSSAPhi->getIncomingValue(0) == Phi) {
- LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
- break;
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
+ if (LCSSAPhi.getIncomingValue(0) == Phi) {
+ LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
}
}
}
@@ -4499,21 +3695,15 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// inside and outside of the scalar remainder loop.
// We know that the loop is in LCSSA form. We need to update the
// PHI nodes in the exit blocks.
- for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
- LEE = LoopExitBlock->end();
- LEI != LEE; ++LEI) {
- PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
- if (!LCSSAPhi)
- break;
-
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
// All PHINodes need to have a single entry edge, or two if
// we already fixed them.
- assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
+ assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
// We found a reduction value exit-PHI. Update it with the
// incoming bypass edge.
- if (LCSSAPhi->getIncomingValue(0) == LoopExitInst)
- LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+ if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
+ LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
} // end of the LCSSA phi scan.
// Fix the scalar loop reduction variable with the incoming reduction sum
@@ -4528,14 +3718,11 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
}
void InnerLoopVectorizer::fixLCSSAPHIs() {
- for (Instruction &LEI : *LoopExitBlock) {
- auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
- if (!LCSSAPhi)
- break;
- if (LCSSAPhi->getNumIncomingValues() == 1) {
- assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) &&
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
+ if (LCSSAPhi.getNumIncomingValues() == 1) {
+ assert(OrigLoop->isLoopInvariant(LCSSAPhi.getIncomingValue(0)) &&
"Incoming value isn't loop invariant");
- LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock);
+ LCSSAPhi.addIncoming(LCSSAPhi.getIncomingValue(0), LoopMiddleBlock);
}
}
}
@@ -4955,7 +4142,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
default:
// This instruction is not vectorized by simple widening.
- DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+ LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
llvm_unreachable("Unhandled instruction!");
} // end of switch.
}
@@ -4973,467 +4160,7 @@ void InnerLoopVectorizer::updateAnalysis() {
DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
- DEBUG(DT->verifyDomTree());
-}
-
-/// \brief Check whether it is safe to if-convert this phi node.
-///
-/// Phi nodes with constant expressions that can trap are not safe to if
-/// convert.
-static bool canIfConvertPHINodes(BasicBlock *BB) {
- for (Instruction &I : *BB) {
- auto *Phi = dyn_cast<PHINode>(&I);
- if (!Phi)
- return true;
- for (Value *V : Phi->incoming_values())
- if (auto *C = dyn_cast<Constant>(V))
- if (C->canTrap())
- return false;
- }
- return true;
-}
-
-bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
- if (!EnableIfConversion) {
- ORE->emit(createMissedAnalysis("IfConversionDisabled")
- << "if-conversion is disabled");
- return false;
- }
-
- assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
-
- // A list of pointers that we can safely read and write to.
- SmallPtrSet<Value *, 8> SafePointes;
-
- // Collect safe addresses.
- for (BasicBlock *BB : TheLoop->blocks()) {
- if (blockNeedsPredication(BB))
- continue;
-
- for (Instruction &I : *BB)
- if (auto *Ptr = getPointerOperand(&I))
- SafePointes.insert(Ptr);
- }
-
- // Collect the blocks that need predication.
- BasicBlock *Header = TheLoop->getHeader();
- for (BasicBlock *BB : TheLoop->blocks()) {
- // We don't support switch statements inside loops.
- if (!isa<BranchInst>(BB->getTerminator())) {
- ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator())
- << "loop contains a switch statement");
- return false;
- }
-
- // We must be able to predicate all blocks that need to be predicated.
- if (blockNeedsPredication(BB)) {
- if (!blockCanBePredicated(BB, SafePointes)) {
- ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
- << "control flow cannot be substituted for a select");
- return false;
- }
- } else if (BB != Header && !canIfConvertPHINodes(BB)) {
- ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
- << "control flow cannot be substituted for a select");
- return false;
- }
- }
-
- // We can if-convert this loop.
- return true;
-}
-
-bool LoopVectorizationLegality::canVectorize() {
- // Store the result and return it at the end instead of exiting early, in case
- // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
- bool Result = true;
-
- bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
- // We must have a loop in canonical form. Loops with indirectbr in them cannot
- // be canonicalized.
- if (!TheLoop->getLoopPreheader()) {
- DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n");
- ORE->emit(createMissedAnalysis("CFGNotUnderstood")
- << "loop control flow is not understood by vectorizer");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // FIXME: The code is currently dead, since the loop gets sent to
- // LoopVectorizationLegality is already an innermost loop.
- //
- // We can only vectorize innermost loops.
- if (!TheLoop->empty()) {
- ORE->emit(createMissedAnalysis("NotInnermostLoop")
- << "loop is not the innermost loop");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // We must have a single backedge.
- if (TheLoop->getNumBackEdges() != 1) {
- ORE->emit(createMissedAnalysis("CFGNotUnderstood")
- << "loop control flow is not understood by vectorizer");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // We must have a single exiting block.
- if (!TheLoop->getExitingBlock()) {
- ORE->emit(createMissedAnalysis("CFGNotUnderstood")
- << "loop control flow is not understood by vectorizer");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // We only handle bottom-tested loops, i.e. loop in which the condition is
- // checked at the end of each iteration. With that we can assume that all
- // instructions in the loop are executed the same number of times.
- if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
- ORE->emit(createMissedAnalysis("CFGNotUnderstood")
- << "loop control flow is not understood by vectorizer");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // We need to have a loop header.
- DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
- << '\n');
-
- // Check if we can if-convert non-single-bb loops.
- unsigned NumBlocks = TheLoop->getNumBlocks();
- if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
- DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // Check if we can vectorize the instructions and CFG in this loop.
- if (!canVectorizeInstrs()) {
- DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // Go over each instruction and look at memory deps.
- if (!canVectorizeMemory()) {
- DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- DEBUG(dbgs() << "LV: We can vectorize this loop"
- << (LAI->getRuntimePointerChecking()->Need
- ? " (with a runtime bound check)"
- : "")
- << "!\n");
-
- bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
-
- // If an override option has been passed in for interleaved accesses, use it.
- if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
- UseInterleaved = EnableInterleavedMemAccesses;
-
- // Analyze interleaved memory accesses.
- if (UseInterleaved)
- InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());
-
- unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
- if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
- SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
-
- if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
- ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks")
- << "Too many SCEV assumptions need to be made and checked "
- << "at runtime");
- DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // Okay! We've done all the tests. If any have failed, return false. Otherwise
- // we can vectorize, and at this point we don't have any other mem analysis
- // which may limit our maximum vectorization factor, so just return true with
- // no restrictions.
- return Result;
-}
-
-static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
- if (Ty->isPointerTy())
- return DL.getIntPtrType(Ty);
-
- // It is possible that char's or short's overflow when we ask for the loop's
- // trip count, work around this by changing the type size.
- if (Ty->getScalarSizeInBits() < 32)
- return Type::getInt32Ty(Ty->getContext());
-
- return Ty;
-}
-
-static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
- Ty0 = convertPointerToIntegerType(DL, Ty0);
- Ty1 = convertPointerToIntegerType(DL, Ty1);
- if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
- return Ty0;
- return Ty1;
-}
-
-/// \brief Check that the instruction has outside loop users and is not an
-/// identified reduction variable.
-static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
- SmallPtrSetImpl<Value *> &AllowedExit) {
- // Reduction and Induction instructions are allowed to have exit users. All
- // other instructions must not have external users.
- if (!AllowedExit.count(Inst))
- // Check that all of the users of the loop are inside the BB.
- for (User *U : Inst->users()) {
- Instruction *UI = cast<Instruction>(U);
- // This user may be a reduction exit value.
- if (!TheLoop->contains(UI)) {
- DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
- return true;
- }
- }
- return false;
-}
-
-void LoopVectorizationLegality::addInductionPhi(
- PHINode *Phi, const InductionDescriptor &ID,
- SmallPtrSetImpl<Value *> &AllowedExit) {
- Inductions[Phi] = ID;
-
- // In case this induction also comes with casts that we know we can ignore
- // in the vectorized loop body, record them here. All casts could be recorded
- // here for ignoring, but suffices to record only the first (as it is the
- // only one that may bw used outside the cast sequence).
- const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
- if (!Casts.empty())
- InductionCastsToIgnore.insert(*Casts.begin());
-
- Type *PhiTy = Phi->getType();
- const DataLayout &DL = Phi->getModule()->getDataLayout();
-
- // Get the widest type.
- if (!PhiTy->isFloatingPointTy()) {
- if (!WidestIndTy)
- WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
- else
- WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
- }
-
- // Int inductions are special because we only allow one IV.
- if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
- ID.getConstIntStepValue() &&
- ID.getConstIntStepValue()->isOne() &&
- isa<Constant>(ID.getStartValue()) &&
- cast<Constant>(ID.getStartValue())->isNullValue()) {
-
- // Use the phi node with the widest type as induction. Use the last
- // one if there are multiple (no good reason for doing this other
- // than it is expedient). We've checked that it begins at zero and
- // steps by one, so this is a canonical induction variable.
- if (!PrimaryInduction || PhiTy == WidestIndTy)
- PrimaryInduction = Phi;
- }
-
- // Both the PHI node itself, and the "post-increment" value feeding
- // back into the PHI node may have external users.
- // We can allow those uses, except if the SCEVs we have for them rely
- // on predicates that only hold within the loop, since allowing the exit
- // currently means re-using this SCEV outside the loop.
- if (PSE.getUnionPredicate().isAlwaysTrue()) {
- AllowedExit.insert(Phi);
- AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
- }
-
- DEBUG(dbgs() << "LV: Found an induction variable.\n");
-}
-
-bool LoopVectorizationLegality::canVectorizeInstrs() {
- BasicBlock *Header = TheLoop->getHeader();
-
- // Look for the attribute signaling the absence of NaNs.
- Function &F = *Header->getParent();
- HasFunNoNaNAttr =
- F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
-
- // For each block in the loop.
- for (BasicBlock *BB : TheLoop->blocks()) {
- // Scan the instructions in the block and look for hazards.
- for (Instruction &I : *BB) {
- if (auto *Phi = dyn_cast<PHINode>(&I)) {
- Type *PhiTy = Phi->getType();
- // Check that this PHI type is allowed.
- if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
- !PhiTy->isPointerTy()) {
- ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
- << "loop control flow is not understood by vectorizer");
- DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
- return false;
- }
-
- // If this PHINode is not in the header block, then we know that we
- // can convert it to select during if-conversion. No need to check if
- // the PHIs in this block are induction or reduction variables.
- if (BB != Header) {
- // Check that this instruction has no outside users or is an
- // identified reduction value with an outside user.
- if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit))
- continue;
- ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi)
- << "value could not be identified as "
- "an induction or reduction variable");
- return false;
- }
-
- // We only allow if-converted PHIs with exactly two incoming values.
- if (Phi->getNumIncomingValues() != 2) {
- ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
- << "control flow not understood by vectorizer");
- DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
- return false;
- }
-
- RecurrenceDescriptor RedDes;
- if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) {
- if (RedDes.hasUnsafeAlgebra())
- Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
- AllowedExit.insert(RedDes.getLoopExitInstr());
- Reductions[Phi] = RedDes;
- continue;
- }
-
- InductionDescriptor ID;
- if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
- addInductionPhi(Phi, ID, AllowedExit);
- if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
- Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
- continue;
- }
-
- if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
- SinkAfter, DT)) {
- FirstOrderRecurrences.insert(Phi);
- continue;
- }
-
- // As a last resort, coerce the PHI to a AddRec expression
- // and re-try classifying it a an induction PHI.
- if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
- addInductionPhi(Phi, ID, AllowedExit);
- continue;
- }
-
- ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi)
- << "value that could not be identified as "
- "reduction is used outside the loop");
- DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n");
- return false;
- } // end of PHI handling
-
- // We handle calls that:
- // * Are debug info intrinsics.
- // * Have a mapping to an IR intrinsic.
- // * Have a vector version available.
- auto *CI = dyn_cast<CallInst>(&I);
- if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
- !isa<DbgInfoIntrinsic>(CI) &&
- !(CI->getCalledFunction() && TLI &&
- TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
- ORE->emit(createMissedAnalysis("CantVectorizeCall", CI)
- << "call instruction cannot be vectorized");
- DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
- return false;
- }
-
- // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
- // second argument is the same (i.e. loop invariant)
- if (CI && hasVectorInstrinsicScalarOpd(
- getVectorIntrinsicIDForCall(CI, TLI), 1)) {
- auto *SE = PSE.getSE();
- if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
- ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI)
- << "intrinsic instruction cannot be vectorized");
- DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
- return false;
- }
- }
-
- // Check that the instruction return type is vectorizable.
- // Also, we can't vectorize extractelement instructions.
- if ((!VectorType::isValidElementType(I.getType()) &&
- !I.getType()->isVoidTy()) ||
- isa<ExtractElementInst>(I)) {
- ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I)
- << "instruction return type cannot be vectorized");
- DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
- return false;
- }
-
- // Check that the stored type is vectorizable.
- if (auto *ST = dyn_cast<StoreInst>(&I)) {
- Type *T = ST->getValueOperand()->getType();
- if (!VectorType::isValidElementType(T)) {
- ORE->emit(createMissedAnalysis("CantVectorizeStore", ST)
- << "store instruction cannot be vectorized");
- return false;
- }
-
- // FP instructions can allow unsafe algebra, thus vectorizable by
- // non-IEEE-754 compliant SIMD units.
- // This applies to floating-point math operations and calls, not memory
- // operations, shuffles, or casts, as they don't change precision or
- // semantics.
- } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
- !I.isFast()) {
- DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
- Hints->setPotentiallyUnsafe();
- }
-
- // Reduction instructions are allowed to have exit users.
- // All other instructions must not have external users.
- if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
- ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I)
- << "value cannot be used outside the loop");
- return false;
- }
- } // next instr.
- }
-
- if (!PrimaryInduction) {
- DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
- if (Inductions.empty()) {
- ORE->emit(createMissedAnalysis("NoInductionVariable")
- << "loop induction variable could not be identified");
- return false;
- }
- }
-
- // Now we know the widest induction type, check if our found induction
- // is the same size. If it's not, unset it here and InnerLoopVectorizer
- // will create another.
- if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
- PrimaryInduction = nullptr;
-
- return true;
+ assert(DT->verify(DominatorTree::VerificationLevel::Fast));
}
void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
@@ -5461,7 +4188,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
if (auto *Store = dyn_cast<StoreInst>(MemAccess))
if (Ptr == Store->getValueOperand())
return WideningDecision == CM_Scalarize;
- assert(Ptr == getPointerOperand(MemAccess) &&
+ assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
"Ptr is neither a value or pointer operand");
return WideningDecision != CM_GatherScatter;
};
@@ -5527,7 +4254,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
}
for (auto *I : ScalarPtrs)
if (!PossibleNonScalarPtrs.count(I)) {
- DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
Worklist.insert(I);
}
@@ -5544,8 +4271,9 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
continue;
Worklist.insert(Ind);
Worklist.insert(IndUpdate);
- DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
- DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
+ << "\n");
}
// Insert the forced scalars.
@@ -5572,7 +4300,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
isScalarUse(J, Src));
})) {
Worklist.insert(Src);
- DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
}
}
@@ -5612,21 +4340,30 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// The induction variable and its update instruction will remain scalar.
Worklist.insert(Ind);
Worklist.insert(IndUpdate);
- DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
- DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
+ << "\n");
}
Scalars[VF].insert(Worklist.begin(), Worklist.end());
}
-bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
- if (!blockNeedsPredication(I->getParent()))
+bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) {
+ if (!Legal->blockNeedsPredication(I->getParent()))
return false;
switch(I->getOpcode()) {
default:
break;
- case Instruction::Store:
- return !isMaskRequired(I);
+ case Instruction::Load:
+ case Instruction::Store: {
+ if (!Legal->isMaskRequired(I))
+ return false;
+ auto *Ptr = getLoadStorePointerOperand(I);
+ auto *Ty = getMemInstValueType(I);
+ return isa<LoadInst>(I) ?
+ !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))
+ : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
+ }
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::SRem:
@@ -5636,17 +4373,17 @@ bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
return false;
}
-bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
- unsigned VF) {
+bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
+ unsigned VF) {
// Get and ensure we have a valid memory instruction.
LoadInst *LI = dyn_cast<LoadInst>(I);
StoreInst *SI = dyn_cast<StoreInst>(I);
assert((LI || SI) && "Invalid memory instruction");
- auto *Ptr = getPointerOperand(I);
+ auto *Ptr = getLoadStorePointerOperand(I);
// In order to be widened, the pointer should be consecutive, first of all.
- if (!isConsecutivePtr(Ptr))
+ if (!Legal->isConsecutivePtr(Ptr))
return false;
// If the instruction is a store located in a predicated block, it will be
@@ -5697,7 +4434,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
Worklist.insert(Cmp);
- DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
}
// Holds consecutive and consecutive-like pointers. Consecutive-like pointers
@@ -5729,7 +4466,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
for (auto *BB : TheLoop->blocks())
for (auto &I : *BB) {
// If there's no pointer operand, there's nothing to do.
- auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I));
+ auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
if (!Ptr)
continue;
@@ -5737,7 +4474,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// pointer operand.
auto UsersAreMemAccesses =
llvm::all_of(Ptr->users(), [&](User *U) -> bool {
- return getPointerOperand(U) == Ptr;
+ return getLoadStorePointerOperand(U) == Ptr;
});
// Ensure the memory instruction will not be scalarized or used by
@@ -5758,7 +4495,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// aren't also identified as possibly non-uniform.
for (auto *V : ConsecutiveLikePtrs)
if (!PossibleNonUniformPtrs.count(V)) {
- DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
Worklist.insert(V);
}
@@ -5777,10 +4514,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
if (llvm::all_of(OI->users(), [&](User *U) -> bool {
auto *J = cast<Instruction>(U);
return !TheLoop->contains(J) || Worklist.count(J) ||
- (OI == getPointerOperand(J) && isUniformDecision(J, VF));
+ (OI == getLoadStorePointerOperand(J) &&
+ isUniformDecision(J, VF));
})) {
Worklist.insert(OI);
- DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
}
}
}
@@ -5788,7 +4526,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// Returns true if Ptr is the pointer operand of a memory access instruction
// I, and I is known to not require scalarization.
auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
- return getPointerOperand(I) == Ptr && isUniformDecision(I, VF);
+ return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
};
// For an instruction to be added into Worklist above, all its users inside
@@ -5825,123 +4563,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// The induction variable and its update instruction will remain uniform.
Worklist.insert(Ind);
Worklist.insert(IndUpdate);
- DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
- DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
+ << "\n");
}
Uniforms[VF].insert(Worklist.begin(), Worklist.end());
}
-bool LoopVectorizationLegality::canVectorizeMemory() {
- LAI = &(*GetLAA)(*TheLoop);
- InterleaveInfo.setLAI(LAI);
- const OptimizationRemarkAnalysis *LAR = LAI->getReport();
- if (LAR) {
- ORE->emit([&]() {
- return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
- "loop not vectorized: ", *LAR);
- });
- }
- if (!LAI->canVectorizeMemory())
- return false;
-
- if (LAI->hasStoreToLoopInvariantAddress()) {
- ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
- << "write to a loop invariant address could not be vectorized");
- DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
- return false;
- }
-
- Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
- PSE.addPredicate(LAI->getPSE().getUnionPredicate());
-
- return true;
-}
-
-bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
- Value *In0 = const_cast<Value *>(V);
- PHINode *PN = dyn_cast_or_null<PHINode>(In0);
- if (!PN)
- return false;
-
- return Inductions.count(PN);
-}
-
-bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
- auto *Inst = dyn_cast<Instruction>(V);
- return (Inst && InductionCastsToIgnore.count(Inst));
-}
-
-bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
- return isInductionPhi(V) || isCastedInductionVariable(V);
-}
-
-bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
- return FirstOrderRecurrences.count(Phi);
-}
-
-bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
- return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
-}
-
-bool LoopVectorizationLegality::blockCanBePredicated(
- BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) {
- const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
-
- for (Instruction &I : *BB) {
- // Check that we don't have a constant expression that can trap as operand.
- for (Value *Operand : I.operands()) {
- if (auto *C = dyn_cast<Constant>(Operand))
- if (C->canTrap())
- return false;
- }
- // We might be able to hoist the load.
- if (I.mayReadFromMemory()) {
- auto *LI = dyn_cast<LoadInst>(&I);
- if (!LI)
- return false;
- if (!SafePtrs.count(LI->getPointerOperand())) {
- if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) ||
- isLegalMaskedGather(LI->getType())) {
- MaskedOp.insert(LI);
- continue;
- }
- // !llvm.mem.parallel_loop_access implies if-conversion safety.
- if (IsAnnotatedParallel)
- continue;
- return false;
- }
- }
-
- if (I.mayWriteToMemory()) {
- auto *SI = dyn_cast<StoreInst>(&I);
- // We only support predication of stores in basic blocks with one
- // predecessor.
- if (!SI)
- return false;
-
- // Build a masked store if it is legal for the target.
- if (isLegalMaskedStore(SI->getValueOperand()->getType(),
- SI->getPointerOperand()) ||
- isLegalMaskedScatter(SI->getValueOperand()->getType())) {
- MaskedOp.insert(SI);
- continue;
- }
-
- bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
- bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
-
- if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
- !isSinglePredecessor)
- return false;
- }
- if (I.mayThrow())
- return false;
- }
-
- return true;
-}
-
void InterleavedAccessInfo::collectConstStrideAccesses(
MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
const ValueToValueMap &Strides) {
@@ -5962,7 +4591,7 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
if (!LI && !SI)
continue;
- Value *Ptr = getPointerOperand(&I);
+ Value *Ptr = getLoadStorePointerOperand(&I);
// We don't check wrapping here because we don't know yet if Ptr will be
// part of a full group or a group with gaps. Checking wrapping for all
// pointers (even those that end up in groups with no gaps) will be overly
@@ -6022,9 +4651,9 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
// this group because it and (2) are dependent. However, (1) can be grouped
// with other accesses that may precede it in program order. Note that a
// bottom-up order does not imply that WAW dependences should not be checked.
-void InterleavedAccessInfo::analyzeInterleaving(
- const ValueToValueMap &Strides) {
- DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
+void InterleavedAccessInfo::analyzeInterleaving() {
+ LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
+ const ValueToValueMap &Strides = LAI->getSymbolicStrides();
// Holds all accesses with a constant stride.
MapVector<Instruction *, StrideDescriptor> AccessStrideInfo;
@@ -6065,7 +4694,8 @@ void InterleavedAccessInfo::analyzeInterleaving(
if (isStrided(DesB.Stride)) {
Group = getInterleaveGroup(B);
if (!Group) {
- DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B
+ << '\n');
Group = createInterleaveGroup(B, DesB.Stride, DesB.Align);
}
if (B->mayWriteToMemory())
@@ -6124,7 +4754,12 @@ void InterleavedAccessInfo::analyzeInterleaving(
// Ignore A if it's already in a group or isn't the same kind of memory
// operation as B.
- if (isInterleaved(A) || A->mayReadFromMemory() != B->mayReadFromMemory())
+ // Note that mayReadFromMemory() isn't mutually exclusive to mayWriteToMemory
+ // in the case of atomic loads. We shouldn't see those here, canVectorizeMemory()
+ // should have returned false - except for the case we asked for optimization
+ // remarks.
+ if (isInterleaved(A) || (A->mayReadFromMemory() != B->mayReadFromMemory())
+ || (A->mayWriteToMemory() != B->mayWriteToMemory()))
continue;
// Check rules 1 and 2. Ignore A if its stride or size is different from
@@ -6163,8 +4798,9 @@ void InterleavedAccessInfo::analyzeInterleaving(
// Try to insert A into B's group.
if (Group->insertMember(A, IndexA, DesA.Align)) {
- DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
- << " into the interleave group with" << *B << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
+ << " into the interleave group with" << *B
+ << '\n');
InterleaveGroupMap[A] = Group;
// Set the first load in program order as the insert position.
@@ -6177,8 +4813,9 @@ void InterleavedAccessInfo::analyzeInterleaving(
// Remove interleaved store groups with gaps.
for (InterleaveGroup *Group : StoreGroups)
if (Group->getNumMembers() != Group->getFactor()) {
- DEBUG(dbgs() << "LV: Invalidate candidate interleaved store group due "
- "to gaps.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Invalidate candidate interleaved store group due "
+ "to gaps.\n");
releaseGroup(Group);
}
// Remove interleaved groups with gaps (currently only loads) whose memory
@@ -6207,21 +4844,23 @@ void InterleavedAccessInfo::analyzeInterleaving(
// So we check only group member 0 (which is always guaranteed to exist),
// and group member Factor - 1; If the latter doesn't exist we rely on
// peeling (if it is a non-reveresed accsess -- see Case 3).
- Value *FirstMemberPtr = getPointerOperand(Group->getMember(0));
+ Value *FirstMemberPtr = getLoadStorePointerOperand(Group->getMember(0));
if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,
/*ShouldCheckWrap=*/true)) {
- DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
- "first group member potentially pointer-wrapping.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Invalidate candidate interleaved group due to "
+ "first group member potentially pointer-wrapping.\n");
releaseGroup(Group);
continue;
}
Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
if (LastMember) {
- Value *LastMemberPtr = getPointerOperand(LastMember);
+ Value *LastMemberPtr = getLoadStorePointerOperand(LastMember);
if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
/*ShouldCheckWrap=*/true)) {
- DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
- "last group member potentially pointer-wrapping.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Invalidate candidate interleaved group due to "
+ "last group member potentially pointer-wrapping.\n");
releaseGroup(Group);
}
} else {
@@ -6231,29 +4870,25 @@ void InterleavedAccessInfo::analyzeInterleaving(
// to look for a member at index factor - 1, since every group must have
// a member at index zero.
if (Group->isReverse()) {
- DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
- "a reverse access with gaps.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Invalidate candidate interleaved group due to "
+ "a reverse access with gaps.\n");
releaseGroup(Group);
continue;
}
- DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
RequiresScalarEpilogue = true;
}
}
}
Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
- if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
- ORE->emit(createMissedAnalysis("ConditionalStore")
- << "store that is conditionally executed prevents vectorization");
- DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
- return None;
- }
-
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
// TODO: It may by useful to do since it's still likely to be dynamically
// uniform if the target can skip.
- DEBUG(dbgs() << "LV: Not inserting runtime ptr check for divergent target");
+ LLVM_DEBUG(
+ dbgs() << "LV: Not inserting runtime ptr check for divergent target");
ORE->emit(
createMissedAnalysis("CantVersionLoopWithDivergentTarget")
@@ -6271,20 +4906,22 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
<< "runtime pointer checks needed. Enable vectorization of this "
"loop with '#pragma clang loop vectorize(enable)' when "
"compiling with -Os/-Oz");
- DEBUG(dbgs()
- << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
return None;
}
// If we optimize the program for size, avoid creating the tail loop.
- DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
// If we don't know the precise trip count, don't try to vectorize.
if (TC < 2) {
ORE->emit(
createMissedAnalysis("UnknownLoopCountComplexCFG")
<< "unable to calculate the loop count due to complex control flow");
- DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
return None;
}
@@ -6302,7 +4939,8 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
"same time. Enable vectorization of this loop "
"with '#pragma clang loop vectorize(enable)' "
"when compiling with -Os/-Oz");
- DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
return None;
}
@@ -6327,29 +4965,30 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
unsigned MaxVectorSize = WidestRegister / WidestType;
- DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
- << WidestType << " bits.\n");
- DEBUG(dbgs() << "LV: The Widest register safe to use is: " << WidestRegister
- << " bits.\n");
+ LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
+ << " / " << WidestType << " bits.\n");
+ LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
+ << WidestRegister << " bits.\n");
- assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
- " into one vector!");
+ assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
+ " into one vector!");
if (MaxVectorSize == 0) {
- DEBUG(dbgs() << "LV: The target has no vector registers.\n");
+ LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
MaxVectorSize = 1;
return MaxVectorSize;
} else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
isPowerOf2_32(ConstTripCount)) {
// We need to clamp the VF to be the ConstTripCount. There is no point in
// choosing a higher viable VF as done in the loop below.
- DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
- << ConstTripCount << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
+ << ConstTripCount << "\n");
MaxVectorSize = ConstTripCount;
return MaxVectorSize;
}
unsigned MaxVF = MaxVectorSize;
- if (MaximizeBandwidth && !OptForSize) {
+ if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
+ (MaximizeBandwidth && !OptForSize)) {
// Collect all viable vectorization factors larger than the default MaxVF
// (i.e. MaxVectorSize).
SmallVector<unsigned, 8> VFs;
@@ -6369,24 +5008,30 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
break;
}
}
+ if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
+ if (MaxVF < MinVF) {
+ LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
+ << ") with target's minimum: " << MinVF << '\n');
+ MaxVF = MinVF;
+ }
+ }
}
return MaxVF;
}
-LoopVectorizationCostModel::VectorizationFactor
+VectorizationFactor
LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
float Cost = expectedCost(1).first;
-#ifndef NDEBUG
const float ScalarCost = Cost;
-#endif /* NDEBUG */
unsigned Width = 1;
- DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
+ LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
- // Ignore scalar width, because the user explicitly wants vectorization.
if (ForceVectorization && MaxVF > 1) {
- Width = 2;
- Cost = expectedCost(Width).first / (float)Width;
+ // Ignore scalar width, because the user explicitly wants vectorization.
+ // Initialize cost to max so that VF = 2 is, at least, chosen during cost
+ // evaluation.
+ Cost = std::numeric_limits<float>::max();
}
for (unsigned i = 2; i <= MaxVF; i *= 2) {
@@ -6395,10 +5040,10 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
// the vector elements.
VectorizationCostTy C = expectedCost(i);
float VectorCost = C.first / (float)i;
- DEBUG(dbgs() << "LV: Vector loop of width " << i
- << " costs: " << (int)VectorCost << ".\n");
+ LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
+ << " costs: " << (int)VectorCost << ".\n");
if (!C.second && !ForceVectorization) {
- DEBUG(
+ LLVM_DEBUG(
dbgs() << "LV: Not considering vector loop of width " << i
<< " because it will not generate any vector instructions.\n");
continue;
@@ -6409,10 +5054,19 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
}
}
- DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
- << "LV: Vectorization seems to be not beneficial, "
- << "but was forced by a user.\n");
- DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
+ if (!EnableCondStoresVectorization && NumPredStores) {
+ ORE->emit(createMissedAnalysis("ConditionalStore")
+ << "store that is conditionally executed prevents vectorization");
+ LLVM_DEBUG(
+ dbgs() << "LV: No vectorization. There are conditional stores.\n");
+ Width = 1;
+ Cost = ScalarCost;
+ }
+
+ LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
+ << "LV: Vectorization seems to be not beneficial, "
+ << "but was forced by a user.\n");
+ LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
return Factor;
}
@@ -6460,7 +5114,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
// optimization to non-pointer types.
//
if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
- !Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I))
+ !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
continue;
MinWidth = std::min(MinWidth,
@@ -6504,8 +5158,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
return 1;
unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
- DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
- << " registers\n");
+ LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
+ << " registers\n");
if (VF == 1) {
if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
@@ -6519,7 +5173,6 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
// We divide by these constants so assume that we have at least one
// instruction that uses at least one register.
R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
- R.NumInstructions = std::max(R.NumInstructions, 1U);
// We calculate the interleave count using the following formula.
// Subtract the number of loop invariants from the number of available
@@ -6564,7 +5217,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
// Interleave if we vectorized this loop and there is a reduction that could
// benefit from interleaving.
if (VF > 1 && !Legal->getReductionVars()->empty()) {
- DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
+ LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
return IC;
}
@@ -6575,7 +5228,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
// We want to interleave small loops in order to reduce the loop overhead and
// potentially expose ILP opportunities.
- DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
// We assume that the cost overhead is 1 and we use the cost model
// to estimate the cost of the loop and interleave until the cost of the
@@ -6603,11 +5256,12 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
if (EnableLoadStoreRuntimeInterleave &&
std::max(StoresIC, LoadsIC) > SmallIC) {
- DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Interleaving to saturate store or load ports.\n");
return std::max(StoresIC, LoadsIC);
}
- DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
+ LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
return SmallIC;
}
@@ -6615,11 +5269,11 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
// this point) that could benefit from interleaving.
bool HasReductions = !Legal->getReductionVars()->empty();
if (TTI.enableAggressiveInterleaving(HasReductions)) {
- DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
+ LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
return IC;
}
- DEBUG(dbgs() << "LV: Not Interleaving.\n");
+ LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
return 1;
}
@@ -6646,7 +5300,6 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
DFS.perform(LI);
RegisterUsage RU;
- RU.NumInstructions = 0;
// Each 'key' in the map opens a new interval. The values
// of the map are the index of the 'last seen' usage of the
@@ -6658,14 +5311,13 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
// Marks the end of each interval.
IntervalMap EndPoint;
// Saves the list of instruction indices that are used in the loop.
- SmallSet<Instruction *, 8> Ends;
+ SmallPtrSet<Instruction *, 8> Ends;
// Saves the list of values that are used in the loop but are
// defined outside the loop, such as arguments and constants.
SmallPtrSet<Value *, 8> LoopInvariants;
unsigned Index = 0;
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
- RU.NumInstructions += BB->size();
for (Instruction &I : *BB) {
IdxToInstr[Index++] = &I;
@@ -6698,7 +5350,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
for (auto &Interval : EndPoint)
TransposeEnds[Interval.second].push_back(Interval.first);
- SmallSet<Instruction *, 8> OpenIntervals;
+ SmallPtrSet<Instruction *, 8> OpenIntervals;
// Get the size of the widest register.
unsigned MaxSafeDepDist = -1U;
@@ -6711,7 +5363,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
SmallVector<RegisterUsage, 8> RUs(VFs.size());
SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
- DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+ LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
// A lambda that gets the register usage for the given type and VF.
auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
@@ -6756,8 +5408,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
}
- DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
- << OpenIntervals.size() << '\n');
+ LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
+ << OpenIntervals.size() << '\n');
// Add the current instruction to the list of open intervals.
OpenIntervals.insert(I);
@@ -6772,10 +5424,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
Invariant += GetRegUsage(Inst->getType(), VFs[i]);
}
- DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
- DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
- DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
- DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n');
+ LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
+ LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
+ LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
+ << '\n');
RU.LoopInvariantRegs = Invariant;
RU.MaxLocalUsers = MaxUsages[i];
@@ -6785,6 +5437,22 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
return RUs;
}
+bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
+ // TODO: Cost model for emulated masked load/store is completely
+ // broken. This hack guides the cost model to use an artificially
+ // high enough value to practically disable vectorization with such
+ // operations, except where previously deployed legality hack allowed
+ // using very low cost values. This is to avoid regressions coming simply
+ // from moving "masked load/store" check from legality to cost model.
+ // Masked Load/Gather emulation was previously never allowed.
+ // Limited number of Masked Store/Scatter emulation was allowed.
+ assert(isScalarWithPredication(I) &&
+ "Expecting a scalar emulated instruction");
+ return isa<LoadInst>(I) ||
+ (isa<StoreInst>(I) &&
+ NumPredStores > NumberOfStoresToPredicate);
+}
+
void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
// If we aren't vectorizing the loop, or if we've already collected the
// instructions to scalarize, there's nothing to do. Collection may already
@@ -6805,11 +5473,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
if (!Legal->blockNeedsPredication(BB))
continue;
for (Instruction &I : *BB)
- if (Legal->isScalarWithPredication(&I)) {
+ if (isScalarWithPredication(&I)) {
ScalarCostsTy ScalarCosts;
- if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
+ // Do not apply discount logic if hacked cost is needed
+ // for emulated masked memrefs.
+ if (!useEmulatedMaskMemRefHack(&I) &&
+ computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
-
// Remember that BB will remain after vectorization.
PredicatedBBsAfterVectorization.insert(BB);
}
@@ -6844,7 +5514,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// If the instruction is scalar with predication, it will be analyzed
// separately. We ignore it within the context of PredInst.
- if (Legal->isScalarWithPredication(I))
+ if (isScalarWithPredication(I))
return false;
// If any of the instruction's operands are uniform after vectorization,
@@ -6898,7 +5568,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
- if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+ if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
true, false);
ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
@@ -6940,11 +5610,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
VectorizationCostTy BlockCost;
// For each instruction in the old loop.
- for (Instruction &I : *BB) {
- // Skip dbg intrinsics.
- if (isa<DbgInfoIntrinsic>(I))
- continue;
-
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
// Skip ignored values.
if (ValuesToIgnore.count(&I) ||
(VF > 1 && VecValuesToIgnore.count(&I)))
@@ -6958,8 +5624,9 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
BlockCost.first += C.first;
BlockCost.second |= C.second;
- DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first << " for VF "
- << VF << " For instruction: " << I << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
+ << " for VF " << VF << " For instruction: " << I
+ << '\n');
}
// If we are vectorizing a predicated block, it will have been
@@ -6978,7 +5645,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
return Cost;
}
-/// \brief Gets Address Access SCEV after verifying that the access pattern
+/// Gets Address Access SCEV after verifying that the access pattern
/// is loop invariant except the induction variable dependence.
///
/// This SCEV can be sent to the Target in order to estimate the address
@@ -7020,7 +5687,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
unsigned Alignment = getMemInstAlignment(I);
unsigned AS = getMemInstAddressSpace(I);
- Value *Ptr = getPointerOperand(I);
+ Value *Ptr = getLoadStorePointerOperand(I);
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
// Figure out whether the access is strided and get the stride value
@@ -7041,9 +5708,15 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// If we have a predicated store, it may not be executed for each vector
// lane. Scale the cost by the probability of executing the predicated
// block.
- if (Legal->isScalarWithPredication(I))
+ if (isScalarWithPredication(I)) {
Cost /= getReciprocalPredBlockProb();
+ if (useEmulatedMaskMemRefHack(I))
+ // Artificially setting to a high enough value to practically disable
+ // vectorization with such operations.
+ Cost = 3000000;
+ }
+
return Cost;
}
@@ -7052,7 +5725,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
unsigned Alignment = getMemInstAlignment(I);
- Value *Ptr = getPointerOperand(I);
+ Value *Ptr = getLoadStorePointerOperand(I);
unsigned AS = getMemInstAddressSpace(I);
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
@@ -7088,7 +5761,7 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
unsigned Alignment = getMemInstAlignment(I);
- Value *Ptr = getPointerOperand(I);
+ Value *Ptr = getLoadStorePointerOperand(I);
return TTI.getAddressComputationCost(VectorTy) +
TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
@@ -7101,7 +5774,7 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
Type *VectorTy = ToVectorTy(ValTy, VF);
unsigned AS = getMemInstAddressSpace(I);
- auto Group = Legal->getInterleavedAccessGroup(I);
+ auto Group = getInterleavedAccessGroup(I);
assert(Group && "Fail to get an interleaved access group.");
unsigned InterleaveFactor = Group->getFactor();
@@ -7168,13 +5841,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
if (VF == 1)
return;
+ NumPredStores = 0;
for (BasicBlock *BB : TheLoop->blocks()) {
// For each instruction in the old loop.
for (Instruction &I : *BB) {
- Value *Ptr = getPointerOperand(&I);
+ Value *Ptr = getLoadStorePointerOperand(&I);
if (!Ptr)
continue;
+ if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
+ NumPredStores++;
if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {
// Scalar load + broadcast
unsigned Cost = getUniformMemOpCost(&I, VF);
@@ -7183,9 +5859,10 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
}
// We assume that widening is the best solution when possible.
- if (Legal->memoryInstructionCanBeWidened(&I, VF)) {
+ if (memoryInstructionCanBeWidened(&I, VF)) {
unsigned Cost = getConsecutiveMemOpCost(&I, VF);
- int ConsecutiveStride = Legal->isConsecutivePtr(getPointerOperand(&I));
+ int ConsecutiveStride =
+ Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Expected consecutive stride.");
InstWidening Decision =
@@ -7197,8 +5874,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
// Choose between Interleaving, Gather/Scatter or Scalarization.
unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
unsigned NumAccesses = 1;
- if (Legal->isAccessInterleaved(&I)) {
- auto Group = Legal->getInterleavedAccessGroup(&I);
+ if (isAccessInterleaved(&I)) {
+ auto Group = getInterleavedAccessGroup(&I);
assert(Group && "Fail to get an interleaved access group.");
// Make one decision for the whole group.
@@ -7210,7 +5887,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
}
unsigned GatherScatterCost =
- Legal->isLegalGatherOrScatter(&I)
+ isLegalGatherOrScatter(&I)
? getGatherScatterCost(&I, VF) * NumAccesses
: std::numeric_limits<unsigned>::max();
@@ -7235,7 +5912,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
// If the instructions belongs to an interleave group, the whole group
// receives the same decision. The whole group receives the cost, but
// the cost will actually be assigned to one instruction.
- if (auto Group = Legal->getInterleavedAccessGroup(&I))
+ if (auto Group = getInterleavedAccessGroup(&I))
setWideningDecision(Group, VF, Decision, Cost);
else
setWideningDecision(&I, VF, Decision, Cost);
@@ -7255,7 +5932,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
for (BasicBlock *BB : TheLoop->blocks())
for (Instruction &I : *BB) {
Instruction *PtrDef =
- dyn_cast_or_null<Instruction>(getPointerOperand(&I));
+ dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
if (PtrDef && TheLoop->contains(PtrDef) &&
getWideningDecision(&I, VF) != CM_GatherScatter)
AddrDefs.insert(PtrDef);
@@ -7285,7 +5962,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
// Scalarize a widened load of address.
setWideningDecision(I, VF, CM_Scalarize,
(VF * getMemoryInstructionCost(I, 1)));
- else if (auto Group = Legal->getInterleavedAccessGroup(I)) {
+ else if (auto Group = getInterleavedAccessGroup(I)) {
// Scalarize an interleave group of address loads.
for (unsigned I = 0; I < Group->getFactor(); ++I) {
if (Instruction *Member = Group->getMember(I))
@@ -7371,7 +6048,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// vector lane. Get the scalarization cost and scale this amount by the
// probability of executing the predicated block. If the instruction is not
// predicated, we fall through to the next case.
- if (VF > 1 && Legal->isScalarWithPredication(I)) {
+ if (VF > 1 && isScalarWithPredication(I)) {
unsigned Cost = 0;
// These instructions have a non-void type, so account for the phi nodes
@@ -7569,7 +6246,7 @@ Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
// Check if the pointer operand of a load or store instruction is
// consecutive.
- if (auto *Ptr = getPointerOperand(Inst))
+ if (auto *Ptr = getLoadStorePointerOperand(Inst))
return Legal->isConsecutivePtr(Ptr);
return false;
}
@@ -7594,23 +6271,59 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
}
}
-LoopVectorizationCostModel::VectorizationFactor
+VectorizationFactor
+LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
+ unsigned UserVF) {
+ // Width 1 means no vectorization, cost 0 means uncomputed cost.
+ const VectorizationFactor NoVectorization = {1U, 0U};
+
+ // Outer loop handling: They may require CFG and instruction level
+ // transformations before even evaluating whether vectorization is profitable.
+ // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+ // the vectorization pipeline.
+ if (!OrigLoop->empty()) {
+ // TODO: If UserVF is not provided, we set UserVF to 4 for stress testing.
+ // This won't be necessary when UserVF is not required in the VPlan-native
+ // path.
+ if (VPlanBuildStressTest && !UserVF)
+ UserVF = 4;
+
+ assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+ assert(UserVF && "Expected UserVF for outer loop vectorization.");
+ assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
+ LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
+ buildVPlans(UserVF, UserVF);
+
+ // For VPlan build stress testing, we bail out after VPlan construction.
+ if (VPlanBuildStressTest)
+ return NoVectorization;
+
+ return {UserVF, 0};
+ }
+
+ LLVM_DEBUG(
+ dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
+ "VPlan-native path.\n");
+ return NoVectorization;
+}
+
+VectorizationFactor
LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
- // Width 1 means no vectorize, cost 0 means uncomputed cost.
- const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U,
- 0U};
+ assert(OrigLoop->empty() && "Inner loop expected.");
+ // Width 1 means no vectorization, cost 0 means uncomputed cost.
+ const VectorizationFactor NoVectorization = {1U, 0U};
Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
return NoVectorization;
if (UserVF) {
- DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
+ LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
CM.selectUserVectorizationFactor(UserVF);
- buildVPlans(UserVF, UserVF);
- DEBUG(printPlans(dbgs()));
+ buildVPlansWithVPRecipes(UserVF, UserVF);
+ LLVM_DEBUG(printPlans(dbgs()));
return {UserVF, 0};
}
@@ -7627,8 +6340,8 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
CM.collectInstsToScalarize(VF);
}
- buildVPlans(1, MaxVF);
- DEBUG(printPlans(dbgs()));
+ buildVPlansWithVPRecipes(1, MaxVF);
+ LLVM_DEBUG(printPlans(dbgs()));
if (MaxVF == 1)
return NoVectorization;
@@ -7637,7 +6350,8 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
}
void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
- DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n');
+ LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
+ << '\n');
BestVF = VF;
BestUF = UF;
@@ -7787,30 +6501,15 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange(
/// vectorization decision can potentially shorten this sub-range during
/// buildVPlan().
void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
-
- // Collect conditions feeding internal conditional branches; they need to be
- // represented in VPlan for it to model masking.
- SmallPtrSet<Value *, 1> NeedDef;
-
- auto *Latch = OrigLoop->getLoopLatch();
- for (BasicBlock *BB : OrigLoop->blocks()) {
- if (BB == Latch)
- continue;
- BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
- if (Branch && Branch->isConditional())
- NeedDef.insert(Branch->getCondition());
- }
-
for (unsigned VF = MinVF; VF < MaxVF + 1;) {
VFRange SubRange = {VF, MaxVF + 1};
- VPlans.push_back(buildVPlan(SubRange, NeedDef));
+ VPlans.push_back(buildVPlan(SubRange));
VF = SubRange.End;
}
}
-VPValue *LoopVectorizationPlanner::createEdgeMask(BasicBlock *Src,
- BasicBlock *Dst,
- VPlanPtr &Plan) {
+VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
+ VPlanPtr &Plan) {
assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
// Look for cached value.
@@ -7840,8 +6539,7 @@ VPValue *LoopVectorizationPlanner::createEdgeMask(BasicBlock *Src,
return EdgeMaskCache[Edge] = EdgeMask;
}
-VPValue *LoopVectorizationPlanner::createBlockInMask(BasicBlock *BB,
- VPlanPtr &Plan) {
+VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
// Look for cached value.
@@ -7874,10 +6572,9 @@ VPValue *LoopVectorizationPlanner::createBlockInMask(BasicBlock *BB,
return BlockMaskCache[BB] = BlockMask;
}
-VPInterleaveRecipe *
-LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
- VFRange &Range) {
- const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(I);
+VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
+ VFRange &Range) {
+ const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I);
if (!IG)
return nullptr;
@@ -7889,7 +6586,7 @@ LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
LoopVectorizationCostModel::CM_Interleave);
};
};
- if (!getDecisionAndClampRange(isIGMember(I), Range))
+ if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
return nullptr;
// I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
@@ -7902,8 +6599,8 @@ LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
}
VPWidenMemoryInstructionRecipe *
-LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range,
- VPlanPtr &Plan) {
+VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
+ VPlanPtr &Plan) {
if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
return nullptr;
@@ -7922,7 +6619,7 @@ LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range,
return Decision != LoopVectorizationCostModel::CM_Scalarize;
};
- if (!getDecisionAndClampRange(willWiden, Range))
+ if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
return nullptr;
VPValue *Mask = nullptr;
@@ -7933,8 +6630,7 @@ LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range,
}
VPWidenIntOrFpInductionRecipe *
-LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I,
- VFRange &Range) {
+VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
if (PHINode *Phi = dyn_cast<PHINode>(I)) {
// Check if this is an integer or fp induction. If so, build the recipe that
// produces its scalar and vector values.
@@ -7959,15 +6655,14 @@ LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I,
[=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
};
- if (isa<TruncInst>(I) &&
- getDecisionAndClampRange(isOptimizableIVTruncate(I), Range))
+ if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
+ isOptimizableIVTruncate(I), Range))
return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
cast<TruncInst>(I));
return nullptr;
}
-VPBlendRecipe *
-LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) {
+VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
PHINode *Phi = dyn_cast<PHINode>(I);
if (!Phi || Phi->getParent() == OrigLoop->getHeader())
return nullptr;
@@ -7991,9 +6686,9 @@ LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) {
return new VPBlendRecipe(Phi, Masks);
}
-bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
- VFRange &Range) {
- if (Legal->isScalarWithPredication(I))
+bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
+ VFRange &Range) {
+ if (CM.isScalarWithPredication(I))
return false;
auto IsVectorizableOpcode = [](unsigned Opcode) {
@@ -8077,7 +6772,7 @@ bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
return true;
};
- if (!getDecisionAndClampRange(willWiden, Range))
+ if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
return false;
// Success: widen this instruction. We optimize the common case where
@@ -8092,15 +6787,15 @@ bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
return true;
}
-VPBasicBlock *LoopVectorizationPlanner::handleReplication(
+VPBasicBlock *VPRecipeBuilder::handleReplication(
Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
VPlanPtr &Plan) {
- bool IsUniform = getDecisionAndClampRange(
+ bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
[&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
Range);
- bool IsPredicated = Legal->isScalarWithPredication(I);
+ bool IsPredicated = CM.isScalarWithPredication(I);
auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
// Find if I uses a predicated instruction. If so, it will use its scalar
@@ -8113,24 +6808,25 @@ VPBasicBlock *LoopVectorizationPlanner::handleReplication(
// Finalize the recipe for Instr, first if it is not predicated.
if (!IsPredicated) {
- DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
VPBB->appendRecipe(Recipe);
return VPBB;
}
- DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
assert(VPBB->getSuccessors().empty() &&
"VPBB has successors when handling predicated replication.");
// Record predicated instructions for above packing optimizations.
PredInst2Recipe[I] = Recipe;
- VPBlockBase *Region =
- VPBB->setOneSuccessor(createReplicateRegion(I, Recipe, Plan));
- return cast<VPBasicBlock>(Region->setOneSuccessor(new VPBasicBlock()));
+ VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
+ VPBlockUtils::insertBlockAfter(Region, VPBB);
+ auto *RegSucc = new VPBasicBlock();
+ VPBlockUtils::insertBlockAfter(RegSucc, Region);
+ return RegSucc;
}
-VPRegionBlock *
-LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr,
- VPRecipeBase *PredRecipe,
- VPlanPtr &Plan) {
+VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
+ VPRecipeBase *PredRecipe,
+ VPlanPtr &Plan) {
// Instructions marked for predication are replicated and placed under an
// if-then construct to prevent side-effects.
@@ -8150,19 +6846,67 @@ LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr,
// Note: first set Entry as region entry and then connect successors starting
// from it in order, to propagate the "parent" of each VPBasicBlock.
- Entry->setTwoSuccessors(Pred, Exit);
- Pred->setOneSuccessor(Exit);
+ VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
+ VPBlockUtils::connectBlocks(Pred, Exit);
return Region;
}
-LoopVectorizationPlanner::VPlanPtr
-LoopVectorizationPlanner::buildVPlan(VFRange &Range,
- const SmallPtrSetImpl<Value *> &NeedDef) {
- EdgeMaskCache.clear();
- BlockMaskCache.clear();
- DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
- DenseMap<Instruction *, Instruction *> SinkAfterInverse;
+bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
+ VPlanPtr &Plan, VPBasicBlock *VPBB) {
+ VPRecipeBase *Recipe = nullptr;
+ // Check if Instr should belong to an interleave memory recipe, or already
+ // does. In the latter case Instr is irrelevant.
+ if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
+ VPBB->appendRecipe(Recipe);
+ return true;
+ }
+
+ // Check if Instr is a memory operation that should be widened.
+ if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
+ VPBB->appendRecipe(Recipe);
+ return true;
+ }
+
+ // Check if Instr should form some PHI recipe.
+ if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
+ VPBB->appendRecipe(Recipe);
+ return true;
+ }
+ if ((Recipe = tryToBlend(Instr, Plan))) {
+ VPBB->appendRecipe(Recipe);
+ return true;
+ }
+ if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
+ VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
+ return true;
+ }
+
+ // Check if Instr is to be widened by a general VPWidenRecipe, after
+ // having first checked for specific widening recipes that deal with
+ // Interleave Groups, Inductions and Phi nodes.
+ if (tryToWiden(Instr, VPBB, Range))
+ return true;
+
+ return false;
+}
+
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
+ unsigned MaxVF) {
+ assert(OrigLoop->empty() && "Inner loop expected.");
+
+ // Collect conditions feeding internal conditional branches; they need to be
+ // represented in VPlan for it to model masking.
+ SmallPtrSet<Value *, 1> NeedDef;
+
+ auto *Latch = OrigLoop->getLoopLatch();
+ for (BasicBlock *BB : OrigLoop->blocks()) {
+ if (BB == Latch)
+ continue;
+ BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
+ if (Branch && Branch->isConditional())
+ NeedDef.insert(Branch->getCondition());
+ }
// Collect instructions from the original loop that will become trivially dead
// in the vectorized loop. We don't need to vectorize these instructions. For
@@ -8173,15 +6917,31 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
SmallPtrSet<Instruction *, 4> DeadInstructions;
collectTriviallyDeadInstructions(DeadInstructions);
+ for (unsigned VF = MinVF; VF < MaxVF + 1;) {
+ VFRange SubRange = {VF, MaxVF + 1};
+ VPlans.push_back(
+ buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
+ VF = SubRange.End;
+ }
+}
+
+LoopVectorizationPlanner::VPlanPtr
+LoopVectorizationPlanner::buildVPlanWithVPRecipes(
+ VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
+ SmallPtrSetImpl<Instruction *> &DeadInstructions) {
// Hold a mapping from predicated instructions to their recipes, in order to
// fix their AlsoPack behavior if a user is determined to replicate and use a
// scalar instead of vector value.
DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
+ DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
+ DenseMap<Instruction *, Instruction *> SinkAfterInverse;
+
// Create a dummy pre-entry VPBasicBlock to start building the VPlan.
VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
auto Plan = llvm::make_unique<VPlan>(VPBB);
+ VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, TTI, Legal, CM, Builder);
// Represent values that will have defs inside VPlan.
for (Value *V : NeedDef)
Plan->addVPValue(V);
@@ -8196,7 +6956,7 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
// ingredients and fill a new VPBasicBlock.
unsigned VPBBsForBB = 0;
auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
- VPBB->setOneSuccessor(FirstVPBBForBB);
+ VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
VPBB = FirstVPBBForBB;
Builder.setInsertPoint(VPBB);
@@ -8204,18 +6964,17 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
// Organize the ingredients to vectorize from current basic block in the
// right order.
- for (Instruction &I : *BB) {
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
Instruction *Instr = &I;
// First filter out irrelevant instructions, to ensure no recipes are
// built for them.
- if (isa<BranchInst>(Instr) || isa<DbgInfoIntrinsic>(Instr) ||
- DeadInstructions.count(Instr))
+ if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
continue;
// I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
// member of the IG, do not construct any Recipe for it.
- const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(Instr);
+ const InterleaveGroup *IG = CM.getInterleavedAccessGroup(Instr);
if (IG && Instr != IG->getInsertPos() &&
Range.Start >= 2 && // Query is illegal for VF == 1
CM.getWideningDecision(Instr, Range.Start) ==
@@ -8230,8 +6989,9 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
// should follow.
auto SAIt = SinkAfter.find(Instr);
if (SAIt != SinkAfter.end()) {
- DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" << *SAIt->second
- << " to vectorize a 1st order recurrence.\n");
+ LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
+ << *SAIt->second
+ << " to vectorize a 1st order recurrence.\n");
SinkAfterInverse[SAIt->second] = Instr;
continue;
}
@@ -8247,45 +7007,13 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
// Introduce each ingredient into VPlan.
for (Instruction *Instr : Ingredients) {
- VPRecipeBase *Recipe = nullptr;
-
- // Check if Instr should belong to an interleave memory recipe, or already
- // does. In the latter case Instr is irrelevant.
- if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
- VPBB->appendRecipe(Recipe);
- continue;
- }
-
- // Check if Instr is a memory operation that should be widened.
- if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
- VPBB->appendRecipe(Recipe);
- continue;
- }
-
- // Check if Instr should form some PHI recipe.
- if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
- VPBB->appendRecipe(Recipe);
- continue;
- }
- if ((Recipe = tryToBlend(Instr, Plan))) {
- VPBB->appendRecipe(Recipe);
- continue;
- }
- if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
- VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
- continue;
- }
-
- // Check if Instr is to be widened by a general VPWidenRecipe, after
- // having first checked for specific widening recipes that deal with
- // Interleave Groups, Inductions and Phi nodes.
- if (tryToWiden(Instr, VPBB, Range))
+ if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
continue;
// Otherwise, if all widening options failed, Instruction is to be
// replicated. This may create a successor for VPBB.
- VPBasicBlock *NextVPBB =
- handleReplication(Instr, Range, VPBB, PredInst2Recipe, Plan);
+ VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
+ Instr, Range, VPBB, PredInst2Recipe, Plan);
if (NextVPBB != VPBB) {
VPBB = NextVPBB;
VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
@@ -8300,7 +7028,7 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
assert(PreEntry->empty() && "Expecting empty pre-entry block.");
VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
- PreEntry->disconnectSuccessor(Entry);
+ VPBlockUtils::disconnectBlocks(PreEntry, Entry);
delete PreEntry;
std::string PlanName;
@@ -8319,6 +7047,30 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range,
return Plan;
}
+LoopVectorizationPlanner::VPlanPtr
+LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
+ // Outer loop handling: They may require CFG and instruction level
+ // transformations before even evaluating whether vectorization is profitable.
+ // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+ // the vectorization pipeline.
+ assert(!OrigLoop->empty());
+ assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+
+ // Create new empty VPlan
+ auto Plan = llvm::make_unique<VPlan>();
+
+ // Build hierarchical CFG
+ VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI);
+ HCFGBuilder.buildHierarchicalCFG(*Plan.get());
+
+ return Plan;
+}
+
+Value* LoopVectorizationPlanner::VPCallbackILV::
+getOrCreateVectorValues(Value *V, unsigned Part) {
+ return ILV.getOrCreateVectorValue(V, Part);
+}
+
void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
O << " +\n"
<< Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
@@ -8483,28 +7235,66 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
}
+// Process the loop in the VPlan-native vectorization path. This path builds
+// VPlan upfront in the vectorization pipeline, which allows to apply
+// VPlan-to-VPlan transformations from the very beginning without modifying the
+// input LLVM IR.
+static bool processLoopInVPlanNativePath(
+ Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
+ LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
+ TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints) {
+
+ assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
+ Function *F = L->getHeader()->getParent();
+ InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
+ LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
+ &Hints, IAI);
+ // Use the planner for outer loop vectorization.
+ // TODO: CM is not used at this point inside the planner. Turn CM into an
+ // optional argument if we don't need it in the future.
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
+
+ // Get user vectorization factor.
+ unsigned UserVF = Hints.getWidth();
+
+ // Check the function attributes to find out if this function should be
+ // optimized for size.
+ bool OptForSize =
+ Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
+
+ // Plan how to best vectorize, return the best VF and its cost.
+ LVP.planInVPlanNativePath(OptForSize, UserVF);
+
+ // Returning false. We are currently not generating vector code in the VPlan
+ // native path.
+ return false;
+}
+
bool LoopVectorizePass::processLoop(Loop *L) {
- assert(L->empty() && "Only process inner loops.");
+ assert((EnableVPlanNativePath || L->empty()) &&
+ "VPlan-native path is not enabled. Only process inner loops.");
#ifndef NDEBUG
const std::string DebugLocStr = getDebugLocString(L);
#endif /* NDEBUG */
- DEBUG(dbgs() << "\nLV: Checking a loop in \""
- << L->getHeader()->getParent()->getName() << "\" from "
- << DebugLocStr << "\n");
+ LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
+ << L->getHeader()->getParent()->getName() << "\" from "
+ << DebugLocStr << "\n");
LoopVectorizeHints Hints(L, DisableUnrolling, *ORE);
- DEBUG(dbgs() << "LV: Loop hints:"
- << " force="
- << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
- ? "disabled"
- : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
- ? "enabled"
- : "?"))
- << " width=" << Hints.getWidth()
- << " unroll=" << Hints.getInterleave() << "\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Loop hints:"
+ << " force="
+ << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
+ ? "disabled"
+ : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
+ ? "enabled"
+ : "?"))
+ << " width=" << Hints.getWidth()
+ << " unroll=" << Hints.getInterleave() << "\n");
// Function containing loop
Function *F = L->getHeader()->getParent();
@@ -8518,7 +7308,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// benefit from vectorization, respectively.
if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
- DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
+ LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
return false;
}
@@ -8526,10 +7316,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check if it is legal to vectorize the loop.
LoopVectorizationRequirements Requirements(*ORE);
- LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE,
- &Requirements, &Hints);
- if (!LVL.canVectorize()) {
- DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
+ LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, GetLAA, LI, ORE,
+ &Requirements, &Hints, DB, AC);
+ if (!LVL.canVectorize(EnableVPlanNativePath)) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
emitMissedWarning(F, L, Hints, ORE);
return false;
}
@@ -8539,11 +7329,33 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool OptForSize =
Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
+ // Entrance to the VPlan-native vectorization path. Outer loops are processed
+ // here. They may require CFG and instruction level transformations before
+ // even evaluating whether vectorization is profitable. Since we cannot modify
+ // the incoming IR, we need to build VPlan upfront in the vectorization
+ // pipeline.
+ if (!L->empty())
+ return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
+ ORE, Hints);
+
+ assert(L->empty() && "Inner loop expected.");
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
// count by optimizing for size, to minimize overheads.
- unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
- bool HasExpectedTC = (ExpectedTC > 0);
-
+ // Prefer constant trip counts over profile data, over upper bound estimate.
+ unsigned ExpectedTC = 0;
+ bool HasExpectedTC = false;
+ if (const SCEVConstant *ConstExits =
+ dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
+ const APInt &ExitsCount = ConstExits->getAPInt();
+ // We are interested in small values for ExpectedTC. Skip over those that
+ // can't fit an unsigned.
+ if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
+ ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
+ HasExpectedTC = true;
+ }
+ }
+ // ExpectedTC may be large because it's bound by a variable. Check
+ // profiling information to validate we should vectorize.
if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
auto EstimatedTC = getLoopEstimatedTripCount(L);
if (EstimatedTC) {
@@ -8551,15 +7363,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
HasExpectedTC = true;
}
}
+ if (!HasExpectedTC) {
+ ExpectedTC = SE->getSmallConstantMaxTripCount(L);
+ HasExpectedTC = (ExpectedTC > 0);
+ }
if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
- DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
- << "This loop is worth vectorizing only if no scalar "
- << "iteration overheads are incurred.");
+ LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
+ << "This loop is worth vectorizing only if no scalar "
+ << "iteration overheads are incurred.");
if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
- DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
+ LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
else {
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "\n");
// Loops with a very small trip count are considered for vectorization
// under OptForSize, thereby making sure the cost of their loop body is
// dominant, free of runtime guards and scalar iteration overheads.
@@ -8572,10 +7388,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// an integer loop and the vector instructions selected are purely integer
// vector instructions?
if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
- DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
- "attribute is used.\n");
- ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(),
- "NoImplicitFloat", L)
+ LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
+ "attribute is used.\n");
+ ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(),
+ "NoImplicitFloat", L)
<< "loop not vectorized due to NoImplicitFloat attribute");
emitMissedWarning(F, L, Hints, ORE);
return false;
@@ -8587,17 +7403,30 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// additional fp-math flags can help.
if (Hints.isPotentiallyUnsafe() &&
TTI->isFPVectorizationPotentiallyUnsafe()) {
- DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
ORE->emit(
- createMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
+ createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
<< "loop not vectorized due to unsafe FP support.");
emitMissedWarning(F, L, Hints, ORE);
return false;
}
+ bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
+ InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
+
+ // If an override option has been passed in for interleaved accesses, use it.
+ if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
+ UseInterleaved = EnableInterleavedMemAccesses;
+
+ // Analyze interleaved memory accesses.
+ if (UseInterleaved) {
+ IAI.analyzeInterleaving();
+ }
+
// Use the cost model.
LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
- &Hints);
+ &Hints, IAI);
CM.collectValuesToIgnore();
// Use the planner for vectorization.
@@ -8607,8 +7436,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
unsigned UserVF = Hints.getWidth();
// Plan how to best vectorize, return the best VF and its cost.
- LoopVectorizationCostModel::VectorizationFactor VF =
- LVP.plan(OptForSize, UserVF);
+ VectorizationFactor VF = LVP.plan(OptForSize, UserVF);
// Select the interleave count.
unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
@@ -8620,14 +7448,14 @@ bool LoopVectorizePass::processLoop(Loop *L) {
std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
bool VectorizeLoop = true, InterleaveLoop = true;
if (Requirements.doesNotMeet(F, L, Hints)) {
- DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
- "requirements.\n");
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
+ "requirements.\n");
emitMissedWarning(F, L, Hints, ORE);
return false;
}
if (VF.Width == 1) {
- DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+ LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
VecDiagMsg = std::make_pair(
"VectorizationNotBeneficial",
"the cost-model indicates that vectorization is not beneficial");
@@ -8636,7 +7464,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (IC == 1 && UserIC <= 1) {
// Tell the user interleaving is not beneficial.
- DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
+ LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
IntDiagMsg = std::make_pair(
"InterleavingNotBeneficial",
"the cost-model indicates that interleaving is not beneficial");
@@ -8648,8 +7476,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
}
} else if (IC > 1 && UserIC == 1) {
// Tell the user interleaving is beneficial, but it explicitly disabled.
- DEBUG(dbgs()
- << "LV: Interleaving is beneficial but is explicitly disabled.");
+ LLVM_DEBUG(
+ dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
IntDiagMsg = std::make_pair(
"InterleavingBeneficialButDisabled",
"the cost-model indicates that interleaving is beneficial "
@@ -8676,24 +7504,24 @@ bool LoopVectorizePass::processLoop(Loop *L) {
});
return false;
} else if (!VectorizeLoop && InterleaveLoop) {
- DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
ORE->emit([&]() {
return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
L->getStartLoc(), L->getHeader())
<< VecDiagMsg.second;
});
} else if (VectorizeLoop && !InterleaveLoop) {
- DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
- << DebugLocStr << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
+ << ") in " << DebugLocStr << '\n');
ORE->emit([&]() {
return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
L->getStartLoc(), L->getHeader())
<< IntDiagMsg.second;
});
} else if (VectorizeLoop && InterleaveLoop) {
- DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
- << DebugLocStr << '\n');
- DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
+ << ") in " << DebugLocStr << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
}
LVP.setBestPlan(VF.Width, IC);
@@ -8740,7 +7568,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Mark the loop as already vectorized to avoid vectorizing again.
Hints.setAlreadyVectorized();
- DEBUG(verifyFunction(*L->getHeader()->getParent()));
+ LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
return true;
}
@@ -8788,7 +7616,7 @@ bool LoopVectorizePass::runImpl(
SmallVector<Loop *, 8> Worklist;
for (Loop *L : *LI)
- addAcyclicInnerLoop(*L, Worklist);
+ collectSupportedLoops(*L, LI, ORE, Worklist);
LoopsAnalyzed += Worklist.size();
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a7ccd3faec44..ac8c4f046c6f 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -161,7 +161,7 @@ static const unsigned MaxMemDepDistance = 160;
/// regions to be handled.
static const int MinScheduleRegionSize = 16;
-/// \brief Predicate for the element types that the SLP vectorizer supports.
+/// Predicate for the element types that the SLP vectorizer supports.
///
/// The most important thing to filter here are types which are invalid in LLVM
/// vectors. We also filter target specific types which have absolutely no
@@ -246,13 +246,15 @@ static bool isSplat(ArrayRef<Value *> VL) {
/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
/// ret <4 x i8> %ins4
/// InstCombiner transforms this into a shuffle and vector mul
+/// TODO: Can we split off and reuse the shuffle mask detection from
+/// TargetTransformInfo::getInstructionThroughput?
static Optional<TargetTransformInfo::ShuffleKind>
isShuffle(ArrayRef<Value *> VL) {
auto *EI0 = cast<ExtractElementInst>(VL[0]);
unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
Value *Vec1 = nullptr;
Value *Vec2 = nullptr;
- enum ShuffleMode {Unknown, FirstAlternate, SecondAlternate, Permute};
+ enum ShuffleMode { Unknown, Select, Permute };
ShuffleMode CommonShuffleMode = Unknown;
for (unsigned I = 0, E = VL.size(); I < E; ++I) {
auto *EI = cast<ExtractElementInst>(VL[I]);
@@ -272,7 +274,11 @@ isShuffle(ArrayRef<Value *> VL) {
continue;
// For correct shuffling we have to have at most 2 different vector operands
// in all extractelement instructions.
- if (Vec1 && Vec2 && Vec != Vec1 && Vec != Vec2)
+ if (!Vec1 || Vec1 == Vec)
+ Vec1 = Vec;
+ else if (!Vec2 || Vec2 == Vec)
+ Vec2 = Vec;
+ else
return None;
if (CommonShuffleMode == Permute)
continue;
@@ -282,119 +288,17 @@ isShuffle(ArrayRef<Value *> VL) {
CommonShuffleMode = Permute;
continue;
}
- // Check the shuffle mode for the current operation.
- if (!Vec1)
- Vec1 = Vec;
- else if (Vec != Vec1)
- Vec2 = Vec;
- // Example: shufflevector A, B, <0,5,2,7>
- // I is odd and IntIdx for A == I - FirstAlternate shuffle.
- // I is even and IntIdx for B == I - FirstAlternate shuffle.
- // Example: shufflevector A, B, <4,1,6,3>
- // I is even and IntIdx for A == I - SecondAlternate shuffle.
- // I is odd and IntIdx for B == I - SecondAlternate shuffle.
- const bool IIsEven = I & 1;
- const bool CurrVecIsA = Vec == Vec1;
- const bool IIsOdd = !IIsEven;
- const bool CurrVecIsB = !CurrVecIsA;
- ShuffleMode CurrentShuffleMode =
- ((IIsOdd && CurrVecIsA) || (IIsEven && CurrVecIsB)) ? FirstAlternate
- : SecondAlternate;
- // Common mode is not set or the same as the shuffle mode of the current
- // operation - alternate.
- if (CommonShuffleMode == Unknown)
- CommonShuffleMode = CurrentShuffleMode;
- // Common shuffle mode is not the same as the shuffle mode of the current
- // operation - permutation.
- if (CommonShuffleMode != CurrentShuffleMode)
- CommonShuffleMode = Permute;
+ CommonShuffleMode = Select;
}
// If we're not crossing lanes in different vectors, consider it as blending.
- if ((CommonShuffleMode == FirstAlternate ||
- CommonShuffleMode == SecondAlternate) &&
- Vec2)
- return TargetTransformInfo::SK_Alternate;
+ if (CommonShuffleMode == Select && Vec2)
+ return TargetTransformInfo::SK_Select;
// If Vec2 was never used, we have a permutation of a single vector, otherwise
// we have permutation of 2 vectors.
return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
: TargetTransformInfo::SK_PermuteSingleSrc;
}
-///\returns Opcode that can be clubbed with \p Op to create an alternate
-/// sequence which can later be merged as a ShuffleVector instruction.
-static unsigned getAltOpcode(unsigned Op) {
- switch (Op) {
- case Instruction::FAdd:
- return Instruction::FSub;
- case Instruction::FSub:
- return Instruction::FAdd;
- case Instruction::Add:
- return Instruction::Sub;
- case Instruction::Sub:
- return Instruction::Add;
- default:
- return 0;
- }
-}
-
-static bool isOdd(unsigned Value) {
- return Value & 1;
-}
-
-static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode,
- unsigned CheckedOpcode) {
- return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode;
-}
-
-/// Chooses the correct key for scheduling data. If \p Op has the same (or
-/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
-/// OpValue.
-static Value *isOneOf(Value *OpValue, Value *Op) {
- auto *I = dyn_cast<Instruction>(Op);
- if (!I)
- return OpValue;
- auto *OpInst = cast<Instruction>(OpValue);
- unsigned OpInstOpcode = OpInst->getOpcode();
- unsigned IOpcode = I->getOpcode();
- if (sameOpcodeOrAlt(OpInstOpcode, getAltOpcode(OpInstOpcode), IOpcode))
- return Op;
- return OpValue;
-}
-
-namespace {
-
-/// Contains data for the instructions going to be vectorized.
-struct RawInstructionsData {
- /// Main Opcode of the instructions going to be vectorized.
- unsigned Opcode = 0;
-
- /// The list of instructions have some instructions with alternate opcodes.
- bool HasAltOpcodes = false;
-};
-
-} // end anonymous namespace
-
-/// Checks the list of the vectorized instructions \p VL and returns info about
-/// this list.
-static RawInstructionsData getMainOpcode(ArrayRef<Value *> VL) {
- auto *I0 = dyn_cast<Instruction>(VL[0]);
- if (!I0)
- return {};
- RawInstructionsData Res;
- unsigned Opcode = I0->getOpcode();
- // Walk through the list of the vectorized instructions
- // in order to check its structure described by RawInstructionsData.
- for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) {
- auto *I = dyn_cast<Instruction>(VL[Cnt]);
- if (!I)
- return {};
- if (Opcode != I->getOpcode())
- Res.HasAltOpcodes = true;
- }
- Res.Opcode = Opcode;
- return Res;
-}
-
namespace {
/// Main data required for vectorization of instructions.
@@ -402,42 +306,90 @@ struct InstructionsState {
/// The very first instruction in the list with the main opcode.
Value *OpValue = nullptr;
- /// The main opcode for the list of instructions.
- unsigned Opcode = 0;
+ /// The main/alternate instruction.
+ Instruction *MainOp = nullptr;
+ Instruction *AltOp = nullptr;
+
+ /// The main/alternate opcodes for the list of instructions.
+ unsigned getOpcode() const {
+ return MainOp ? MainOp->getOpcode() : 0;
+ }
+
+ unsigned getAltOpcode() const {
+ return AltOp ? AltOp->getOpcode() : 0;
+ }
/// Some of the instructions in the list have alternate opcodes.
- bool IsAltShuffle = false;
+ bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
+
+ bool isOpcodeOrAlt(Instruction *I) const {
+ unsigned CheckedOpcode = I->getOpcode();
+ return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
+ }
- InstructionsState() = default;
- InstructionsState(Value *OpValue, unsigned Opcode, bool IsAltShuffle)
- : OpValue(OpValue), Opcode(Opcode), IsAltShuffle(IsAltShuffle) {}
+ InstructionsState() = delete;
+ InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
+ : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
};
} // end anonymous namespace
+/// Chooses the correct key for scheduling data. If \p Op has the same (or
+/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
+/// OpValue.
+static Value *isOneOf(const InstructionsState &S, Value *Op) {
+ auto *I = dyn_cast<Instruction>(Op);
+ if (I && S.isOpcodeOrAlt(I))
+ return Op;
+ return S.OpValue;
+}
+
/// \returns analysis of the Instructions in \p VL described in
/// InstructionsState, the Opcode that we suppose the whole list
/// could be vectorized even if its structure is diverse.
-static InstructionsState getSameOpcode(ArrayRef<Value *> VL) {
- auto Res = getMainOpcode(VL);
- unsigned Opcode = Res.Opcode;
- if (!Res.HasAltOpcodes)
- return InstructionsState(VL[0], Opcode, false);
- auto *OpInst = cast<Instruction>(VL[0]);
- unsigned AltOpcode = getAltOpcode(Opcode);
- // Examine each element in the list instructions VL to determine
- // if some operations there could be considered as an alternative
- // (for example as subtraction relates to addition operation).
+static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
+ unsigned BaseIndex = 0) {
+ // Make sure these are all Instructions.
+ if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+
+ bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
+ bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
+ unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
+ unsigned AltOpcode = Opcode;
+ unsigned AltIndex = BaseIndex;
+
+ // Check for one alternate opcode from another BinaryOperator.
+ // TODO - generalize to support all operators (types, calls etc.).
for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
- auto *I = cast<Instruction>(VL[Cnt]);
- unsigned InstOpcode = I->getOpcode();
- if ((Res.HasAltOpcodes &&
- InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) ||
- (!Res.HasAltOpcodes && InstOpcode != Opcode)) {
- return InstructionsState(OpInst, 0, false);
- }
+ unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
+ if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
+ if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+ continue;
+ if (Opcode == AltOpcode) {
+ AltOpcode = InstOpcode;
+ AltIndex = Cnt;
+ continue;
+ }
+ } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
+ Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
+ Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
+ if (Ty0 == Ty1) {
+ if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+ continue;
+ if (Opcode == AltOpcode) {
+ AltOpcode = InstOpcode;
+ AltIndex = Cnt;
+ continue;
+ }
+ }
+ } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+ continue;
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
}
- return InstructionsState(OpInst, Opcode, Res.HasAltOpcodes);
+
+ return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
+ cast<Instruction>(VL[AltIndex]));
}
/// \returns true if all of the values in \p VL have the same type or false
@@ -452,16 +404,21 @@ static bool allSameType(ArrayRef<Value *> VL) {
}
/// \returns True if Extract{Value,Element} instruction extracts element Idx.
-static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
- assert(Opcode == Instruction::ExtractElement ||
- Opcode == Instruction::ExtractValue);
+static Optional<unsigned> getExtractIndex(Instruction *E) {
+ unsigned Opcode = E->getOpcode();
+ assert((Opcode == Instruction::ExtractElement ||
+ Opcode == Instruction::ExtractValue) &&
+ "Expected extractelement or extractvalue instruction.");
if (Opcode == Instruction::ExtractElement) {
- ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
- return CI && CI->getZExtValue() == Idx;
- } else {
- ExtractValueInst *EI = cast<ExtractValueInst>(E);
- return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
+ auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
+ if (!CI)
+ return None;
+ return CI->getZExtValue();
}
+ ExtractValueInst *EI = cast<ExtractValueInst>(E);
+ if (EI->getNumIndices() != 1)
+ return None;
+ return *EI->idx_begin();
}
/// \returns True if in-tree use also needs extract. This refers to
@@ -549,7 +506,7 @@ public:
MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
}
- /// \brief Vectorize the tree that starts with the elements in \p VL.
+ /// Vectorize the tree that starts with the elements in \p VL.
/// Returns the vectorized root.
Value *vectorizeTree();
@@ -585,8 +542,8 @@ public:
ScalarToTreeEntry.clear();
MustGather.clear();
ExternalUses.clear();
- NumLoadsWantToKeepOrder = 0;
- NumLoadsWantToChangeOrder = 0;
+ NumOpsWantToKeepOrder.clear();
+ NumOpsWantToKeepOriginalOrder = 0;
for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();
BS->clear();
@@ -596,12 +553,22 @@ public:
unsigned getTreeSize() const { return VectorizableTree.size(); }
- /// \brief Perform LICM and CSE on the newly generated gather sequences.
- void optimizeGatherSequence(Function &F);
+ /// Perform LICM and CSE on the newly generated gather sequences.
+ void optimizeGatherSequence();
+
+ /// \returns The best order of instructions for vectorization.
+ Optional<ArrayRef<unsigned>> bestOrder() const {
+ auto I = std::max_element(
+ NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
+ [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
+ const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
+ return D1.second < D2.second;
+ });
+ if (I == NumOpsWantToKeepOrder.end() ||
+ I->getSecond() <= NumOpsWantToKeepOriginalOrder)
+ return None;
- /// \returns true if it is beneficial to reverse the vector order.
- bool shouldReorder() const {
- return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
+ return makeArrayRef(I->getFirst());
}
/// \return The vector element size in bits to use when vectorizing the
@@ -625,7 +592,7 @@ public:
return MinVecRegSize;
}
- /// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
+ /// Check if ArrayType or StructType is isomorphic to some VectorType.
///
/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
unsigned canMapToVector(Type *T, const DataLayout &DL) const;
@@ -648,9 +615,13 @@ private:
/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
- /// \returns True if the ExtractElement/ExtractValue instructions in VL can
- /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
- bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const;
+ /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
+ /// be vectorized to use the original vector (or aggregate "bitcast" to a
+ /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
+ /// returns false, setting \p CurrentOrder to either an empty vector or a
+ /// non-identity permutation that allows to reuse extract instructions.
+ bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+ SmallVectorImpl<unsigned> &CurrentOrder) const;
/// Vectorize a single entry in the tree.
Value *vectorizeTree(TreeEntry *E);
@@ -658,22 +629,19 @@ private:
/// Vectorize a single entry in the tree, starting in \p VL.
Value *vectorizeTree(ArrayRef<Value *> VL);
- /// \returns the pointer to the vectorized value if \p VL is already
- /// vectorized, or NULL. They may happen in cycles.
- Value *alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const;
-
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars.
- int getGatherCost(Type *Ty);
+ int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices);
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
/// roots. This method calculates the cost of extracting the values.
int getGatherCost(ArrayRef<Value *> VL);
- /// \brief Set the Builder insert point to one after the last instruction in
+ /// Set the Builder insert point to one after the last instruction in
/// the bundle
- void setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue);
+ void setInsertPointAfterBundle(ArrayRef<Value *> VL,
+ const InstructionsState &S);
/// \returns a vector from a collection of scalars in \p VL.
Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
@@ -684,7 +652,8 @@ private:
/// \reorder commutative operands in alt shuffle if they result in
/// vectorized code.
- void reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
+ void reorderAltShuffleOperands(const InstructionsState &S,
+ ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right);
@@ -698,8 +667,12 @@ private:
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
- assert(VL.size() == Scalars.size() && "Invalid size");
- return std::equal(VL.begin(), VL.end(), Scalars.begin());
+ if (VL.size() == Scalars.size())
+ return std::equal(VL.begin(), VL.end(), Scalars.begin());
+ return VL.size() == ReuseShuffleIndices.size() &&
+ std::equal(
+ VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
+ [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
}
/// A vector of scalars.
@@ -711,6 +684,12 @@ private:
/// Do we need to gather this sequence ?
bool NeedToGather = false;
+ /// Does this sequence require some shuffling?
+ SmallVector<unsigned, 4> ReuseShuffleIndices;
+
+ /// Does this entry require reordering?
+ ArrayRef<unsigned> ReorderIndices;
+
/// Points back to the VectorizableTree.
///
/// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
@@ -725,13 +704,17 @@ private:
};
/// Create a new VectorizableTree entry.
- TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
- int &UserTreeIdx) {
+ void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx,
+ ArrayRef<unsigned> ReuseShuffleIndices = None,
+ ArrayRef<unsigned> ReorderIndices = None) {
VectorizableTree.emplace_back(VectorizableTree);
int idx = VectorizableTree.size() - 1;
TreeEntry *Last = &VectorizableTree[idx];
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
Last->NeedToGather = !Vectorized;
+ Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
+ ReuseShuffleIndices.end());
+ Last->ReorderIndices = ReorderIndices;
if (Vectorized) {
for (int i = 0, e = VL.size(); i != e; ++i) {
assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
@@ -744,7 +727,6 @@ private:
if (UserTreeIdx >= 0)
Last->UserTreeIndices.push_back(UserTreeIdx);
UserTreeIdx = idx;
- return Last;
}
/// -- Vectorization State --
@@ -758,13 +740,6 @@ private:
return nullptr;
}
- const TreeEntry *getTreeEntry(Value *V) const {
- auto I = ScalarToTreeEntry.find(V);
- if (I != ScalarToTreeEntry.end())
- return &VectorizableTree[I->second];
- return nullptr;
- }
-
/// Maps a specific scalar to its tree entry.
SmallDenseMap<Value*, int> ScalarToTreeEntry;
@@ -1038,7 +1013,7 @@ private:
template <typename ReadyListType>
void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
SD->IsScheduled = true;
- DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
ScheduleData *BundleMember = SD;
while (BundleMember) {
@@ -1061,8 +1036,8 @@ private:
assert(!DepBundle->IsScheduled &&
"already scheduled bundle gets ready");
ReadyList.insert(DepBundle);
- DEBUG(dbgs()
- << "SLP: gets ready (def): " << *DepBundle << "\n");
+ LLVM_DEBUG(dbgs()
+ << "SLP: gets ready (def): " << *DepBundle << "\n");
}
});
}
@@ -1075,8 +1050,8 @@ private:
assert(!DepBundle->IsScheduled &&
"already scheduled bundle gets ready");
ReadyList.insert(DepBundle);
- DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle
- << "\n");
+ LLVM_DEBUG(dbgs()
+ << "SLP: gets ready (mem): " << *DepBundle << "\n");
}
}
BundleMember = BundleMember->NextInBundle;
@@ -1101,7 +1076,8 @@ private:
doForAllOpcodes(I, [&](ScheduleData *SD) {
if (SD->isSchedulingEntity() && SD->isReady()) {
ReadyList.insert(SD);
- DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
+ LLVM_DEBUG(dbgs()
+ << "SLP: initially in ready list: " << *I << "\n");
}
});
}
@@ -1110,7 +1086,8 @@ private:
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
/// actually moved at this stage.
- bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, Value *OpValue);
+ bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+ const InstructionsState &S);
/// Un-bundles a group of instructions.
void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
@@ -1120,7 +1097,7 @@ private:
/// Extends the scheduling region so that V is inside the region.
/// \returns true if the region size is within the limit.
- bool extendSchedulingRegion(Value *V, Value *OpValue);
+ bool extendSchedulingRegion(Value *V, const InstructionsState &S);
/// Initialize the ScheduleData structures for new instructions in the
/// scheduling region.
@@ -1201,11 +1178,38 @@ private:
/// List of users to ignore during scheduling and that don't need extracting.
ArrayRef<Value *> UserIgnoreList;
- // Number of load bundles that contain consecutive loads.
- int NumLoadsWantToKeepOrder = 0;
+ using OrdersType = SmallVector<unsigned, 4>;
+ /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
+ /// sorted SmallVectors of unsigned.
+ struct OrdersTypeDenseMapInfo {
+ static OrdersType getEmptyKey() {
+ OrdersType V;
+ V.push_back(~1U);
+ return V;
+ }
+
+ static OrdersType getTombstoneKey() {
+ OrdersType V;
+ V.push_back(~2U);
+ return V;
+ }
+
+ static unsigned getHashValue(const OrdersType &V) {
+ return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+ }
+
+ static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
+ return LHS == RHS;
+ }
+ };
- // Number of load bundles that contain consecutive loads in reversed order.
- int NumLoadsWantToChangeOrder = 0;
+ /// Contains orders of operations along with the number of bundles that have
+ /// operations in this order. It stores only those orders that require
+ /// reordering, if reordering is not required it is counted using \a
+ /// NumOpsWantToKeepOriginalOrder.
+ DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
+ /// Number of bundles that do not require reordering.
+ unsigned NumOpsWantToKeepOriginalOrder = 0;
// Analysis and block reference.
Function *F;
@@ -1242,7 +1246,7 @@ template <> struct GraphTraits<BoUpSLP *> {
/// NodeRef has to be a pointer per the GraphWriter.
using NodeRef = TreeEntry *;
- /// \brief Add the VectorizableTree to the index iterator to be able to return
+ /// Add the VectorizableTree to the index iterator to be able to return
/// TreeEntry pointers.
struct ChildIteratorType
: public iterator_adaptor_base<ChildIteratorType,
@@ -1340,17 +1344,22 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
+ int FoundLane = Lane;
+ if (!Entry->ReuseShuffleIndices.empty()) {
+ FoundLane =
+ std::distance(Entry->ReuseShuffleIndices.begin(),
+ llvm::find(Entry->ReuseShuffleIndices, FoundLane));
+ }
// Check if the scalar is externally used as an extra arg.
auto ExtI = ExternallyUsedValues.find(Scalar);
if (ExtI != ExternallyUsedValues.end()) {
- DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<
- Lane << " from " << *Scalar << ".\n");
- ExternalUses.emplace_back(Scalar, nullptr, Lane);
- continue;
+ LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
+ << Lane << " from " << *Scalar << ".\n");
+ ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
}
for (User *U : Scalar->users()) {
- DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
Instruction *UserInst = dyn_cast<Instruction>(U);
if (!UserInst)
@@ -1364,8 +1373,8 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
// be used.
if (UseScalar != U ||
!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
- DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
- << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
+ << ".\n");
assert(!UseEntry->NeedToGather && "Bad state");
continue;
}
@@ -1375,9 +1384,9 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
if (is_contained(UserIgnoreList, UserInst))
continue;
- DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
- Lane << " from " << *Scalar << ".\n");
- ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
+ LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
+ << Lane << " from " << *Scalar << ".\n");
+ ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
}
}
}
@@ -1389,28 +1398,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
InstructionsState S = getSameOpcode(VL);
if (Depth == RecursionMaxDepth) {
- DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// Don't handle vectors.
if (S.OpValue->getType()->isVectorTy()) {
- DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
if (SI->getValueOperand()->getType()->isVectorTy()) {
- DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// If all of the operands are identical or constant we have a simple solution.
- if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) {
- DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
+ if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
@@ -1421,8 +1430,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Don't vectorize ephemeral values.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (EphValues.count(VL[i])) {
- DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
- ") is ephemeral.\n");
+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
+ << ") is ephemeral.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
@@ -1430,18 +1439,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Check if this is a duplicate of another entry.
if (TreeEntry *E = getTreeEntry(S.OpValue)) {
- for (unsigned i = 0, e = VL.size(); i != e; ++i) {
- DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
- if (E->Scalars[i] != VL[i]) {
- DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
- newTreeEntry(VL, false, UserTreeIdx);
- return;
- }
+ LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
+ if (!E->isSame(VL)) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
+ newTreeEntry(VL, false, UserTreeIdx);
+ return;
}
// Record the reuse of the tree node. FIXME, currently this is only used to
// properly draw the graph rather than for the actual vectorization.
E->UserTreeIndices.push_back(UserTreeIdx);
- DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
+ << ".\n");
return;
}
@@ -1451,8 +1459,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (!I)
continue;
if (getTreeEntry(I)) {
- DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
- ") is already in tree.\n");
+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
+ << ") is already in tree.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
@@ -1462,7 +1470,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// we need to gather the scalars.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (MustGather.count(VL[i])) {
- DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
@@ -1476,19 +1484,32 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (!DT->isReachableFromEntry(BB)) {
// Don't go into unreachable blocks. They may contain instructions with
// dependency cycles which confuse the final scheduling.
- DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
+ LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// Check that every instruction appears once in this bundle.
- for (unsigned i = 0, e = VL.size(); i < e; ++i)
- for (unsigned j = i + 1; j < e; ++j)
- if (VL[i] == VL[j]) {
- DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
- newTreeEntry(VL, false, UserTreeIdx);
- return;
- }
+ SmallVector<unsigned, 4> ReuseShuffleIndicies;
+ SmallVector<Value *, 4> UniqueValues;
+ DenseMap<Value *, unsigned> UniquePositions;
+ for (Value *V : VL) {
+ auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+ ReuseShuffleIndicies.emplace_back(Res.first->second);
+ if (Res.second)
+ UniqueValues.emplace_back(V);
+ }
+ if (UniqueValues.size() == VL.size()) {
+ ReuseShuffleIndicies.clear();
+ } else {
+ LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
+ if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
+ LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+ newTreeEntry(VL, false, UserTreeIdx);
+ return;
+ }
+ VL = UniqueValues;
+ }
auto &BSRef = BlocksSchedules[BB];
if (!BSRef)
@@ -1496,18 +1517,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
BlockScheduling &BS = *BSRef.get();
- if (!BS.tryScheduleBundle(VL, this, S.OpValue)) {
- DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
+ if (!BS.tryScheduleBundle(VL, this, S)) {
+ LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
assert((!BS.getScheduleData(VL0) ||
!BS.getScheduleData(VL0)->isPartOfBundle()) &&
"tryScheduleBundle should cancelScheduling on failure");
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
- DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
+ LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
- unsigned ShuffleOrOp = S.IsAltShuffle ?
- (unsigned) Instruction::ShuffleVector : S.Opcode;
+ unsigned ShuffleOrOp = S.isAltShuffle() ?
+ (unsigned) Instruction::ShuffleVector : S.getOpcode();
switch (ShuffleOrOp) {
case Instruction::PHI: {
PHINode *PH = dyn_cast<PHINode>(VL0);
@@ -1518,15 +1539,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TerminatorInst *Term = dyn_cast<TerminatorInst>(
cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
if (Term) {
- DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
ValueList Operands;
@@ -1541,13 +1564,35 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
- bool Reuse = canReuseExtract(VL, VL0);
+ OrdersType CurrentOrder;
+ bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
if (Reuse) {
- DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
- } else {
- BS.cancelScheduling(VL, VL0);
+ LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
+ ++NumOpsWantToKeepOriginalOrder;
+ newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
}
- newTreeEntry(VL, Reuse, UserTreeIdx);
+ if (!CurrentOrder.empty()) {
+ LLVM_DEBUG({
+ dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
+ "with order";
+ for (unsigned Idx : CurrentOrder)
+ dbgs() << " " << Idx;
+ dbgs() << "\n";
+ });
+ // Insert new order with initial value 0, if it does not exist,
+ // otherwise return the iterator to the existing one.
+ auto StoredCurrentOrderAndNum =
+ NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
+ ++StoredCurrentOrderAndNum->getSecond();
+ newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
+ StoredCurrentOrderAndNum->getFirst());
+ return;
+ }
+ LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
+ newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
+ BS.cancelScheduling(VL, VL0);
return;
}
case Instruction::Load: {
@@ -1562,62 +1607,67 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
return;
}
// Make sure all loads in the bundle are simple - we can't vectorize
// atomic or volatile loads.
- for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
- LoadInst *L = cast<LoadInst>(VL[i]);
+ SmallVector<Value *, 4> PointerOps(VL.size());
+ auto POIter = PointerOps.begin();
+ for (Value *V : VL) {
+ auto *L = cast<LoadInst>(V);
if (!L->isSimple()) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;
}
+ *POIter = L->getPointerOperand();
+ ++POIter;
}
- // Check if the loads are consecutive, reversed, or neither.
- // TODO: What we really want is to sort the loads, but for now, check
- // the two likely directions.
- bool Consecutive = true;
- bool ReverseConsecutive = true;
- for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
- if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
- Consecutive = false;
- break;
+ OrdersType CurrentOrder;
+ // Check the order of pointer operands.
+ if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
+ Value *Ptr0;
+ Value *PtrN;
+ if (CurrentOrder.empty()) {
+ Ptr0 = PointerOps.front();
+ PtrN = PointerOps.back();
} else {
- ReverseConsecutive = false;
+ Ptr0 = PointerOps[CurrentOrder.front()];
+ PtrN = PointerOps[CurrentOrder.back()];
}
- }
-
- if (Consecutive) {
- ++NumLoadsWantToKeepOrder;
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of loads.\n");
- return;
- }
-
- // If none of the load pairs were consecutive when checked in order,
- // check the reverse order.
- if (ReverseConsecutive)
- for (unsigned i = VL.size() - 1; i > 0; --i)
- if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) {
- ReverseConsecutive = false;
- break;
+ const SCEV *Scev0 = SE->getSCEV(Ptr0);
+ const SCEV *ScevN = SE->getSCEV(PtrN);
+ const auto *Diff =
+ dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
+ uint64_t Size = DL->getTypeAllocSize(ScalarTy);
+ // Check that the sorted loads are consecutive.
+ if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) {
+ if (CurrentOrder.empty()) {
+ // Original loads are consecutive and does not require reordering.
+ ++NumOpsWantToKeepOriginalOrder;
+ newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+ } else {
+ // Need to reorder.
+ auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
+ ++I->getSecond();
+ newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+ ReuseShuffleIndicies, I->getFirst());
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
}
+ return;
+ }
+ }
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
-
- if (ReverseConsecutive) {
- ++NumLoadsWantToChangeOrder;
- DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
- } else {
- DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
- }
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
case Instruction::ZExt:
@@ -1637,13 +1687,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
if (Ty != SrcTy || !isValidElementType(Ty)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs()
+ << "SLP: Gathering casts with different src types.\n");
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of casts.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
@@ -1665,14 +1716,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (Cmp->getPredicate() != P0 ||
Cmp->getOperand(0)->getType() != ComparedTy) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs()
+ << "SLP: Gathering cmp with different predicate.\n");
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of compares.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
@@ -1703,14 +1755,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
// Sort operands of the instructions so that each side is more likely to
// have the same opcode.
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
- reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right);
+ reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right);
buildTree_rec(Left, Depth + 1, UserTreeIdx);
buildTree_rec(Right, Depth + 1, UserTreeIdx);
return;
@@ -1730,9 +1782,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// We don't combine GEPs with complicated (nested) indexing.
for (unsigned j = 0; j < VL.size(); ++j) {
if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
- DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
+ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
@@ -1743,9 +1795,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned j = 0; j < VL.size(); ++j) {
Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
if (Ty0 != CurTy) {
- DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
+ LLVM_DEBUG(dbgs()
+ << "SLP: not-vectorizable GEP (different types).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
@@ -1754,16 +1807,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned j = 0; j < VL.size(); ++j) {
auto Op = cast<Instruction>(VL[j])->getOperand(1);
if (!isa<ConstantInt>(Op)) {
- DEBUG(
- dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
+ LLVM_DEBUG(dbgs()
+ << "SLP: not-vectorizable GEP (non-constant indexes).\n");
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
for (unsigned i = 0, e = 2; i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
@@ -1779,13 +1832,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return;
}
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
ValueList Operands;
for (Value *j : VL)
@@ -1802,8 +1855,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (!isTriviallyVectorizable(ID)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
return;
}
Function *Int = CI->getCalledFunction();
@@ -1816,9 +1869,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
!CI->hasIdenticalOperandBundleSchema(*CI2)) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
- << "\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
+ << "\n");
return;
}
// ctlz,cttz and powi are special intrinsics whose second argument
@@ -1827,10 +1880,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Value *A1J = CI2->getArgOperand(1);
if (A1I != A1J) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
- << " argument "<< A1I<<"!=" << A1J
- << "\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
+ << " argument " << A1I << "!=" << A1J << "\n");
return;
}
}
@@ -1840,14 +1892,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CI->op_begin() + CI->getBundleOperandsEndIndex(),
CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
- << *VL[i] << '\n');
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
+ << *CI << "!=" << *VL[i] << '\n');
return;
}
}
- newTreeEntry(VL, true, UserTreeIdx);
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
ValueList Operands;
// Prepare the operand vector.
@@ -1862,19 +1914,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::ShuffleVector:
// If this is not an alternate sequence of opcode like add-sub
// then do not vectorize this instruction.
- if (!S.IsAltShuffle) {
+ if (!S.isAltShuffle()) {
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
return;
}
- newTreeEntry(VL, true, UserTreeIdx);
- DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+ newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
// Reorder operands if reordering would enable vectorization.
if (isa<BinaryOperator>(VL0)) {
ValueList Left, Right;
- reorderAltShuffleOperands(S.Opcode, VL, Left, Right);
+ reorderAltShuffleOperands(S, VL, Left, Right);
buildTree_rec(Left, Depth + 1, UserTreeIdx);
buildTree_rec(Right, Depth + 1, UserTreeIdx);
return;
@@ -1892,8 +1944,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
default:
BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, false, UserTreeIdx);
- DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
return;
}
}
@@ -1923,15 +1975,18 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
return N;
}
-bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const {
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+ SmallVectorImpl<unsigned> &CurrentOrder) const {
Instruction *E0 = cast<Instruction>(OpValue);
assert(E0->getOpcode() == Instruction::ExtractElement ||
E0->getOpcode() == Instruction::ExtractValue);
- assert(E0->getOpcode() == getSameOpcode(VL).Opcode && "Invalid opcode");
+ assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
// Check if all of the extracts come from the same vector and from the
// correct offset.
Value *Vec = E0->getOperand(0);
+ CurrentOrder.clear();
+
// We have to extract from a vector/aggregate with the same number of elements.
unsigned NElts;
if (E0->getOpcode() == Instruction::ExtractValue) {
@@ -1951,15 +2006,40 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const {
return false;
// Check that all of the indices extract from the correct offset.
- for (unsigned I = 0, E = VL.size(); I < E; ++I) {
- Instruction *Inst = cast<Instruction>(VL[I]);
- if (!matchExtractIndex(Inst, I, Inst->getOpcode()))
- return false;
+ bool ShouldKeepOrder = true;
+ unsigned E = VL.size();
+ // Assign to all items the initial value E + 1 so we can check if the extract
+ // instruction index was used already.
+ // Also, later we can check that all the indices are used and we have a
+ // consecutive access in the extract instructions, by checking that no
+ // element of CurrentOrder still has value E + 1.
+ CurrentOrder.assign(E, E + 1);
+ unsigned I = 0;
+ for (; I < E; ++I) {
+ auto *Inst = cast<Instruction>(VL[I]);
if (Inst->getOperand(0) != Vec)
- return false;
+ break;
+ Optional<unsigned> Idx = getExtractIndex(Inst);
+ if (!Idx)
+ break;
+ const unsigned ExtIdx = *Idx;
+ if (ExtIdx != I) {
+ if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
+ break;
+ ShouldKeepOrder = false;
+ CurrentOrder[ExtIdx] = I;
+ } else {
+ if (CurrentOrder[I] != E + 1)
+ break;
+ CurrentOrder[I] = I;
+ }
+ }
+ if (I < E) {
+ CurrentOrder.clear();
+ return false;
}
- return true;
+ return ShouldKeepOrder;
}
bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
@@ -1985,13 +2065,22 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
VecTy = VectorType::get(
IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
+ unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
+ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+ int ReuseShuffleCost = 0;
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost =
+ TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
if (E->NeedToGather) {
if (allConstant(VL))
return 0;
if (isSplat(VL)) {
- return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
+ return ReuseShuffleCost +
+ TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
}
- if (getSameOpcode(VL).Opcode == Instruction::ExtractElement) {
+ if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement &&
+ allSameType(VL) && allSameBlock(VL)) {
Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
if (ShuffleKind.hasValue()) {
int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
@@ -2008,37 +2097,86 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
IO->getZExtValue());
}
}
- return Cost;
+ return ReuseShuffleCost + Cost;
}
}
- return getGatherCost(E->Scalars);
+ return ReuseShuffleCost + getGatherCost(VL);
}
InstructionsState S = getSameOpcode(VL);
- assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+ assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
Instruction *VL0 = cast<Instruction>(S.OpValue);
- unsigned ShuffleOrOp = S.IsAltShuffle ?
- (unsigned) Instruction::ShuffleVector : S.Opcode;
+ unsigned ShuffleOrOp = S.isAltShuffle() ?
+ (unsigned) Instruction::ShuffleVector : S.getOpcode();
switch (ShuffleOrOp) {
case Instruction::PHI:
return 0;
case Instruction::ExtractValue:
case Instruction::ExtractElement:
- if (canReuseExtract(VL, S.OpValue)) {
- int DeadCost = 0;
+ if (NeedToShuffleReuses) {
+ unsigned Idx = 0;
+ for (unsigned I : E->ReuseShuffleIndices) {
+ if (ShuffleOrOp == Instruction::ExtractElement) {
+ auto *IO = cast<ConstantInt>(
+ cast<ExtractElementInst>(VL[I])->getIndexOperand());
+ Idx = IO->getZExtValue();
+ ReuseShuffleCost -= TTI->getVectorInstrCost(
+ Instruction::ExtractElement, VecTy, Idx);
+ } else {
+ ReuseShuffleCost -= TTI->getVectorInstrCost(
+ Instruction::ExtractElement, VecTy, Idx);
+ ++Idx;
+ }
+ }
+ Idx = ReuseShuffleNumbers;
+ for (Value *V : VL) {
+ if (ShuffleOrOp == Instruction::ExtractElement) {
+ auto *IO = cast<ConstantInt>(
+ cast<ExtractElementInst>(V)->getIndexOperand());
+ Idx = IO->getZExtValue();
+ } else {
+ --Idx;
+ }
+ ReuseShuffleCost +=
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
+ }
+ }
+ if (!E->NeedToGather) {
+ int DeadCost = ReuseShuffleCost;
+ if (!E->ReorderIndices.empty()) {
+ // TODO: Merge this shuffle with the ReuseShuffleCost.
+ DeadCost += TTI->getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
Instruction *E = cast<Instruction>(VL[i]);
// If all users are going to be vectorized, instruction can be
// considered as dead.
// The same, if have only one user, it will be vectorized for sure.
- if (areAllUsersVectorized(E))
+ if (areAllUsersVectorized(E)) {
// Take credit for instruction that will become dead.
- DeadCost +=
+ if (E->hasOneUse()) {
+ Instruction *Ext = E->user_back();
+ if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+ all_of(Ext->users(),
+ [](User *U) { return isa<GetElementPtrInst>(U); })) {
+ // Use getExtractWithExtendCost() to calculate the cost of
+ // extractelement/ext pair.
+ DeadCost -= TTI->getExtractWithExtendCost(
+ Ext->getOpcode(), Ext->getType(), VecTy, i);
+ // Add back the cost of s|zext which is subtracted seperately.
+ DeadCost += TTI->getCastInstrCost(
+ Ext->getOpcode(), Ext->getType(), E->getType(), Ext);
+ continue;
+ }
+ }
+ DeadCost -=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
+ }
}
- return -DeadCost;
+ return DeadCost;
}
- return getGatherCost(VecTy);
+ return ReuseShuffleCost + getGatherCost(VL);
case Instruction::ZExt:
case Instruction::SExt:
@@ -2053,24 +2191,37 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
case Instruction::FPTrunc:
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
+ int ScalarEltCost =
+ TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
// Calculate the cost of this instruction.
- int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
- VL0->getType(), SrcTy, VL0);
+ int ScalarCost = VL.size() * ScalarEltCost;
VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
- int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
+ int VecCost = 0;
+ // Check if the values are candidates to demote.
+ if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
+ VecCost = ReuseShuffleCost +
+ TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0);
+ }
return VecCost - ScalarCost;
}
case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select: {
// Calculate the cost of this instruction.
+ int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy,
+ Builder.getInt1Ty(), VL0);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
- int ScalarCost = VecTy->getNumElements() *
- TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
- int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0);
- return VecCost - ScalarCost;
+ int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+ int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0);
+ return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::Add:
case Instruction::FAdd:
@@ -2099,42 +2250,43 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TargetTransformInfo::OperandValueProperties Op1VP =
TargetTransformInfo::OP_None;
TargetTransformInfo::OperandValueProperties Op2VP =
- TargetTransformInfo::OP_None;
+ TargetTransformInfo::OP_PowerOf2;
// If all operands are exactly the same ConstantInt then set the
// operand kind to OK_UniformConstantValue.
// If instead not all operands are constants, then set the operand kind
// to OK_AnyValue. If all operands are constants but not the same,
// then set the operand kind to OK_NonUniformConstantValue.
- ConstantInt *CInt = nullptr;
- for (unsigned i = 0; i < VL.size(); ++i) {
+ ConstantInt *CInt0 = nullptr;
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
const Instruction *I = cast<Instruction>(VL[i]);
- if (!isa<ConstantInt>(I->getOperand(1))) {
+ ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(1));
+ if (!CInt) {
Op2VK = TargetTransformInfo::OK_AnyValue;
+ Op2VP = TargetTransformInfo::OP_None;
break;
}
+ if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
+ !CInt->getValue().isPowerOf2())
+ Op2VP = TargetTransformInfo::OP_None;
if (i == 0) {
- CInt = cast<ConstantInt>(I->getOperand(1));
+ CInt0 = CInt;
continue;
}
- if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
- CInt != cast<ConstantInt>(I->getOperand(1)))
+ if (CInt0 != CInt)
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
}
- // FIXME: Currently cost of model modification for division by power of
- // 2 is handled for X86 and AArch64. Add support for other targets.
- if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
- CInt->getValue().isPowerOf2())
- Op2VP = TargetTransformInfo::OP_PowerOf2;
SmallVector<const Value *, 4> Operands(VL0->operand_values());
- int ScalarCost =
- VecTy->getNumElements() *
- TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
- Op2VP, Operands);
- int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK,
- Op1VP, Op2VP, Operands);
- return VecCost - ScalarCost;
+ int ScalarEltCost = TTI->getArithmeticInstrCost(
+ S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+ int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK,
+ Op2VK, Op1VP, Op2VP, Operands);
+ return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::GetElementPtr: {
TargetTransformInfo::OperandValueKind Op1VK =
@@ -2142,83 +2294,119 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_UniformConstantValue;
- int ScalarCost =
- VecTy->getNumElements() *
+ int ScalarEltCost =
TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
int VecCost =
TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
-
- return VecCost - ScalarCost;
+ return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::Load: {
// Cost of wide load - cost of scalar loads.
- unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
- int ScalarLdCost = VecTy->getNumElements() *
+ unsigned alignment = cast<LoadInst>(VL0)->getAlignment();
+ int ScalarEltCost =
TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
- int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
- VecTy, alignment, 0, VL0);
- return VecLdCost - ScalarLdCost;
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
+ int VecLdCost =
+ TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0);
+ if (!E->ReorderIndices.empty()) {
+ // TODO: Merge this shuffle with the ReuseShuffleCost.
+ VecLdCost += TTI->getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
+ return ReuseShuffleCost + VecLdCost - ScalarLdCost;
}
case Instruction::Store: {
// We know that we can merge the stores. Calculate the cost.
- unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
- int ScalarStCost = VecTy->getNumElements() *
+ unsigned alignment = cast<StoreInst>(VL0)->getAlignment();
+ int ScalarEltCost =
TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
- int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
- VecTy, alignment, 0, VL0);
- return VecStCost - ScalarStCost;
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
+ int VecStCost =
+ TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0);
+ return ReuseShuffleCost + VecStCost - ScalarStCost;
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
- SmallVector<Type*, 4> ScalarTys;
- for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op)
+ SmallVector<Type *, 4> ScalarTys;
+ for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op)
ScalarTys.push_back(CI->getArgOperand(op)->getType());
FastMathFlags FMF;
if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
FMF = FPMO->getFastMathFlags();
- int ScalarCallCost = VecTy->getNumElements() *
+ int ScalarEltCost =
TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
SmallVector<Value *, 4> Args(CI->arg_operands());
int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
VecTy->getNumElements());
- DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
- << " (" << VecCallCost << "-" << ScalarCallCost << ")"
- << " for " << *CI << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
+ << " (" << VecCallCost << "-" << ScalarCallCost << ")"
+ << " for " << *CI << "\n");
- return VecCallCost - ScalarCallCost;
+ return ReuseShuffleCost + VecCallCost - ScalarCallCost;
}
case Instruction::ShuffleVector: {
- TargetTransformInfo::OperandValueKind Op1VK =
- TargetTransformInfo::OK_AnyValue;
- TargetTransformInfo::OperandValueKind Op2VK =
- TargetTransformInfo::OK_AnyValue;
+ assert(S.isAltShuffle() &&
+ ((Instruction::isBinaryOp(S.getOpcode()) &&
+ Instruction::isBinaryOp(S.getAltOpcode())) ||
+ (Instruction::isCast(S.getOpcode()) &&
+ Instruction::isCast(S.getAltOpcode()))) &&
+ "Invalid Shuffle Vector Operand");
int ScalarCost = 0;
- int VecCost = 0;
+ if (NeedToShuffleReuses) {
+ for (unsigned Idx : E->ReuseShuffleIndices) {
+ Instruction *I = cast<Instruction>(VL[Idx]);
+ ReuseShuffleCost -= TTI->getInstructionCost(
+ I, TargetTransformInfo::TCK_RecipThroughput);
+ }
+ for (Value *V : VL) {
+ Instruction *I = cast<Instruction>(V);
+ ReuseShuffleCost += TTI->getInstructionCost(
+ I, TargetTransformInfo::TCK_RecipThroughput);
+ }
+ }
for (Value *i : VL) {
Instruction *I = cast<Instruction>(i);
- if (!I)
- break;
- ScalarCost +=
- TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
+ assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+ ScalarCost += TTI->getInstructionCost(
+ I, TargetTransformInfo::TCK_RecipThroughput);
}
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
- Instruction *I0 = cast<Instruction>(VL[0]);
- VecCost =
- TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
- Instruction *I1 = cast<Instruction>(VL[1]);
- VecCost +=
- TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
- VecCost +=
- TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
- return VecCost - ScalarCost;
+ int VecCost = 0;
+ if (Instruction::isBinaryOp(S.getOpcode())) {
+ VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy);
+ VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy);
+ } else {
+ Type *Src0SclTy = S.MainOp->getOperand(0)->getType();
+ Type *Src1SclTy = S.AltOp->getOperand(0)->getType();
+ VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
+ VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
+ VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty);
+ VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty);
+ }
+ VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
+ return ReuseShuffleCost + VecCost - ScalarCost;
}
default:
llvm_unreachable("Unknown instruction");
@@ -2226,8 +2414,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
bool BoUpSLP::isFullyVectorizableTinyTree() {
- DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
- VectorizableTree.size() << " is fully vectorizable .\n");
+ LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
+ << VectorizableTree.size() << " is fully vectorizable .\n");
// We only handle trees of heights 1 and 2.
if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
@@ -2297,13 +2485,13 @@ int BoUpSLP::getSpillCost() {
LiveValues.insert(cast<Instruction>(&*J));
}
- DEBUG(
+ LLVM_DEBUG({
dbgs() << "SLP: #LV: " << LiveValues.size();
for (auto *X : LiveValues)
dbgs() << " " << X->getName();
dbgs() << ", Looking at ";
Inst->dump();
- );
+ });
// Now find the sequence of instructions between PrevInst and Inst.
BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
@@ -2315,7 +2503,10 @@ int BoUpSLP::getSpillCost() {
continue;
}
- if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
+ // Debug informations don't impact spill cost.
+ if ((isa<CallInst>(&*PrevInstIt) &&
+ !isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
+ &*PrevInstIt != PrevInst) {
SmallVector<Type*, 4> V;
for (auto *II : LiveValues)
V.push_back(VectorType::get(II->getType(), BundleWidth));
@@ -2333,19 +2524,41 @@ int BoUpSLP::getSpillCost() {
int BoUpSLP::getTreeCost() {
int Cost = 0;
- DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
- VectorizableTree.size() << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
+ << VectorizableTree.size() << ".\n");
unsigned BundleWidth = VectorizableTree[0].Scalars.size();
- for (TreeEntry &TE : VectorizableTree) {
+ for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
+ TreeEntry &TE = VectorizableTree[I];
+
+ // We create duplicate tree entries for gather sequences that have multiple
+ // uses. However, we should not compute the cost of duplicate sequences.
+ // For example, if we have a build vector (i.e., insertelement sequence)
+ // that is used by more than one vector instruction, we only need to
+ // compute the cost of the insertelement instructions once. The redundent
+ // instructions will be eliminated by CSE.
+ //
+ // We should consider not creating duplicate tree entries for gather
+ // sequences, and instead add additional edges to the tree representing
+ // their uses. Since such an approach results in fewer total entries,
+ // existing heuristics based on tree size may yeild different results.
+ //
+ if (TE.NeedToGather &&
+ std::any_of(std::next(VectorizableTree.begin(), I + 1),
+ VectorizableTree.end(), [TE](TreeEntry &Entry) {
+ return Entry.NeedToGather && Entry.isSame(TE.Scalars);
+ }))
+ continue;
+
int C = getEntryCost(&TE);
- DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
- << *TE.Scalars[0] << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+ << " for bundle that starts with " << *TE.Scalars[0]
+ << ".\n");
Cost += C;
}
- SmallSet<Value *, 16> ExtractCostCalculated;
+ SmallPtrSet<Value *, 16> ExtractCostCalculated;
int ExtractCost = 0;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
@@ -2386,7 +2599,7 @@ int BoUpSLP::getTreeCost() {
<< "SLP: Extract Cost = " << ExtractCost << ".\n"
<< "SLP: Total Cost = " << Cost << ".\n";
}
- DEBUG(dbgs() << Str);
+ LLVM_DEBUG(dbgs() << Str);
if (ViewSLPTree)
ViewGraph(this, "SLP" + F->getName(), false, Str);
@@ -2394,10 +2607,14 @@ int BoUpSLP::getTreeCost() {
return Cost;
}
-int BoUpSLP::getGatherCost(Type *Ty) {
+int BoUpSLP::getGatherCost(Type *Ty,
+ const DenseSet<unsigned> &ShuffledIndices) {
int Cost = 0;
for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
- Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+ if (!ShuffledIndices.count(i))
+ Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+ if (!ShuffledIndices.empty())
+ Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
return Cost;
}
@@ -2408,7 +2625,17 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
// Find the cost of inserting/extracting values from the vector.
- return getGatherCost(VecTy);
+ // Check if the same elements are inserted several times and count them as
+ // shuffle candidates.
+ DenseSet<unsigned> ShuffledElements;
+ DenseSet<Value *> UniqueElements;
+ // Iterate in reverse order to consider insert elements with the high cost.
+ for (unsigned I = VL.size(); I > 0; --I) {
+ unsigned Idx = I - 1;
+ if (!UniqueElements.insert(VL[Idx]).second)
+ ShuffledElements.insert(Idx);
+ }
+ return getGatherCost(VecTy, ShuffledElements);
}
// Reorder commutative operations in alternate shuffle if the resulting vectors
@@ -2420,16 +2647,14 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
// load a[3] + load b[3]
// Reordering the second load b[1] load a[1] would allow us to vectorize this
// code.
-void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
+void BoUpSLP::reorderAltShuffleOperands(const InstructionsState &S,
+ ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {
// Push left and right operands of binary operation into Left and Right
- unsigned AltOpcode = getAltOpcode(Opcode);
- (void)AltOpcode;
for (Value *V : VL) {
auto *I = cast<Instruction>(V);
- assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) &&
- "Incorrect instruction in vector");
+ assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector");
Left.push_back(I->getOperand(0));
Right.push_back(I->getOperand(1));
}
@@ -2609,7 +2834,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
// add a[1],c[2] load b[1]
// b[2] load b[2]
// add a[3],c[3] load b[3]
- for (unsigned j = 0; j < VL.size() - 1; ++j) {
+ for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) {
if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
if (isConsecutiveAccess(L, L1, *DL, *SE)) {
@@ -2630,17 +2855,15 @@ void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
}
}
-void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {
+void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
+ const InstructionsState &S) {
// Get the basic block this bundle is in. All instructions in the bundle
// should be in this block.
- auto *Front = cast<Instruction>(OpValue);
+ auto *Front = cast<Instruction>(S.OpValue);
auto *BB = Front->getParent();
- const unsigned Opcode = cast<Instruction>(OpValue)->getOpcode();
- const unsigned AltOpcode = getAltOpcode(Opcode);
assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool {
- return !sameOpcodeOrAlt(Opcode, AltOpcode,
- cast<Instruction>(V)->getOpcode()) ||
- cast<Instruction>(V)->getParent() == BB;
+ auto *I = cast<Instruction>(V);
+ return !S.isOpcodeOrAlt(I) || I->getParent() == BB;
}));
// The last instruction in the bundle in program order.
@@ -2652,7 +2875,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {
// bundle. The end of the bundle is marked by null ScheduleData.
if (BlocksSchedules.count(BB)) {
auto *Bundle =
- BlocksSchedules[BB]->getScheduleData(isOneOf(OpValue, VL.back()));
+ BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back()));
if (Bundle && Bundle->isPartOfBundle())
for (; Bundle; Bundle = Bundle->NextInBundle)
if (Bundle->OpValue == Bundle->Inst)
@@ -2680,7 +2903,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {
if (!LastInst) {
SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
- if (Bundle.erase(&I) && sameOpcodeOrAlt(Opcode, AltOpcode, I.getOpcode()))
+ if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I))
LastInst = &I;
if (Bundle.empty())
break;
@@ -2706,7 +2929,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
if (TreeEntry *E = getTreeEntry(VL[i])) {
// Find which lane we need to extract.
int FoundLane = -1;
- for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
+ for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) {
// Is this the lane of the scalar that we are looking for ?
if (E->Scalars[Lane] == VL[i]) {
FoundLane = Lane;
@@ -2714,6 +2937,11 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
}
}
assert(FoundLane >= 0 && "Could not find the correct lane");
+ if (!E->ReuseShuffleIndices.empty()) {
+ FoundLane =
+ std::distance(E->ReuseShuffleIndices.begin(),
+ llvm::find(E->ReuseShuffleIndices, FoundLane));
+ }
ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
}
}
@@ -2722,66 +2950,128 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
return Vec;
}
-Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {
- if (const TreeEntry *En = getTreeEntry(OpValue)) {
- if (En->isSame(VL) && En->VectorizedValue)
- return En->VectorizedValue;
- }
- return nullptr;
-}
-
Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
InstructionsState S = getSameOpcode(VL);
- if (S.Opcode) {
+ if (S.getOpcode()) {
if (TreeEntry *E = getTreeEntry(S.OpValue)) {
- if (E->isSame(VL))
- return vectorizeTree(E);
+ if (E->isSame(VL)) {
+ Value *V = vectorizeTree(E);
+ if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
+ // We need to get the vectorized value but without shuffle.
+ if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+ V = SV->getOperand(0);
+ } else {
+ // Reshuffle to get only unique values.
+ SmallVector<unsigned, 4> UniqueIdxs;
+ SmallSet<unsigned, 4> UsedIdxs;
+ for(unsigned Idx : E->ReuseShuffleIndices)
+ if (UsedIdxs.insert(Idx).second)
+ UniqueIdxs.emplace_back(Idx);
+ V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
+ UniqueIdxs);
+ }
+ }
+ return V;
+ }
}
}
Type *ScalarTy = S.OpValue->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
ScalarTy = SI->getValueOperand()->getType();
+
+ // Check that every instruction appears once in this bundle.
+ SmallVector<unsigned, 4> ReuseShuffleIndicies;
+ SmallVector<Value *, 4> UniqueValues;
+ if (VL.size() > 2) {
+ DenseMap<Value *, unsigned> UniquePositions;
+ for (Value *V : VL) {
+ auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+ ReuseShuffleIndicies.emplace_back(Res.first->second);
+ if (Res.second || isa<Constant>(V))
+ UniqueValues.emplace_back(V);
+ }
+ // Do not shuffle single element or if number of unique values is not power
+ // of 2.
+ if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
+ !llvm::isPowerOf2_32(UniqueValues.size()))
+ ReuseShuffleIndicies.clear();
+ else
+ VL = UniqueValues;
+ }
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
- return Gather(VL, VecTy);
+ Value *V = Gather(VL, VecTy);
+ if (!ReuseShuffleIndicies.empty()) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ ReuseShuffleIndicies, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
+ return V;
+}
+
+static void inversePermutation(ArrayRef<unsigned> Indices,
+ SmallVectorImpl<unsigned> &Mask) {
+ Mask.clear();
+ const unsigned E = Indices.size();
+ Mask.resize(E);
+ for (unsigned I = 0; I < E; ++I)
+ Mask[Indices[I]] = I;
}
Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
IRBuilder<>::InsertPointGuard Guard(Builder);
if (E->VectorizedValue) {
- DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
return E->VectorizedValue;
}
InstructionsState S = getSameOpcode(E->Scalars);
- Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
+ Instruction *VL0 = cast<Instruction>(S.OpValue);
Type *ScalarTy = VL0->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
+ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+
if (E->NeedToGather) {
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
auto *V = Gather(E->Scalars, VecTy);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
E->VectorizedValue = V;
return V;
}
- unsigned ShuffleOrOp = S.IsAltShuffle ?
- (unsigned) Instruction::ShuffleVector : S.Opcode;
+ unsigned ShuffleOrOp = S.isAltShuffle() ?
+ (unsigned) Instruction::ShuffleVector : S.getOpcode();
switch (ShuffleOrOp) {
case Instruction::PHI: {
PHINode *PH = dyn_cast<PHINode>(VL0);
Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
- E->VectorizedValue = NewPhi;
+ Value *V = NewPhi;
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
// PHINodes may have multiple entries from the same block. We want to
// visit every block once.
- SmallSet<BasicBlock*, 4> VisitedBBs;
+ SmallPtrSet<BasicBlock*, 4> VisitedBBs;
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
ValueList Operands;
@@ -2804,32 +3094,74 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
"Invalid number of incoming values");
- return NewPhi;
+ return V;
}
case Instruction::ExtractElement: {
- if (canReuseExtract(E->Scalars, VL0)) {
+ if (!E->NeedToGather) {
Value *V = VL0->getOperand(0);
+ if (!E->ReorderIndices.empty()) {
+ OrdersType Mask;
+ inversePermutation(E->ReorderIndices, Mask);
+ Builder.SetInsertPoint(VL0);
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
+ "reorder_shuffle");
+ }
+ if (NeedToShuffleReuses) {
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
+ if (!E->ReorderIndices.empty())
+ Builder.SetInsertPoint(VL0);
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
return V;
}
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
auto *V = Gather(E->Scalars, VecTy);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
E->VectorizedValue = V;
return V;
}
case Instruction::ExtractValue: {
- if (canReuseExtract(E->Scalars, VL0)) {
+ if (!E->NeedToGather) {
LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
Builder.SetInsertPoint(LI);
PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
- E->VectorizedValue = V;
- return propagateMetadata(V, E->Scalars);
+ Value *NewV = propagateMetadata(V, E->Scalars);
+ if (!E->ReorderIndices.empty()) {
+ OrdersType Mask;
+ inversePermutation(E->ReorderIndices, Mask);
+ NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
+ "reorder_shuffle");
+ }
+ if (NeedToShuffleReuses) {
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
+ NewV = Builder.CreateShuffleVector(
+ NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = NewV;
+ return NewV;
}
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
auto *V = Gather(E->Scalars, VecTy);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
E->VectorizedValue = V;
return V;
}
@@ -2849,15 +3181,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
for (Value *V : E->Scalars)
INVL.push_back(cast<Instruction>(V)->getOperand(0));
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
Value *InVec = vectorizeTree(INVL);
- if (Value *V = alreadyVectorized(E->Scalars, VL0))
- return V;
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
CastInst *CI = dyn_cast<CastInst>(VL0);
Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
@@ -2870,23 +3208,29 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
RHSV.push_back(cast<Instruction>(V)->getOperand(1));
}
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
Value *L = vectorizeTree(LHSV);
Value *R = vectorizeTree(RHSV);
- if (Value *V = alreadyVectorized(E->Scalars, VL0))
- return V;
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V;
- if (S.Opcode == Instruction::FCmp)
+ if (S.getOpcode() == Instruction::FCmp)
V = Builder.CreateFCmp(P0, L, R);
else
V = Builder.CreateICmp(P0, L, R);
+ propagateIRFlags(V, E->Scalars, VL0);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
- propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions;
return V;
}
@@ -2898,16 +3242,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
}
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
Value *Cond = vectorizeTree(CondVec);
Value *True = vectorizeTree(TrueVec);
Value *False = vectorizeTree(FalseVec);
- if (Value *V = alreadyVectorized(E->Scalars, VL0))
- return V;
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
Value *V = Builder.CreateSelect(Cond, True, False);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
@@ -2932,7 +3282,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Xor: {
ValueList LHSVL, RHSVL;
if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
- reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL,
+ reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL,
RHSVL);
else
for (Value *V : E->Scalars) {
@@ -2941,29 +3291,40 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
RHSVL.push_back(I->getOperand(1));
}
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
Value *LHS = vectorizeTree(LHSVL);
Value *RHS = vectorizeTree(RHSVL);
- if (Value *V = alreadyVectorized(E->Scalars, VL0))
- return V;
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
Value *V = Builder.CreateBinOp(
- static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
+ static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+ propagateIRFlags(V, E->Scalars, VL0);
+ if (auto *I = dyn_cast<Instruction>(V))
+ V = propagateMetadata(I, E->Scalars);
+
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
- propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions;
- if (Instruction *I = dyn_cast<Instruction>(V))
- return propagateMetadata(I, E->Scalars);
-
return V;
}
case Instruction::Load: {
// Loads are inserted at the head of the tree because we don't want to
// sink them all the way down past store instructions.
- setInsertPointAfterBundle(E->Scalars, VL0);
+ bool IsReorder = !E->ReorderIndices.empty();
+ if (IsReorder) {
+ S = getSameOpcode(E->Scalars, E->ReorderIndices.front());
+ VL0 = cast<Instruction>(S.OpValue);
+ }
+ setInsertPointAfterBundle(E->Scalars, S);
LoadInst *LI = cast<LoadInst>(VL0);
Type *ScalarLoadTy = LI->getType();
@@ -2985,9 +3346,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Alignment = DL->getABITypeAlignment(ScalarLoadTy);
}
LI->setAlignment(Alignment);
- E->VectorizedValue = LI;
+ Value *V = propagateMetadata(LI, E->Scalars);
+ if (IsReorder) {
+ OrdersType Mask;
+ inversePermutation(E->ReorderIndices, Mask);
+ V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
+ Mask, "reorder_shuffle");
+ }
+ if (NeedToShuffleReuses) {
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
++NumVectorInstructions;
- return propagateMetadata(LI, E->Scalars);
+ return V;
}
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(VL0);
@@ -2998,12 +3371,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
for (Value *V : E->Scalars)
ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand());
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
Value *VecValue = vectorizeTree(ScalarStoreValues);
Value *ScalarPtr = SI->getPointerOperand();
Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
- StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
+ StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
// The pointer operand uses an in-tree scalar, so add the new BitCast to
// ExternalUses to make sure that an extract will be generated in the
@@ -3014,13 +3387,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (!Alignment)
Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
- S->setAlignment(Alignment);
- E->VectorizedValue = S;
+ ST->setAlignment(Alignment);
+ Value *V = propagateMetadata(ST, E->Scalars);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
+ E->VectorizedValue = V;
++NumVectorInstructions;
- return propagateMetadata(S, E->Scalars);
+ return V;
}
case Instruction::GetElementPtr: {
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
ValueList Op0VL;
for (Value *V : E->Scalars)
@@ -3041,17 +3419,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *V = Builder.CreateGEP(
cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ V = propagateMetadata(I, E->Scalars);
+
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
++NumVectorInstructions;
- if (Instruction *I = dyn_cast<Instruction>(V))
- return propagateMetadata(I, E->Scalars);
-
return V;
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
- setInsertPointAfterBundle(E->Scalars, VL0);
+ setInsertPointAfterBundle(E->Scalars, S);
Function *FI;
Intrinsic::ID IID = Intrinsic::not_intrinsic;
Value *ScalarArg = nullptr;
@@ -3075,7 +3457,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *OpVec = vectorizeTree(OpVL);
- DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
OpVecs.push_back(OpVec);
}
@@ -3093,58 +3475,87 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (ScalarArg && getTreeEntry(ScalarArg))
ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
+ propagateIRFlags(V, E->Scalars, VL0);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
- propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions;
return V;
}
case Instruction::ShuffleVector: {
ValueList LHSVL, RHSVL;
- assert(Instruction::isBinaryOp(S.Opcode) &&
+ assert(S.isAltShuffle() &&
+ ((Instruction::isBinaryOp(S.getOpcode()) &&
+ Instruction::isBinaryOp(S.getAltOpcode())) ||
+ (Instruction::isCast(S.getOpcode()) &&
+ Instruction::isCast(S.getAltOpcode()))) &&
"Invalid Shuffle Vector Operand");
- reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL);
- setInsertPointAfterBundle(E->Scalars, VL0);
- Value *LHS = vectorizeTree(LHSVL);
- Value *RHS = vectorizeTree(RHSVL);
-
- if (Value *V = alreadyVectorized(E->Scalars, VL0))
- return V;
+ Value *LHS, *RHS;
+ if (Instruction::isBinaryOp(S.getOpcode())) {
+ reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL);
+ setInsertPointAfterBundle(E->Scalars, S);
+ LHS = vectorizeTree(LHSVL);
+ RHS = vectorizeTree(RHSVL);
+ } else {
+ ValueList INVL;
+ for (Value *V : E->Scalars)
+ INVL.push_back(cast<Instruction>(V)->getOperand(0));
+ setInsertPointAfterBundle(E->Scalars, S);
+ LHS = vectorizeTree(INVL);
+ }
- // Create a vector of LHS op1 RHS
- Value *V0 = Builder.CreateBinOp(
- static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
- unsigned AltOpcode = getAltOpcode(S.Opcode);
- // Create a vector of LHS op2 RHS
- Value *V1 = Builder.CreateBinOp(
- static_cast<Instruction::BinaryOps>(AltOpcode), LHS, RHS);
+ Value *V0, *V1;
+ if (Instruction::isBinaryOp(S.getOpcode())) {
+ V0 = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+ V1 = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS);
+ } else {
+ V0 = Builder.CreateCast(
+ static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy);
+ V1 = Builder.CreateCast(
+ static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy);
+ }
// Create shuffle to take alternate operations from the vector.
- // Also, gather up odd and even scalar ops to propagate IR flags to
+ // Also, gather up main and alt scalar ops to propagate IR flags to
// each vector operation.
- ValueList OddScalars, EvenScalars;
+ ValueList OpScalars, AltScalars;
unsigned e = E->Scalars.size();
SmallVector<Constant *, 8> Mask(e);
for (unsigned i = 0; i < e; ++i) {
- if (isOdd(i)) {
+ auto *OpInst = cast<Instruction>(E->Scalars[i]);
+ assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
+ if (OpInst->getOpcode() == S.getAltOpcode()) {
Mask[i] = Builder.getInt32(e + i);
- OddScalars.push_back(E->Scalars[i]);
+ AltScalars.push_back(E->Scalars[i]);
} else {
Mask[i] = Builder.getInt32(i);
- EvenScalars.push_back(E->Scalars[i]);
+ OpScalars.push_back(E->Scalars[i]);
}
}
Value *ShuffleMask = ConstantVector::get(Mask);
- propagateIRFlags(V0, EvenScalars);
- propagateIRFlags(V1, OddScalars);
+ propagateIRFlags(V0, OpScalars);
+ propagateIRFlags(V1, AltScalars);
Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ V = propagateMetadata(I, E->Scalars);
+ if (NeedToShuffleReuses) {
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
+ }
E->VectorizedValue = V;
++NumVectorInstructions;
- if (Instruction *I = dyn_cast<Instruction>(V))
- return propagateMetadata(I, E->Scalars);
return V;
}
@@ -3183,7 +3594,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
VectorizableTree[0].VectorizedValue = Trunc;
}
- DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
+ LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
+ << " values .\n");
// If necessary, sign-extend or zero-extend ScalarRoot to the larger type
// specified by ScalarType.
@@ -3260,7 +3672,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
Ex = extend(ScalarRoot, Ex, Scalar->getType());
CSEBlocks.insert(cast<Instruction>(User)->getParent());
User->replaceUsesOfWith(Scalar, Ex);
- }
+ }
} else {
Builder.SetInsertPoint(&F->getEntryBlock().front());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
@@ -3269,7 +3681,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
User->replaceUsesOfWith(Scalar, Ex);
}
- DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
}
// For each vectorized value:
@@ -3290,7 +3702,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
if (!Ty->isVoidTy()) {
#ifndef NDEBUG
for (User *U : Scalar->users()) {
- DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
// It is legal to replace users in the ignorelist by undef.
assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
@@ -3300,7 +3712,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
Value *Undef = UndefValue::get(Ty);
Scalar->replaceAllUsesWith(Undef);
}
- DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
eraseInstruction(cast<Instruction>(Scalar));
}
}
@@ -3310,18 +3722,16 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
return VectorizableTree[0].VectorizedValue;
}
-void BoUpSLP::optimizeGatherSequence(Function &F) {
- DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
- << " gather sequences instructions.\n");
+void BoUpSLP::optimizeGatherSequence() {
+ LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
+ << " gather sequences instructions.\n");
// LICM InsertElementInst sequences.
- for (Instruction *it : GatherSeq) {
- InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);
-
- if (!Insert)
+ for (Instruction *I : GatherSeq) {
+ if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(I))
continue;
// Check if this block is inside a loop.
- Loop *L = LI->getLoopFor(Insert->getParent());
+ Loop *L = LI->getLoopFor(I->getParent());
if (!L)
continue;
@@ -3333,27 +3743,41 @@ void BoUpSLP::optimizeGatherSequence(Function &F) {
// If the vector or the element that we insert into it are
// instructions that are defined in this basic block then we can't
// hoist this instruction.
- Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
- Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
- if (CurrVec && L->contains(CurrVec))
+ auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
+ auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
+ if (Op0 && L->contains(Op0))
continue;
- if (NewElem && L->contains(NewElem))
+ if (Op1 && L->contains(Op1))
continue;
// We can hoist this instruction. Move it to the pre-header.
- Insert->moveBefore(PreHeader->getTerminator());
+ I->moveBefore(PreHeader->getTerminator());
}
+ // Make a list of all reachable blocks in our CSE queue.
+ SmallVector<const DomTreeNode *, 8> CSEWorkList;
+ CSEWorkList.reserve(CSEBlocks.size());
+ for (BasicBlock *BB : CSEBlocks)
+ if (DomTreeNode *N = DT->getNode(BB)) {
+ assert(DT->isReachableFromEntry(N));
+ CSEWorkList.push_back(N);
+ }
+
+ // Sort blocks by domination. This ensures we visit a block after all blocks
+ // dominating it are visited.
+ std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
+ [this](const DomTreeNode *A, const DomTreeNode *B) {
+ return DT->properlyDominates(A, B);
+ });
+
// Perform O(N^2) search over the gather sequences and merge identical
// instructions. TODO: We can further optimize this scan if we split the
// instructions into different buckets based on the insert lane.
SmallVector<Instruction *, 16> Visited;
- ReversePostOrderTraversal<Function *> RPOT(&F);
- for (auto BB : RPOT) {
- // Traverse CSEBlocks by RPOT order.
- if (!CSEBlocks.count(BB))
- continue;
-
+ for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
+ assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
+ "Worklist not sorted properly!");
+ BasicBlock *BB = (*I)->getBlock();
// For all instructions in blocks containing gather sequences:
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
Instruction *In = &*it++;
@@ -3384,8 +3808,9 @@ void BoUpSLP::optimizeGatherSequence(Function &F) {
// Groups the instructions to a bundle (which is then a single scheduling entity)
// and schedules instructions until the bundle gets ready.
bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
- BoUpSLP *SLP, Value *OpValue) {
- if (isa<PHINode>(OpValue))
+ BoUpSLP *SLP,
+ const InstructionsState &S) {
+ if (isa<PHINode>(S.OpValue))
return true;
// Initialize the instruction bundle.
@@ -3393,12 +3818,12 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
ScheduleData *PrevInBundle = nullptr;
ScheduleData *Bundle = nullptr;
bool ReSchedule = false;
- DEBUG(dbgs() << "SLP: bundle: " << *OpValue << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
// Make sure that the scheduling region contains all
// instructions of the bundle.
for (Value *V : VL) {
- if (!extendSchedulingRegion(V, OpValue))
+ if (!extendSchedulingRegion(V, S))
return false;
}
@@ -3410,8 +3835,8 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
// A bundle member was scheduled as single instruction before and now
// needs to be scheduled as part of the bundle. We just get rid of the
// existing schedule.
- DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
- << " was already scheduled\n");
+ LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
+ << " was already scheduled\n");
ReSchedule = true;
}
assert(BundleMember->isSchedulingEntity() &&
@@ -3446,8 +3871,8 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
initialFillReadyList(ReadyInsts);
}
- DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
- << BB->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
+ << BB->getName() << "\n");
calculateDependencies(Bundle, true, SLP);
@@ -3465,7 +3890,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
}
}
if (!Bundle->isReady()) {
- cancelScheduling(VL, OpValue);
+ cancelScheduling(VL, S.OpValue);
return false;
}
return true;
@@ -3477,7 +3902,7 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
return;
ScheduleData *Bundle = getScheduleData(OpValue);
- DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
assert(!Bundle->IsScheduled &&
"Can't cancel bundle which is already scheduled");
assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
@@ -3508,13 +3933,13 @@ BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
}
bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
- Value *OpValue) {
- if (getScheduleData(V, isOneOf(OpValue, V)))
+ const InstructionsState &S) {
+ if (getScheduleData(V, isOneOf(S, V)))
return true;
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
- auto &&CheckSheduleForI = [this, OpValue](Instruction *I) -> bool {
+ auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
ScheduleData *ISD = getScheduleData(I);
if (!ISD)
return false;
@@ -3522,8 +3947,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
"ScheduleData not in scheduling region");
ScheduleData *SD = allocateScheduleDataChunks();
SD->Inst = I;
- SD->init(SchedulingRegionID, OpValue);
- ExtraScheduleDataMap[I][OpValue] = SD;
+ SD->init(SchedulingRegionID, S.OpValue);
+ ExtraScheduleDataMap[I][S.OpValue] = SD;
return true;
};
if (CheckSheduleForI(I))
@@ -3533,10 +3958,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
initScheduleData(I, I->getNextNode(), nullptr, nullptr);
ScheduleStart = I;
ScheduleEnd = I->getNextNode();
- if (isOneOf(OpValue, I) != I)
+ if (isOneOf(S, I) != I)
CheckSheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
- DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
return true;
}
// Search up and down at the same time, because we don't know if the new
@@ -3548,7 +3973,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
BasicBlock::iterator LowerEnd = BB->end();
while (true) {
if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
- DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
+ LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
return false;
}
@@ -3556,9 +3981,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
if (&*UpIter == I) {
initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
ScheduleStart = I;
- if (isOneOf(OpValue, I) != I)
+ if (isOneOf(S, I) != I)
CheckSheduleForI(I);
- DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
+ << "\n");
return true;
}
UpIter++;
@@ -3568,10 +3994,11 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
nullptr);
ScheduleEnd = I->getNextNode();
- if (isOneOf(OpValue, I) != I)
+ if (isOneOf(S, I) != I)
CheckSheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
- DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I
+ << "\n");
return true;
}
DownIter++;
@@ -3635,7 +4062,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
assert(isInSchedulingRegion(BundleMember));
if (!BundleMember->hasValidDependencies()) {
- DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
+ << "\n");
BundleMember->Dependencies = 0;
BundleMember->resetUnscheduledDeps();
@@ -3727,7 +4155,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
// and we can abort this loop at i6.
if (DistToSrc >= 2 * MaxMemDepDistance)
- break;
+ break;
DistToSrc++;
}
}
@@ -3736,7 +4164,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
}
if (InsertInReadyList && SD->isReady()) {
ReadyInsts.push_back(SD);
- DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
+ << "\n");
}
}
}
@@ -3759,7 +4188,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
if (!BS->ScheduleStart)
return;
- DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
BS->resetSchedule();
@@ -4025,7 +4454,11 @@ void BoUpSLP::computeMinimumValueSizes() {
// We start by looking at each entry that can be demoted. We compute the
// maximum bit width required to store the scalar by using ValueTracking to
// compute the number of high-order bits we can truncate.
- if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
+ if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
+ llvm::all_of(TreeRoot, [](Value *R) {
+ assert(R->hasOneUse() && "Root should have only one use!");
+ return isa<GetElementPtrInst>(R->user_back());
+ })) {
MaxBitWidth = 8u;
// Determine if the sign bit of all the roots is known to be zero. If not,
@@ -4188,7 +4621,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
return false;
- DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
// Use the bottom up slp vectorizer to construct chains that start with
// store instructions.
@@ -4203,8 +4636,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// Vectorize trees that end at stores.
if (!Stores.empty()) {
- DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
- << " underlying objects.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
+ << " underlying objects.\n");
Changed |= vectorizeStoreChains(R);
}
@@ -4215,21 +4648,21 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// is primarily intended to catch gather-like idioms ending at
// non-consecutive loads.
if (!GEPs.empty()) {
- DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
- << " underlying objects.\n");
+ LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
+ << " underlying objects.\n");
Changed |= vectorizeGEPIndices(BB, R);
}
}
if (Changed) {
- R.optimizeGatherSequence(F);
- DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
- DEBUG(verifyFunction(F));
+ R.optimizeGatherSequence();
+ LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
+ LLVM_DEBUG(verifyFunction(F));
}
return Changed;
}
-/// \brief Check that the Values in the slice in VL array are still existent in
+/// Check that the Values in the slice in VL array are still existent in
/// the WeakTrackingVH array.
/// Vectorization of part of the VL array may cause later values in the VL array
/// to become invalid. We track when this has happened in the WeakTrackingVH
@@ -4244,30 +4677,28 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,
bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
unsigned VecRegSize) {
- unsigned ChainLen = Chain.size();
- DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
- << "\n");
- unsigned Sz = R.getVectorElementSize(Chain[0]);
- unsigned VF = VecRegSize / Sz;
+ const unsigned ChainLen = Chain.size();
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
+ << "\n");
+ const unsigned Sz = R.getVectorElementSize(Chain[0]);
+ const unsigned VF = VecRegSize / Sz;
if (!isPowerOf2_32(Sz) || VF < 2)
return false;
// Keep track of values that were deleted by vectorizing in the loop below.
- SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
+ const SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
bool Changed = false;
// Look for profitable vectorizable trees at all offsets, starting at zero.
- for (unsigned i = 0, e = ChainLen; i < e; ++i) {
- if (i + VF > e)
- break;
+ for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {
// Check that a previous iteration of this loop did not delete the Value.
if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
continue;
- DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
- << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
+ << "\n");
ArrayRef<Value *> Operands = Chain.slice(i, VF);
R.buildTree(Operands);
@@ -4278,9 +4709,10 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
int Cost = R.getTreeCost();
- DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF
+ << "\n");
if (Cost < -SLPCostThreshold) {
- DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
using namespace ore;
@@ -4417,66 +4849,48 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
if (!A || !B)
return false;
Value *VL[] = { A, B };
- return tryToVectorizeList(VL, R, None, true);
+ return tryToVectorizeList(VL, R, /*UserCost=*/0, true);
}
bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
- ArrayRef<Value *> BuildVector,
- bool AllowReorder,
- bool NeedExtraction) {
+ int UserCost, bool AllowReorder) {
if (VL.size() < 2)
return false;
- DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size()
- << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
+ << VL.size() << ".\n");
- // Check that all of the parts are scalar instructions of the same type.
- Instruction *I0 = dyn_cast<Instruction>(VL[0]);
- if (!I0)
+ // Check that all of the parts are scalar instructions of the same type,
+ // we permit an alternate opcode via InstructionsState.
+ InstructionsState S = getSameOpcode(VL);
+ if (!S.getOpcode())
return false;
- unsigned Opcode0 = I0->getOpcode();
-
+ Instruction *I0 = cast<Instruction>(S.OpValue);
unsigned Sz = R.getVectorElementSize(I0);
unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
if (MaxVF < 2) {
- R.getORE()->emit([&]() {
- return OptimizationRemarkMissed(
- SV_NAME, "SmallVF", I0)
- << "Cannot SLP vectorize list: vectorization factor "
- << "less than 2 is not supported";
- });
- return false;
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
+ << "Cannot SLP vectorize list: vectorization factor "
+ << "less than 2 is not supported";
+ });
+ return false;
}
for (Value *V : VL) {
Type *Ty = V->getType();
if (!isValidElementType(Ty)) {
- // NOTE: the following will give user internal llvm type name, which may not be useful
+ // NOTE: the following will give user internal llvm type name, which may
+ // not be useful.
R.getORE()->emit([&]() {
- std::string type_str;
- llvm::raw_string_ostream rso(type_str);
- Ty->print(rso);
- return OptimizationRemarkMissed(
- SV_NAME, "UnsupportedType", I0)
- << "Cannot SLP vectorize list: type "
- << rso.str() + " is unsupported by vectorizer";
- });
- return false;
- }
- Instruction *Inst = dyn_cast<Instruction>(V);
-
- if (!Inst)
- return false;
- if (Inst->getOpcode() != Opcode0) {
- R.getORE()->emit([&]() {
- return OptimizationRemarkMissed(
- SV_NAME, "InequableTypes", I0)
- << "Cannot SLP vectorize list: not all of the "
- << "parts of scalar instructions are of the same type: "
- << ore::NV("Instruction1Opcode", I0) << " and "
- << ore::NV("Instruction2Opcode", Inst);
+ std::string type_str;
+ llvm::raw_string_ostream rso(type_str);
+ Ty->print(rso);
+ return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
+ << "Cannot SLP vectorize list: type "
+ << rso.str() + " is unsupported by vectorizer";
});
return false;
}
@@ -4513,24 +4927,20 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
continue;
- DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
- << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+ << "\n");
ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
- ArrayRef<Value *> EmptyArray;
- ArrayRef<Value *> BuildVectorSlice;
- if (!BuildVector.empty())
- BuildVectorSlice = BuildVector.slice(I, OpsWidth);
-
- R.buildTree(Ops, NeedExtraction ? EmptyArray : BuildVectorSlice);
+ R.buildTree(Ops);
+ Optional<ArrayRef<unsigned>> Order = R.bestOrder();
// TODO: check if we can allow reordering for more cases.
- if (AllowReorder && R.shouldReorder()) {
+ if (AllowReorder && Order) {
+ // TODO: reorder tree nodes without tree rebuilding.
// Conceptually, there is nothing actually preventing us from trying to
// reorder a larger list. In fact, we do exactly this when vectorizing
// reductions. However, at this point, we only expect to get here when
// there are exactly two operations.
assert(Ops.size() == 2);
- assert(BuildVectorSlice.empty());
Value *ReorderedOps[] = {Ops[1], Ops[0]};
R.buildTree(ReorderedOps, None);
}
@@ -4538,43 +4948,19 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
continue;
R.computeMinimumValueSizes();
- int Cost = R.getTreeCost();
+ int Cost = R.getTreeCost() - UserCost;
CandidateFound = true;
MinCost = std::min(MinCost, Cost);
if (Cost < -SLPCostThreshold) {
- DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
cast<Instruction>(Ops[0]))
<< "SLP vectorized with cost " << ore::NV("Cost", Cost)
<< " and with tree size "
<< ore::NV("TreeSize", R.getTreeSize()));
- Value *VectorizedRoot = R.vectorizeTree();
-
- // Reconstruct the build vector by extracting the vectorized root. This
- // way we handle the case where some elements of the vector are
- // undefined.
- // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
- if (!BuildVectorSlice.empty()) {
- // The insert point is the last build vector instruction. The
- // vectorized root will precede it. This guarantees that we get an
- // instruction. The vectorized tree could have been constant folded.
- Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
- unsigned VecIdx = 0;
- for (auto &V : BuildVectorSlice) {
- IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
- ++BasicBlock::iterator(InsertAfter));
- Instruction *I = cast<Instruction>(V);
- assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
- Instruction *Extract =
- cast<Instruction>(Builder.CreateExtractElement(
- VectorizedRoot, Builder.getInt32(VecIdx++)));
- I->setOperand(1, Extract);
- I->moveAfter(Extract);
- InsertAfter = I;
- }
- }
+ R.vectorizeTree();
// Move to the next bundle.
I += VF - 1;
NextInst = I + 1;
@@ -4585,18 +4971,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (!Changed && CandidateFound) {
R.getORE()->emit([&]() {
- return OptimizationRemarkMissed(
- SV_NAME, "NotBeneficial", I0)
- << "List vectorization was possible but not beneficial with cost "
- << ore::NV("Cost", MinCost) << " >= "
- << ore::NV("Treshold", -SLPCostThreshold);
+ return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
+ << "List vectorization was possible but not beneficial with cost "
+ << ore::NV("Cost", MinCost) << " >= "
+ << ore::NV("Treshold", -SLPCostThreshold);
});
} else if (!Changed) {
R.getORE()->emit([&]() {
- return OptimizationRemarkMissed(
- SV_NAME, "NotPossible", I0)
- << "Cannot SLP vectorize list: vectorization was impossible"
- << " with available vectorization factors";
+ return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
+ << "Cannot SLP vectorize list: vectorization was impossible"
+ << " with available vectorization factors";
});
}
return Changed;
@@ -4645,7 +5029,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
return false;
}
-/// \brief Generate a shuffle mask to be used in a reduction tree.
+/// Generate a shuffle mask to be used in a reduction tree.
///
/// \param VecLen The length of the vector to be reduced.
/// \param NumEltsToRdx The number of elements that should be reduced in the
@@ -5128,6 +5512,77 @@ class HorizontalReduction {
return OperationData(
Instruction::FCmp, LHS, RHS, RK_Max,
cast<Instruction>(Select->getCondition())->hasNoNaNs());
+ } else {
+ // Try harder: look for min/max pattern based on instructions producing
+ // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
+ // During the intermediate stages of SLP, it's very common to have
+ // pattern like this (since optimizeGatherSequence is run only once
+ // at the end):
+ // %1 = extractelement <2 x i32> %a, i32 0
+ // %2 = extractelement <2 x i32> %a, i32 1
+ // %cond = icmp sgt i32 %1, %2
+ // %3 = extractelement <2 x i32> %a, i32 0
+ // %4 = extractelement <2 x i32> %a, i32 1
+ // %select = select i1 %cond, i32 %3, i32 %4
+ CmpInst::Predicate Pred;
+ Instruction *L1;
+ Instruction *L2;
+
+ LHS = Select->getTrueValue();
+ RHS = Select->getFalseValue();
+ Value *Cond = Select->getCondition();
+
+ // TODO: Support inverse predicates.
+ if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
+ if (!isa<ExtractElementInst>(RHS) ||
+ !L2->isIdenticalTo(cast<Instruction>(RHS)))
+ return OperationData(V);
+ } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
+ if (!isa<ExtractElementInst>(LHS) ||
+ !L1->isIdenticalTo(cast<Instruction>(LHS)))
+ return OperationData(V);
+ } else {
+ if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
+ return OperationData(V);
+ if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
+ !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
+ !L2->isIdenticalTo(cast<Instruction>(RHS)))
+ return OperationData(V);
+ }
+ switch (Pred) {
+ default:
+ return OperationData(V);
+
+ case CmpInst::ICMP_ULT:
+ case CmpInst::ICMP_ULE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
+
+ case CmpInst::ICMP_SLT:
+ case CmpInst::ICMP_SLE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
+
+ case CmpInst::FCMP_OLT:
+ case CmpInst::FCMP_OLE:
+ case CmpInst::FCMP_ULT:
+ case CmpInst::FCMP_ULE:
+ return OperationData(Instruction::FCmp, LHS, RHS, RK_Min,
+ cast<Instruction>(Cond)->hasNoNaNs());
+
+ case CmpInst::ICMP_UGT:
+ case CmpInst::ICMP_UGE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
+
+ case CmpInst::ICMP_SGT:
+ case CmpInst::ICMP_SGE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
+
+ case CmpInst::FCMP_OGT:
+ case CmpInst::FCMP_OGE:
+ case CmpInst::FCMP_UGT:
+ case CmpInst::FCMP_UGE:
+ return OperationData(Instruction::FCmp, LHS, RHS, RK_Max,
+ cast<Instruction>(Cond)->hasNoNaNs());
+ }
}
}
return OperationData(V);
@@ -5136,7 +5591,7 @@ class HorizontalReduction {
public:
HorizontalReduction() = default;
- /// \brief Try to find a reduction tree.
+ /// Try to find a reduction tree.
bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
assert((!Phi || is_contained(Phi->operands(), B)) &&
"Thi phi needs to use the binary operator");
@@ -5164,6 +5619,8 @@ public:
Type *Ty = B->getType();
if (!isValidElementType(Ty))
return false;
+ if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy())
+ return false;
ReducedValueData.clear();
ReductionRoot = B;
@@ -5262,7 +5719,7 @@ public:
return true;
}
- /// \brief Attempt to vectorize the tree found by
+ /// Attempt to vectorize the tree found by
/// matchAssociativeReduction.
bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
if (ReducedVals.empty())
@@ -5295,9 +5752,14 @@ public:
while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
V.buildTree(VL, ExternallyUsedValues, IgnoreList);
- if (V.shouldReorder()) {
- SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
- V.buildTree(Reversed, ExternallyUsedValues, IgnoreList);
+ Optional<ArrayRef<unsigned>> Order = V.bestOrder();
+ // TODO: Handle orders of size less than number of elements in the vector.
+ if (Order && Order->size() == VL.size()) {
+ // TODO: reorder tree nodes without tree rebuilding.
+ SmallVector<Value *, 4> ReorderedOps(VL.size());
+ llvm::transform(*Order, ReorderedOps.begin(),
+ [VL](const unsigned Idx) { return VL[Idx]; });
+ V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
}
if (V.isTreeTinyAndNotFullyVectorizable())
break;
@@ -5305,8 +5767,9 @@ public:
V.computeMinimumValueSizes();
// Estimate cost.
- int Cost =
- V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);
+ int TreeCost = V.getTreeCost();
+ int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth);
+ int Cost = TreeCost + ReductionCost;
if (Cost >= -SLPCostThreshold) {
V.getORE()->emit([&]() {
return OptimizationRemarkMissed(
@@ -5319,8 +5782,8 @@ public:
break;
}
- DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
- << ". (HorRdx)\n");
+ LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
+ << Cost << ". (HorRdx)\n");
V.getORE()->emit([&]() {
return OptimizationRemark(
SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0]))
@@ -5382,7 +5845,7 @@ public:
}
private:
- /// \brief Calculate the cost of a reduction.
+ /// Calculate the cost of a reduction.
int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
unsigned ReduxWidth) {
Type *ScalarTy = FirstReducedVal->getType();
@@ -5441,16 +5904,16 @@ private:
}
ScalarReduxCost *= (ReduxWidth - 1);
- DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
- << " for reduction that starts with " << *FirstReducedVal
- << " (It is a "
- << (IsPairwiseReduction ? "pairwise" : "splitting")
- << " reduction)\n");
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
+ << " for reduction that starts with " << *FirstReducedVal
+ << " (It is a "
+ << (IsPairwiseReduction ? "pairwise" : "splitting")
+ << " reduction)\n");
return VecReduxCost - ScalarReduxCost;
}
- /// \brief Emit a horizontal reduction of the vectorized value.
+ /// Emit a horizontal reduction of the vectorized value.
Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
unsigned ReduxWidth, const TargetTransformInfo *TTI) {
assert(VectorizedValue && "Need to have a vectorized tree node");
@@ -5486,7 +5949,7 @@ private:
} // end anonymous namespace
-/// \brief Recognize construction of vectors like
+/// Recognize construction of vectors like
/// %ra = insertelement <4 x float> undef, float %s0, i32 0
/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
@@ -5495,11 +5958,17 @@ private:
///
/// Returns true if it matches
static bool findBuildVector(InsertElementInst *LastInsertElem,
- SmallVectorImpl<Value *> &BuildVector,
- SmallVectorImpl<Value *> &BuildVectorOpds) {
+ TargetTransformInfo *TTI,
+ SmallVectorImpl<Value *> &BuildVectorOpds,
+ int &UserCost) {
+ UserCost = 0;
Value *V = nullptr;
do {
- BuildVector.push_back(LastInsertElem);
+ if (auto *CI = dyn_cast<ConstantInt>(LastInsertElem->getOperand(2))) {
+ UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
+ LastInsertElem->getType(),
+ CI->getZExtValue());
+ }
BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
V = LastInsertElem->getOperand(0);
if (isa<UndefValue>(V))
@@ -5508,20 +5977,17 @@ static bool findBuildVector(InsertElementInst *LastInsertElem,
if (!LastInsertElem || !LastInsertElem->hasOneUse())
return false;
} while (true);
- std::reverse(BuildVector.begin(), BuildVector.end());
std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
return true;
}
-/// \brief Like findBuildVector, but looks for construction of aggregate.
+/// Like findBuildVector, but looks for construction of aggregate.
///
/// \return true if it matches.
static bool findBuildAggregate(InsertValueInst *IV,
- SmallVectorImpl<Value *> &BuildVector,
SmallVectorImpl<Value *> &BuildVectorOpds) {
Value *V;
do {
- BuildVector.push_back(IV);
BuildVectorOpds.push_back(IV->getInsertedValueOperand());
V = IV->getAggregateOperand();
if (isa<UndefValue>(V))
@@ -5530,7 +5996,6 @@ static bool findBuildAggregate(InsertValueInst *IV,
if (!IV || !IV->hasOneUse())
return false;
} while (true);
- std::reverse(BuildVector.begin(), BuildVector.end());
std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
return true;
}
@@ -5539,7 +6004,7 @@ static bool PhiTypeSorterFunc(Value *V, Value *V2) {
return V->getType() < V2->getType();
}
-/// \brief Try and get a reduction value from a phi node.
+/// Try and get a reduction value from a phi node.
///
/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
/// if they come from either \p ParentBB or a containing loop latch.
@@ -5552,9 +6017,8 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
// reduction phi. Vectorizing such cases has been reported to cause
// miscompiles. See PR25787.
auto DominatedReduxValue = [&](Value *R) {
- return (
- dyn_cast<Instruction>(R) &&
- DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
+ return isa<Instruction>(R) &&
+ DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
};
Value *Rdx = nullptr;
@@ -5624,7 +6088,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
// Interrupt the process if the Root instruction itself was vectorized or all
// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0});
- SmallSet<Value *, 8> VisitedInstrs;
+ SmallPtrSet<Value *, 8> VisitedInstrs;
bool Res = false;
while (!Stack.empty()) {
Value *V;
@@ -5706,27 +6170,29 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
if (!R.canMapToVector(IVI->getType(), DL))
return false;
- SmallVector<Value *, 16> BuildVector;
SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildAggregate(IVI, BuildVector, BuildVectorOpds))
+ if (!findBuildAggregate(IVI, BuildVectorOpds))
return false;
- DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
// Aggregate value is unlikely to be processed in vector register, we need to
// extract scalars into scalar registers, so NeedExtraction is set true.
- return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, true);
+ return tryToVectorizeList(BuildVectorOpds, R);
}
bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
BasicBlock *BB, BoUpSLP &R) {
- SmallVector<Value *, 16> BuildVector;
+ int UserCost;
SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildVector(IEI, BuildVector, BuildVectorOpds))
+ if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) ||
+ (llvm::all_of(BuildVectorOpds,
+ [](Value *V) { return isa<ExtractElementInst>(V); }) &&
+ isShuffle(BuildVectorOpds)))
return false;
// Vectorize starting with the build vector operands ignoring the BuildVector
// instructions for the purpose of scheduling and user extraction.
- return tryToVectorizeList(BuildVectorOpds, R, BuildVector);
+ return tryToVectorizeList(BuildVectorOpds, R, UserCost);
}
bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
@@ -5763,7 +6229,7 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
SmallVector<Value *, 4> Incoming;
- SmallSet<Value *, 16> VisitedInstrs;
+ SmallPtrSet<Value *, 16> VisitedInstrs;
bool HaveVectorizedPhiNodes = true;
while (HaveVectorizedPhiNodes) {
@@ -5798,14 +6264,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Try to vectorize them.
unsigned NumElts = (SameTypeIt - IncIt);
- DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
+ LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("
+ << NumElts << ")\n");
// The order in which the phi nodes appear in the program does not matter.
// So allow tryToVectorizeList to reorder them if it is beneficial. This
// is done when there are exactly two elements since tryToVectorizeList
// asserts that there are only two values when AllowReorder is true.
bool AllowReorder = NumElts == 2;
if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
- None, AllowReorder)) {
+ /*UserCost=*/0, AllowReorder)) {
// Success start over because instructions might have been changed.
HaveVectorizedPhiNodes = true;
Changed = true;
@@ -5885,7 +6352,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
isa<InsertValueInst>(it))
PostProcessInstructions.push_back(&*it);
-
}
return Changed;
@@ -5899,8 +6365,8 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
if (Entry.second.size() < 2)
continue;
- DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
- << Entry.second.size() << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
+ << Entry.second.size() << ".\n");
// We process the getelementptr list in chunks of 16 (like we do for
// stores) to minimize compile-time.
@@ -5982,14 +6448,14 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
if (it->second.size() < 2)
continue;
- DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
- << it->second.size() << ".\n");
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
+ << it->second.size() << ".\n");
// Process the stores in chunks of 16.
// TODO: The limit of 16 inhibits greater vectorization factors.
// For example, AVX2 supports v32i8. Increasing this limit, however,
// may cause a significant compile-time increase.
- for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
+ for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) {
unsigned Len = std::min<unsigned>(CE - CI, 16);
Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
}
diff --git a/lib/Transforms/Vectorize/VPRecipeBuilder.h b/lib/Transforms/Vectorize/VPRecipeBuilder.h
new file mode 100644
index 000000000000..f43a8bb123b1
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -0,0 +1,131 @@
+//===- VPRecipeBuilder.h - Helper class to build recipes --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
+
+#include "LoopVectorizationPlanner.h"
+#include "VPlan.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/IRBuilder.h"
+
+namespace llvm {
+
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+class TargetTransformInfo;
+class TargetLibraryInfo;
+
+/// Helper class to create VPRecipies from IR instructions.
+class VPRecipeBuilder {
+ /// The loop that we evaluate.
+ Loop *OrigLoop;
+
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
+ /// Target Transform Info.
+ const TargetTransformInfo *TTI;
+
+ /// The legality analysis.
+ LoopVectorizationLegality *Legal;
+
+ /// The profitablity analysis.
+ LoopVectorizationCostModel &CM;
+
+ VPBuilder &Builder;
+
+ /// When we if-convert we need to create edge masks. We have to cache values
+ /// so that we don't end up with exponential recursion/IR. Note that
+ /// if-conversion currently takes place during VPlan-construction, so these
+ /// caches are only used at that stage.
+ using EdgeMaskCacheTy =
+ DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>;
+ using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>;
+ EdgeMaskCacheTy EdgeMaskCache;
+ BlockMaskCacheTy BlockMaskCache;
+
+public:
+ /// A helper function that computes the predicate of the block BB, assuming
+ /// that the header block of the loop is set to True. It returns the *entry*
+ /// mask for the block BB.
+ VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan);
+
+ /// A helper function that computes the predicate of the edge between SRC
+ /// and DST.
+ VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan);
+
+ /// Check if \I belongs to an Interleave Group within the given VF \p Range,
+ /// \return true in the first returned value if so and false otherwise.
+ /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG
+ /// for \p Range.Start, and provide it as the second returned value.
+ /// Note that if \I is an adjunct member of an IG for \p Range.Start, the
+ /// \return value is <true, nullptr>, as it is handled by another recipe.
+ /// \p Range.End may be decreased to ensure same decision from \p Range.Start
+ /// to \p Range.End.
+ VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
+
+ /// Check if \I is a memory instruction to be widened for \p Range.Start and
+ /// potentially masked. Such instructions are handled by a recipe that takes
+ /// an additional VPInstruction for the mask.
+ VPWidenMemoryInstructionRecipe *
+ tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan);
+
+ /// Check if an induction recipe should be constructed for \I within the given
+ /// VF \p Range. If so build and return it. If not, return null. \p Range.End
+ /// may be decreased to ensure same decision from \p Range.Start to
+ /// \p Range.End.
+ VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I,
+ VFRange &Range);
+
+ /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
+ /// a sequence of select instructions as the vectorizer currently performs
+ /// full if-conversion.
+ VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan);
+
+ /// Check if \p I can be widened within the given VF \p Range. If \p I can be
+ /// widened for \p Range.Start, check if the last recipe of \p VPBB can be
+ /// extended to include \p I or else build a new VPWidenRecipe for it and
+ /// append it to \p VPBB. Return true if \p I can be widened for Range.Start,
+ /// false otherwise. Range.End may be decreased to ensure same decision from
+ /// \p Range.Start to \p Range.End.
+ bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range);
+
+ /// Create a replicating region for instruction \p I that requires
+ /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
+ VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe,
+ VPlanPtr &Plan);
+
+public:
+ VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI,
+ LoopVectorizationLegality *Legal,
+ LoopVectorizationCostModel &CM, VPBuilder &Builder)
+ : OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
+ Builder(Builder) {}
+
+ /// Check if a recipe can be create for \p I withing the given VF \p Range.
+ /// If a recipe can be created, it adds it to \p VPBB.
+ bool tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPlanPtr &Plan,
+ VPBasicBlock *VPBB);
+
+ /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
+ /// is predicated. \return \p VPBB augmented with this new recipe if \p I is
+ /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
+ /// Region. Update the packing decision of predicated instructions if they
+ /// feed \p I. Range.End may be decreased to ensure same recipe behavior from
+ /// \p Range.Start to \p Range.End.
+ VPBasicBlock *handleReplication(
+ Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
+ DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
+ VPlanPtr &Plan);
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
diff --git a/lib/Transforms/Vectorize/VPlan.cpp b/lib/Transforms/Vectorize/VPlan.cpp
index 4e54fc6db2a5..f7b07b722bb1 100644
--- a/lib/Transforms/Vectorize/VPlan.cpp
+++ b/lib/Transforms/Vectorize/VPlan.cpp
@@ -116,7 +116,7 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
BasicBlock *PrevBB = CFG.PrevBB;
BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(),
PrevBB->getParent(), CFG.LastBB);
- DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
// Hook up the new basic block to its predecessors.
for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
@@ -125,7 +125,7 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
assert(PredBB && "Predecessor basic-block not found building successor.");
auto *PredBBTerminator = PredBB->getTerminator();
- DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
if (isa<UnreachableInst>(PredBBTerminator)) {
assert(PredVPSuccessors.size() == 1 &&
"Predecessor ending w/o branch must have single successor.");
@@ -175,8 +175,8 @@ void VPBasicBlock::execute(VPTransformState *State) {
}
// 2. Fill the IR basic block with IR instructions.
- DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
- << " in BB:" << NewBB->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
+ << " in BB:" << NewBB->getName() << '\n');
State->CFG.VPBB2IRBB[this] = NewBB;
State->CFG.PrevVPBB = this;
@@ -184,7 +184,7 @@ void VPBasicBlock::execute(VPTransformState *State) {
for (VPRecipeBase &Recipe : Recipes)
Recipe.execute(*State);
- DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
+ LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
}
void VPRegionBlock::execute(VPTransformState *State) {
@@ -193,7 +193,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
if (!isReplicator()) {
// Visit the VPBlocks connected to "this", starting from it.
for (VPBlockBase *Block : RPOT) {
- DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
Block->execute(State);
}
return;
@@ -210,7 +210,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
State->Instance->Lane = Lane;
// Visit the VPBlocks connected to \p this, starting from it.
for (VPBlockBase *Block : RPOT) {
- DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
Block->execute(State);
}
}
@@ -220,6 +220,15 @@ void VPRegionBlock::execute(VPTransformState *State) {
State->Instance.reset();
}
+void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
+ Parent = InsertPos->getParent();
+ Parent->getRecipeList().insert(InsertPos->getIterator(), this);
+}
+
+iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
+ return getParent()->getRecipeList().erase(getIterator());
+}
+
void VPInstruction::generateInstruction(VPTransformState &State,
unsigned Part) {
IRBuilder<> &Builder = State.Builder;
@@ -356,7 +365,7 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,
"One successor of a basic block does not lead to the other.");
assert(InterimSucc->getSinglePredecessor() &&
"Interim successor has more than one predecessor.");
- assert(std::distance(pred_begin(PostDomSucc), pred_end(PostDomSucc)) == 2 &&
+ assert(pred_size(PostDomSucc) == 2 &&
"PostDom successor has more than two predecessors.");
DT->addNewBlock(InterimSucc, BB);
DT->addNewBlock(PostDomSucc, BB);
@@ -448,6 +457,18 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
bumpIndent(1);
for (const VPRecipeBase &Recipe : *BasicBlock)
Recipe.print(OS, Indent);
+
+ // Dump the condition bit.
+ const VPValue *CBV = BasicBlock->getCondBit();
+ if (CBV) {
+ OS << " +\n" << Indent << " \"CondBit: ";
+ if (const VPInstruction *CBI = dyn_cast<VPInstruction>(CBV)) {
+ CBI->printAsOperand(OS);
+ OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\"";
+ } else
+ CBV->printAsOperand(OS);
+ }
+
bumpIndent(-2);
OS << "\n" << Indent << "]\n";
dumpEdges(BasicBlock);
diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h
index 2ccabfd6af25..866951cb79a4 100644
--- a/lib/Transforms/Vectorize/VPlan.h
+++ b/lib/Transforms/Vectorize/VPlan.h
@@ -30,6 +30,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/GraphTraits.h"
#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Twine.h"
@@ -42,15 +43,10 @@
#include <map>
#include <string>
-// The (re)use of existing LoopVectorize classes is subject to future VPlan
-// refactoring.
-namespace {
-class LoopVectorizationLegality;
-class LoopVectorizationCostModel;
-} // namespace
-
namespace llvm {
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
class BasicBlock;
class DominatorTree;
class InnerLoopVectorizer;
@@ -60,6 +56,20 @@ class raw_ostream;
class Value;
class VPBasicBlock;
class VPRegionBlock;
+class VPlan;
+
+/// A range of powers-of-2 vectorization factors with fixed start and
+/// adjustable end. The range includes start and excludes end, e.g.,:
+/// [1, 9) = {1, 2, 4, 8}
+struct VFRange {
+ // A power of 2.
+ const unsigned Start;
+
+ // Need not be a power of 2. If End <= Start range is empty.
+ unsigned End;
+};
+
+using VPlanPtr = std::unique_ptr<VPlan>;
/// In what follows, the term "input IR" refers to code that is fed into the
/// vectorizer whereas the term "output IR" refers to code that is generated by
@@ -311,6 +321,8 @@ struct VPTransformState {
/// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
/// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock.
class VPBlockBase {
+ friend class VPBlockUtils;
+
private:
const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
@@ -327,6 +339,9 @@ private:
/// List of successor blocks.
SmallVector<VPBlockBase *, 1> Successors;
+ /// Successor selector, null for zero or single successor blocks.
+ VPValue *CondBit = nullptr;
+
/// Add \p Successor as the last successor to this block.
void appendSuccessor(VPBlockBase *Successor) {
assert(Successor && "Cannot add nullptr successor!");
@@ -377,6 +392,7 @@ public:
/// for any other purpose, as the values may change as LLVM evolves.
unsigned getVPBlockID() const { return SubclassID; }
+ VPRegionBlock *getParent() { return Parent; }
const VPRegionBlock *getParent() const { return Parent; }
void setParent(VPRegionBlock *P) { Parent = P; }
@@ -411,6 +427,9 @@ public:
return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr);
}
+ size_t getNumSuccessors() const { return Successors.size(); }
+ size_t getNumPredecessors() const { return Predecessors.size(); }
+
/// An Enclosing Block of a block B is any block containing B, including B
/// itself. \return the closest enclosing block starting from "this", which
/// has successors. \return the root enclosing block if all enclosing blocks
@@ -454,34 +473,41 @@ public:
return getEnclosingBlockWithPredecessors()->getSinglePredecessor();
}
- /// Sets a given VPBlockBase \p Successor as the single successor and \return
- /// \p Successor. The parent of this Block is copied to be the parent of
- /// \p Successor.
- VPBlockBase *setOneSuccessor(VPBlockBase *Successor) {
+ /// \return the condition bit selecting the successor.
+ VPValue *getCondBit() { return CondBit; }
+
+ const VPValue *getCondBit() const { return CondBit; }
+
+ void setCondBit(VPValue *CV) { CondBit = CV; }
+
+ /// Set a given VPBlockBase \p Successor as the single successor of this
+ /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor.
+ /// This VPBlockBase must have no successors.
+ void setOneSuccessor(VPBlockBase *Successor) {
assert(Successors.empty() && "Setting one successor when others exist.");
appendSuccessor(Successor);
- Successor->appendPredecessor(this);
- Successor->Parent = Parent;
- return Successor;
}
- /// Sets two given VPBlockBases \p IfTrue and \p IfFalse to be the two
- /// successors. The parent of this Block is copied to be the parent of both
- /// \p IfTrue and \p IfFalse.
- void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) {
+ /// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two
+ /// successors of this VPBlockBase. \p Condition is set as the successor
+ /// selector. This VPBlockBase is not added as predecessor of \p IfTrue or \p
+ /// IfFalse. This VPBlockBase must have no successors.
+ void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
+ VPValue *Condition) {
assert(Successors.empty() && "Setting two successors when others exist.");
+ assert(Condition && "Setting two successors without condition!");
+ CondBit = Condition;
appendSuccessor(IfTrue);
appendSuccessor(IfFalse);
- IfTrue->appendPredecessor(this);
- IfFalse->appendPredecessor(this);
- IfTrue->Parent = Parent;
- IfFalse->Parent = Parent;
}
- void disconnectSuccessor(VPBlockBase *Successor) {
- assert(Successor && "Successor to disconnect is null.");
- removeSuccessor(Successor);
- Successor->removePredecessor(this);
+ /// Set each VPBasicBlock in \p NewPreds as predecessor of this VPBlockBase.
+ /// This VPBlockBase must have no predecessors. This VPBlockBase is not added
+ /// as successor of any VPBasicBlock in \p NewPreds.
+ void setPredecessors(ArrayRef<VPBlockBase *> NewPreds) {
+ assert(Predecessors.empty() && "Block predecessors already set.");
+ for (auto *Pred : NewPreds)
+ appendPredecessor(Pred);
}
/// The method which generates the output IR that correspond to this
@@ -539,6 +565,15 @@ public:
/// Each recipe prints itself.
virtual void print(raw_ostream &O, const Twine &Indent) const = 0;
+
+ /// Insert an unlinked recipe into a basic block immediately before
+ /// the specified recipe.
+ void insertBefore(VPRecipeBase *InsertPos);
+
+ /// This method unlinks 'this' from the containing basic block and deletes it.
+ ///
+ /// \returns an iterator pointing to the element after the erased one
+ iplist<VPRecipeBase>::iterator eraseFromParent();
};
/// This is a concrete Recipe that models a single VPlan-level instruction.
@@ -546,6 +581,8 @@ public:
/// executed, these instructions would always form a single-def expression as
/// the VPInstruction is also a single def-use vertex.
class VPInstruction : public VPUser, public VPRecipeBase {
+ friend class VPlanHCFGTransforms;
+
public:
/// VPlan opcodes, extending LLVM IR with idiomatics instructions.
enum { Not = Instruction::OtherOpsEnd + 1 };
@@ -559,10 +596,13 @@ private:
void generateInstruction(VPTransformState &State, unsigned Part);
public:
- VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
+ VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands)
: VPUser(VPValue::VPInstructionSC, Operands),
VPRecipeBase(VPRecipeBase::VPInstructionSC), Opcode(Opcode) {}
+ VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
+ : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {}
+
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPValue *V) {
return V->getVPValueID() == VPValue::VPInstructionSC;
@@ -907,7 +947,10 @@ public:
inline const VPRecipeBase &back() const { return Recipes.back(); }
inline VPRecipeBase &back() { return Recipes.back(); }
- /// \brief Returns a pointer to a member of the recipe list.
+ /// Returns a reference to the list of recipes.
+ RecipeListTy &getRecipeList() { return Recipes; }
+
+ /// Returns a pointer to a member of the recipe list.
static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) {
return &VPBasicBlock::Recipes;
}
@@ -968,6 +1011,9 @@ public:
Entry->setParent(this);
Exit->setParent(this);
}
+ VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
+ : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exit(nullptr),
+ IsReplicator(IsReplicator) {}
~VPRegionBlock() override {
if (Entry)
@@ -982,9 +1028,27 @@ public:
const VPBlockBase *getEntry() const { return Entry; }
VPBlockBase *getEntry() { return Entry; }
+ /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
+ /// EntryBlock must have no predecessors.
+ void setEntry(VPBlockBase *EntryBlock) {
+ assert(EntryBlock->getPredecessors().empty() &&
+ "Entry block cannot have predecessors.");
+ Entry = EntryBlock;
+ EntryBlock->setParent(this);
+ }
+
const VPBlockBase *getExit() const { return Exit; }
VPBlockBase *getExit() { return Exit; }
+ /// Set \p ExitBlock as the exit VPBlockBase of this VPRegionBlock. \p
+ /// ExitBlock must have no successors.
+ void setExit(VPBlockBase *ExitBlock) {
+ assert(ExitBlock->getSuccessors().empty() &&
+ "Exit block cannot have successors.");
+ Exit = ExitBlock;
+ ExitBlock->setParent(this);
+ }
+
/// An indicator whether this region is to generate multiple replicated
/// instances of output IR corresponding to its VPBlockBases.
bool isReplicator() const { return IsReplicator; }
@@ -1012,6 +1076,13 @@ private:
/// Holds the name of the VPlan, for printing.
std::string Name;
+ /// Holds all the external definitions created for this VPlan.
+ // TODO: Introduce a specific representation for external definitions in
+ // VPlan. External definitions must be immutable and hold a pointer to its
+ // underlying IR that will be used to implement its structural comparison
+ // (operators '==' and '<').
+ SmallPtrSet<VPValue *, 16> VPExternalDefs;
+
/// Holds a mapping between Values and their corresponding VPValue inside
/// VPlan.
Value2VPValueTy Value2VPValue;
@@ -1024,6 +1095,8 @@ public:
VPBlockBase::deleteCFG(Entry);
for (auto &MapEntry : Value2VPValue)
delete MapEntry.second;
+ for (VPValue *Def : VPExternalDefs)
+ delete Def;
}
/// Generate the IR code for this VPlan.
@@ -1042,6 +1115,12 @@ public:
void setName(const Twine &newName) { Name = newName.str(); }
+ /// Add \p VPVal to the pool of external definitions if it's not already
+ /// in the pool.
+ void addExternalDef(VPValue *VPVal) {
+ VPExternalDefs.insert(VPVal);
+ }
+
void addVPValue(Value *V) {
assert(V && "Trying to add a null Value to VPlan");
assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
@@ -1189,6 +1268,72 @@ template <> struct GraphTraits<Inverse<VPBlockBase *>> {
}
};
+//===----------------------------------------------------------------------===//
+// VPlan Utilities
+//===----------------------------------------------------------------------===//
+
+/// Class that provides utilities for VPBlockBases in VPlan.
+class VPBlockUtils {
+public:
+ VPBlockUtils() = delete;
+
+ /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p
+ /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p
+ /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. If \p BlockPtr
+ /// has more than one successor, its conditional bit is propagated to \p
+ /// NewBlock. \p NewBlock must have neither successors nor predecessors.
+ static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
+ assert(NewBlock->getSuccessors().empty() &&
+ "Can't insert new block with successors.");
+ // TODO: move successors from BlockPtr to NewBlock when this functionality
+ // is necessary. For now, setBlockSingleSuccessor will assert if BlockPtr
+ // already has successors.
+ BlockPtr->setOneSuccessor(NewBlock);
+ NewBlock->setPredecessors({BlockPtr});
+ NewBlock->setParent(BlockPtr->getParent());
+ }
+
+ /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
+ /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p
+ /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr
+ /// parent to \p IfTrue and \p IfFalse. \p Condition is set as the successor
+ /// selector. \p BlockPtr must have no successors and \p IfTrue and \p IfFalse
+ /// must have neither successors nor predecessors.
+ static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
+ VPValue *Condition, VPBlockBase *BlockPtr) {
+ assert(IfTrue->getSuccessors().empty() &&
+ "Can't insert IfTrue with successors.");
+ assert(IfFalse->getSuccessors().empty() &&
+ "Can't insert IfFalse with successors.");
+ BlockPtr->setTwoSuccessors(IfTrue, IfFalse, Condition);
+ IfTrue->setPredecessors({BlockPtr});
+ IfFalse->setPredecessors({BlockPtr});
+ IfTrue->setParent(BlockPtr->getParent());
+ IfFalse->setParent(BlockPtr->getParent());
+ }
+
+ /// Connect VPBlockBases \p From and \p To bi-directionally. Append \p To to
+ /// the successors of \p From and \p From to the predecessors of \p To. Both
+ /// VPBlockBases must have the same parent, which can be null. Both
+ /// VPBlockBases can be already connected to other VPBlockBases.
+ static void connectBlocks(VPBlockBase *From, VPBlockBase *To) {
+ assert((From->getParent() == To->getParent()) &&
+ "Can't connect two block with different parents");
+ assert(From->getNumSuccessors() < 2 &&
+ "Blocks can't have more than two successors.");
+ From->appendSuccessor(To);
+ To->appendPredecessor(From);
+ }
+
+ /// Disconnect VPBlockBases \p From and \p To bi-directionally. Remove \p To
+ /// from the successors of \p From and \p From from the predecessors of \p To.
+ static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To) {
+ assert(To && "Successor to disconnect is null.");
+ From->removeSuccessor(To);
+ To->removePredecessor(From);
+ }
+};
+
} // end namespace llvm
#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/lib/Transforms/Vectorize/VPlanBuilder.h b/lib/Transforms/Vectorize/VPlanBuilder.h
deleted file mode 100644
index d6eb3397d044..000000000000
--- a/lib/Transforms/Vectorize/VPlanBuilder.h
+++ /dev/null
@@ -1,61 +0,0 @@
-//===- VPlanBuilder.h - A VPlan utility for constructing VPInstructions ---===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file provides a VPlan-based builder utility analogous to IRBuilder.
-/// It provides an instruction-level API for generating VPInstructions while
-/// abstracting away the Recipe manipulation details.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H
-#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H
-
-#include "VPlan.h"
-
-namespace llvm {
-
-class VPBuilder {
-private:
- VPBasicBlock *BB = nullptr;
- VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
-
- VPInstruction *createInstruction(unsigned Opcode,
- std::initializer_list<VPValue *> Operands) {
- VPInstruction *Instr = new VPInstruction(Opcode, Operands);
- BB->insert(Instr, InsertPt);
- return Instr;
- }
-
-public:
- VPBuilder() {}
-
- /// \brief This specifies that created VPInstructions should be appended to
- /// the end of the specified block.
- void setInsertPoint(VPBasicBlock *TheBB) {
- assert(TheBB && "Attempting to set a null insert point");
- BB = TheBB;
- InsertPt = BB->end();
- }
-
- VPValue *createNot(VPValue *Operand) {
- return createInstruction(VPInstruction::Not, {Operand});
- }
-
- VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
- return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
- }
-
- VPValue *createOr(VPValue *LHS, VPValue *RHS) {
- return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
- }
-};
-
-} // namespace llvm
-
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H
diff --git a/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
new file mode 100644
index 000000000000..08129b74cddf
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -0,0 +1,336 @@
+//===-- VPlanHCFGBuilder.cpp ----------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the construction of a VPlan-based Hierarchical CFG
+/// (H-CFG) for an incoming IR. This construction comprises the following
+/// components and steps:
+//
+/// 1. PlainCFGBuilder class: builds a plain VPBasicBlock-based CFG that
+/// faithfully represents the CFG in the incoming IR. A VPRegionBlock (Top
+/// Region) is created to enclose and serve as parent of all the VPBasicBlocks
+/// in the plain CFG.
+/// NOTE: At this point, there is a direct correspondence between all the
+/// VPBasicBlocks created for the initial plain CFG and the incoming
+/// BasicBlocks. However, this might change in the future.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanHCFGBuilder.h"
+#include "LoopVectorizationPlanner.h"
+#include "llvm/Analysis/LoopIterator.h"
+
+#define DEBUG_TYPE "loop-vectorize"
+
+using namespace llvm;
+
+namespace {
+// Class that is used to build the plain CFG for the incoming IR.
+class PlainCFGBuilder {
+private:
+ // The outermost loop of the input loop nest considered for vectorization.
+ Loop *TheLoop;
+
+ // Loop Info analysis.
+ LoopInfo *LI;
+
+ // Vectorization plan that we are working on.
+ VPlan &Plan;
+
+ // Output Top Region.
+ VPRegionBlock *TopRegion = nullptr;
+
+ // Builder of the VPlan instruction-level representation.
+ VPBuilder VPIRBuilder;
+
+ // NOTE: The following maps are intentionally destroyed after the plain CFG
+ // construction because subsequent VPlan-to-VPlan transformation may
+ // invalidate them.
+ // Map incoming BasicBlocks to their newly-created VPBasicBlocks.
+ DenseMap<BasicBlock *, VPBasicBlock *> BB2VPBB;
+ // Map incoming Value definitions to their newly-created VPValues.
+ DenseMap<Value *, VPValue *> IRDef2VPValue;
+
+ // Hold phi node's that need to be fixed once the plain CFG has been built.
+ SmallVector<PHINode *, 8> PhisToFix;
+
+ // Utility functions.
+ void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);
+ void fixPhiNodes();
+ VPBasicBlock *getOrCreateVPBB(BasicBlock *BB);
+ bool isExternalDef(Value *Val);
+ VPValue *getOrCreateVPOperand(Value *IRVal);
+ void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB);
+
+public:
+ PlainCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
+ : TheLoop(Lp), LI(LI), Plan(P) {}
+
+ // Build the plain CFG and return its Top Region.
+ VPRegionBlock *buildPlainCFG();
+};
+} // anonymous namespace
+
+// Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB
+// must have no predecessors.
+void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) {
+ SmallVector<VPBlockBase *, 8> VPBBPreds;
+ // Collect VPBB predecessors.
+ for (BasicBlock *Pred : predecessors(BB))
+ VPBBPreds.push_back(getOrCreateVPBB(Pred));
+
+ VPBB->setPredecessors(VPBBPreds);
+}
+
+// Add operands to VPInstructions representing phi nodes from the input IR.
+void PlainCFGBuilder::fixPhiNodes() {
+ for (auto *Phi : PhisToFix) {
+ assert(IRDef2VPValue.count(Phi) && "Missing VPInstruction for PHINode.");
+ VPValue *VPVal = IRDef2VPValue[Phi];
+ assert(isa<VPInstruction>(VPVal) && "Expected VPInstruction for phi node.");
+ auto *VPPhi = cast<VPInstruction>(VPVal);
+ assert(VPPhi->getNumOperands() == 0 &&
+ "Expected VPInstruction with no operands.");
+
+ for (Value *Op : Phi->operands())
+ VPPhi->addOperand(getOrCreateVPOperand(Op));
+ }
+}
+
+// Create a new empty VPBasicBlock for an incoming BasicBlock or retrieve an
+// existing one if it was already created.
+VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
+ auto BlockIt = BB2VPBB.find(BB);
+ if (BlockIt != BB2VPBB.end())
+ // Retrieve existing VPBB.
+ return BlockIt->second;
+
+ // Create new VPBB.
+ LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << BB->getName() << "\n");
+ VPBasicBlock *VPBB = new VPBasicBlock(BB->getName());
+ BB2VPBB[BB] = VPBB;
+ VPBB->setParent(TopRegion);
+ return VPBB;
+}
+
+// Return true if \p Val is considered an external definition. An external
+// definition is either:
+// 1. A Value that is not an Instruction. This will be refined in the future.
+// 2. An Instruction that is outside of the CFG snippet represented in VPlan,
+// i.e., is not part of: a) the loop nest, b) outermost loop PH and, c)
+// outermost loop exits.
+bool PlainCFGBuilder::isExternalDef(Value *Val) {
+ // All the Values that are not Instructions are considered external
+ // definitions for now.
+ Instruction *Inst = dyn_cast<Instruction>(Val);
+ if (!Inst)
+ return true;
+
+ BasicBlock *InstParent = Inst->getParent();
+ assert(InstParent && "Expected instruction parent.");
+
+ // Check whether Instruction definition is in loop PH.
+ BasicBlock *PH = TheLoop->getLoopPreheader();
+ assert(PH && "Expected loop pre-header.");
+
+ if (InstParent == PH)
+ // Instruction definition is in outermost loop PH.
+ return false;
+
+ // Check whether Instruction definition is in the loop exit.
+ BasicBlock *Exit = TheLoop->getUniqueExitBlock();
+ assert(Exit && "Expected loop with single exit.");
+ if (InstParent == Exit) {
+ // Instruction definition is in outermost loop exit.
+ return false;
+ }
+
+ // Check whether Instruction definition is in loop body.
+ return !TheLoop->contains(Inst);
+}
+
+// Create a new VPValue or retrieve an existing one for the Instruction's
+// operand \p IRVal. This function must only be used to create/retrieve VPValues
+// for *Instruction's operands* and not to create regular VPInstruction's. For
+// the latter, please, look at 'createVPInstructionsForVPBB'.
+VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
+ auto VPValIt = IRDef2VPValue.find(IRVal);
+ if (VPValIt != IRDef2VPValue.end())
+ // Operand has an associated VPInstruction or VPValue that was previously
+ // created.
+ return VPValIt->second;
+
+ // Operand doesn't have a previously created VPInstruction/VPValue. This
+ // means that operand is:
+ // A) a definition external to VPlan,
+ // B) any other Value without specific representation in VPlan.
+ // For now, we use VPValue to represent A and B and classify both as external
+ // definitions. We may introduce specific VPValue subclasses for them in the
+ // future.
+ assert(isExternalDef(IRVal) && "Expected external definition as operand.");
+
+ // A and B: Create VPValue and add it to the pool of external definitions and
+ // to the Value->VPValue map.
+ VPValue *NewVPVal = new VPValue(IRVal);
+ Plan.addExternalDef(NewVPVal);
+ IRDef2VPValue[IRVal] = NewVPVal;
+ return NewVPVal;
+}
+
+// Create new VPInstructions in a VPBasicBlock, given its BasicBlock
+// counterpart. This function must be invoked in RPO so that the operands of a
+// VPInstruction in \p BB have been visited before (except for Phi nodes).
+void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
+ BasicBlock *BB) {
+ VPIRBuilder.setInsertPoint(VPBB);
+ for (Instruction &InstRef : *BB) {
+ Instruction *Inst = &InstRef;
+
+ // There shouldn't be any VPValue for Inst at this point. Otherwise, we
+ // visited Inst when we shouldn't, breaking the RPO traversal order.
+ assert(!IRDef2VPValue.count(Inst) &&
+ "Instruction shouldn't have been visited.");
+
+ if (auto *Br = dyn_cast<BranchInst>(Inst)) {
+ // Branch instruction is not explicitly represented in VPlan but we need
+ // to represent its condition bit when it's conditional.
+ if (Br->isConditional())
+ getOrCreateVPOperand(Br->getCondition());
+
+ // Skip the rest of the Instruction processing for Branch instructions.
+ continue;
+ }
+
+ VPInstruction *NewVPInst;
+ if (auto *Phi = dyn_cast<PHINode>(Inst)) {
+ // Phi node's operands may have not been visited at this point. We create
+ // an empty VPInstruction that we will fix once the whole plain CFG has
+ // been built.
+ NewVPInst = cast<VPInstruction>(VPIRBuilder.createNaryOp(
+ Inst->getOpcode(), {} /*No operands*/, Inst));
+ PhisToFix.push_back(Phi);
+ } else {
+ // Translate LLVM-IR operands into VPValue operands and set them in the
+ // new VPInstruction.
+ SmallVector<VPValue *, 4> VPOperands;
+ for (Value *Op : Inst->operands())
+ VPOperands.push_back(getOrCreateVPOperand(Op));
+
+ // Build VPInstruction for any arbitraty Instruction without specific
+ // representation in VPlan.
+ NewVPInst = cast<VPInstruction>(
+ VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst));
+ }
+
+ IRDef2VPValue[Inst] = NewVPInst;
+ }
+}
+
+// Main interface to build the plain CFG.
+VPRegionBlock *PlainCFGBuilder::buildPlainCFG() {
+ // 1. Create the Top Region. It will be the parent of all VPBBs.
+ TopRegion = new VPRegionBlock("TopRegion", false /*isReplicator*/);
+
+ // 2. Scan the body of the loop in a topological order to visit each basic
+ // block after having visited its predecessor basic blocks. Create a VPBB for
+ // each BB and link it to its successor and predecessor VPBBs. Note that
+ // predecessors must be set in the same order as they are in the incomming IR.
+ // Otherwise, there might be problems with existing phi nodes and algorithm
+ // based on predecessors traversal.
+
+ // Loop PH needs to be explicitly visited since it's not taken into account by
+ // LoopBlocksDFS.
+ BasicBlock *PreheaderBB = TheLoop->getLoopPreheader();
+ assert((PreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
+ "Unexpected loop preheader");
+ VPBasicBlock *PreheaderVPBB = getOrCreateVPBB(PreheaderBB);
+ createVPInstructionsForVPBB(PreheaderVPBB, PreheaderBB);
+ // Create empty VPBB for Loop H so that we can link PH->H.
+ VPBlockBase *HeaderVPBB = getOrCreateVPBB(TheLoop->getHeader());
+ // Preheader's predecessors will be set during the loop RPO traversal below.
+ PreheaderVPBB->setOneSuccessor(HeaderVPBB);
+
+ LoopBlocksRPO RPO(TheLoop);
+ RPO.perform(LI);
+
+ for (BasicBlock *BB : RPO) {
+ // Create or retrieve the VPBasicBlock for this BB and create its
+ // VPInstructions.
+ VPBasicBlock *VPBB = getOrCreateVPBB(BB);
+ createVPInstructionsForVPBB(VPBB, BB);
+
+ // Set VPBB successors. We create empty VPBBs for successors if they don't
+ // exist already. Recipes will be created when the successor is visited
+ // during the RPO traversal.
+ TerminatorInst *TI = BB->getTerminator();
+ assert(TI && "Terminator expected.");
+ unsigned NumSuccs = TI->getNumSuccessors();
+
+ if (NumSuccs == 1) {
+ VPBasicBlock *SuccVPBB = getOrCreateVPBB(TI->getSuccessor(0));
+ assert(SuccVPBB && "VPBB Successor not found.");
+ VPBB->setOneSuccessor(SuccVPBB);
+ } else if (NumSuccs == 2) {
+ VPBasicBlock *SuccVPBB0 = getOrCreateVPBB(TI->getSuccessor(0));
+ assert(SuccVPBB0 && "Successor 0 not found.");
+ VPBasicBlock *SuccVPBB1 = getOrCreateVPBB(TI->getSuccessor(1));
+ assert(SuccVPBB1 && "Successor 1 not found.");
+
+ // Get VPBB's condition bit.
+ assert(isa<BranchInst>(TI) && "Unsupported terminator!");
+ auto *Br = cast<BranchInst>(TI);
+ Value *BrCond = Br->getCondition();
+ // Look up the branch condition to get the corresponding VPValue
+ // representing the condition bit in VPlan (which may be in another VPBB).
+ assert(IRDef2VPValue.count(BrCond) &&
+ "Missing condition bit in IRDef2VPValue!");
+ VPValue *VPCondBit = IRDef2VPValue[BrCond];
+
+ // Link successors using condition bit.
+ VPBB->setTwoSuccessors(SuccVPBB0, SuccVPBB1, VPCondBit);
+ } else
+ llvm_unreachable("Number of successors not supported.");
+
+ // Set VPBB predecessors in the same order as they are in the incoming BB.
+ setVPBBPredsFromBB(VPBB, BB);
+ }
+
+ // 3. Process outermost loop exit. We created an empty VPBB for the loop
+ // single exit BB during the RPO traversal of the loop body but Instructions
+ // weren't visited because it's not part of the the loop.
+ BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock();
+ assert(LoopExitBB && "Loops with multiple exits are not supported.");
+ VPBasicBlock *LoopExitVPBB = BB2VPBB[LoopExitBB];
+ createVPInstructionsForVPBB(LoopExitVPBB, LoopExitBB);
+ // Loop exit was already set as successor of the loop exiting BB.
+ // We only set its predecessor VPBB now.
+ setVPBBPredsFromBB(LoopExitVPBB, LoopExitBB);
+
+ // 4. The whole CFG has been built at this point so all the input Values must
+ // have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding
+ // VPlan operands.
+ fixPhiNodes();
+
+ // 5. Final Top Region setup. Set outermost loop pre-header and single exit as
+ // Top Region entry and exit.
+ TopRegion->setEntry(PreheaderVPBB);
+ TopRegion->setExit(LoopExitVPBB);
+ return TopRegion;
+}
+
+// Public interface to build a H-CFG.
+void VPlanHCFGBuilder::buildHierarchicalCFG(VPlan &Plan) {
+ // Build Top Region enclosing the plain CFG and set it as VPlan entry.
+ PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
+ VPRegionBlock *TopRegion = PCFGBuilder.buildPlainCFG();
+ Plan.setEntry(TopRegion);
+ LLVM_DEBUG(Plan.setName("HCFGBuilder: Plain CFG\n"); dbgs() << Plan);
+
+ Verifier.verifyHierarchicalCFG(TopRegion);
+}
diff --git a/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
new file mode 100644
index 000000000000..c4e69843615a
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
@@ -0,0 +1,55 @@
+//===-- VPlanHCFGBuilder.h --------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the VPlanHCFGBuilder class which contains the public
+/// interface (buildHierarchicalCFG) to build a VPlan-based Hierarchical CFG
+/// (H-CFG) for an incoming IR.
+///
+/// A H-CFG in VPlan is a control-flow graph whose nodes are VPBasicBlocks
+/// and/or VPRegionBlocks (i.e., other H-CFGs). The outermost H-CFG of a VPlan
+/// consists of a VPRegionBlock, denoted Top Region, which encloses any other
+/// VPBlockBase in the H-CFG. This guarantees that any VPBlockBase in the H-CFG
+/// other than the Top Region will have a parent VPRegionBlock and allows us
+/// to easily add more nodes before/after the main vector loop (such as the
+/// reduction epilogue).
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
+
+#include "VPlan.h"
+#include "VPlanVerifier.h"
+
+namespace llvm {
+
+class Loop;
+
+/// Main class to build the VPlan H-CFG for an incoming IR.
+class VPlanHCFGBuilder {
+private:
+ // The outermost loop of the input loop nest considered for vectorization.
+ Loop *TheLoop;
+
+ // Loop Info analysis.
+ LoopInfo *LI;
+
+ // VPlan verifier utility.
+ VPlanVerifier Verifier;
+
+public:
+ VPlanHCFGBuilder(Loop *Lp, LoopInfo *LI) : TheLoop(Lp), LI(LI) {}
+
+ /// Build H-CFG for TheLoop and update \p Plan accordingly.
+ void buildHierarchicalCFG(VPlan &Plan);
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
diff --git a/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp b/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
new file mode 100644
index 000000000000..e3cbab077e61
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
@@ -0,0 +1,73 @@
+//===-- VPlanHCFGTransforms.cpp - Utility VPlan to VPlan transforms -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements a set of utility VPlan to VPlan transformations.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanHCFGTransforms.h"
+#include "llvm/ADT/PostOrderIterator.h"
+
+using namespace llvm;
+
+void VPlanHCFGTransforms::VPInstructionsToVPRecipes(
+ VPlanPtr &Plan,
+ LoopVectorizationLegality::InductionList *Inductions,
+ SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+
+ VPRegionBlock *TopRegion = dyn_cast<VPRegionBlock>(Plan->getEntry());
+ ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry());
+ for (VPBlockBase *Base : RPOT) {
+ // Do not widen instructions in pre-header and exit blocks.
+ if (Base->getNumPredecessors() == 0 || Base->getNumSuccessors() == 0)
+ continue;
+
+ VPBasicBlock *VPBB = Base->getEntryBasicBlock();
+ VPRecipeBase *LastRecipe = nullptr;
+ // Introduce each ingredient into VPlan.
+ for (auto I = VPBB->begin(), E = VPBB->end(); I != E;) {
+ VPRecipeBase *Ingredient = &*I++;
+ // Can only handle VPInstructions.
+ VPInstruction *VPInst = cast<VPInstruction>(Ingredient);
+ Instruction *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
+ if (DeadInstructions.count(Inst)) {
+ Ingredient->eraseFromParent();
+ continue;
+ }
+
+ VPRecipeBase *NewRecipe = nullptr;
+ // Create VPWidenMemoryInstructionRecipe for loads and stores.
+ if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
+ NewRecipe = new VPWidenMemoryInstructionRecipe(*Inst, nullptr /*Mask*/);
+ else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {
+ InductionDescriptor II = Inductions->lookup(Phi);
+ if (II.getKind() == InductionDescriptor::IK_IntInduction ||
+ II.getKind() == InductionDescriptor::IK_FpInduction) {
+ NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi);
+ } else
+ NewRecipe = new VPWidenPHIRecipe(Phi);
+ } else {
+ // If the last recipe is a VPWidenRecipe, add Inst to it instead of
+ // creating a new recipe.
+ if (VPWidenRecipe *WidenRecipe =
+ dyn_cast_or_null<VPWidenRecipe>(LastRecipe)) {
+ WidenRecipe->appendInstruction(Inst);
+ Ingredient->eraseFromParent();
+ continue;
+ }
+ NewRecipe = new VPWidenRecipe(Inst);
+ }
+
+ NewRecipe->insertBefore(Ingredient);
+ LastRecipe = NewRecipe;
+ Ingredient->eraseFromParent();
+ }
+ }
+}
diff --git a/lib/Transforms/Vectorize/VPlanHCFGTransforms.h b/lib/Transforms/Vectorize/VPlanHCFGTransforms.h
new file mode 100644
index 000000000000..ae549c6871b3
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlanHCFGTransforms.h
@@ -0,0 +1,36 @@
+//===- VPlanHCFGTransforms.h - Utility VPlan to VPlan transforms ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides utility VPlan to VPlan transformations.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H
+
+#include "VPlan.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+
+namespace llvm {
+
+class VPlanHCFGTransforms {
+
+public:
+ /// Replaces the VPInstructions in \p Plan with corresponding
+ /// widen recipes.
+ static void VPInstructionsToVPRecipes(
+ VPlanPtr &Plan,
+ LoopVectorizationLegality::InductionList *Inductions,
+ SmallPtrSetImpl<Instruction *> &DeadInstructions);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H
diff --git a/lib/Transforms/Vectorize/VPlanValue.h b/lib/Transforms/Vectorize/VPlanValue.h
index 50966891e0eb..08f142915b49 100644
--- a/lib/Transforms/Vectorize/VPlanValue.h
+++ b/lib/Transforms/Vectorize/VPlanValue.h
@@ -37,13 +37,34 @@ class VPUser;
// coming from the input IR, instructions which VPlan will generate if executed
// and live-outs which the VPlan will need to fix accordingly.
class VPValue {
+ friend class VPBuilder;
private:
const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
SmallVector<VPUser *, 1> Users;
protected:
- VPValue(const unsigned char SC) : SubclassID(SC) {}
+ // Hold the underlying Value, if any, attached to this VPValue.
+ Value *UnderlyingVal;
+
+ VPValue(const unsigned char SC, Value *UV = nullptr)
+ : SubclassID(SC), UnderlyingVal(UV) {}
+
+ // DESIGN PRINCIPLE: Access to the underlying IR must be strictly limited to
+ // the front-end and back-end of VPlan so that the middle-end is as
+ // independent as possible of the underlying IR. We grant access to the
+ // underlying IR using friendship. In that way, we should be able to use VPlan
+ // for multiple underlying IRs (Polly?) by providing a new VPlan front-end,
+ // back-end and analysis information for the new IR.
+
+ /// Return the underlying Value attached to this VPValue.
+ Value *getUnderlyingValue() { return UnderlyingVal; }
+
+ // Set \p Val as the underlying Value of this VPValue.
+ void setUnderlyingValue(Value *Val) {
+ assert(!UnderlyingVal && "Underlying Value is already set.");
+ UnderlyingVal = Val;
+ }
public:
/// An enumeration for keeping track of the concrete subclass of VPValue that
@@ -52,7 +73,7 @@ public:
/// type identification.
enum { VPValueSC, VPUserSC, VPInstructionSC };
- VPValue() : SubclassID(VPValueSC) {}
+ VPValue(Value *UV = nullptr) : VPValue(VPValueSC, UV) {}
VPValue(const VPValue &) = delete;
VPValue &operator=(const VPValue &) = delete;
@@ -94,11 +115,6 @@ class VPUser : public VPValue {
private:
SmallVector<VPValue *, 2> Operands;
- void addOperand(VPValue *Operand) {
- Operands.push_back(Operand);
- Operand->addUser(*this);
- }
-
protected:
VPUser(const unsigned char SC) : VPValue(SC) {}
VPUser(const unsigned char SC, ArrayRef<VPValue *> Operands) : VPValue(SC) {
@@ -120,6 +136,11 @@ public:
V->getVPValueID() <= VPInstructionSC;
}
+ void addOperand(VPValue *Operand) {
+ Operands.push_back(Operand);
+ Operand->addUser(*this);
+ }
+
unsigned getNumOperands() const { return Operands.size(); }
inline VPValue *getOperand(unsigned N) const {
assert(N < Operands.size() && "Operand index out of bounds");
diff --git a/lib/Transforms/Vectorize/VPlanVerifier.cpp b/lib/Transforms/Vectorize/VPlanVerifier.cpp
new file mode 100644
index 000000000000..054bed4e177f
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -0,0 +1,133 @@
+//===-- VPlanVerifier.cpp -------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the class VPlanVerifier, which contains utility functions
+/// to check the consistency and invariants of a VPlan.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanVerifier.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+
+#define DEBUG_TYPE "loop-vectorize"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableHCFGVerifier("vplan-verify-hcfg", cl::init(false),
+ cl::Hidden,
+ cl::desc("Verify VPlan H-CFG."));
+
+#ifndef NDEBUG
+/// Utility function that checks whether \p VPBlockVec has duplicate
+/// VPBlockBases.
+static bool hasDuplicates(const SmallVectorImpl<VPBlockBase *> &VPBlockVec) {
+ SmallDenseSet<const VPBlockBase *, 8> VPBlockSet;
+ for (const auto *Block : VPBlockVec) {
+ if (VPBlockSet.count(Block))
+ return true;
+ VPBlockSet.insert(Block);
+ }
+ return false;
+}
+#endif
+
+/// Helper function that verifies the CFG invariants of the VPBlockBases within
+/// \p Region. Checks in this function are generic for VPBlockBases. They are
+/// not specific for VPBasicBlocks or VPRegionBlocks.
+static void verifyBlocksInRegion(const VPRegionBlock *Region) {
+ for (const VPBlockBase *VPB :
+ make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
+ df_iterator<const VPBlockBase *>::end(Region->getExit()))) {
+ // Check block's parent.
+ assert(VPB->getParent() == Region && "VPBlockBase has wrong parent");
+
+ // Check block's condition bit.
+ if (VPB->getNumSuccessors() > 1)
+ assert(VPB->getCondBit() && "Missing condition bit!");
+ else
+ assert(!VPB->getCondBit() && "Unexpected condition bit!");
+
+ // Check block's successors.
+ const auto &Successors = VPB->getSuccessors();
+ // There must be only one instance of a successor in block's successor list.
+ // TODO: This won't work for switch statements.
+ assert(!hasDuplicates(Successors) &&
+ "Multiple instances of the same successor.");
+
+ for (const VPBlockBase *Succ : Successors) {
+ // There must be a bi-directional link between block and successor.
+ const auto &SuccPreds = Succ->getPredecessors();
+ assert(std::find(SuccPreds.begin(), SuccPreds.end(), VPB) !=
+ SuccPreds.end() &&
+ "Missing predecessor link.");
+ (void)SuccPreds;
+ }
+
+ // Check block's predecessors.
+ const auto &Predecessors = VPB->getPredecessors();
+ // There must be only one instance of a predecessor in block's predecessor
+ // list.
+ // TODO: This won't work for switch statements.
+ assert(!hasDuplicates(Predecessors) &&
+ "Multiple instances of the same predecessor.");
+
+ for (const VPBlockBase *Pred : Predecessors) {
+ // Block and predecessor must be inside the same region.
+ assert(Pred->getParent() == VPB->getParent() &&
+ "Predecessor is not in the same region.");
+
+ // There must be a bi-directional link between block and predecessor.
+ const auto &PredSuccs = Pred->getSuccessors();
+ assert(std::find(PredSuccs.begin(), PredSuccs.end(), VPB) !=
+ PredSuccs.end() &&
+ "Missing successor link.");
+ (void)PredSuccs;
+ }
+ }
+}
+
+/// Verify the CFG invariants of VPRegionBlock \p Region and its nested
+/// VPBlockBases. Do not recurse inside nested VPRegionBlocks.
+static void verifyRegion(const VPRegionBlock *Region) {
+ const VPBlockBase *Entry = Region->getEntry();
+ const VPBlockBase *Exit = Region->getExit();
+
+ // Entry and Exit shouldn't have any predecessor/successor, respectively.
+ assert(!Entry->getNumPredecessors() && "Region entry has predecessors.");
+ assert(!Exit->getNumSuccessors() && "Region exit has successors.");
+ (void)Entry;
+ (void)Exit;
+
+ verifyBlocksInRegion(Region);
+}
+
+/// Verify the CFG invariants of VPRegionBlock \p Region and its nested
+/// VPBlockBases. Recurse inside nested VPRegionBlocks.
+static void verifyRegionRec(const VPRegionBlock *Region) {
+ verifyRegion(Region);
+
+ // Recurse inside nested regions.
+ for (const VPBlockBase *VPB :
+ make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
+ df_iterator<const VPBlockBase *>::end(Region->getExit()))) {
+ if (const auto *SubRegion = dyn_cast<VPRegionBlock>(VPB))
+ verifyRegionRec(SubRegion);
+ }
+}
+
+void VPlanVerifier::verifyHierarchicalCFG(
+ const VPRegionBlock *TopRegion) const {
+ if (!EnableHCFGVerifier)
+ return;
+
+ LLVM_DEBUG(dbgs() << "Verifying VPlan H-CFG.\n");
+ assert(!TopRegion->getParent() && "VPlan Top Region should have no parent.");
+ verifyRegionRec(TopRegion);
+}
diff --git a/lib/Transforms/Vectorize/VPlanVerifier.h b/lib/Transforms/Vectorize/VPlanVerifier.h
new file mode 100644
index 000000000000..d2f99d006a66
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlanVerifier.h
@@ -0,0 +1,44 @@
+//===-- VPlanVerifier.h -----------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the class VPlanVerifier, which contains utility functions
+/// to check the consistency of a VPlan. This includes the following kinds of
+/// invariants:
+///
+/// 1. Region/Block invariants:
+/// - Region's entry/exit block must have no predecessors/successors,
+/// respectively.
+/// - Block's parent must be the region immediately containing the block.
+/// - Linked blocks must have a bi-directional link (successor/predecessor).
+/// - All predecessors/successors of a block must belong to the same region.
+/// - Blocks must have no duplicated successor/predecessor.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
+
+#include "VPlan.h"
+
+namespace llvm {
+
+/// Class with utility functions that can be used to check the consistency and
+/// invariants of a VPlan, including the components of its H-CFG.
+class VPlanVerifier {
+public:
+ /// Verify the invariants of the H-CFG starting from \p TopRegion. The
+ /// verification process comprises the following steps:
+ /// 1. Region/Block verification: Check the Region/Block verification
+ /// invariants for every region in the H-CFG.
+ void verifyHierarchicalCFG(const VPRegionBlock *TopRegion) const;
+};
+} // namespace llvm
+
+#endif //LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
index b04905bfc6fa..f62a88558328 100644
--- a/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@@ -34,10 +34,6 @@ void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
initializeVectorization(*unwrap(R));
}
-// DEPRECATED: Remove after the LLVM 5 release.
-void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
-}
-
void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createLoopVectorizePass());
}