summaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2016-07-23 20:41:05 +0000
committerDimitry Andric <dim@FreeBSD.org>2016-07-23 20:41:05 +0000
commit01095a5d43bbfde13731688ddcf6048ebb8b7721 (patch)
tree4def12e759965de927d963ac65840d663ef9d1ea /lib/Transforms/Vectorize/SLPVectorizer.cpp
parentf0f4822ed4b66e3579e92a89f368f8fb860e218e (diff)
Notes
Diffstat (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp1364
1 files changed, 895 insertions, 469 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f69a4e52c7e1..8a3c4d14fecb 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -15,21 +15,17 @@
// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Vectorize.h"
-#include "llvm/ADT/MapVector.h"
+#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
@@ -44,12 +40,12 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
-#include <map>
#include <memory>
using namespace llvm;
+using namespace slpvectorizer;
#define SV_NAME "slp-vectorizer"
#define DEBUG_TYPE "SLP"
@@ -82,11 +78,11 @@ static cl::opt<int>
ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
cl::desc("Limit the size of the SLP scheduling region per block"));
-namespace {
+static cl::opt<int> MinVectorRegSizeOption(
+ "slp-min-reg-size", cl::init(128), cl::Hidden,
+ cl::desc("Attempt to vectorize for this register size in bits"));
// FIXME: Set this via cl::opt to allow overriding.
-static const unsigned MinVecRegSize = 128;
-
static const unsigned RecursionMaxDepth = 12;
// Limit the number of alias checks. The limit is chosen so that
@@ -134,8 +130,8 @@ static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
/// \returns True if all of the values in \p VL are constants.
static bool allConstant(ArrayRef<Value *> VL) {
- for (unsigned i = 0, e = VL.size(); i < e; ++i)
- if (!isa<Constant>(VL[i]))
+ for (Value *i : VL)
+ if (!isa<Constant>(i))
return false;
return true;
}
@@ -223,46 +219,6 @@ static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
}
}
-/// \returns \p I after propagating metadata from \p VL.
-static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
- Instruction *I0 = cast<Instruction>(VL[0]);
- SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
- I0->getAllMetadataOtherThanDebugLoc(Metadata);
-
- for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
- unsigned Kind = Metadata[i].first;
- MDNode *MD = Metadata[i].second;
-
- for (int i = 1, e = VL.size(); MD && i != e; i++) {
- Instruction *I = cast<Instruction>(VL[i]);
- MDNode *IMD = I->getMetadata(Kind);
-
- switch (Kind) {
- default:
- MD = nullptr; // Remove unknown metadata
- break;
- case LLVMContext::MD_tbaa:
- MD = MDNode::getMostGenericTBAA(MD, IMD);
- break;
- case LLVMContext::MD_alias_scope:
- MD = MDNode::getMostGenericAliasScope(MD, IMD);
- break;
- case LLVMContext::MD_noalias:
- MD = MDNode::intersect(MD, IMD);
- break;
- case LLVMContext::MD_fpmath:
- MD = MDNode::getMostGenericFPMath(MD, IMD);
- break;
- case LLVMContext::MD_nontemporal:
- MD = MDNode::intersect(MD, IMD);
- break;
- }
- }
- I->setMetadata(Kind, MD);
- }
- return I;
-}
-
/// \returns The type that all of the values in \p VL have or null if there
/// are different types.
static Type* getSameType(ArrayRef<Value *> VL) {
@@ -274,36 +230,17 @@ static Type* getSameType(ArrayRef<Value *> VL) {
return Ty;
}
-/// \returns True if the ExtractElement instructions in VL can be vectorized
-/// to use the original vector.
-static bool CanReuseExtract(ArrayRef<Value *> VL) {
- assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
- // Check if all of the extracts come from the same vector and from the
- // correct offset.
- Value *VL0 = VL[0];
- ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
- Value *Vec = E0->getOperand(0);
-
- // We have to extract from the same vector type.
- unsigned NElts = Vec->getType()->getVectorNumElements();
-
- if (NElts != VL.size())
- return false;
-
- // Check that all of the indices extract from the correct offset.
- ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
- if (!CI || CI->getZExtValue())
- return false;
-
- for (unsigned i = 1, e = VL.size(); i < e; ++i) {
- ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
+/// \returns True if Extract{Value,Element} instruction extracts element Idx.
+static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
+ assert(Opcode == Instruction::ExtractElement ||
+ Opcode == Instruction::ExtractValue);
+ if (Opcode == Instruction::ExtractElement) {
ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
-
- if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
- return false;
+ return CI && CI->getZExtValue() == Idx;
+ } else {
+ ExtractValueInst *EI = cast<ExtractValueInst>(E);
+ return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
}
-
- return true;
}
/// \returns True if in-tree use also needs extract. This refers to
@@ -323,7 +260,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(UserInst);
- Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (hasVectorInstrinsicScalarOpd(ID, 1)) {
return (CI->getArgOperand(1) == Scalar);
}
@@ -353,6 +290,8 @@ static bool isSimple(Instruction *I) {
return true;
}
+namespace llvm {
+namespace slpvectorizer {
/// Bottom Up SLP Vectorizer.
class BoUpSLP {
public:
@@ -363,11 +302,24 @@ public:
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
- DominatorTree *Dt, AssumptionCache *AC)
+ DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
+ const DataLayout *DL)
: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
- SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
- Builder(Se->getContext()) {
+ SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
+ DL(DL), Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
+ // Use the vector register size specified by the target unless overridden
+ // by a command-line option.
+ // TODO: It would be better to limit the vectorization factor based on
+ // data type rather than just register size. For example, x86 AVX has
+ // 256-bit registers, but it does not support integer operations
+ // at that width (that requires AVX2).
+ if (MaxVectorRegSizeOption.getNumOccurrences())
+ MaxVecRegSize = MaxVectorRegSizeOption;
+ else
+ MaxVecRegSize = TTI->getRegisterBitWidth(true);
+
+ MinVecRegSize = MinVectorRegSizeOption;
}
/// \brief Vectorize the tree that starts with the elements in \p VL.
@@ -399,11 +351,9 @@ public:
BlockScheduling *BS = Iter.second.get();
BS->clear();
}
+ MinBWs.clear();
}
- /// \returns true if the memory operations A and B are consecutive.
- bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL);
-
/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
@@ -412,6 +362,32 @@ public:
return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
}
+ /// \return The vector element size in bits to use when vectorizing the
+ /// expression tree ending at \p V. If V is a store, the size is the width of
+ /// the stored value. Otherwise, the size is the width of the largest loaded
+ /// value reaching V. This method is used by the vectorizer to calculate
+ /// vectorization factors.
+ unsigned getVectorElementSize(Value *V);
+
+ /// Compute the minimum type sizes required to represent the entries in a
+ /// vectorizable tree.
+ void computeMinimumValueSizes();
+
+ // \returns maximum vector register size as set by TTI or overridden by cl::opt.
+ unsigned getMaxVecRegSize() const {
+ return MaxVecRegSize;
+ }
+
+ // \returns minimum vector register size as set by cl::opt.
+ unsigned getMinVecRegSize() const {
+ return MinVecRegSize;
+ }
+
+ /// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
+ ///
+ /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
+ unsigned canMapToVector(Type *T, const DataLayout &DL) const;
+
private:
struct TreeEntry;
@@ -421,6 +397,10 @@ private:
/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
+ /// \returns True if the ExtractElement/ExtractValue instructions in VL can
+ /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
+ bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;
+
/// Vectorize a single entry in the tree.
Value *vectorizeTree(TreeEntry *E);
@@ -431,14 +411,6 @@ private:
/// vectorized, or NULL. They may happen in cycles.
Value *alreadyVectorized(ArrayRef<Value *> VL) const;
- /// \brief Take the pointer operand from the Load/Store instruction.
- /// \returns NULL if this is not a valid Load/Store instruction.
- static Value *getPointerOperand(Value *I);
-
- /// \brief Take the address space operand from the Load/Store instruction.
- /// \returns -1 if this is not a valid Load/Store instruction.
- static unsigned getAddressSpaceOperand(Value *I);
-
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars.
int getGatherCost(Type *Ty);
@@ -719,8 +691,11 @@ private:
};
#ifndef NDEBUG
- friend raw_ostream &operator<<(raw_ostream &os,
- const BoUpSLP::ScheduleData &SD);
+ friend inline raw_ostream &operator<<(raw_ostream &os,
+ const BoUpSLP::ScheduleData &SD) {
+ SD.dump(os);
+ return os;
+ }
#endif
/// Contains all scheduling data for a basic block.
@@ -917,16 +892,21 @@ private:
AliasAnalysis *AA;
LoopInfo *LI;
DominatorTree *DT;
+ AssumptionCache *AC;
+ DemandedBits *DB;
+ const DataLayout *DL;
+ unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
+ unsigned MinVecRegSize; // Set by cl::opt (default: 128).
/// Instruction builder to construct the vectorized tree.
IRBuilder<> Builder;
+
+ /// A map of scalar integer values to the smallest bit width with which they
+ /// can legally be represented.
+ MapVector<Value *, uint64_t> MinBWs;
};
-#ifndef NDEBUG
-raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
- SD.dump(os);
- return os;
-}
-#endif
+} // end namespace llvm
+} // end namespace slpvectorizer
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst) {
@@ -937,8 +917,8 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
buildTree_rec(Roots, 0);
// Collect the values that we need to extract from the tree.
- for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
- TreeEntry *Entry = &VectorizableTree[EIdx];
+ for (TreeEntry &EIdx : VectorizableTree) {
+ TreeEntry *Entry = &EIdx;
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
@@ -987,7 +967,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
- bool SameTy = getSameType(VL); (void)SameTy;
+ bool SameTy = allConstant(VL) || getSameType(VL); (void)SameTy;
bool isAltShuffle = false;
assert(SameTy && "Invalid types!");
@@ -1138,16 +1118,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
- for (unsigned j = 0; j < VL.size(); ++j)
- Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
+ for (Value *j : VL)
+ Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
PH->getIncomingBlock(i)));
buildTree_rec(Operands, Depth + 1);
}
return;
}
+ case Instruction::ExtractValue:
case Instruction::ExtractElement: {
- bool Reuse = CanReuseExtract(VL);
+ bool Reuse = canReuseExtract(VL, Opcode);
if (Reuse) {
DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
} else {
@@ -1164,11 +1145,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
// loading/storing it as an i8 struct. If we vectorize loads/stores from
// such a struct we read/write packed bits disagreeing with the
// unvectorized version.
- const DataLayout &DL = F->getParent()->getDataLayout();
Type *ScalarTy = VL[0]->getType();
- if (DL.getTypeSizeInBits(ScalarTy) !=
- DL.getTypeAllocSizeInBits(ScalarTy)) {
+ if (DL->getTypeSizeInBits(ScalarTy) !=
+ DL->getTypeAllocSizeInBits(ScalarTy)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
@@ -1184,8 +1164,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
return;
}
- if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
- if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
+ if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
+ if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], *DL, *SE)) {
++NumLoadsWantToChangeOrder;
}
BS.cancelScheduling(VL);
@@ -1227,8 +1207,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
- for (unsigned j = 0; j < VL.size(); ++j)
- Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+ for (Value *j : VL)
+ Operands.push_back(cast<Instruction>(j)->getOperand(i));
buildTree_rec(Operands, Depth+1);
}
@@ -1256,8 +1236,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
- for (unsigned j = 0; j < VL.size(); ++j)
- Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+ for (Value *j : VL)
+ Operands.push_back(cast<Instruction>(j)->getOperand(i));
buildTree_rec(Operands, Depth+1);
}
@@ -1298,8 +1278,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
- for (unsigned j = 0; j < VL.size(); ++j)
- Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+ for (Value *j : VL)
+ Operands.push_back(cast<Instruction>(j)->getOperand(i));
buildTree_rec(Operands, Depth+1);
}
@@ -1346,18 +1326,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
for (unsigned i = 0, e = 2; i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
- for (unsigned j = 0; j < VL.size(); ++j)
- Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+ for (Value *j : VL)
+ Operands.push_back(cast<Instruction>(j)->getOperand(i));
buildTree_rec(Operands, Depth + 1);
}
return;
}
case Instruction::Store: {
- const DataLayout &DL = F->getParent()->getDataLayout();
// Check if the stores are consecutive or of we need to swizzle them.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
- if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
+ if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
@@ -1368,8 +1347,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
DEBUG(dbgs() << "SLP: added a vector of stores.\n");
ValueList Operands;
- for (unsigned j = 0; j < VL.size(); ++j)
- Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+ for (Value *j : VL)
+ Operands.push_back(cast<Instruction>(j)->getOperand(0));
buildTree_rec(Operands, Depth + 1);
return;
@@ -1379,7 +1358,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
CallInst *CI = cast<CallInst>(VL[0]);
// Check if this is an Intrinsic call or something that can be
// represented by an intrinsic call
- Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (!isTriviallyVectorizable(ID)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
@@ -1393,7 +1372,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
for (unsigned i = 1, e = VL.size(); i != e; ++i) {
CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
if (!CI2 || CI2->getCalledFunction() != Int ||
- getIntrinsicIDForCall(CI2, TLI) != ID) {
+ getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
+ !CI->hasIdenticalOperandBundleSchema(*CI2)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
@@ -1413,14 +1393,25 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
return;
}
}
+ // Verify that the bundle operands are identical between the two calls.
+ if (CI->hasOperandBundles() &&
+ !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
+ CI->op_begin() + CI->getBundleOperandsEndIndex(),
+ CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
+ << *VL[i] << '\n');
+ return;
+ }
}
newTreeEntry(VL, true);
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
ValueList Operands;
// Prepare the operand vector.
- for (unsigned j = 0; j < VL.size(); ++j) {
- CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
+ for (Value *j : VL) {
+ CallInst *CI2 = dyn_cast<CallInst>(j);
Operands.push_back(CI2->getArgOperand(i));
}
buildTree_rec(Operands, Depth + 1);
@@ -1451,8 +1442,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
- for (unsigned j = 0; j < VL.size(); ++j)
- Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+ for (Value *j : VL)
+ Operands.push_back(cast<Instruction>(j)->getOperand(i));
buildTree_rec(Operands, Depth + 1);
}
@@ -1466,6 +1457,74 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
}
}
+unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
+ unsigned N;
+ Type *EltTy;
+ auto *ST = dyn_cast<StructType>(T);
+ if (ST) {
+ N = ST->getNumElements();
+ EltTy = *ST->element_begin();
+ } else {
+ N = cast<ArrayType>(T)->getNumElements();
+ EltTy = cast<ArrayType>(T)->getElementType();
+ }
+ if (!isValidElementType(EltTy))
+ return 0;
+ uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
+ if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
+ return 0;
+ if (ST) {
+ // Check that struct is homogeneous.
+ for (const auto *Ty : ST->elements())
+ if (Ty != EltTy)
+ return 0;
+ }
+ return N;
+}
+
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {
+ assert(Opcode == Instruction::ExtractElement ||
+ Opcode == Instruction::ExtractValue);
+ assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
+ // Check if all of the extracts come from the same vector and from the
+ // correct offset.
+ Value *VL0 = VL[0];
+ Instruction *E0 = cast<Instruction>(VL0);
+ Value *Vec = E0->getOperand(0);
+
+ // We have to extract from a vector/aggregate with the same number of elements.
+ unsigned NElts;
+ if (Opcode == Instruction::ExtractValue) {
+ const DataLayout &DL = E0->getModule()->getDataLayout();
+ NElts = canMapToVector(Vec->getType(), DL);
+ if (!NElts)
+ return false;
+ // Check if load can be rewritten as load of vector.
+ LoadInst *LI = dyn_cast<LoadInst>(Vec);
+ if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
+ return false;
+ } else {
+ NElts = Vec->getType()->getVectorNumElements();
+ }
+
+ if (NElts != VL.size())
+ return false;
+
+ // Check that all of the indices extract from the correct offset.
+ if (!matchExtractIndex(E0, 0, Opcode))
+ return false;
+
+ for (unsigned i = 1, e = VL.size(); i < e; ++i) {
+ Instruction *E = cast<Instruction>(VL[i]);
+ if (!matchExtractIndex(E, i, Opcode))
+ return false;
+ if (E->getOperand(0) != Vec)
+ return false;
+ }
+
+ return true;
+}
+
int BoUpSLP::getEntryCost(TreeEntry *E) {
ArrayRef<Value*> VL = E->Scalars;
@@ -1474,6 +1533,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+ // If we have computed a smaller type for the expression, update VecTy so
+ // that the costs will be accurate.
+ if (MinBWs.count(VL[0]))
+ VecTy = VectorType::get(IntegerType::get(F->getContext(), MinBWs[VL[0]]),
+ VL.size());
+
if (E->NeedToGather) {
if (allConstant(VL))
return 0;
@@ -1489,11 +1554,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
case Instruction::PHI: {
return 0;
}
+ case Instruction::ExtractValue:
case Instruction::ExtractElement: {
- if (CanReuseExtract(VL)) {
+ if (canReuseExtract(VL, Opcode)) {
int DeadCost = 0;
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
- ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
+ Instruction *E = cast<Instruction>(VL[i]);
if (E->hasOneUse())
// Take credit for instruction that will become dead.
DeadCost +=
@@ -1527,7 +1593,14 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
case Instruction::FCmp:
case Instruction::ICmp:
- case Instruction::Select:
+ case Instruction::Select: {
+ // Calculate the cost of this instruction.
+ VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
+ int ScalarCost = VecTy->getNumElements() *
+ TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
+ int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
+ return VecCost - ScalarCost;
+ }
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
@@ -1546,59 +1619,48 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
- // Calculate the cost of this instruction.
- int ScalarCost = 0;
- int VecCost = 0;
- if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
- Opcode == Instruction::Select) {
- VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
- ScalarCost = VecTy->getNumElements() *
- TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
- VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
- } else {
- // Certain instructions can be cheaper to vectorize if they have a
- // constant second vector operand.
- TargetTransformInfo::OperandValueKind Op1VK =
- TargetTransformInfo::OK_AnyValue;
- TargetTransformInfo::OperandValueKind Op2VK =
- TargetTransformInfo::OK_UniformConstantValue;
- TargetTransformInfo::OperandValueProperties Op1VP =
- TargetTransformInfo::OP_None;
- TargetTransformInfo::OperandValueProperties Op2VP =
- TargetTransformInfo::OP_None;
-
- // If all operands are exactly the same ConstantInt then set the
- // operand kind to OK_UniformConstantValue.
- // If instead not all operands are constants, then set the operand kind
- // to OK_AnyValue. If all operands are constants but not the same,
- // then set the operand kind to OK_NonUniformConstantValue.
- ConstantInt *CInt = nullptr;
- for (unsigned i = 0; i < VL.size(); ++i) {
- const Instruction *I = cast<Instruction>(VL[i]);
- if (!isa<ConstantInt>(I->getOperand(1))) {
- Op2VK = TargetTransformInfo::OK_AnyValue;
- break;
- }
- if (i == 0) {
- CInt = cast<ConstantInt>(I->getOperand(1));
- continue;
- }
- if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
- CInt != cast<ConstantInt>(I->getOperand(1)))
- Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+ // Certain instructions can be cheaper to vectorize if they have a
+ // constant second vector operand.
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TargetTransformInfo::OK_UniformConstantValue;
+ TargetTransformInfo::OperandValueProperties Op1VP =
+ TargetTransformInfo::OP_None;
+ TargetTransformInfo::OperandValueProperties Op2VP =
+ TargetTransformInfo::OP_None;
+
+ // If all operands are exactly the same ConstantInt then set the
+ // operand kind to OK_UniformConstantValue.
+ // If instead not all operands are constants, then set the operand kind
+ // to OK_AnyValue. If all operands are constants but not the same,
+ // then set the operand kind to OK_NonUniformConstantValue.
+ ConstantInt *CInt = nullptr;
+ for (unsigned i = 0; i < VL.size(); ++i) {
+ const Instruction *I = cast<Instruction>(VL[i]);
+ if (!isa<ConstantInt>(I->getOperand(1))) {
+ Op2VK = TargetTransformInfo::OK_AnyValue;
+ break;
+ }
+ if (i == 0) {
+ CInt = cast<ConstantInt>(I->getOperand(1));
+ continue;
}
- // FIXME: Currently cost of model modification for division by
- // power of 2 is handled only for X86. Add support for other targets.
- if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
- CInt->getValue().isPowerOf2())
- Op2VP = TargetTransformInfo::OP_PowerOf2;
-
- ScalarCost = VecTy->getNumElements() *
- TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
- Op1VP, Op2VP);
- VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
- Op1VP, Op2VP);
+ if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
+ CInt != cast<ConstantInt>(I->getOperand(1)))
+ Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
}
+ // FIXME: Currently cost of model modification for division by power of
+ // 2 is handled for X86 and AArch64. Add support for other targets.
+ if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
+ CInt->getValue().isPowerOf2())
+ Op2VP = TargetTransformInfo::OP_PowerOf2;
+
+ int ScalarCost = VecTy->getNumElements() *
+ TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
+ Op2VK, Op1VP, Op2VP);
+ int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
+ Op1VP, Op2VP);
return VecCost - ScalarCost;
}
case Instruction::GetElementPtr: {
@@ -1617,21 +1679,25 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
case Instruction::Load: {
// Cost of wide load - cost of scalar loads.
+ unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
int ScalarLdCost = VecTy->getNumElements() *
- TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
- int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
+ TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
+ int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
+ VecTy, alignment, 0);
return VecLdCost - ScalarLdCost;
}
case Instruction::Store: {
// We know that we can merge the stores. Calculate the cost.
+ unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
int ScalarStCost = VecTy->getNumElements() *
- TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
- int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
+ TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0);
+ int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
+ VecTy, alignment, 0);
return VecStCost - ScalarStCost;
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
- Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
SmallVector<Type*, 4> ScalarTys, VecTys;
@@ -1641,10 +1707,14 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
VecTy->getNumElements()));
}
+ FastMathFlags FMF;
+ if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
+ FMF = FPMO->getFastMathFlags();
+
int ScalarCallCost = VecTy->getNumElements() *
- TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
+ TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
- int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
+ int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);
DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
@@ -1659,8 +1729,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TargetTransformInfo::OK_AnyValue;
int ScalarCost = 0;
int VecCost = 0;
- for (unsigned i = 0; i < VL.size(); ++i) {
- Instruction *I = cast<Instruction>(VL[i]);
+ for (Value *i : VL) {
+ Instruction *I = cast<Instruction>(i);
if (!I)
break;
ScalarCost +=
@@ -1715,8 +1785,8 @@ int BoUpSLP::getSpillCost() {
SmallPtrSet<Instruction*, 4> LiveValues;
Instruction *PrevInst = nullptr;
- for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
- Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
+ for (const auto &N : VectorizableTree) {
+ Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
if (!Inst)
continue;
@@ -1725,6 +1795,13 @@ int BoUpSLP::getSpillCost() {
continue;
}
+ // Update LiveValues.
+ LiveValues.erase(PrevInst);
+ for (auto &J : PrevInst->operands()) {
+ if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
+ LiveValues.insert(cast<Instruction>(&*J));
+ }
+
DEBUG(
dbgs() << "SLP: #LV: " << LiveValues.size();
for (auto *X : LiveValues)
@@ -1733,13 +1810,6 @@ int BoUpSLP::getSpillCost() {
Inst->dump();
);
- // Update LiveValues.
- LiveValues.erase(PrevInst);
- for (auto &J : PrevInst->operands()) {
- if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
- LiveValues.insert(cast<Instruction>(&*J));
- }
-
// Now find the sequence of instructions between PrevInst and Inst.
BasicBlock::reverse_iterator InstIt(Inst->getIterator()),
PrevInstIt(PrevInst->getIterator());
@@ -1763,7 +1833,6 @@ int BoUpSLP::getSpillCost() {
PrevInst = Inst;
}
- DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n");
return Cost;
}
@@ -1785,7 +1854,7 @@ int BoUpSLP::getTreeCost() {
for (TreeEntry &TE : VectorizableTree) {
int C = getEntryCost(&TE);
DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
- << TE.Scalars[0] << " .\n");
+ << *TE.Scalars[0] << ".\n");
Cost += C;
}
@@ -1802,15 +1871,29 @@ int BoUpSLP::getTreeCost() {
if (EphValues.count(EU.User))
continue;
- VectorType *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
- ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
- EU.Lane);
+ // If we plan to rewrite the tree in a smaller type, we will need to sign
+ // extend the extracted value back to the original type. Here, we account
+ // for the extract and the added cost of the sign extend if needed.
+ auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
+ auto *ScalarRoot = VectorizableTree[0].Scalars[0];
+ if (MinBWs.count(ScalarRoot)) {
+ auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);
+ VecTy = VectorType::get(MinTy, BundleWidth);
+ ExtractCost += TTI->getExtractWithExtendCost(
+ Instruction::SExt, EU.Scalar->getType(), VecTy, EU.Lane);
+ } else {
+ ExtractCost +=
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+ }
}
- Cost += getSpillCost();
+ int SpillCost = getSpillCost();
+ Cost += SpillCost + ExtractCost;
- DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
- return Cost + ExtractCost;
+ DEBUG(dbgs() << "SLP: Spill Cost = " << SpillCost << ".\n"
+ << "SLP: Extract Cost = " << ExtractCost << ".\n"
+ << "SLP: Total Cost = " << Cost << ".\n");
+ return Cost;
}
int BoUpSLP::getGatherCost(Type *Ty) {
@@ -1830,63 +1913,6 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
return getGatherCost(VecTy);
}
-Value *BoUpSLP::getPointerOperand(Value *I) {
- if (LoadInst *LI = dyn_cast<LoadInst>(I))
- return LI->getPointerOperand();
- if (StoreInst *SI = dyn_cast<StoreInst>(I))
- return SI->getPointerOperand();
- return nullptr;
-}
-
-unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
- if (LoadInst *L = dyn_cast<LoadInst>(I))
- return L->getPointerAddressSpace();
- if (StoreInst *S = dyn_cast<StoreInst>(I))
- return S->getPointerAddressSpace();
- return -1;
-}
-
-bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) {
- Value *PtrA = getPointerOperand(A);
- Value *PtrB = getPointerOperand(B);
- unsigned ASA = getAddressSpaceOperand(A);
- unsigned ASB = getAddressSpaceOperand(B);
-
- // Check that the address spaces match and that the pointers are valid.
- if (!PtrA || !PtrB || (ASA != ASB))
- return false;
-
- // Make sure that A and B are different pointers of the same type.
- if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
- return false;
-
- unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
- Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
- APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
-
- APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
- PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
- PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
-
- APInt OffsetDelta = OffsetB - OffsetA;
-
- // Check if they are based on the same pointer. That makes the offsets
- // sufficient.
- if (PtrA == PtrB)
- return OffsetDelta == Size;
-
- // Compute the necessary base pointer delta to have the necessary final delta
- // equal to the size.
- APInt BaseDelta = Size - OffsetDelta;
-
- // Otherwise compute the distance with SCEV between the base pointers.
- const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
- const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
- const SCEV *C = SE->getConstant(BaseDelta);
- const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
- return X == PtrSCEVB;
-}
-
// Reorder commutative operations in alternate shuffle if the resulting vectors
// are consecutive loads. This would allow us to vectorize the tree.
// If we have something like-
@@ -1899,12 +1925,10 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) {
void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {
- const DataLayout &DL = F->getParent()->getDataLayout();
-
// Push left and right operands of binary operation into Left and Right
- for (unsigned i = 0, e = VL.size(); i < e; ++i) {
- Left.push_back(cast<Instruction>(VL[i])->getOperand(0));
- Right.push_back(cast<Instruction>(VL[i])->getOperand(1));
+ for (Value *i : VL) {
+ Left.push_back(cast<Instruction>(i)->getOperand(0));
+ Right.push_back(cast<Instruction>(i)->getOperand(1));
}
// Reorder if we have a commutative operation and consecutive access
@@ -1914,10 +1938,11 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
Instruction *VL1 = cast<Instruction>(VL[j]);
Instruction *VL2 = cast<Instruction>(VL[j + 1]);
- if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
+ if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
std::swap(Left[j], Right[j]);
continue;
- } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
+ } else if (VL2->isCommutative() &&
+ isConsecutiveAccess(L, L1, *DL, *SE)) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
@@ -1928,10 +1953,11 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
Instruction *VL1 = cast<Instruction>(VL[j]);
Instruction *VL2 = cast<Instruction>(VL[j + 1]);
- if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
+ if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
std::swap(Left[j], Right[j]);
continue;
- } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
+ } else if (VL2->isCommutative() &&
+ isConsecutiveAccess(L, L1, *DL, *SE)) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
@@ -2061,8 +2087,6 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
if (SplatRight || SplatLeft)
return;
- const DataLayout &DL = F->getParent()->getDataLayout();
-
// Finally check if we can get longer vectorizable chain by reordering
// without breaking the good operand order detected above.
// E.g. If we have something like-
@@ -2081,7 +2105,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
for (unsigned j = 0; j < VL.size() - 1; ++j) {
if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
- if (isConsecutiveAccess(L, L1, DL)) {
+ if (isConsecutiveAccess(L, L1, *DL, *SE)) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
@@ -2089,7 +2113,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
}
if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
- if (isConsecutiveAccess(L, L1, DL)) {
+ if (isConsecutiveAccess(L, L1, *DL, *SE)) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
@@ -2185,7 +2209,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return Gather(E->Scalars, VecTy);
}
- const DataLayout &DL = F->getParent()->getDataLayout();
unsigned Opcode = getSameOpcode(E->Scalars);
switch (Opcode) {
@@ -2225,13 +2248,25 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
case Instruction::ExtractElement: {
- if (CanReuseExtract(E->Scalars)) {
+ if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) {
Value *V = VL0->getOperand(0);
E->VectorizedValue = V;
return V;
}
return Gather(E->Scalars, VecTy);
}
+ case Instruction::ExtractValue: {
+ if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) {
+ LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
+ Builder.SetInsertPoint(LI);
+ PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
+ Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
+ LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
+ E->VectorizedValue = V;
+ return propagateMetadata(V, E->Scalars);
+ }
+ return Gather(E->Scalars, VecTy);
+ }
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
@@ -2382,7 +2417,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
unsigned Alignment = LI->getAlignment();
LI = Builder.CreateLoad(VecPtr);
if (!Alignment) {
- Alignment = DL.getABITypeAlignment(ScalarLoadTy);
+ Alignment = DL->getABITypeAlignment(ScalarLoadTy);
}
LI->setAlignment(Alignment);
E->VectorizedValue = LI;
@@ -2413,7 +2448,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
if (!Alignment) {
- Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType());
+ Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
}
S->setAlignment(Alignment);
E->VectorizedValue = S;
@@ -2481,10 +2516,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Module *M = F->getParent();
- Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
- Value *V = Builder.CreateCall(CF, OpVecs);
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ CI->getOperandBundlesAsDefs(OpBundles);
+ Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
// The scalar argument uses an in-tree scalar so we add the new vectorized
// call to ExternalUses list to make sure that an extract will be
@@ -2559,15 +2596,28 @@ Value *BoUpSLP::vectorizeTree() {
}
Builder.SetInsertPoint(&F->getEntryBlock().front());
- vectorizeTree(&VectorizableTree[0]);
+ auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
+
+ // If the vectorized tree can be rewritten in a smaller type, we truncate the
+ // vectorized root. InstCombine will then rewrite the entire expression. We
+ // sign extend the extracted values below.
+ auto *ScalarRoot = VectorizableTree[0].Scalars[0];
+ if (MinBWs.count(ScalarRoot)) {
+ if (auto *I = dyn_cast<Instruction>(VectorRoot))
+ Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
+ auto BundleWidth = VectorizableTree[0].Scalars.size();
+ auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);
+ auto *VecTy = VectorType::get(MinTy, BundleWidth);
+ auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
+ VectorizableTree[0].VectorizedValue = Trunc;
+ }
DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
// Extract all of the elements with the external uses.
- for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
- it != e; ++it) {
- Value *Scalar = it->Scalar;
- llvm::User *User = it->User;
+ for (const auto &ExternalUse : ExternalUses) {
+ Value *Scalar = ExternalUse.Scalar;
+ llvm::User *User = ExternalUse.User;
// Skip users that we already RAUW. This happens when one instruction
// has multiple uses of the same value.
@@ -2583,15 +2633,24 @@ Value *BoUpSLP::vectorizeTree() {
Value *Vec = E->VectorizedValue;
assert(Vec && "Can't find vectorizable value");
- Value *Lane = Builder.getInt32(it->Lane);
+ Value *Lane = Builder.getInt32(ExternalUse.Lane);
// Generate extracts for out-of-tree users.
// Find the insertion point for the extractelement lane.
- if (isa<Instruction>(Vec)){
+ if (auto *VecI = dyn_cast<Instruction>(Vec)) {
if (PHINode *PH = dyn_cast<PHINode>(User)) {
for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
if (PH->getIncomingValue(i) == Scalar) {
- Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
+ TerminatorInst *IncomingTerminator =
+ PH->getIncomingBlock(i)->getTerminator();
+ if (isa<CatchSwitchInst>(IncomingTerminator)) {
+ Builder.SetInsertPoint(VecI->getParent(),
+ std::next(VecI->getIterator()));
+ } else {
+ Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
+ }
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ if (MinBWs.count(ScalarRoot))
+ Ex = Builder.CreateSExt(Ex, Scalar->getType());
CSEBlocks.insert(PH->getIncomingBlock(i));
PH->setOperand(i, Ex);
}
@@ -2599,12 +2658,16 @@ Value *BoUpSLP::vectorizeTree() {
} else {
Builder.SetInsertPoint(cast<Instruction>(User));
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ if (MinBWs.count(ScalarRoot))
+ Ex = Builder.CreateSExt(Ex, Scalar->getType());
CSEBlocks.insert(cast<Instruction>(User)->getParent());
User->replaceUsesOfWith(Scalar, Ex);
}
} else {
Builder.SetInsertPoint(&F->getEntryBlock().front());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ if (MinBWs.count(ScalarRoot))
+ Ex = Builder.CreateSExt(Ex, Scalar->getType());
CSEBlocks.insert(&F->getEntryBlock());
User->replaceUsesOfWith(Scalar, Ex);
}
@@ -2613,8 +2676,8 @@ Value *BoUpSLP::vectorizeTree() {
}
// For each vectorized value:
- for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
- TreeEntry *Entry = &VectorizableTree[EIdx];
+ for (TreeEntry &EIdx : VectorizableTree) {
+ TreeEntry *Entry = &EIdx;
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
@@ -2655,9 +2718,8 @@ void BoUpSLP::optimizeGatherSequence() {
DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
<< " gather sequences instructions.\n");
// LICM InsertElementInst sequences.
- for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
- e = GatherSeq.end(); it != e; ++it) {
- InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
+ for (Instruction *it : GatherSeq) {
+ InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);
if (!Insert)
continue;
@@ -2718,12 +2780,10 @@ void BoUpSLP::optimizeGatherSequence() {
// Check if we can replace this instruction with any of the
// visited instructions.
- for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
- ve = Visited.end();
- v != ve; ++v) {
- if (In->isIdenticalTo(*v) &&
- DT->dominates((*v)->getParent(), In->getParent())) {
- In->replaceAllUsesWith(*v);
+ for (Instruction *v : Visited) {
+ if (In->isIdenticalTo(v) &&
+ DT->dominates(v->getParent(), In->getParent())) {
+ In->replaceAllUsesWith(v);
eraseInstruction(In);
In = nullptr;
break;
@@ -3139,90 +3199,265 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
BS->ScheduleStart = nullptr;
}
-/// The SLPVectorizer Pass.
-struct SLPVectorizer : public FunctionPass {
- typedef SmallVector<StoreInst *, 8> StoreList;
- typedef MapVector<Value *, StoreList> StoreListMap;
+unsigned BoUpSLP::getVectorElementSize(Value *V) {
+ // If V is a store, just return the width of the stored value without
+ // traversing the expression tree. This is the common case.
+ if (auto *Store = dyn_cast<StoreInst>(V))
+ return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
+
+ // If V is not a store, we can traverse the expression tree to find loads
+ // that feed it. The type of the loaded value may indicate a more suitable
+ // width than V's type. We want to base the vector element size on the width
+ // of memory operations where possible.
+ SmallVector<Instruction *, 16> Worklist;
+ SmallPtrSet<Instruction *, 16> Visited;
+ if (auto *I = dyn_cast<Instruction>(V))
+ Worklist.push_back(I);
+
+ // Traverse the expression tree in bottom-up order looking for loads. If we
+ // encounter an instruciton we don't yet handle, we give up.
+ auto MaxWidth = 0u;
+ auto FoundUnknownInst = false;
+ while (!Worklist.empty() && !FoundUnknownInst) {
+ auto *I = Worklist.pop_back_val();
+ Visited.insert(I);
+
+ // We should only be looking at scalar instructions here. If the current
+ // instruction has a vector type, give up.
+ auto *Ty = I->getType();
+ if (isa<VectorType>(Ty))
+ FoundUnknownInst = true;
+
+ // If the current instruction is a load, update MaxWidth to reflect the
+ // width of the loaded value.
+ else if (isa<LoadInst>(I))
+ MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
+
+ // Otherwise, we need to visit the operands of the instruction. We only
+ // handle the interesting cases from buildTree here. If an operand is an
+ // instruction we haven't yet visited, we add it to the worklist.
+ else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
+ isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
+ for (Use &U : I->operands())
+ if (auto *J = dyn_cast<Instruction>(U.get()))
+ if (!Visited.count(J))
+ Worklist.push_back(J);
+ }
- /// Pass identification, replacement for typeid
- static char ID;
+ // If we don't yet handle the instruction, give up.
+ else
+ FoundUnknownInst = true;
+ }
- explicit SLPVectorizer() : FunctionPass(ID) {
- initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
+ // If we didn't encounter a memory access in the expression tree, or if we
+ // gave up for some reason, just return the width of V.
+ if (!MaxWidth || FoundUnknownInst)
+ return DL->getTypeSizeInBits(V->getType());
+
+ // Otherwise, return the maximum width we found.
+ return MaxWidth;
+}
+
+// Determine if a value V in a vectorizable expression Expr can be demoted to a
+// smaller type with a truncation. We collect the values that will be demoted
+// in ToDemote and additional roots that require investigating in Roots.
+static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
+ SmallVectorImpl<Value *> &ToDemote,
+ SmallVectorImpl<Value *> &Roots) {
+
+ // We can always demote constants.
+ if (isa<Constant>(V)) {
+ ToDemote.push_back(V);
+ return true;
}
- ScalarEvolution *SE;
- TargetTransformInfo *TTI;
- TargetLibraryInfo *TLI;
- AliasAnalysis *AA;
- LoopInfo *LI;
- DominatorTree *DT;
- AssumptionCache *AC;
+ // If the value is not an instruction in the expression with only one use, it
+ // cannot be demoted.
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I || !I->hasOneUse() || !Expr.count(I))
+ return false;
- bool runOnFunction(Function &F) override {
- if (skipOptnoneFunction(F))
+ switch (I->getOpcode()) {
+
+ // We can always demote truncations and extensions. Since truncations can
+ // seed additional demotion, we save the truncated value.
+ case Instruction::Trunc:
+ Roots.push_back(I->getOperand(0));
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ break;
+
+ // We can demote certain binary operations if we can demote both of their
+ // operands.
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
+ !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
return false;
+ break;
- SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
- TLI = TLIP ? &TLIP->getTLI() : nullptr;
- AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-
- StoreRefs.clear();
- bool Changed = false;
-
- // If the target claims to have no vector registers don't attempt
- // vectorization.
- if (!TTI->getNumberOfRegisters(true))
+ // We can demote selects if we can demote their true and false values.
+ case Instruction::Select: {
+ SelectInst *SI = cast<SelectInst>(I);
+ if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
+ !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
return false;
+ break;
+ }
- // Use the vector register size specified by the target unless overridden
- // by a command-line option.
- // TODO: It would be better to limit the vectorization factor based on
- // data type rather than just register size. For example, x86 AVX has
- // 256-bit registers, but it does not support integer operations
- // at that width (that requires AVX2).
- if (MaxVectorRegSizeOption.getNumOccurrences())
- MaxVecRegSize = MaxVectorRegSizeOption;
- else
- MaxVecRegSize = TTI->getRegisterBitWidth(true);
+ // We can demote phis if we can demote all their incoming operands. Note that
+ // we don't need to worry about cycles since we ensure single use above.
+ case Instruction::PHI: {
+ PHINode *PN = cast<PHINode>(I);
+ for (Value *IncValue : PN->incoming_values())
+ if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
+ return false;
+ break;
+ }
- // Don't vectorize when the attribute NoImplicitFloat is used.
- if (F.hasFnAttribute(Attribute::NoImplicitFloat))
- return false;
+ // Otherwise, conservatively give up.
+ default:
+ return false;
+ }
- DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
+ // Record the value that we can demote.
+ ToDemote.push_back(V);
+ return true;
+}
- // Use the bottom up slp vectorizer to construct chains that start with
- // store instructions.
- BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
+void BoUpSLP::computeMinimumValueSizes() {
+ // If there are no external uses, the expression tree must be rooted by a
+ // store. We can't demote in-memory values, so there is nothing to do here.
+ if (ExternalUses.empty())
+ return;
- // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
- // delete instructions.
+ // We only attempt to truncate integer expressions.
+ auto &TreeRoot = VectorizableTree[0].Scalars;
+ auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
+ if (!TreeRootIT)
+ return;
- // Scan the blocks in the function in post order.
- for (auto BB : post_order(&F.getEntryBlock())) {
- // Vectorize trees that end at stores.
- if (unsigned count = collectStores(BB, R)) {
- (void)count;
- DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
- Changed |= vectorizeStoreChains(R);
- }
+ // If the expression is not rooted by a store, these roots should have
+ // external uses. We will rely on InstCombine to rewrite the expression in
+ // the narrower type. However, InstCombine only rewrites single-use values.
+ // This means that if a tree entry other than a root is used externally, it
+ // must have multiple uses and InstCombine will not rewrite it. The code
+ // below ensures that only the roots are used externally.
+ SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
+ for (auto &EU : ExternalUses)
+ if (!Expr.erase(EU.Scalar))
+ return;
+ if (!Expr.empty())
+ return;
- // Vectorize trees that end at reductions.
- Changed |= vectorizeChainsInBlock(BB, R);
- }
+ // Collect the scalar values of the vectorizable expression. We will use this
+ // context to determine which values can be demoted. If we see a truncation,
+ // we mark it as seeding another demotion.
+ for (auto &Entry : VectorizableTree)
+ Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());
- if (Changed) {
- R.optimizeGatherSequence();
- DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
- DEBUG(verifyFunction(F));
+ // Ensure the roots of the vectorizable tree don't form a cycle. They must
+ // have a single external user that is not in the vectorizable tree.
+ for (auto *Root : TreeRoot)
+ if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
+ return;
+
+ // Conservatively determine if we can actually truncate the roots of the
+ // expression. Collect the values that can be demoted in ToDemote and
+ // additional roots that require investigating in Roots.
+ SmallVector<Value *, 32> ToDemote;
+ SmallVector<Value *, 4> Roots;
+ for (auto *Root : TreeRoot)
+ if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
+ return;
+
+ // The maximum bit width required to represent all the values that can be
+ // demoted without loss of precision. It would be safe to truncate the roots
+ // of the expression to this width.
+ auto MaxBitWidth = 8u;
+
+ // We first check if all the bits of the roots are demanded. If they're not,
+ // we can truncate the roots to this narrower type.
+ for (auto *Root : TreeRoot) {
+ auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
+ MaxBitWidth = std::max<unsigned>(
+ Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
+ }
+
+ // If all the bits of the roots are demanded, we can try a little harder to
+ // compute a narrower type. This can happen, for example, if the roots are
+ // getelementptr indices. InstCombine promotes these indices to the pointer
+ // width. Thus, all their bits are technically demanded even though the
+ // address computation might be vectorized in a smaller type.
+ //
+ // We start by looking at each entry that can be demoted. We compute the
+ // maximum bit width required to store the scalar by using ValueTracking to
+ // compute the number of high-order bits we can truncate.
+ if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
+ MaxBitWidth = 8u;
+ for (auto *Scalar : ToDemote) {
+ auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT);
+ auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
+ MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
}
- return Changed;
+ }
+
+ // Round MaxBitWidth up to the next power-of-two.
+ if (!isPowerOf2_64(MaxBitWidth))
+ MaxBitWidth = NextPowerOf2(MaxBitWidth);
+
+ // If the maximum bit width we compute is less than the with of the roots'
+ // type, we can proceed with the narrowing. Otherwise, do nothing.
+ if (MaxBitWidth >= TreeRootIT->getBitWidth())
+ return;
+
+ // If we can truncate the root, we must collect additional values that might
+ // be demoted as a result. That is, those seeded by truncations we will
+ // modify.
+ while (!Roots.empty())
+ collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
+
+ // Finally, map the values we can demote to the maximum bit with we computed.
+ for (auto *Scalar : ToDemote)
+ MinBWs[Scalar] = MaxBitWidth;
+}
+
+namespace {
+/// The SLPVectorizer Pass.
+struct SLPVectorizer : public FunctionPass {
+ SLPVectorizerPass Impl;
+
+ /// Pass identification, replacement for typeid
+ static char ID;
+
+ explicit SLPVectorizer() : FunctionPass(ID) {
+ initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
+ }
+
+
+ bool doInitialization(Module &M) override {
+ return false;
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+ auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+
+ return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -3233,51 +3468,105 @@ struct SLPVectorizer : public FunctionPass {
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<DemandedBitsWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
AU.setPreservesCFG();
}
+};
+} // end anonymous namespace
-private:
+PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
+ auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+ auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
+ auto *AA = &AM.getResult<AAManager>(F);
+ auto *LI = &AM.getResult<LoopAnalysis>(F);
+ auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ auto *AC = &AM.getResult<AssumptionAnalysis>(F);
+ auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
+
+ bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<LoopAnalysis>();
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<AAManager>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
- /// \brief Collect memory references and sort them according to their base
- /// object. We sort the stores to their base objects to reduce the cost of the
- /// quadratic search on the stores. TODO: We can further reduce this cost
- /// if we flush the chain creation every time we run into a memory barrier.
- unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
+bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
+ TargetTransformInfo *TTI_,
+ TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
+ LoopInfo *LI_, DominatorTree *DT_,
+ AssumptionCache *AC_, DemandedBits *DB_) {
+ SE = SE_;
+ TTI = TTI_;
+ TLI = TLI_;
+ AA = AA_;
+ LI = LI_;
+ DT = DT_;
+ AC = AC_;
+ DB = DB_;
+ DL = &F.getParent()->getDataLayout();
+
+ Stores.clear();
+ GEPs.clear();
+ bool Changed = false;
- /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
- bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
+ // If the target claims to have no vector registers don't attempt
+ // vectorization.
+ if (!TTI->getNumberOfRegisters(true))
+ return false;
- /// \brief Try to vectorize a list of operands.
- /// \@param BuildVector A list of users to ignore for the purpose of
- /// scheduling and that don't need extracting.
- /// \returns true if a value was vectorized.
- bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
- ArrayRef<Value *> BuildVector = None,
- bool allowReorder = false);
+ // Don't vectorize when the attribute NoImplicitFloat is used.
+ if (F.hasFnAttribute(Attribute::NoImplicitFloat))
+ return false;
- /// \brief Try to vectorize a chain that may start at the operands of \V;
- bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
+ DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
- /// \brief Vectorize the stores that were collected in StoreRefs.
- bool vectorizeStoreChains(BoUpSLP &R);
+ // Use the bottom up slp vectorizer to construct chains that start with
+ // store instructions.
+ BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
- /// \brief Scan the basic block and look for patterns that are likely to start
- /// a vectorization chain.
- bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
+ // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
+ // delete instructions.
- bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
- BoUpSLP &R, unsigned VecRegSize);
+ // Scan the blocks in the function in post order.
+ for (auto BB : post_order(&F.getEntryBlock())) {
+ collectSeedInstructions(BB);
- bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
- BoUpSLP &R);
-private:
- StoreListMap StoreRefs;
- unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
-};
+ // Vectorize trees that end at stores.
+ if (!Stores.empty()) {
+ DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
+ << " underlying objects.\n");
+ Changed |= vectorizeStoreChains(R);
+ }
+
+ // Vectorize trees that end at reductions.
+ Changed |= vectorizeChainsInBlock(BB, R);
+
+ // Vectorize the index computations of getelementptr instructions. This
+ // is primarily intended to catch gather-like idioms ending at
+ // non-consecutive loads.
+ if (!GEPs.empty()) {
+ DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
+ << " underlying objects.\n");
+ Changed |= vectorizeGEPIndices(BB, R);
+ }
+ }
+
+ if (Changed) {
+ R.optimizeGatherSequence();
+ DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
+ DEBUG(verifyFunction(F));
+ }
+ return Changed;
+}
/// \brief Check that the Values in the slice in VL array are still existent in
/// the WeakVH array.
@@ -3290,15 +3579,13 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
return !std::equal(VL.begin(), VL.end(), VH.begin());
}
-bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
- int CostThreshold, BoUpSLP &R,
- unsigned VecRegSize) {
+bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain,
+ int CostThreshold, BoUpSLP &R,
+ unsigned VecRegSize) {
unsigned ChainLen = Chain.size();
DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
<< "\n");
- Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
- auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
- unsigned Sz = DL.getTypeSizeInBits(StoreTy);
+ unsigned Sz = R.getVectorElementSize(Chain[0]);
unsigned VF = VecRegSize / Sz;
if (!isPowerOf2_32(Sz) || VF < 2)
@@ -3322,6 +3609,7 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
ArrayRef<Value *> Operands = Chain.slice(i, VF);
R.buildTree(Operands);
+ R.computeMinimumValueSizes();
int Cost = R.getTreeCost();
@@ -3339,8 +3627,8 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
return Changed;
}
-bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
- int costThreshold, BoUpSLP &R) {
+bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
+ int costThreshold, BoUpSLP &R) {
SetVector<StoreInst *> Heads, Tails;
SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
@@ -3353,7 +3641,6 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
// all of the pairs of stores that follow each other.
SmallVector<unsigned, 16> IndexQueue;
for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
- const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
IndexQueue.clear();
// If a store has multiple consecutive store candidates, search Stores
// array according to the sequence: from i+1 to e, then from i-1 to 0.
@@ -3366,7 +3653,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
IndexQueue.push_back(j - 1);
for (auto &k : IndexQueue) {
- if (R.isConsecutiveAccess(Stores[i], Stores[k], DL)) {
+ if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) {
Tails.insert(Stores[k]);
Heads.insert(Stores[i]);
ConsecutiveChain[Stores[i]] = Stores[k];
@@ -3396,7 +3683,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
// FIXME: Is division-by-2 the correct step? Should we assert that the
// register size is a power-of-2?
- for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
+ for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize(); Size /= 2) {
if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
// Mark the vectorized stores so that we don't vectorize them again.
VectorizedStores.insert(Operands.begin(), Operands.end());
@@ -3409,45 +3696,53 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
return Changed;
}
+void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
-unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
- unsigned count = 0;
- StoreRefs.clear();
- const DataLayout &DL = BB->getModule()->getDataLayout();
- for (Instruction &I : *BB) {
- StoreInst *SI = dyn_cast<StoreInst>(&I);
- if (!SI)
- continue;
-
- // Don't touch volatile stores.
- if (!SI->isSimple())
- continue;
+ // Initialize the collections. We will make a single pass over the block.
+ Stores.clear();
+ GEPs.clear();
- // Check that the pointer points to scalars.
- Type *Ty = SI->getValueOperand()->getType();
- if (!isValidElementType(Ty))
- continue;
+ // Visit the store and getelementptr instructions in BB and organize them in
+ // Stores and GEPs according to the underlying objects of their pointer
+ // operands.
+ for (Instruction &I : *BB) {
- // Find the base pointer.
- Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
+ // Ignore store instructions that are volatile or have a pointer operand
+ // that doesn't point to a scalar type.
+ if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ if (!SI->isSimple())
+ continue;
+ if (!isValidElementType(SI->getValueOperand()->getType()))
+ continue;
+ Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
+ }
- // Save the store locations.
- StoreRefs[Ptr].push_back(SI);
- count++;
+ // Ignore getelementptr instructions that have more than one index, a
+ // constant index, or a pointer operand that doesn't point to a scalar
+ // type.
+ else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ auto Idx = GEP->idx_begin()->get();
+ if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
+ continue;
+ if (!isValidElementType(Idx->getType()))
+ continue;
+ if (GEP->getType()->isVectorTy())
+ continue;
+ GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP);
+ }
}
- return count;
}
-bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
+bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
if (!A || !B)
return false;
Value *VL[] = { A, B };
return tryToVectorizeList(VL, R, None, true);
}
-bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
- ArrayRef<Value *> BuildVector,
- bool allowReorder) {
+bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
+ ArrayRef<Value *> BuildVector,
+ bool allowReorder) {
if (VL.size() < 2)
return false;
@@ -3459,13 +3754,11 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
return false;
unsigned Opcode0 = I0->getOpcode();
- const DataLayout &DL = I0->getModule()->getDataLayout();
- Type *Ty0 = I0->getType();
- unsigned Sz = DL.getTypeSizeInBits(Ty0);
// FIXME: Register size should be a parameter to this function, so we can
// try different vectorization factors.
- unsigned VF = MinVecRegSize / Sz;
+ unsigned Sz = R.getVectorElementSize(I0);
+ unsigned VF = R.getMinVecRegSize() / Sz;
for (Value *V : VL) {
Type *Ty = V->getType();
@@ -3513,6 +3806,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
Value *ReorderedOps[] = { Ops[1], Ops[0] };
R.buildTree(ReorderedOps, None);
}
+ R.computeMinimumValueSizes();
int Cost = R.getTreeCost();
if (Cost < -SLPCostThreshold) {
@@ -3529,15 +3823,16 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
unsigned VecIdx = 0;
for (auto &V : BuildVectorSlice) {
- IRBuilder<true, NoFolder> Builder(
- InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter));
- InsertElementInst *IE = cast<InsertElementInst>(V);
+ IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
+ ++BasicBlock::iterator(InsertAfter));
+ Instruction *I = cast<Instruction>(V);
+ assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
VectorizedRoot, Builder.getInt32(VecIdx++)));
- IE->setOperand(1, Extract);
- IE->removeFromParent();
- IE->insertAfter(Extract);
- InsertAfter = IE;
+ I->setOperand(1, Extract);
+ I->removeFromParent();
+ I->insertAfter(Extract);
+ InsertAfter = I;
}
}
// Move to the next bundle.
@@ -3549,7 +3844,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
return Changed;
}
-bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
+bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
if (!V)
return false;
@@ -3662,9 +3957,14 @@ public:
/// The width of one full horizontal reduction operation.
unsigned ReduxWidth;
- HorizontalReduction()
- : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
- ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0) {}
+ /// Minimal width of available vector registers. It's used to determine
+ /// ReduxWidth.
+ unsigned MinVecRegSize;
+
+ HorizontalReduction(unsigned MinVecRegSize)
+ : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
+ ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0),
+ MinVecRegSize(MinVecRegSize) {}
/// \brief Try to find a reduction tree.
bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
@@ -3779,6 +4079,7 @@ public:
for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
+ V.computeMinimumValueSizes();
// Estimate cost.
int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
@@ -3928,6 +4229,25 @@ static bool findBuildVector(InsertElementInst *FirstInsertElem,
return false;
}
+/// \brief Like findBuildVector, but looks backwards for construction of aggregate.
+///
+/// \return true if it matches.
+static bool findBuildAggregate(InsertValueInst *IV,
+ SmallVectorImpl<Value *> &BuildVector,
+ SmallVectorImpl<Value *> &BuildVectorOpds) {
+ if (!IV->hasOneUse())
+ return false;
+ Value *V = IV->getAggregateOperand();
+ if (!isa<UndefValue>(V)) {
+ InsertValueInst *I = dyn_cast<InsertValueInst>(V);
+ if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds))
+ return false;
+ }
+ BuildVector.push_back(IV);
+ BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+ return true;
+}
+
static bool PhiTypeSorterFunc(Value *V, Value *V2) {
return V->getType() < V2->getType();
}
@@ -3991,11 +4311,12 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
/// \returns true if a horizontal reduction was matched and reduced.
/// \returns false if a horizontal reduction was not matched.
static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
- BoUpSLP &R, TargetTransformInfo *TTI) {
+ BoUpSLP &R, TargetTransformInfo *TTI,
+ unsigned MinRegSize) {
if (!ShouldVectorizeHor)
return false;
- HorizontalReduction HorRdx;
+ HorizontalReduction HorRdx(MinRegSize);
if (!HorRdx.matchAssociativeReduction(P, BI))
return false;
@@ -4008,7 +4329,7 @@ static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
return HorRdx.tryToReduce(R, TTI);
}
-bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
+bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
SmallVector<Value *, 4> Incoming;
SmallSet<Value *, 16> VisitedInstrs;
@@ -4083,7 +4404,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
continue;
// Try to match and vectorize a horizontal reduction.
- if (canMatchHorizontalReduction(P, BI, R, TTI)) {
+ if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) {
Changed = true;
it = BB->begin();
e = BB->end();
@@ -4110,7 +4431,8 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (StoreInst *SI = dyn_cast<StoreInst>(it))
if (BinaryOperator *BinOp =
dyn_cast<BinaryOperator>(SI->getValueOperand())) {
- if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI) ||
+ if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
+ R.getMinVecRegSize()) ||
tryToVectorize(BinOp, R)) {
Changed = true;
it = BB->begin();
@@ -4178,16 +4500,121 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
continue;
}
+
+ // Try to vectorize trees that start at insertvalue instructions feeding into
+ // a store.
+ if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
+ if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) {
+ const DataLayout &DL = BB->getModule()->getDataLayout();
+ if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) {
+ SmallVector<Value *, 16> BuildVector;
+ SmallVector<Value *, 16> BuildVectorOpds;
+ if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds))
+ continue;
+
+ DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n");
+ if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ }
+ continue;
+ }
+ }
+ }
}
return Changed;
}
-bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
+bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
+ auto Changed = false;
+ for (auto &Entry : GEPs) {
+
+ // If the getelementptr list has fewer than two elements, there's nothing
+ // to do.
+ if (Entry.second.size() < 2)
+ continue;
+
+ DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
+ << Entry.second.size() << ".\n");
+
+ // We process the getelementptr list in chunks of 16 (like we do for
+ // stores) to minimize compile-time.
+ for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
+ auto Len = std::min<unsigned>(BE - BI, 16);
+ auto GEPList = makeArrayRef(&Entry.second[BI], Len);
+
+ // Initialize a set a candidate getelementptrs. Note that we use a
+ // SetVector here to preserve program order. If the index computations
+ // are vectorizable and begin with loads, we want to minimize the chance
+ // of having to reorder them later.
+ SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
+
+ // Some of the candidates may have already been vectorized after we
+ // initially collected them. If so, the WeakVHs will have nullified the
+ // values, so remove them from the set of candidates.
+ Candidates.remove(nullptr);
+
+ // Remove from the set of candidates all pairs of getelementptrs with
+ // constant differences. Such getelementptrs are likely not good
+ // candidates for vectorization in a bottom-up phase since one can be
+ // computed from the other. We also ensure all candidate getelementptr
+ // indices are unique.
+ for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
+ auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
+ if (!Candidates.count(GEPI))
+ continue;
+ auto *SCEVI = SE->getSCEV(GEPList[I]);
+ for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
+ auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
+ auto *SCEVJ = SE->getSCEV(GEPList[J]);
+ if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
+ Candidates.remove(GEPList[I]);
+ Candidates.remove(GEPList[J]);
+ } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
+ Candidates.remove(GEPList[J]);
+ }
+ }
+ }
+
+ // We break out of the above computation as soon as we know there are
+ // fewer than two candidates remaining.
+ if (Candidates.size() < 2)
+ continue;
+
+ // Add the single, non-constant index of each candidate to the bundle. We
+ // ensured the indices met these constraints when we originally collected
+ // the getelementptrs.
+ SmallVector<Value *, 16> Bundle(Candidates.size());
+ auto BundleIndex = 0u;
+ for (auto *V : Candidates) {
+ auto *GEP = cast<GetElementPtrInst>(V);
+ auto *GEPIdx = GEP->idx_begin()->get();
+ assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
+ Bundle[BundleIndex++] = GEPIdx;
+ }
+
+ // Try and vectorize the indices. We are currently only interested in
+ // gather-like cases of the form:
+ //
+ // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
+ //
+ // where the loads of "a", the loads of "b", and the subtractions can be
+ // performed in parallel. It's likely that detecting this pattern in a
+ // bottom-up phase will be simpler and less costly than building a
+ // full-blown top-down phase beginning at the consecutive loads.
+ Changed |= tryToVectorizeList(Bundle, R);
+ }
+ }
+ return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
bool Changed = false;
// Attempt to sort and vectorize each of the store-groups.
- for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
- it != e; ++it) {
+ for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
+ ++it) {
if (it->second.size() < 2)
continue;
@@ -4207,8 +4634,6 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
return Changed;
}
-} // end anonymous namespace
-
char SLPVectorizer::ID = 0;
static const char lv_name[] = "SLP Vectorizer";
INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
@@ -4217,6 +4642,7 @@ INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
namespace llvm {