summaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp102
1 files changed, 65 insertions, 37 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7c4c279dcf4d..7bac407e77e9 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -69,8 +69,13 @@ static cl::opt<bool> ShouldStartVectorizeHorAtStore(
cl::desc(
"Attempt to vectorize horizontal reductions feeding into a store"));
+static cl::opt<int>
+MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
+ cl::desc("Attempt to vectorize for this register size in bits"));
+
namespace {
+// FIXME: Set this via cl::opt to allow overriding.
static const unsigned MinVecRegSize = 128;
static const unsigned RecursionMaxDepth = 12;
@@ -2136,9 +2141,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
// Prepare the operand vector.
- for (unsigned j = 0; j < E->Scalars.size(); ++j)
- Operands.push_back(cast<PHINode>(E->Scalars[j])->
- getIncomingValueForBlock(IBB));
+ for (Value *V : E->Scalars)
+ Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
Builder.SetInsertPoint(IBB->getTerminator());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
@@ -2172,8 +2176,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::FPTrunc:
case Instruction::BitCast: {
ValueList INVL;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i)
- INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+ for (Value *V : E->Scalars)
+ INVL.push_back(cast<Instruction>(V)->getOperand(0));
setInsertPointAfterBundle(E->Scalars);
@@ -2191,9 +2195,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::FCmp:
case Instruction::ICmp: {
ValueList LHSV, RHSV;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
- LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
- RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+ for (Value *V : E->Scalars) {
+ LHSV.push_back(cast<Instruction>(V)->getOperand(0));
+ RHSV.push_back(cast<Instruction>(V)->getOperand(1));
}
setInsertPointAfterBundle(E->Scalars);
@@ -2217,10 +2221,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
case Instruction::Select: {
ValueList TrueVec, FalseVec, CondVec;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
- CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
- TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
- FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2));
+ for (Value *V : E->Scalars) {
+ CondVec.push_back(cast<Instruction>(V)->getOperand(0));
+ TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
+ FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
}
setInsertPointAfterBundle(E->Scalars);
@@ -2259,9 +2263,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
else
- for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
- LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
- RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+ for (Value *V : E->Scalars) {
+ LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
+ RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
}
setInsertPointAfterBundle(E->Scalars);
@@ -2322,8 +2326,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
unsigned AS = SI->getPointerAddressSpace();
ValueList ValueOp;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i)
- ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand());
+ for (Value *V : E->Scalars)
+ ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
setInsertPointAfterBundle(E->Scalars);
@@ -2351,8 +2355,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
setInsertPointAfterBundle(E->Scalars);
ValueList Op0VL;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i)
- Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0));
+ for (Value *V : E->Scalars)
+ Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
Value *Op0 = vectorizeTree(Op0VL);
@@ -2360,8 +2364,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
++j) {
ValueList OpVL;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i)
- OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j));
+ for (Value *V : E->Scalars)
+ OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
Value *OpVec = vectorizeTree(OpVL);
OpVecs.push_back(OpVec);
@@ -2397,8 +2401,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
OpVecs.push_back(CEI->getArgOperand(j));
continue;
}
- for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
- CallInst *CEI = cast<CallInst>(E->Scalars[i]);
+ for (Value *V : E->Scalars) {
+ CallInst *CEI = cast<CallInst>(V);
OpVL.push_back(CEI->getArgOperand(j));
}
@@ -3089,6 +3093,17 @@ struct SLPVectorizer : public FunctionPass {
if (!TTI->getNumberOfRegisters(true))
return false;
+ // Use the vector register size specified by the target unless overridden
+ // by a command-line option.
+ // TODO: It would be better to limit the vectorization factor based on
+ // data type rather than just register size. For example, x86 AVX has
+ // 256-bit registers, but it does not support integer operations
+ // at that width (that requires AVX2).
+ if (MaxVectorRegSizeOption.getNumOccurrences())
+ MaxVecRegSize = MaxVectorRegSizeOption;
+ else
+ MaxVecRegSize = TTI->getRegisterBitWidth(true);
+
// Don't vectorize when the attribute NoImplicitFloat is used.
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
return false;
@@ -3166,12 +3181,13 @@ private:
bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
- BoUpSLP &R);
+ BoUpSLP &R, unsigned VecRegSize);
bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
BoUpSLP &R);
private:
StoreListMap StoreRefs;
+ unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
};
/// \brief Check that the Values in the slice in VL array are still existent in
@@ -3186,14 +3202,15 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
}
bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
- int CostThreshold, BoUpSLP &R) {
+ int CostThreshold, BoUpSLP &R,
+ unsigned VecRegSize) {
unsigned ChainLen = Chain.size();
DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
<< "\n");
Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
unsigned Sz = DL.getTypeSizeInBits(StoreTy);
- unsigned VF = MinVecRegSize / Sz;
+ unsigned VF = VecRegSize / Sz;
if (!isPowerOf2_32(Sz) || VF < 2)
return false;
@@ -3277,12 +3294,16 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
I = ConsecutiveChain[I];
}
- bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R);
-
- // Mark the vectorized stores so that we don't vectorize them again.
- if (Vectorized)
- VectorizedStores.insert(Operands.begin(), Operands.end());
- Changed |= Vectorized;
+ // FIXME: Is division-by-2 the correct step? Should we assert that the
+ // register size is a power-of-2?
+ for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
+ if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
+ // Mark the vectorized stores so that we don't vectorize them again.
+ VectorizedStores.insert(Operands.begin(), Operands.end());
+ Changed = true;
+ break;
+ }
+ }
}
return Changed;
@@ -3293,8 +3314,8 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
unsigned count = 0;
StoreRefs.clear();
const DataLayout &DL = BB->getModule()->getDataLayout();
- for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
- StoreInst *SI = dyn_cast<StoreInst>(it);
+ for (Instruction &I : *BB) {
+ StoreInst *SI = dyn_cast<StoreInst>(&I);
if (!SI)
continue;
@@ -3342,13 +3363,15 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
Type *Ty0 = I0->getType();
unsigned Sz = DL.getTypeSizeInBits(Ty0);
+ // FIXME: Register size should be a parameter to this function, so we can
+ // try different vectorization factors.
unsigned VF = MinVecRegSize / Sz;
- for (int i = 0, e = VL.size(); i < e; ++i) {
- Type *Ty = VL[i]->getType();
+ for (Value *V : VL) {
+ Type *Ty = V->getType();
if (!isValidElementType(Ty))
return false;
- Instruction *Inst = dyn_cast<Instruction>(VL[i]);
+ Instruction *Inst = dyn_cast<Instruction>(V);
if (!Inst || Inst->getOpcode() != Opcode0)
return false;
}
@@ -3571,6 +3594,8 @@ public:
const DataLayout &DL = B->getModule()->getDataLayout();
ReductionOpcode = B->getOpcode();
ReducedValueOpcode = 0;
+ // FIXME: Register size should be a parameter to this function, so we can
+ // try different vectorization factors.
ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
ReductionRoot = B;
ReductionPHI = Phi;
@@ -3997,6 +4022,9 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
<< it->second.size() << ".\n");
// Process the stores in chunks of 16.
+ // TODO: The limit of 16 inhibits greater vectorization factors.
+ // For example, AVX2 supports v32i8. Increasing this limit, however,
+ // may cause a significant compile-time increase.
for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
unsigned Len = std::min<unsigned>(CE - CI, 16);
Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),