aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2023-02-11 12:38:04 +0000
committerDimitry Andric <dim@FreeBSD.org>2023-02-11 12:38:11 +0000
commite3b557809604d036af6e00c60f012c2025b59a5e (patch)
tree8a11ba2269a3b669601e2fd41145b174008f4da8 /llvm/lib/Transforms/Vectorize/VectorCombine.cpp
parent08e8dd7b9db7bb4a9de26d44c1cbfd24e869c014 (diff)
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/VectorCombine.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp339
1 files changed, 270 insertions, 69 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index a38936644bd3..2e489757ebc1 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -30,6 +30,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Vectorize.h"
+#include <numeric>
#define DEBUG_TYPE "vector-combine"
#include "llvm/Transforms/Utils/InstructionWorklist.h"
@@ -64,9 +65,9 @@ class VectorCombine {
public:
VectorCombine(Function &F, const TargetTransformInfo &TTI,
const DominatorTree &DT, AAResults &AA, AssumptionCache &AC,
- bool ScalarizationOnly)
+ bool TryEarlyFoldsOnly)
: F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC),
- ScalarizationOnly(ScalarizationOnly) {}
+ TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
bool run();
@@ -78,13 +79,17 @@ private:
AAResults &AA;
AssumptionCache &AC;
- /// If true only perform scalarization combines and do not introduce new
+ /// If true, only perform beneficial early IR transforms. Do not introduce new
/// vector operations.
- bool ScalarizationOnly;
+ bool TryEarlyFoldsOnly;
InstructionWorklist Worklist;
+ // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
+ // parameter. That should be updated to specific sub-classes because the
+ // run loop was changed to dispatch on opcode.
bool vectorizeLoadInsert(Instruction &I);
+ bool widenSubvectorLoad(Instruction &I);
ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
ExtractElementInst *Ext1,
unsigned PreferredExtractIndex) const;
@@ -97,6 +102,7 @@ private:
void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
Instruction &I);
bool foldExtractExtract(Instruction &I);
+ bool foldInsExtFNeg(Instruction &I);
bool foldBitcastShuf(Instruction &I);
bool scalarizeBinopOrCmp(Instruction &I);
bool foldExtractedCmps(Instruction &I);
@@ -125,12 +131,32 @@ private:
};
} // namespace
+static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
+ // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
+ // The widened load may load data from dirty regions or create data races
+ // non-existent in the source.
+ if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
+ Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
+ mustSuppressSpeculation(*Load))
+ return false;
+
+ // We are potentially transforming byte-sized (8-bit) memory accesses, so make
+ // sure we have all of our type-based constraints in place for this target.
+ Type *ScalarTy = Load->getType()->getScalarType();
+ uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
+ unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
+ if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
+ ScalarSize % 8 != 0)
+ return false;
+
+ return true;
+}
+
bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
// Match insert into fixed vector of scalar value.
// TODO: Handle non-zero insert index.
- auto *Ty = dyn_cast<FixedVectorType>(I.getType());
Value *Scalar;
- if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) ||
+ if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) ||
!Scalar->hasOneUse())
return false;
@@ -140,40 +166,28 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
if (!HasExtract)
X = Scalar;
- // Match source value as load of scalar or vector.
- // Do not vectorize scalar load (widening) if atomic/volatile or under
- // asan/hwasan/memtag/tsan. The widened load may load data from dirty regions
- // or create data races non-existent in the source.
auto *Load = dyn_cast<LoadInst>(X);
- if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
- Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
- mustSuppressSpeculation(*Load))
+ if (!canWidenLoad(Load, TTI))
return false;
- const DataLayout &DL = I.getModule()->getDataLayout();
- Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
- assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
-
- unsigned AS = Load->getPointerAddressSpace();
-
- // We are potentially transforming byte-sized (8-bit) memory accesses, so make
- // sure we have all of our type-based constraints in place for this target.
Type *ScalarTy = Scalar->getType();
uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
- if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
- ScalarSize % 8 != 0)
- return false;
// Check safety of replacing the scalar load with a larger vector load.
// We use minimal alignment (maximum flexibility) because we only care about
// the dereferenceable region. When calculating cost and creating a new op,
// we may use a larger value based on alignment attributes.
+ const DataLayout &DL = I.getModule()->getDataLayout();
+ Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
+ assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
+
unsigned MinVecNumElts = MinVectorSize / ScalarSize;
auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
unsigned OffsetEltIndex = 0;
Align Alignment = Load->getAlign();
- if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) {
+ if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &AC,
+ &DT)) {
// It is not safe to load directly from the pointer, but we can still peek
// through gep offsets and check if it safe to load from a base address with
// updated alignment. If it is, we can shuffle the element(s) into place
@@ -198,7 +212,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
if (OffsetEltIndex >= MinVecNumElts)
return false;
- if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT))
+ if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &AC,
+ &DT))
return false;
// Update alignment with offset value. Note that the offset could be negated
@@ -211,11 +226,14 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
// Use the greater of the alignment on the load or its source pointer.
Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment);
Type *LoadTy = Load->getType();
+ unsigned AS = Load->getPointerAddressSpace();
InstructionCost OldCost =
TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
- OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
- /* Insert */ true, HasExtract);
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ OldCost +=
+ TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
+ /* Insert */ true, HasExtract, CostKind);
// New pattern: load VecPtr
InstructionCost NewCost =
@@ -227,6 +245,7 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
// We assume this operation has no cost in codegen if there was no offset.
// Note that we could use freeze to avoid poison problems, but then we might
// still need a shuffle to change the vector size.
+ auto *Ty = cast<FixedVectorType>(I.getType());
unsigned OutputNumElts = Ty->getNumElements();
SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);
assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
@@ -252,6 +271,66 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
return true;
}
+/// If we are loading a vector and then inserting it into a larger vector with
+/// undefined elements, try to load the larger vector and eliminate the insert.
+/// This removes a shuffle in IR and may allow combining of other loaded values.
+bool VectorCombine::widenSubvectorLoad(Instruction &I) {
+ // Match subvector insert of fixed vector.
+ auto *Shuf = cast<ShuffleVectorInst>(&I);
+ if (!Shuf->isIdentityWithPadding())
+ return false;
+
+ // Allow a non-canonical shuffle mask that is choosing elements from op1.
+ unsigned NumOpElts =
+ cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
+ unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
+ return M >= (int)(NumOpElts);
+ });
+
+ auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
+ if (!canWidenLoad(Load, TTI))
+ return false;
+
+ // We use minimal alignment (maximum flexibility) because we only care about
+ // the dereferenceable region. When calculating cost and creating a new op,
+ // we may use a larger value based on alignment attributes.
+ auto *Ty = cast<FixedVectorType>(I.getType());
+ const DataLayout &DL = I.getModule()->getDataLayout();
+ Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
+ assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
+ Align Alignment = Load->getAlign();
+ if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), DL, Load, &AC, &DT))
+ return false;
+
+ Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment);
+ Type *LoadTy = Load->getType();
+ unsigned AS = Load->getPointerAddressSpace();
+
+ // Original pattern: insert_subvector (load PtrOp)
+ // This conservatively assumes that the cost of a subvector insert into an
+ // undef value is 0. We could add that cost if the cost model accurately
+ // reflects the real cost of that operation.
+ InstructionCost OldCost =
+ TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
+
+ // New pattern: load PtrOp
+ InstructionCost NewCost =
+ TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS);
+
+ // We can aggressively convert to the vector form because the backend can
+ // invert this transform if it does not result in a performance win.
+ if (OldCost < NewCost || !NewCost.isValid())
+ return false;
+
+ IRBuilder<> Builder(Load);
+ Value *CastedPtr =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Ty->getPointerTo(AS));
+ Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
+ replaceValue(I, *VecLd);
+ ++NumVecLoad;
+ return true;
+}
+
/// Determine which, if any, of the inputs should be replaced by a shuffle
/// followed by extract from a different index.
ExtractElementInst *VectorCombine::getShuffleExtract(
@@ -269,11 +348,12 @@ ExtractElementInst *VectorCombine::getShuffleExtract(
return nullptr;
Type *VecTy = Ext0->getVectorOperand()->getType();
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
InstructionCost Cost0 =
- TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
+ TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
InstructionCost Cost1 =
- TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
+ TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
// If both costs are invalid no shuffle is needed
if (!Cost0.isValid() && !Cost1.isValid())
@@ -336,11 +416,12 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
// both sequences.
unsigned Ext0Index = Ext0IndexC->getZExtValue();
unsigned Ext1Index = Ext1IndexC->getZExtValue();
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost Extract0Cost =
- TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);
+ TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
InstructionCost Extract1Cost =
- TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index);
+ TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
// A more expensive extract will always be replaced by a splat shuffle.
// For example, if Ext0 is more expensive:
@@ -533,6 +614,69 @@ bool VectorCombine::foldExtractExtract(Instruction &I) {
return true;
}
+/// Try to replace an extract + scalar fneg + insert with a vector fneg +
+/// shuffle.
+bool VectorCombine::foldInsExtFNeg(Instruction &I) {
+ // Match an insert (op (extract)) pattern.
+ Value *DestVec;
+ uint64_t Index;
+ Instruction *FNeg;
+ if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)),
+ m_ConstantInt(Index))))
+ return false;
+
+ // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
+ Value *SrcVec;
+ Instruction *Extract;
+ if (!match(FNeg, m_FNeg(m_CombineAnd(
+ m_Instruction(Extract),
+ m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index))))))
+ return false;
+
+ // TODO: We could handle this with a length-changing shuffle.
+ auto *VecTy = cast<FixedVectorType>(I.getType());
+ if (SrcVec->getType() != VecTy)
+ return false;
+
+ // Ignore bogus insert/extract index.
+ unsigned NumElts = VecTy->getNumElements();
+ if (Index >= NumElts)
+ return false;
+
+ // We are inserting the negated element into the same lane that we extracted
+ // from. This is equivalent to a select-shuffle that chooses all but the
+ // negated element from the destination vector.
+ SmallVector<int> Mask(NumElts);
+ std::iota(Mask.begin(), Mask.end(), 0);
+ Mask[Index] = Index + NumElts;
+
+ Type *ScalarTy = VecTy->getScalarType();
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ InstructionCost OldCost =
+ TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) +
+ TTI.getVectorInstrCost(I, VecTy, CostKind, Index);
+
+ // If the extract has one use, it will be eliminated, so count it in the
+ // original cost. If it has more than one use, ignore the cost because it will
+ // be the same before/after.
+ if (Extract->hasOneUse())
+ OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index);
+
+ InstructionCost NewCost =
+ TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) +
+ TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask);
+
+ if (NewCost > OldCost)
+ return false;
+
+ // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index -->
+ // shuffle DestVec, (fneg SrcVec), Mask
+ Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
+ Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
+ replaceValue(I, *Shuf);
+ return true;
+}
+
/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
/// destination type followed by shuffle. This can enable further transforms by
/// moving bitcasts or shuffles together.
@@ -548,11 +692,11 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {
// mask for scalable type is a splat or not.
// 2) Disallow non-vector casts and length-changing shuffles.
// TODO: We could allow any shuffle.
- auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
auto *SrcTy = dyn_cast<FixedVectorType>(V->getType());
- if (!SrcTy || !DestTy || I.getOperand(0)->getType() != SrcTy)
+ if (!SrcTy || I.getOperand(0)->getType() != SrcTy)
return false;
+ auto *DestTy = cast<FixedVectorType>(I.getType());
unsigned DestNumElts = DestTy->getNumElements();
unsigned SrcNumElts = SrcTy->getNumElements();
SmallVector<int, 16> NewMask;
@@ -664,8 +808,9 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
// Get cost estimate for the insert element. This cost will factor into
// both sequences.
- InstructionCost InsertCost =
- TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index);
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ InstructionCost InsertCost = TTI.getVectorInstrCost(
+ Instruction::InsertElement, VecTy, CostKind, Index);
InstructionCost OldCost =
(IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost;
InstructionCost NewCost = ScalarOpCost + InsertCost +
@@ -754,9 +899,10 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
if (!VecTy)
return false;
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost OldCost =
- TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
- OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
+ TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
+ OldCost += TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
OldCost +=
TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(),
CmpInst::makeCmpResultType(I0->getType()), Pred) *
@@ -776,7 +922,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy,
ShufMask);
NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
- NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex);
+ NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
// Aggressively form vector ops if the cost is equal because the transform
// may enable further optimization.
@@ -811,6 +957,7 @@ static bool isMemModifiedBetween(BasicBlock::iterator Begin,
});
}
+namespace {
/// Helper class to indicate whether a vector index can be safely scalarized and
/// if a freeze needs to be inserted.
class ScalarizationResult {
@@ -865,6 +1012,7 @@ public:
ToFreeze = nullptr;
}
};
+} // namespace
/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
/// Idx. \p Idx must access a valid vector element.
@@ -928,8 +1076,8 @@ static Align computeAlignmentAfterScalarization(Align VectorAlignment,
// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
// store i32 %b, i32* %1
bool VectorCombine::foldSingleElementStore(Instruction &I) {
- StoreInst *SI = dyn_cast<StoreInst>(&I);
- if (!SI || !SI->isSimple() ||
+ auto *SI = cast<StoreInst>(&I);
+ if (!SI->isSimple() ||
!isa<FixedVectorType>(SI->getValueOperand()->getType()))
return false;
@@ -985,17 +1133,14 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
if (!match(&I, m_Load(m_Value(Ptr))))
return false;
+ auto *FixedVT = cast<FixedVectorType>(I.getType());
auto *LI = cast<LoadInst>(&I);
const DataLayout &DL = I.getModule()->getDataLayout();
- if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(LI->getType()))
- return false;
-
- auto *FixedVT = dyn_cast<FixedVectorType>(LI->getType());
- if (!FixedVT)
+ if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(FixedVT))
return false;
InstructionCost OriginalCost =
- TTI.getMemoryOpCost(Instruction::Load, LI->getType(), LI->getAlign(),
+ TTI.getMemoryOpCost(Instruction::Load, FixedVT, LI->getAlign(),
LI->getPointerAddressSpace());
InstructionCost ScalarizedCost = 0;
@@ -1034,8 +1179,9 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
}
auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1));
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
OriginalCost +=
- TTI.getVectorInstrCost(Instruction::ExtractElement, LI->getType(),
+ TTI.getVectorInstrCost(Instruction::ExtractElement, FixedVT, CostKind,
Index ? Index->getZExtValue() : -1);
ScalarizedCost +=
TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(),
@@ -1070,10 +1216,7 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
/// Try to convert "shuffle (binop), (binop)" with a shared binop operand into
/// "binop (shuffle), (shuffle)".
bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
- auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
- if (!VecTy)
- return false;
-
+ auto *VecTy = cast<FixedVectorType>(I.getType());
BinaryOperator *B0, *B1;
ArrayRef<int> Mask;
if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)),
@@ -1244,15 +1387,14 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
/// architectures with no obvious "select" shuffle, this can reduce the total
/// number of operations if the target reports them as cheaper.
bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
- auto *SVI = dyn_cast<ShuffleVectorInst>(&I);
- auto *VT = dyn_cast<FixedVectorType>(I.getType());
- if (!SVI || !VT)
- return false;
+ auto *SVI = cast<ShuffleVectorInst>(&I);
+ auto *VT = cast<FixedVectorType>(I.getType());
auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
VT != Op0->getType())
return false;
+
auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
@@ -1300,7 +1442,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
// cost calculations.
if (!FromReduction) {
for (ShuffleVectorInst *SV : Shuffles) {
- for (auto U : SV->users()) {
+ for (auto *U : SV->users()) {
ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
Shuffles.push_back(SSV);
@@ -1569,19 +1711,78 @@ bool VectorCombine::run() {
bool MadeChange = false;
auto FoldInst = [this, &MadeChange](Instruction &I) {
Builder.SetInsertPoint(&I);
- if (!ScalarizationOnly) {
- MadeChange |= vectorizeLoadInsert(I);
- MadeChange |= foldExtractExtract(I);
- MadeChange |= foldBitcastShuf(I);
- MadeChange |= foldExtractedCmps(I);
- MadeChange |= foldShuffleOfBinops(I);
- MadeChange |= foldShuffleFromReductions(I);
- MadeChange |= foldSelectShuffle(I);
+ bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
+ auto Opcode = I.getOpcode();
+
+ // These folds should be beneficial regardless of when this pass is run
+ // in the optimization pipeline.
+ // The type checking is for run-time efficiency. We can avoid wasting time
+ // dispatching to folding functions if there's no chance of matching.
+ if (IsFixedVectorType) {
+ switch (Opcode) {
+ case Instruction::InsertElement:
+ MadeChange |= vectorizeLoadInsert(I);
+ break;
+ case Instruction::ShuffleVector:
+ MadeChange |= widenSubvectorLoad(I);
+ break;
+ case Instruction::Load:
+ MadeChange |= scalarizeLoadExtract(I);
+ break;
+ default:
+ break;
+ }
+ }
+
+ // This transform works with scalable and fixed vectors
+ // TODO: Identify and allow other scalable transforms
+ if (isa<VectorType>(I.getType()))
+ MadeChange |= scalarizeBinopOrCmp(I);
+
+ if (Opcode == Instruction::Store)
+ MadeChange |= foldSingleElementStore(I);
+
+
+ // If this is an early pipeline invocation of this pass, we are done.
+ if (TryEarlyFoldsOnly)
+ return;
+
+ // Otherwise, try folds that improve codegen but may interfere with
+ // early IR canonicalizations.
+ // The type checking is for run-time efficiency. We can avoid wasting time
+ // dispatching to folding functions if there's no chance of matching.
+ if (IsFixedVectorType) {
+ switch (Opcode) {
+ case Instruction::InsertElement:
+ MadeChange |= foldInsExtFNeg(I);
+ break;
+ case Instruction::ShuffleVector:
+ MadeChange |= foldShuffleOfBinops(I);
+ MadeChange |= foldSelectShuffle(I);
+ break;
+ case Instruction::BitCast:
+ MadeChange |= foldBitcastShuf(I);
+ break;
+ }
+ } else {
+ switch (Opcode) {
+ case Instruction::Call:
+ MadeChange |= foldShuffleFromReductions(I);
+ break;
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ MadeChange |= foldExtractExtract(I);
+ break;
+ default:
+ if (Instruction::isBinaryOp(Opcode)) {
+ MadeChange |= foldExtractExtract(I);
+ MadeChange |= foldExtractedCmps(I);
+ }
+ break;
+ }
}
- MadeChange |= scalarizeBinopOrCmp(I);
- MadeChange |= scalarizeLoadExtract(I);
- MadeChange |= foldSingleElementStore(I);
};
+
for (BasicBlock &BB : F) {
// Ignore unreachable basic blocks.
if (!DT.isReachableFromEntry(&BB))
@@ -1664,7 +1865,7 @@ PreservedAnalyses VectorCombinePass::run(Function &F,
TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
AAResults &AA = FAM.getResult<AAManager>(F);
- VectorCombine Combiner(F, TTI, DT, AA, AC, ScalarizationOnly);
+ VectorCombine Combiner(F, TTI, DT, AA, AC, TryEarlyFoldsOnly);
if (!Combiner.run())
return PreservedAnalyses::all();
PreservedAnalyses PA;