src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2023-02-11 12:38:04 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2023-02-11 12:38:11 +0000
commit	e3b557809604d036af6e00c60f012c2025b59a5e (patch)
tree	8a11ba2269a3b669601e2fd41145b174008f4da8 /llvm/lib/Transforms/Vectorize/VectorCombine.cpp
parent	08e8dd7b9db7bb4a9de26d44c1cbfd24e869c014 (diff)

vendor/llvm-project/llvmorg-16-init-18548-gb0daacf58f41

Diffstat (limited to 'llvm/lib/Transforms/Vectorize/VectorCombine.cpp')

-rw-r--r--

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

339

1 files changed, 270 insertions, 69 deletions

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index a38936644bd3..2e489757ebc1 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp

@@ -30,6 +30,7 @@

#include "llvm/Support/CommandLine.h"

#include "llvm/Transforms/Utils/Local.h"

#include "llvm/Transforms/Vectorize.h"

+#include <numeric>

#define DEBUG_TYPE "vector-combine"

#include "llvm/Transforms/Utils/InstructionWorklist.h"

@@ -64,9 +65,9 @@ class VectorCombine {

public:

VectorCombine(Function &F, const TargetTransformInfo &TTI,

const DominatorTree &DT, AAResults &AA, AssumptionCache &AC,

- bool ScalarizationOnly)

+ bool TryEarlyFoldsOnly)

: F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC),

- ScalarizationOnly(ScalarizationOnly) {}

+ TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}

bool run();

@@ -78,13 +79,17 @@ private:

AAResults &AA;

AssumptionCache &AC;

- /// If true only perform scalarization combines and do not introduce new

+ /// If true, only perform beneficial early IR transforms. Do not introduce new

/// vector operations.

- bool ScalarizationOnly;

+ bool TryEarlyFoldsOnly;

InstructionWorklist Worklist;

+ // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"

+ // parameter. That should be updated to specific sub-classes because the

+ // run loop was changed to dispatch on opcode.

bool vectorizeLoadInsert(Instruction &I);

+ bool widenSubvectorLoad(Instruction &I);

ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,

ExtractElementInst *Ext1,

unsigned PreferredExtractIndex) const;

@@ -97,6 +102,7 @@ private:

void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1,

Instruction &I);

bool foldExtractExtract(Instruction &I);

+ bool foldInsExtFNeg(Instruction &I);

bool foldBitcastShuf(Instruction &I);

bool scalarizeBinopOrCmp(Instruction &I);

bool foldExtractedCmps(Instruction &I);

@@ -125,12 +131,32 @@ private:

};

} // namespace

+static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {

+ // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.

+ // The widened load may load data from dirty regions or create data races

+ // non-existent in the source.

+ if (!Load || !Load->isSimple() || !Load->hasOneUse() ||

+ Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||

+ mustSuppressSpeculation(*Load))

+ return false;

+ // We are potentially transforming byte-sized (8-bit) memory accesses, so make

+ // sure we have all of our type-based constraints in place for this target.

+ Type *ScalarTy = Load->getType()->getScalarType();

+ uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();

+ unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();

+ if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||

+ ScalarSize % 8 != 0)

+ return false;

+ return true;

bool VectorCombine::vectorizeLoadInsert(Instruction &I) {

// Match insert into fixed vector of scalar value.

// TODO: Handle non-zero insert index.

- auto *Ty = dyn_cast<FixedVectorType>(I.getType());

Value *Scalar;

- if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) ||

+ if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) ||

!Scalar->hasOneUse())

return false;

@@ -140,40 +166,28 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {

if (!HasExtract)

X = Scalar;

- // Match source value as load of scalar or vector.

- // Do not vectorize scalar load (widening) if atomic/volatile or under

- // asan/hwasan/memtag/tsan. The widened load may load data from dirty regions

- // or create data races non-existent in the source.

auto *Load = dyn_cast<LoadInst>(X);

- if (!Load || !Load->isSimple() || !Load->hasOneUse() ||

- Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||

- mustSuppressSpeculation(*Load))

+ if (!canWidenLoad(Load, TTI))

return false;

- const DataLayout &DL = I.getModule()->getDataLayout();

- Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();

- assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");

- unsigned AS = Load->getPointerAddressSpace();

- // We are potentially transforming byte-sized (8-bit) memory accesses, so make

- // sure we have all of our type-based constraints in place for this target.

Type *ScalarTy = Scalar->getType();

uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();

unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();

- if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||

- ScalarSize % 8 != 0)

- return false;

// Check safety of replacing the scalar load with a larger vector load.

// We use minimal alignment (maximum flexibility) because we only care about

// the dereferenceable region. When calculating cost and creating a new op,

// we may use a larger value based on alignment attributes.

+ const DataLayout &DL = I.getModule()->getDataLayout();

+ Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();

+ assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");

unsigned MinVecNumElts = MinVectorSize / ScalarSize;

auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);

unsigned OffsetEltIndex = 0;

Align Alignment = Load->getAlign();

- if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) {

+ if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &AC,

+ &DT)) {

// It is not safe to load directly from the pointer, but we can still peek

// through gep offsets and check if it safe to load from a base address with

// updated alignment. If it is, we can shuffle the element(s) into place

@@ -198,7 +212,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {

if (OffsetEltIndex >= MinVecNumElts)

return false;

- if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT))

+ if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &AC,

+ &DT))

return false;

// Update alignment with offset value. Note that the offset could be negated

@@ -211,11 +226,14 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {

// Use the greater of the alignment on the load or its source pointer.

Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment);

Type *LoadTy = Load->getType();

+ unsigned AS = Load->getPointerAddressSpace();

InstructionCost OldCost =

TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);

APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);

- OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts,

- /* Insert */ true, HasExtract);

+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

+ OldCost +=

+ TTI.getScalarizationOverhead(MinVecTy, DemandedElts,

+ /* Insert */ true, HasExtract, CostKind);

// New pattern: load VecPtr

InstructionCost NewCost =

@@ -227,6 +245,7 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {

// We assume this operation has no cost in codegen if there was no offset.

// Note that we could use freeze to avoid poison problems, but then we might

// still need a shuffle to change the vector size.

+ auto *Ty = cast<FixedVectorType>(I.getType());

unsigned OutputNumElts = Ty->getNumElements();

SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);

assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");

@@ -252,6 +271,66 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {

return true;

}

+/// If we are loading a vector and then inserting it into a larger vector with

+/// undefined elements, try to load the larger vector and eliminate the insert.

+/// This removes a shuffle in IR and may allow combining of other loaded values.

+bool VectorCombine::widenSubvectorLoad(Instruction &I) {

+ // Match subvector insert of fixed vector.

+ auto *Shuf = cast<ShuffleVectorInst>(&I);

+ if (!Shuf->isIdentityWithPadding())

+ return false;

+ // Allow a non-canonical shuffle mask that is choosing elements from op1.

+ unsigned NumOpElts =

+ cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();

+ unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {

+ return M >= (int)(NumOpElts);

+ });

+ auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));

+ if (!canWidenLoad(Load, TTI))

+ return false;

+ // We use minimal alignment (maximum flexibility) because we only care about

+ // the dereferenceable region. When calculating cost and creating a new op,

+ // we may use a larger value based on alignment attributes.

+ auto *Ty = cast<FixedVectorType>(I.getType());

+ const DataLayout &DL = I.getModule()->getDataLayout();

+ Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();

+ assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");

+ Align Alignment = Load->getAlign();

+ if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), DL, Load, &AC, &DT))

+ return false;

+ Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment);

+ Type *LoadTy = Load->getType();

+ unsigned AS = Load->getPointerAddressSpace();

+ // Original pattern: insert_subvector (load PtrOp)

+ // This conservatively assumes that the cost of a subvector insert into an

+ // undef value is 0. We could add that cost if the cost model accurately

+ // reflects the real cost of that operation.

+ InstructionCost OldCost =

+ TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);

+ // New pattern: load PtrOp

+ InstructionCost NewCost =

+ TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS);

+ // We can aggressively convert to the vector form because the backend can

+ // invert this transform if it does not result in a performance win.

+ if (OldCost < NewCost || !NewCost.isValid())

+ return false;

+ IRBuilder<> Builder(Load);

+ Value *CastedPtr =

+ Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Ty->getPointerTo(AS));

+ Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);

+ replaceValue(I, *VecLd);

+ ++NumVecLoad;

+ return true;

/// Determine which, if any, of the inputs should be replaced by a shuffle

/// followed by extract from a different index.

ExtractElementInst *VectorCombine::getShuffleExtract(

@@ -269,11 +348,12 @@ ExtractElementInst *VectorCombine::getShuffleExtract(

return nullptr;

Type *VecTy = Ext0->getVectorOperand()->getType();

+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");

InstructionCost Cost0 =

- TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);

+ TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);

InstructionCost Cost1 =

- TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);

+ TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);

// If both costs are invalid no shuffle is needed

if (!Cost0.isValid() && !Cost1.isValid())

@@ -336,11 +416,12 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,

// both sequences.

unsigned Ext0Index = Ext0IndexC->getZExtValue();

unsigned Ext1Index = Ext1IndexC->getZExtValue();

+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

InstructionCost Extract0Cost =

- TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);

+ TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);

InstructionCost Extract1Cost =

- TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index);

+ TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);

// A more expensive extract will always be replaced by a splat shuffle.

// For example, if Ext0 is more expensive:

@@ -533,6 +614,69 @@ bool VectorCombine::foldExtractExtract(Instruction &I) {

return true;

}

+/// Try to replace an extract + scalar fneg + insert with a vector fneg +

+/// shuffle.

+bool VectorCombine::foldInsExtFNeg(Instruction &I) {

+ // Match an insert (op (extract)) pattern.

+ Value *DestVec;

+ uint64_t Index;

+ Instruction *FNeg;

+ if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)),

+ m_ConstantInt(Index))))

+ return false;

+ // Note: This handles the canonical fneg instruction and "fsub -0.0, X".

+ Value *SrcVec;

+ Instruction *Extract;

+ if (!match(FNeg, m_FNeg(m_CombineAnd(

+ m_Instruction(Extract),

+ m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index))))))

+ return false;

+ // TODO: We could handle this with a length-changing shuffle.

+ auto *VecTy = cast<FixedVectorType>(I.getType());

+ if (SrcVec->getType() != VecTy)

+ return false;

+ // Ignore bogus insert/extract index.

+ unsigned NumElts = VecTy->getNumElements();

+ if (Index >= NumElts)

+ return false;

+ // We are inserting the negated element into the same lane that we extracted

+ // from. This is equivalent to a select-shuffle that chooses all but the

+ // negated element from the destination vector.

+ SmallVector<int> Mask(NumElts);

+ std::iota(Mask.begin(), Mask.end(), 0);

+ Mask[Index] = Index + NumElts;

+ Type *ScalarTy = VecTy->getScalarType();

+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

+ InstructionCost OldCost =

+ TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) +

+ TTI.getVectorInstrCost(I, VecTy, CostKind, Index);

+ // If the extract has one use, it will be eliminated, so count it in the

+ // original cost. If it has more than one use, ignore the cost because it will

+ // be the same before/after.

+ if (Extract->hasOneUse())

+ OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index);

+ InstructionCost NewCost =

+ TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) +

+ TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask);

+ if (NewCost > OldCost)

+ return false;

+ // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index -->

+ // shuffle DestVec, (fneg SrcVec), Mask

+ Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);

+ Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);

+ replaceValue(I, *Shuf);

+ return true;

/// If this is a bitcast of a shuffle, try to bitcast the source vector to the

/// destination type followed by shuffle. This can enable further transforms by

/// moving bitcasts or shuffles together.

@@ -548,11 +692,11 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {

// mask for scalable type is a splat or not.

// 2) Disallow non-vector casts and length-changing shuffles.

// TODO: We could allow any shuffle.

- auto *DestTy = dyn_cast<FixedVectorType>(I.getType());

auto *SrcTy = dyn_cast<FixedVectorType>(V->getType());

- if (!SrcTy || !DestTy || I.getOperand(0)->getType() != SrcTy)

+ if (!SrcTy || I.getOperand(0)->getType() != SrcTy)

return false;

+ auto *DestTy = cast<FixedVectorType>(I.getType());

unsigned DestNumElts = DestTy->getNumElements();

unsigned SrcNumElts = SrcTy->getNumElements();

SmallVector<int, 16> NewMask;

@@ -664,8 +808,9 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {

// Get cost estimate for the insert element. This cost will factor into

// both sequences.

- InstructionCost InsertCost =

- TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index);

+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

+ InstructionCost InsertCost = TTI.getVectorInstrCost(

+ Instruction::InsertElement, VecTy, CostKind, Index);

InstructionCost OldCost =

(IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost;

InstructionCost NewCost = ScalarOpCost + InsertCost +

@@ -754,9 +899,10 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {

if (!VecTy)

return false;

+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

InstructionCost OldCost =

- TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);

- OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);

+ TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);

+ OldCost += TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);

OldCost +=

TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(),

CmpInst::makeCmpResultType(I0->getType()), Pred) *

@@ -776,7 +922,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {

NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy,

ShufMask);

NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);

- NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex);

+ NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);

// Aggressively form vector ops if the cost is equal because the transform

// may enable further optimization.

@@ -811,6 +957,7 @@ static bool isMemModifiedBetween(BasicBlock::iterator Begin,

});

}

+namespace {

/// Helper class to indicate whether a vector index can be safely scalarized and

/// if a freeze needs to be inserted.

class ScalarizationResult {

@@ -865,6 +1012,7 @@ public:

ToFreeze = nullptr;

}

};

+} // namespace

/// Check if it is legal to scalarize a memory access to \p VecTy at index \p

/// Idx. \p Idx must access a valid vector element.

@@ -928,8 +1076,8 @@ static Align computeAlignmentAfterScalarization(Align VectorAlignment,

// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1

// store i32 %b, i32* %1

bool VectorCombine::foldSingleElementStore(Instruction &I) {

- StoreInst *SI = dyn_cast<StoreInst>(&I);

- if (!SI || !SI->isSimple() ||

+ auto *SI = cast<StoreInst>(&I);

+ if (!SI->isSimple() ||

!isa<FixedVectorType>(SI->getValueOperand()->getType()))

return false;

@@ -985,17 +1133,14 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {

if (!match(&I, m_Load(m_Value(Ptr))))

return false;

+ auto *FixedVT = cast<FixedVectorType>(I.getType());

auto *LI = cast<LoadInst>(&I);

const DataLayout &DL = I.getModule()->getDataLayout();

- if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(LI->getType()))

- return false;

- auto *FixedVT = dyn_cast<FixedVectorType>(LI->getType());

- if (!FixedVT)

+ if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(FixedVT))

return false;

InstructionCost OriginalCost =

- TTI.getMemoryOpCost(Instruction::Load, LI->getType(), LI->getAlign(),

+ TTI.getMemoryOpCost(Instruction::Load, FixedVT, LI->getAlign(),

LI->getPointerAddressSpace());

InstructionCost ScalarizedCost = 0;

@@ -1034,8 +1179,9 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {

}

auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1));

+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

OriginalCost +=

- TTI.getVectorInstrCost(Instruction::ExtractElement, LI->getType(),

+ TTI.getVectorInstrCost(Instruction::ExtractElement, FixedVT, CostKind,

Index ? Index->getZExtValue() : -1);

ScalarizedCost +=

TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(),

@@ -1070,10 +1216,7 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {

/// Try to convert "shuffle (binop), (binop)" with a shared binop operand into

/// "binop (shuffle), (shuffle)".

bool VectorCombine::foldShuffleOfBinops(Instruction &I) {

- auto *VecTy = dyn_cast<FixedVectorType>(I.getType());

- if (!VecTy)

- return false;

+ auto *VecTy = cast<FixedVectorType>(I.getType());

BinaryOperator *B0, *B1;

ArrayRef<int> Mask;

if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)),

@@ -1244,15 +1387,14 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {

/// architectures with no obvious "select" shuffle, this can reduce the total

/// number of operations if the target reports them as cheaper.

bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {

- auto *SVI = dyn_cast<ShuffleVectorInst>(&I);

- auto *VT = dyn_cast<FixedVectorType>(I.getType());

- if (!SVI || !VT)

- return false;

+ auto *SVI = cast<ShuffleVectorInst>(&I);

+ auto *VT = cast<FixedVectorType>(I.getType());

auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));

auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));

if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||

VT != Op0->getType())

return false;

auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));

auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));

auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));

@@ -1300,7 +1442,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {

// cost calculations.

if (!FromReduction) {

for (ShuffleVectorInst *SV : Shuffles) {

- for (auto U : SV->users()) {

+ for (auto *U : SV->users()) {

ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);

if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)

Shuffles.push_back(SSV);

@@ -1569,19 +1711,78 @@ bool VectorCombine::run() {

bool MadeChange = false;

auto FoldInst = [this, &MadeChange](Instruction &I) {

Builder.SetInsertPoint(&I);

- if (!ScalarizationOnly) {

- MadeChange |= vectorizeLoadInsert(I);

- MadeChange |= foldExtractExtract(I);

- MadeChange |= foldBitcastShuf(I);

- MadeChange |= foldExtractedCmps(I);

- MadeChange |= foldShuffleOfBinops(I);

- MadeChange |= foldShuffleFromReductions(I);

- MadeChange |= foldSelectShuffle(I);

+ bool IsFixedVectorType = isa<FixedVectorType>(I.getType());

+ auto Opcode = I.getOpcode();

+ // These folds should be beneficial regardless of when this pass is run

+ // in the optimization pipeline.

+ // The type checking is for run-time efficiency. We can avoid wasting time

+ // dispatching to folding functions if there's no chance of matching.

+ if (IsFixedVectorType) {

+ switch (Opcode) {

+ case Instruction::InsertElement:

+ MadeChange |= vectorizeLoadInsert(I);

+ break;

+ case Instruction::ShuffleVector:

+ MadeChange |= widenSubvectorLoad(I);

+ break;

+ case Instruction::Load:

+ MadeChange |= scalarizeLoadExtract(I);

+ break;

+ default:

+ break;

+ }

+ // This transform works with scalable and fixed vectors

+ // TODO: Identify and allow other scalable transforms

+ if (isa<VectorType>(I.getType()))

+ MadeChange |= scalarizeBinopOrCmp(I);

+ if (Opcode == Instruction::Store)

+ MadeChange |= foldSingleElementStore(I);

+ // If this is an early pipeline invocation of this pass, we are done.

+ if (TryEarlyFoldsOnly)

+ return;

+ // Otherwise, try folds that improve codegen but may interfere with

+ // early IR canonicalizations.

+ // The type checking is for run-time efficiency. We can avoid wasting time

+ // dispatching to folding functions if there's no chance of matching.

+ if (IsFixedVectorType) {

+ switch (Opcode) {

+ case Instruction::InsertElement:

+ MadeChange |= foldInsExtFNeg(I);

+ break;

+ case Instruction::ShuffleVector:

+ MadeChange |= foldShuffleOfBinops(I);

+ MadeChange |= foldSelectShuffle(I);

+ break;

+ case Instruction::BitCast:

+ MadeChange |= foldBitcastShuf(I);

+ break;

+ }

+ } else {

+ switch (Opcode) {

+ case Instruction::Call:

+ MadeChange |= foldShuffleFromReductions(I);

+ break;

+ case Instruction::ICmp:

+ case Instruction::FCmp:

+ MadeChange |= foldExtractExtract(I);

+ break;

+ default:

+ if (Instruction::isBinaryOp(Opcode)) {

+ MadeChange |= foldExtractExtract(I);

+ MadeChange |= foldExtractedCmps(I);

+ }

+ break;

+ }

}

- MadeChange |= scalarizeBinopOrCmp(I);

- MadeChange |= scalarizeLoadExtract(I);

- MadeChange |= foldSingleElementStore(I);

};

for (BasicBlock &BB : F) {

// Ignore unreachable basic blocks.

if (!DT.isReachableFromEntry(&BB))

@@ -1664,7 +1865,7 @@ PreservedAnalyses VectorCombinePass::run(Function &F,

TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);

DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);

AAResults &AA = FAM.getResult<AAManager>(F);

- VectorCombine Combiner(F, TTI, DT, AA, AC, ScalarizationOnly);

+ VectorCombine Combiner(F, TTI, DT, AA, AC, TryEarlyFoldsOnly);

if (!Combiner.run())

return PreservedAnalyses::all();

PreservedAnalyses PA;