From e3b557809604d036af6e00c60f012c2025b59a5e Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Sat, 11 Feb 2023 13:38:04 +0100 Subject: Vendor import of llvm-project main llvmorg-16-init-18548-gb0daacf58f41, the last commit before the upstream release/17.x branch was created. --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 339 +++++++++++++++++++----- 1 file changed, 270 insertions(+), 69 deletions(-) (limited to 'llvm/lib/Transforms/Vectorize/VectorCombine.cpp') diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index a38936644bd3..2e489757ebc1 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -30,6 +30,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" +#include #define DEBUG_TYPE "vector-combine" #include "llvm/Transforms/Utils/InstructionWorklist.h" @@ -64,9 +65,9 @@ class VectorCombine { public: VectorCombine(Function &F, const TargetTransformInfo &TTI, const DominatorTree &DT, AAResults &AA, AssumptionCache &AC, - bool ScalarizationOnly) + bool TryEarlyFoldsOnly) : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC), - ScalarizationOnly(ScalarizationOnly) {} + TryEarlyFoldsOnly(TryEarlyFoldsOnly) {} bool run(); @@ -78,13 +79,17 @@ private: AAResults &AA; AssumptionCache &AC; - /// If true only perform scalarization combines and do not introduce new + /// If true, only perform beneficial early IR transforms. Do not introduce new /// vector operations. - bool ScalarizationOnly; + bool TryEarlyFoldsOnly; InstructionWorklist Worklist; + // TODO: Direct calls from the top-level "run" loop use a plain "Instruction" + // parameter. That should be updated to specific sub-classes because the + // run loop was changed to dispatch on opcode. bool vectorizeLoadInsert(Instruction &I); + bool widenSubvectorLoad(Instruction &I); ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, ExtractElementInst *Ext1, unsigned PreferredExtractIndex) const; @@ -97,6 +102,7 @@ private: void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1, Instruction &I); bool foldExtractExtract(Instruction &I); + bool foldInsExtFNeg(Instruction &I); bool foldBitcastShuf(Instruction &I); bool scalarizeBinopOrCmp(Instruction &I); bool foldExtractedCmps(Instruction &I); @@ -125,12 +131,32 @@ private: }; } // namespace +static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) { + // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan. + // The widened load may load data from dirty regions or create data races + // non-existent in the source. + if (!Load || !Load->isSimple() || !Load->hasOneUse() || + Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) || + mustSuppressSpeculation(*Load)) + return false; + + // We are potentially transforming byte-sized (8-bit) memory accesses, so make + // sure we have all of our type-based constraints in place for this target. + Type *ScalarTy = Load->getType()->getScalarType(); + uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); + unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); + if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 || + ScalarSize % 8 != 0) + return false; + + return true; +} + bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // Match insert into fixed vector of scalar value. // TODO: Handle non-zero insert index. - auto *Ty = dyn_cast(I.getType()); Value *Scalar; - if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) || + if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) || !Scalar->hasOneUse()) return false; @@ -140,40 +166,28 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { if (!HasExtract) X = Scalar; - // Match source value as load of scalar or vector. - // Do not vectorize scalar load (widening) if atomic/volatile or under - // asan/hwasan/memtag/tsan. The widened load may load data from dirty regions - // or create data races non-existent in the source. auto *Load = dyn_cast(X); - if (!Load || !Load->isSimple() || !Load->hasOneUse() || - Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) || - mustSuppressSpeculation(*Load)) + if (!canWidenLoad(Load, TTI)) return false; - const DataLayout &DL = I.getModule()->getDataLayout(); - Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); - assert(isa(SrcPtr->getType()) && "Expected a pointer type"); - - unsigned AS = Load->getPointerAddressSpace(); - - // We are potentially transforming byte-sized (8-bit) memory accesses, so make - // sure we have all of our type-based constraints in place for this target. Type *ScalarTy = Scalar->getType(); uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); - if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 || - ScalarSize % 8 != 0) - return false; // Check safety of replacing the scalar load with a larger vector load. // We use minimal alignment (maximum flexibility) because we only care about // the dereferenceable region. When calculating cost and creating a new op, // we may use a larger value based on alignment attributes. + const DataLayout &DL = I.getModule()->getDataLayout(); + Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); + assert(isa(SrcPtr->getType()) && "Expected a pointer type"); + unsigned MinVecNumElts = MinVectorSize / ScalarSize; auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); unsigned OffsetEltIndex = 0; Align Alignment = Load->getAlign(); - if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) { + if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &AC, + &DT)) { // It is not safe to load directly from the pointer, but we can still peek // through gep offsets and check if it safe to load from a base address with // updated alignment. If it is, we can shuffle the element(s) into place @@ -198,7 +212,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { if (OffsetEltIndex >= MinVecNumElts) return false; - if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) + if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &AC, + &DT)) return false; // Update alignment with offset value. Note that the offset could be negated @@ -211,11 +226,14 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // Use the greater of the alignment on the load or its source pointer. Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment); Type *LoadTy = Load->getType(); + unsigned AS = Load->getPointerAddressSpace(); InstructionCost OldCost = TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); - OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts, - /* Insert */ true, HasExtract); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + OldCost += + TTI.getScalarizationOverhead(MinVecTy, DemandedElts, + /* Insert */ true, HasExtract, CostKind); // New pattern: load VecPtr InstructionCost NewCost = @@ -227,6 +245,7 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // We assume this operation has no cost in codegen if there was no offset. // Note that we could use freeze to avoid poison problems, but then we might // still need a shuffle to change the vector size. + auto *Ty = cast(I.getType()); unsigned OutputNumElts = Ty->getNumElements(); SmallVector Mask(OutputNumElts, UndefMaskElem); assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); @@ -252,6 +271,66 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { return true; } +/// If we are loading a vector and then inserting it into a larger vector with +/// undefined elements, try to load the larger vector and eliminate the insert. +/// This removes a shuffle in IR and may allow combining of other loaded values. +bool VectorCombine::widenSubvectorLoad(Instruction &I) { + // Match subvector insert of fixed vector. + auto *Shuf = cast(&I); + if (!Shuf->isIdentityWithPadding()) + return false; + + // Allow a non-canonical shuffle mask that is choosing elements from op1. + unsigned NumOpElts = + cast(Shuf->getOperand(0)->getType())->getNumElements(); + unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) { + return M >= (int)(NumOpElts); + }); + + auto *Load = dyn_cast(Shuf->getOperand(OpIndex)); + if (!canWidenLoad(Load, TTI)) + return false; + + // We use minimal alignment (maximum flexibility) because we only care about + // the dereferenceable region. When calculating cost and creating a new op, + // we may use a larger value based on alignment attributes. + auto *Ty = cast(I.getType()); + const DataLayout &DL = I.getModule()->getDataLayout(); + Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); + assert(isa(SrcPtr->getType()) && "Expected a pointer type"); + Align Alignment = Load->getAlign(); + if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), DL, Load, &AC, &DT)) + return false; + + Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment); + Type *LoadTy = Load->getType(); + unsigned AS = Load->getPointerAddressSpace(); + + // Original pattern: insert_subvector (load PtrOp) + // This conservatively assumes that the cost of a subvector insert into an + // undef value is 0. We could add that cost if the cost model accurately + // reflects the real cost of that operation. + InstructionCost OldCost = + TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); + + // New pattern: load PtrOp + InstructionCost NewCost = + TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS); + + // We can aggressively convert to the vector form because the backend can + // invert this transform if it does not result in a performance win. + if (OldCost < NewCost || !NewCost.isValid()) + return false; + + IRBuilder<> Builder(Load); + Value *CastedPtr = + Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Ty->getPointerTo(AS)); + Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment); + replaceValue(I, *VecLd); + ++NumVecLoad; + return true; +} + /// Determine which, if any, of the inputs should be replaced by a shuffle /// followed by extract from a different index. ExtractElementInst *VectorCombine::getShuffleExtract( @@ -269,11 +348,12 @@ ExtractElementInst *VectorCombine::getShuffleExtract( return nullptr; Type *VecTy = Ext0->getVectorOperand()->getType(); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types"); InstructionCost Cost0 = - TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); + TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); InstructionCost Cost1 = - TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); + TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); // If both costs are invalid no shuffle is needed if (!Cost0.isValid() && !Cost1.isValid()) @@ -336,11 +416,12 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, // both sequences. unsigned Ext0Index = Ext0IndexC->getZExtValue(); unsigned Ext1Index = Ext1IndexC->getZExtValue(); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost Extract0Cost = - TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index); + TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index); InstructionCost Extract1Cost = - TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index); + TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index); // A more expensive extract will always be replaced by a splat shuffle. // For example, if Ext0 is more expensive: @@ -533,6 +614,69 @@ bool VectorCombine::foldExtractExtract(Instruction &I) { return true; } +/// Try to replace an extract + scalar fneg + insert with a vector fneg + +/// shuffle. +bool VectorCombine::foldInsExtFNeg(Instruction &I) { + // Match an insert (op (extract)) pattern. + Value *DestVec; + uint64_t Index; + Instruction *FNeg; + if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)), + m_ConstantInt(Index)))) + return false; + + // Note: This handles the canonical fneg instruction and "fsub -0.0, X". + Value *SrcVec; + Instruction *Extract; + if (!match(FNeg, m_FNeg(m_CombineAnd( + m_Instruction(Extract), + m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index)))))) + return false; + + // TODO: We could handle this with a length-changing shuffle. + auto *VecTy = cast(I.getType()); + if (SrcVec->getType() != VecTy) + return false; + + // Ignore bogus insert/extract index. + unsigned NumElts = VecTy->getNumElements(); + if (Index >= NumElts) + return false; + + // We are inserting the negated element into the same lane that we extracted + // from. This is equivalent to a select-shuffle that chooses all but the + // negated element from the destination vector. + SmallVector Mask(NumElts); + std::iota(Mask.begin(), Mask.end(), 0); + Mask[Index] = Index + NumElts; + + Type *ScalarTy = VecTy->getScalarType(); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost OldCost = + TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) + + TTI.getVectorInstrCost(I, VecTy, CostKind, Index); + + // If the extract has one use, it will be eliminated, so count it in the + // original cost. If it has more than one use, ignore the cost because it will + // be the same before/after. + if (Extract->hasOneUse()) + OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index); + + InstructionCost NewCost = + TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) + + TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask); + + if (NewCost > OldCost) + return false; + + // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index --> + // shuffle DestVec, (fneg SrcVec), Mask + Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); + Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); + replaceValue(I, *Shuf); + return true; +} + /// If this is a bitcast of a shuffle, try to bitcast the source vector to the /// destination type followed by shuffle. This can enable further transforms by /// moving bitcasts or shuffles together. @@ -548,11 +692,11 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) { // mask for scalable type is a splat or not. // 2) Disallow non-vector casts and length-changing shuffles. // TODO: We could allow any shuffle. - auto *DestTy = dyn_cast(I.getType()); auto *SrcTy = dyn_cast(V->getType()); - if (!SrcTy || !DestTy || I.getOperand(0)->getType() != SrcTy) + if (!SrcTy || I.getOperand(0)->getType() != SrcTy) return false; + auto *DestTy = cast(I.getType()); unsigned DestNumElts = DestTy->getNumElements(); unsigned SrcNumElts = SrcTy->getNumElements(); SmallVector NewMask; @@ -664,8 +808,9 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { // Get cost estimate for the insert element. This cost will factor into // both sequences. - InstructionCost InsertCost = - TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost InsertCost = TTI.getVectorInstrCost( + Instruction::InsertElement, VecTy, CostKind, Index); InstructionCost OldCost = (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost; InstructionCost NewCost = ScalarOpCost + InsertCost + @@ -754,9 +899,10 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { if (!VecTy) return false; + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost OldCost = - TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); - OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); + TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); + OldCost += TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred) * @@ -776,7 +922,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy, ShufMask); NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); - NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex); + NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex); // Aggressively form vector ops if the cost is equal because the transform // may enable further optimization. @@ -811,6 +957,7 @@ static bool isMemModifiedBetween(BasicBlock::iterator Begin, }); } +namespace { /// Helper class to indicate whether a vector index can be safely scalarized and /// if a freeze needs to be inserted. class ScalarizationResult { @@ -865,6 +1012,7 @@ public: ToFreeze = nullptr; } }; +} // namespace /// Check if it is legal to scalarize a memory access to \p VecTy at index \p /// Idx. \p Idx must access a valid vector element. @@ -928,8 +1076,8 @@ static Align computeAlignmentAfterScalarization(Align VectorAlignment, // %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1 // store i32 %b, i32* %1 bool VectorCombine::foldSingleElementStore(Instruction &I) { - StoreInst *SI = dyn_cast(&I); - if (!SI || !SI->isSimple() || + auto *SI = cast(&I); + if (!SI->isSimple() || !isa(SI->getValueOperand()->getType())) return false; @@ -985,17 +1133,14 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { if (!match(&I, m_Load(m_Value(Ptr)))) return false; + auto *FixedVT = cast(I.getType()); auto *LI = cast(&I); const DataLayout &DL = I.getModule()->getDataLayout(); - if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(LI->getType())) - return false; - - auto *FixedVT = dyn_cast(LI->getType()); - if (!FixedVT) + if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(FixedVT)) return false; InstructionCost OriginalCost = - TTI.getMemoryOpCost(Instruction::Load, LI->getType(), LI->getAlign(), + TTI.getMemoryOpCost(Instruction::Load, FixedVT, LI->getAlign(), LI->getPointerAddressSpace()); InstructionCost ScalarizedCost = 0; @@ -1034,8 +1179,9 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { } auto *Index = dyn_cast(UI->getOperand(1)); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; OriginalCost += - TTI.getVectorInstrCost(Instruction::ExtractElement, LI->getType(), + TTI.getVectorInstrCost(Instruction::ExtractElement, FixedVT, CostKind, Index ? Index->getZExtValue() : -1); ScalarizedCost += TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(), @@ -1070,10 +1216,7 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { /// Try to convert "shuffle (binop), (binop)" with a shared binop operand into /// "binop (shuffle), (shuffle)". bool VectorCombine::foldShuffleOfBinops(Instruction &I) { - auto *VecTy = dyn_cast(I.getType()); - if (!VecTy) - return false; - + auto *VecTy = cast(I.getType()); BinaryOperator *B0, *B1; ArrayRef Mask; if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)), @@ -1244,15 +1387,14 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { /// architectures with no obvious "select" shuffle, this can reduce the total /// number of operations if the target reports them as cheaper. bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { - auto *SVI = dyn_cast(&I); - auto *VT = dyn_cast(I.getType()); - if (!SVI || !VT) - return false; + auto *SVI = cast(&I); + auto *VT = cast(I.getType()); auto *Op0 = dyn_cast(SVI->getOperand(0)); auto *Op1 = dyn_cast(SVI->getOperand(1)); if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() || VT != Op0->getType()) return false; + auto *SVI0A = dyn_cast(Op0->getOperand(0)); auto *SVI0B = dyn_cast(Op0->getOperand(1)); auto *SVI1A = dyn_cast(Op1->getOperand(0)); @@ -1300,7 +1442,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { // cost calculations. if (!FromReduction) { for (ShuffleVectorInst *SV : Shuffles) { - for (auto U : SV->users()) { + for (auto *U : SV->users()) { ShuffleVectorInst *SSV = dyn_cast(U); if (SSV && isa(SSV->getOperand(1)) && SSV->getType() == VT) Shuffles.push_back(SSV); @@ -1569,19 +1711,78 @@ bool VectorCombine::run() { bool MadeChange = false; auto FoldInst = [this, &MadeChange](Instruction &I) { Builder.SetInsertPoint(&I); - if (!ScalarizationOnly) { - MadeChange |= vectorizeLoadInsert(I); - MadeChange |= foldExtractExtract(I); - MadeChange |= foldBitcastShuf(I); - MadeChange |= foldExtractedCmps(I); - MadeChange |= foldShuffleOfBinops(I); - MadeChange |= foldShuffleFromReductions(I); - MadeChange |= foldSelectShuffle(I); + bool IsFixedVectorType = isa(I.getType()); + auto Opcode = I.getOpcode(); + + // These folds should be beneficial regardless of when this pass is run + // in the optimization pipeline. + // The type checking is for run-time efficiency. We can avoid wasting time + // dispatching to folding functions if there's no chance of matching. + if (IsFixedVectorType) { + switch (Opcode) { + case Instruction::InsertElement: + MadeChange |= vectorizeLoadInsert(I); + break; + case Instruction::ShuffleVector: + MadeChange |= widenSubvectorLoad(I); + break; + case Instruction::Load: + MadeChange |= scalarizeLoadExtract(I); + break; + default: + break; + } + } + + // This transform works with scalable and fixed vectors + // TODO: Identify and allow other scalable transforms + if (isa(I.getType())) + MadeChange |= scalarizeBinopOrCmp(I); + + if (Opcode == Instruction::Store) + MadeChange |= foldSingleElementStore(I); + + + // If this is an early pipeline invocation of this pass, we are done. + if (TryEarlyFoldsOnly) + return; + + // Otherwise, try folds that improve codegen but may interfere with + // early IR canonicalizations. + // The type checking is for run-time efficiency. We can avoid wasting time + // dispatching to folding functions if there's no chance of matching. + if (IsFixedVectorType) { + switch (Opcode) { + case Instruction::InsertElement: + MadeChange |= foldInsExtFNeg(I); + break; + case Instruction::ShuffleVector: + MadeChange |= foldShuffleOfBinops(I); + MadeChange |= foldSelectShuffle(I); + break; + case Instruction::BitCast: + MadeChange |= foldBitcastShuf(I); + break; + } + } else { + switch (Opcode) { + case Instruction::Call: + MadeChange |= foldShuffleFromReductions(I); + break; + case Instruction::ICmp: + case Instruction::FCmp: + MadeChange |= foldExtractExtract(I); + break; + default: + if (Instruction::isBinaryOp(Opcode)) { + MadeChange |= foldExtractExtract(I); + MadeChange |= foldExtractedCmps(I); + } + break; + } } - MadeChange |= scalarizeBinopOrCmp(I); - MadeChange |= scalarizeLoadExtract(I); - MadeChange |= foldSingleElementStore(I); }; + for (BasicBlock &BB : F) { // Ignore unreachable basic blocks. if (!DT.isReachableFromEntry(&BB)) @@ -1664,7 +1865,7 @@ PreservedAnalyses VectorCombinePass::run(Function &F, TargetTransformInfo &TTI = FAM.getResult(F); DominatorTree &DT = FAM.getResult(F); AAResults &AA = FAM.getResult(F); - VectorCombine Combiner(F, TTI, DT, AA, AC, ScalarizationOnly); + VectorCombine Combiner(F, TTI, DT, AA, AC, TryEarlyFoldsOnly); if (!Combiner.run()) return PreservedAnalyses::all(); PreservedAnalyses PA; -- cgit v1.2.3