diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-10-23 17:51:42 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-10-23 17:51:42 +0000 |
commit | 1d5ae1026e831016fc29fd927877c86af904481f (patch) | |
tree | 2cdfd12620fcfa5d9e4a0389f85368e8e36f63f9 /lib/Transforms/Vectorize | |
parent | e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (diff) |
Notes
Diffstat (limited to 'lib/Transforms/Vectorize')
-rw-r--r-- | lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 26 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorizationLegality.cpp | 186 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorizationPlanner.h | 4 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 738 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 820 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/VPlan.cpp | 19 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/VPlan.h | 4 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp | 2 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/VPlanSLP.cpp | 13 |
9 files changed, 1143 insertions, 669 deletions
diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 4273080ddd91..f44976c723ec 100644 --- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -147,7 +147,7 @@ private: static const unsigned MaxDepth = 3; bool isConsecutiveAccess(Value *A, Value *B); - bool areConsecutivePointers(Value *PtrA, Value *PtrB, const APInt &PtrDelta, + bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta, unsigned Depth = 0) const; bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta, unsigned Depth) const; @@ -336,14 +336,29 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { } bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, - const APInt &PtrDelta, - unsigned Depth) const { + APInt PtrDelta, unsigned Depth) const { unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType()); APInt OffsetA(PtrBitWidth, 0); APInt OffsetB(PtrBitWidth, 0); PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); + unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType()); + + if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType())) + return false; + + // In case if we have to shrink the pointer + // stripAndAccumulateInBoundsConstantOffsets should properly handle a + // possible overflow and the value should fit into a smallest data type + // used in the cast/gep chain. + assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth && + OffsetB.getMinSignedBits() <= NewPtrBitWidth); + + OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth); + OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth); + PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth); + APInt OffsetDelta = OffsetB - OffsetA; // Check if they are based on the same pointer. That makes the offsets @@ -650,7 +665,7 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { // We can ignore the alias if the we have a load store pair and the load // is known to be invariant. The load cannot be clobbered by the store. auto IsInvariantLoad = [](const LoadInst *LI) -> bool { - return LI->getMetadata(LLVMContext::MD_invariant_load); + return LI->hasMetadata(LLVMContext::MD_invariant_load); }; // We can ignore the alias as long as the load comes before the store, @@ -1077,7 +1092,7 @@ bool Vectorizer::vectorizeLoadChain( LoadInst *L0 = cast<LoadInst>(Chain[0]); // If the vector has an int element, default to int for the whole load. - Type *LoadTy; + Type *LoadTy = nullptr; for (const auto &V : Chain) { LoadTy = cast<LoadInst>(V)->getType(); if (LoadTy->isIntOrIntVectorTy()) @@ -1089,6 +1104,7 @@ bool Vectorizer::vectorizeLoadChain( break; } } + assert(LoadTy && "Can't determine LoadInst type from chain"); unsigned Sz = DL.getTypeSizeInBits(LoadTy); unsigned AS = L0->getPointerAddressSpace(); diff --git a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 6ef8dc2d3cd7..f43842be5357 100644 --- a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -13,7 +13,10 @@ // pass. It should be easy to create an analysis pass around it if there // is a need (but D45420 needs to happen first). // +#include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/IntrinsicInst.h" @@ -47,38 +50,6 @@ static const unsigned MaxInterleaveFactor = 16; namespace llvm { -#ifndef NDEBUG -static void debugVectorizationFailure(const StringRef DebugMsg, - Instruction *I) { - dbgs() << "LV: Not vectorizing: " << DebugMsg; - if (I != nullptr) - dbgs() << " " << *I; - else - dbgs() << '.'; - dbgs() << '\n'; -} -#endif - -OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName, - StringRef RemarkName, - Loop *TheLoop, - Instruction *I) { - Value *CodeRegion = TheLoop->getHeader(); - DebugLoc DL = TheLoop->getStartLoc(); - - if (I) { - CodeRegion = I->getParent(); - // If there is no debug location attached to the instruction, revert back to - // using the loop's. - if (I->getDebugLoc()) - DL = I->getDebugLoc(); - } - - OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); - R << "loop not vectorized: "; - return R; -} - bool LoopVectorizeHints::Hint::validate(unsigned Val) { switch (Kind) { case HK_WIDTH: @@ -88,6 +59,7 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) { case HK_FORCE: return (Val <= 1); case HK_ISVECTORIZED: + case HK_PREDICATE: return (Val == 0 || Val == 1); } return false; @@ -99,7 +71,9 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L, : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH), Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), - IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) { + IsVectorized("isvectorized", 0, HK_ISVECTORIZED), + Predicate("vectorize.predicate.enable", 0, HK_PREDICATE), TheLoop(L), + ORE(ORE) { // Populate values with existing loop metadata. getHintsFromMetadata(); @@ -250,7 +224,7 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { return; unsigned Val = C->getZExtValue(); - Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized}; + Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate}; for (auto H : Hints) { if (Name == H->Name) { if (H->validate(Val)) @@ -435,7 +409,8 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap(); - int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false); + bool CanAddPredicate = !TheLoop->getHeader()->getParent()->hasOptSize(); + int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false); if (Stride == 1 || Stride == -1) return Stride; return 0; @@ -445,14 +420,6 @@ bool LoopVectorizationLegality::isUniform(Value *V) { return LAI->isUniform(V); } -void LoopVectorizationLegality::reportVectorizationFailure( - const StringRef DebugMsg, const StringRef OREMsg, - const StringRef ORETag, Instruction *I) const { - LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); - ORE->emit(createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(), - ORETag, TheLoop, I) << OREMsg); -} - bool LoopVectorizationLegality::canVectorizeOuterLoop() { assert(!TheLoop->empty() && "We are not vectorizing an outer loop."); // Store the result and return it at the end instead of exiting early, in case @@ -467,7 +434,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() { if (!Br) { reportVectorizationFailure("Unsupported basic block terminator", "loop control flow is not understood by vectorizer", - "CFGNotUnderstood"); + "CFGNotUnderstood", ORE, TheLoop); if (DoExtraAnalysis) Result = false; else @@ -486,7 +453,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() { !LI->isLoopHeader(Br->getSuccessor(1))) { reportVectorizationFailure("Unsupported conditional branch", "loop control flow is not understood by vectorizer", - "CFGNotUnderstood"); + "CFGNotUnderstood", ORE, TheLoop); if (DoExtraAnalysis) Result = false; else @@ -500,7 +467,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() { TheLoop /*context outer loop*/)) { reportVectorizationFailure("Outer loop contains divergent loops", "loop control flow is not understood by vectorizer", - "CFGNotUnderstood"); + "CFGNotUnderstood", ORE, TheLoop); if (DoExtraAnalysis) Result = false; else @@ -511,7 +478,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() { if (!setupOuterLoopInductions()) { reportVectorizationFailure("Unsupported outer loop Phi(s)", "Unsupported outer loop Phi(s)", - "UnsupportedPhi"); + "UnsupportedPhi", ORE, TheLoop); if (DoExtraAnalysis) Result = false; else @@ -618,7 +585,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { !PhiTy->isPointerTy()) { reportVectorizationFailure("Found a non-int non-pointer PHI", "loop control flow is not understood by vectorizer", - "CFGNotUnderstood"); + "CFGNotUnderstood", ORE, TheLoop); return false; } @@ -631,6 +598,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Unsafe cyclic dependencies with header phis are identified during // legalization for reduction, induction and first order // recurrences. + AllowedExit.insert(&I); continue; } @@ -638,7 +606,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (Phi->getNumIncomingValues() != 2) { reportVectorizationFailure("Found an invalid PHI", "loop control flow is not understood by vectorizer", - "CFGNotUnderstood", Phi); + "CFGNotUnderstood", ORE, TheLoop, Phi); return false; } @@ -690,7 +658,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { reportVectorizationFailure("Found an unidentified PHI", "value that could not be identified as " "reduction is used outside the loop", - "NonReductionValueUsedOutsideLoop", Phi); + "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi); return false; } // end of PHI handling @@ -721,11 +689,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { "library call cannot be vectorized. " "Try compiling with -fno-math-errno, -ffast-math, " "or similar flags", - "CantVectorizeLibcall", CI); + "CantVectorizeLibcall", ORE, TheLoop, CI); } else { reportVectorizationFailure("Found a non-intrinsic callsite", "call instruction cannot be vectorized", - "CantVectorizeLibcall", CI); + "CantVectorizeLibcall", ORE, TheLoop, CI); } return false; } @@ -740,7 +708,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) { reportVectorizationFailure("Found unvectorizable intrinsic", "intrinsic instruction cannot be vectorized", - "CantVectorizeIntrinsic", CI); + "CantVectorizeIntrinsic", ORE, TheLoop, CI); return false; } } @@ -753,7 +721,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { isa<ExtractElementInst>(I)) { reportVectorizationFailure("Found unvectorizable type", "instruction return type cannot be vectorized", - "CantVectorizeInstructionReturnType", &I); + "CantVectorizeInstructionReturnType", ORE, TheLoop, &I); return false; } @@ -763,7 +731,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!VectorType::isValidElementType(T)) { reportVectorizationFailure("Store instruction cannot be vectorized", "store instruction cannot be vectorized", - "CantVectorizeStore", ST); + "CantVectorizeStore", ORE, TheLoop, ST); return false; } @@ -773,12 +741,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Arbitrarily try a vector of 2 elements. Type *VecTy = VectorType::get(T, /*NumElements=*/2); assert(VecTy && "did not find vectorized version of stored type"); - unsigned Alignment = getLoadStoreAlignment(ST); - if (!TTI->isLegalNTStore(VecTy, Alignment)) { + const MaybeAlign Alignment = getLoadStoreAlignment(ST); + assert(Alignment && "Alignment should be set"); + if (!TTI->isLegalNTStore(VecTy, *Alignment)) { reportVectorizationFailure( "nontemporal store instruction cannot be vectorized", "nontemporal store instruction cannot be vectorized", - "CantVectorizeNontemporalStore", ST); + "CantVectorizeNontemporalStore", ORE, TheLoop, ST); return false; } } @@ -789,12 +758,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // supported on the target (arbitrarily try a vector of 2 elements). Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2); assert(VecTy && "did not find vectorized version of load type"); - unsigned Alignment = getLoadStoreAlignment(LD); - if (!TTI->isLegalNTLoad(VecTy, Alignment)) { + const MaybeAlign Alignment = getLoadStoreAlignment(LD); + assert(Alignment && "Alignment should be set"); + if (!TTI->isLegalNTLoad(VecTy, *Alignment)) { reportVectorizationFailure( "nontemporal load instruction cannot be vectorized", "nontemporal load instruction cannot be vectorized", - "CantVectorizeNontemporalLoad", LD); + "CantVectorizeNontemporalLoad", ORE, TheLoop, LD); return false; } } @@ -823,7 +793,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } reportVectorizationFailure("Value cannot be used outside the loop", "value cannot be used outside the loop", - "ValueUsedOutsideLoop", &I); + "ValueUsedOutsideLoop", ORE, TheLoop, &I); return false; } } // next instr. @@ -833,12 +803,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (Inductions.empty()) { reportVectorizationFailure("Did not find one integer induction var", "loop induction variable could not be identified", - "NoInductionVariable"); + "NoInductionVariable", ORE, TheLoop); return false; } else if (!WidestIndTy) { reportVectorizationFailure("Did not find one integer induction var", "integer loop induction variable could not be identified", - "NoIntegerInductionVariable"); + "NoIntegerInductionVariable", ORE, TheLoop); return false; } else { LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); @@ -869,7 +839,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { if (LAI->hasDependenceInvolvingLoopInvariantAddress()) { reportVectorizationFailure("Stores to a uniform address", "write to a loop invariant address could not be vectorized", - "CantVectorizeStoreToLoopInvariantAddress"); + "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); return false; } Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); @@ -905,7 +875,7 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { } bool LoopVectorizationLegality::blockCanBePredicated( - BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) { + BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, bool PreserveGuards) { const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); for (Instruction &I : *BB) { @@ -924,7 +894,7 @@ bool LoopVectorizationLegality::blockCanBePredicated( // !llvm.mem.parallel_loop_access implies if-conversion safety. // Otherwise, record that the load needs (real or emulated) masking // and let the cost model decide. - if (!IsAnnotatedParallel) + if (!IsAnnotatedParallel || PreserveGuards) MaskedOp.insert(LI); continue; } @@ -953,23 +923,41 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!EnableIfConversion) { reportVectorizationFailure("If-conversion is disabled", "if-conversion is disabled", - "IfConversionDisabled"); + "IfConversionDisabled", + ORE, TheLoop); return false; } assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); - // A list of pointers that we can safely read and write to. + // A list of pointers which are known to be dereferenceable within scope of + // the loop body for each iteration of the loop which executes. That is, + // the memory pointed to can be dereferenced (with the access size implied by + // the value's type) unconditionally within the loop header without + // introducing a new fault. SmallPtrSet<Value *, 8> SafePointes; // Collect safe addresses. for (BasicBlock *BB : TheLoop->blocks()) { - if (blockNeedsPredication(BB)) + if (!blockNeedsPredication(BB)) { + for (Instruction &I : *BB) + if (auto *Ptr = getLoadStorePointerOperand(&I)) + SafePointes.insert(Ptr); continue; + } - for (Instruction &I : *BB) - if (auto *Ptr = getLoadStorePointerOperand(&I)) - SafePointes.insert(Ptr); + // For a block which requires predication, a address may be safe to access + // in the loop w/o predication if we can prove dereferenceability facts + // sufficient to ensure it'll never fault within the loop. For the moment, + // we restrict this to loads; stores are more complicated due to + // concurrency restrictions. + ScalarEvolution &SE = *PSE.getSE(); + for (Instruction &I : *BB) { + LoadInst *LI = dyn_cast<LoadInst>(&I); + if (LI && !mustSuppressSpeculation(*LI) && + isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT)) + SafePointes.insert(LI->getPointerOperand()); + } } // Collect the blocks that need predication. @@ -979,7 +967,8 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!isa<BranchInst>(BB->getTerminator())) { reportVectorizationFailure("Loop contains a switch statement", "loop contains a switch statement", - "LoopContainsSwitch", BB->getTerminator()); + "LoopContainsSwitch", ORE, TheLoop, + BB->getTerminator()); return false; } @@ -989,14 +978,16 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { reportVectorizationFailure( "Control flow cannot be substituted for a select", "control flow cannot be substituted for a select", - "NoCFGForSelect", BB->getTerminator()); + "NoCFGForSelect", ORE, TheLoop, + BB->getTerminator()); return false; } } else if (BB != Header && !canIfConvertPHINodes(BB)) { reportVectorizationFailure( "Control flow cannot be substituted for a select", "control flow cannot be substituted for a select", - "NoCFGForSelect", BB->getTerminator()); + "NoCFGForSelect", ORE, TheLoop, + BB->getTerminator()); return false; } } @@ -1026,7 +1017,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, if (!Lp->getLoopPreheader()) { reportVectorizationFailure("Loop doesn't have a legal pre-header", "loop control flow is not understood by vectorizer", - "CFGNotUnderstood"); + "CFGNotUnderstood", ORE, TheLoop); if (DoExtraAnalysis) Result = false; else @@ -1037,7 +1028,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, if (Lp->getNumBackEdges() != 1) { reportVectorizationFailure("The loop must have a single backedge", "loop control flow is not understood by vectorizer", - "CFGNotUnderstood"); + "CFGNotUnderstood", ORE, TheLoop); if (DoExtraAnalysis) Result = false; else @@ -1048,7 +1039,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, if (!Lp->getExitingBlock()) { reportVectorizationFailure("The loop must have an exiting block", "loop control flow is not understood by vectorizer", - "CFGNotUnderstood"); + "CFGNotUnderstood", ORE, TheLoop); if (DoExtraAnalysis) Result = false; else @@ -1061,7 +1052,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, if (Lp->getExitingBlock() != Lp->getLoopLatch()) { reportVectorizationFailure("The exiting block is not the loop latch", "loop control flow is not understood by vectorizer", - "CFGNotUnderstood"); + "CFGNotUnderstood", ORE, TheLoop); if (DoExtraAnalysis) Result = false; else @@ -1124,7 +1115,8 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { if (!canVectorizeOuterLoop()) { reportVectorizationFailure("Unsupported outer loop", "unsupported outer loop", - "UnsupportedOuterLoop"); + "UnsupportedOuterLoop", + ORE, TheLoop); // TODO: Implement DoExtraAnalysis when subsequent legal checks support // outer loops. return false; @@ -1176,7 +1168,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { reportVectorizationFailure("Too many SCEV checks needed", "Too many SCEV assumptions need to be made and checked at runtime", - "TooManySCEVRunTimeChecks"); + "TooManySCEVRunTimeChecks", ORE, TheLoop); if (DoExtraAnalysis) Result = false; else @@ -1190,7 +1182,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { return Result; } -bool LoopVectorizationLegality::canFoldTailByMasking() { +bool LoopVectorizationLegality::prepareToFoldTailByMasking() { LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n"); @@ -1199,22 +1191,21 @@ bool LoopVectorizationLegality::canFoldTailByMasking() { "No primary induction, cannot fold tail by masking", "Missing a primary induction variable in the loop, which is " "needed in order to fold tail by masking as required.", - "NoPrimaryInduction"); + "NoPrimaryInduction", ORE, TheLoop); return false; } - // TODO: handle reductions when tail is folded by masking. - if (!Reductions.empty()) { - reportVectorizationFailure( - "Loop has reductions, cannot fold tail by masking", - "Cannot fold tail by masking in the presence of reductions.", - "ReductionFoldingTailByMasking"); - return false; - } + SmallPtrSet<const Value *, 8> ReductionLiveOuts; - // TODO: handle outside users when tail is folded by masking. + for (auto &Reduction : *getReductionVars()) + ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr()); + + // TODO: handle non-reduction outside users when tail is folded by masking. for (auto *AE : AllowedExit) { - // Check that all users of allowed exit values are inside the loop. + // Check that all users of allowed exit values are inside the loop or + // are the live-out of a reduction. + if (ReductionLiveOuts.count(AE)) + continue; for (User *U : AE->users()) { Instruction *UI = cast<Instruction>(U); if (TheLoop->contains(UI)) @@ -1222,7 +1213,7 @@ bool LoopVectorizationLegality::canFoldTailByMasking() { reportVectorizationFailure( "Cannot fold tail by masking, loop has an outside user for", "Cannot fold tail by masking in the presence of live outs.", - "LiveOutFoldingTailByMasking", UI); + "LiveOutFoldingTailByMasking", ORE, TheLoop, UI); return false; } } @@ -1233,11 +1224,12 @@ bool LoopVectorizationLegality::canFoldTailByMasking() { // Check and mark all blocks for predication, including those that ordinarily // do not need predication such as the header block. for (BasicBlock *BB : TheLoop->blocks()) { - if (!blockCanBePredicated(BB, SafePointers)) { + if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) { reportVectorizationFailure( "Cannot fold tail by masking as required", "control flow cannot be substituted for a select", - "NoCFGForSelect", BB->getTerminator()); + "NoCFGForSelect", ORE, TheLoop, + BB->getTerminator()); return false; } } diff --git a/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 97077cce83e3..a5e85f27fabf 100644 --- a/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -228,11 +228,11 @@ public: /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. - Optional<VectorizationFactor> plan(bool OptForSize, unsigned UserVF); + Optional<VectorizationFactor> plan(unsigned UserVF); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. - VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF); + VectorizationFactor planInVPlanNativePath(unsigned UserVF); /// Finalize the best decision and dispose of all other VPlans. void setBestPlan(unsigned VF, unsigned UF); diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 46265e3f3e13..8f0bf70f873c 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -177,6 +177,14 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold( "value are vectorized only if no scalar iteration overheads " "are incurred.")); +// Indicates that an epilogue is undesired, predication is preferred. +// This means that the vectorizer will try to fold the loop-tail (epilogue) +// into the loop and predicate the loop body accordingly. +static cl::opt<bool> PreferPredicateOverEpilog( + "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, + cl::desc("Indicate that an epilogue is undesired, predication should be " + "used instead.")); + static cl::opt<bool> MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " @@ -347,6 +355,29 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { : ConstantFP::get(Ty, C); } +/// Returns "best known" trip count for the specified loop \p L as defined by +/// the following procedure: +/// 1) Returns exact trip count if it is known. +/// 2) Returns expected trip count according to profile data if any. +/// 3) Returns upper bound estimate if it is known. +/// 4) Returns None if all of the above failed. +static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { + // Check if exact trip count is known. + if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) + return ExpectedTC; + + // Check if there is an expected trip count available from profile data. + if (LoopVectorizeWithBlockFrequency) + if (auto EstimatedTC = getLoopEstimatedTripCount(L)) + return EstimatedTC; + + // Check if upper bound estimate is known. + if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) + return ExpectedTC; + + return None; +} + namespace llvm { /// InnerLoopVectorizer vectorizes loops which contain only one basic @@ -795,6 +826,59 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) B.SetCurrentDebugLocation(DebugLoc()); } +/// Write a record \p DebugMsg about vectorization failure to the debug +/// output stream. If \p I is passed, it is an instruction that prevents +/// vectorization. +#ifndef NDEBUG +static void debugVectorizationFailure(const StringRef DebugMsg, + Instruction *I) { + dbgs() << "LV: Not vectorizing: " << DebugMsg; + if (I != nullptr) + dbgs() << " " << *I; + else + dbgs() << '.'; + dbgs() << '\n'; +} +#endif + +/// Create an analysis remark that explains why vectorization failed +/// +/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p +/// RemarkName is the identifier for the remark. If \p I is passed it is an +/// instruction that prevents vectorization. Otherwise \p TheLoop is used for +/// the location of the remark. \return the remark object that can be +/// streamed to. +static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, + StringRef RemarkName, Loop *TheLoop, Instruction *I) { + Value *CodeRegion = TheLoop->getHeader(); + DebugLoc DL = TheLoop->getStartLoc(); + + if (I) { + CodeRegion = I->getParent(); + // If there is no debug location attached to the instruction, revert back to + // using the loop's. + if (I->getDebugLoc()) + DL = I->getDebugLoc(); + } + + OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); + R << "loop not vectorized: "; + return R; +} + +namespace llvm { + +void reportVectorizationFailure(const StringRef DebugMsg, + const StringRef OREMsg, const StringRef ORETag, + OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { + LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); + LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); + ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), + ORETag, TheLoop, I) << OREMsg); +} + +} // end namespace llvm + #ifndef NDEBUG /// \return string containing a file name and a line # for the given loop. static std::string getDebugLocString(const Loop *L) { @@ -836,6 +920,26 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, namespace llvm { +// Loop vectorization cost-model hints how the scalar epilogue loop should be +// lowered. +enum ScalarEpilogueLowering { + + // The default: allowing scalar epilogues. + CM_ScalarEpilogueAllowed, + + // Vectorization with OptForSize: don't allow epilogues. + CM_ScalarEpilogueNotAllowedOptSize, + + // A special case of vectorisation with OptForSize: loops with a very small + // trip count are considered for vectorization under OptForSize, thereby + // making sure the cost of their loop body is dominant, free of runtime + // guards and scalar iteration overheads. + CM_ScalarEpilogueNotAllowedLowTripLoop, + + // Loop hint predicate indicating an epilogue is undesired. + CM_ScalarEpilogueNotNeededUsePredicate +}; + /// LoopVectorizationCostModel - estimates the expected speedups due to /// vectorization. /// In many cases vectorization is not profitable. This can happen because of @@ -845,20 +949,26 @@ namespace llvm { /// different operations. class LoopVectorizationCostModel { public: - LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE, - LoopInfo *LI, LoopVectorizationLegality *Legal, + LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, + PredicatedScalarEvolution &PSE, LoopInfo *LI, + LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI) - : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), - AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} + : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), + TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), + Hints(Hints), InterleaveInfo(IAI) {} /// \return An upper bound for the vectorization factor, or None if /// vectorization and interleaving should be avoided up front. - Optional<unsigned> computeMaxVF(bool OptForSize); + Optional<unsigned> computeMaxVF(); + + /// \return True if runtime checks are required for vectorization, and false + /// otherwise. + bool runtimeChecksRequired(); /// \return The most profitable vectorization factor and the cost of that VF. /// This method checks every power of two up to MaxVF. If UserVF is not ZERO @@ -881,8 +991,7 @@ public: /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(bool OptForSize, unsigned VF, - unsigned LoopCost); + unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -897,10 +1006,11 @@ public: /// of a loop. struct RegisterUsage { /// Holds the number of loop invariant values that are used in the loop. - unsigned LoopInvariantRegs; - + /// The key is ClassID of target-provided register class. + SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; /// Holds the maximum number of concurrent live intervals in the loop. - unsigned MaxLocalUsers; + /// The key is ClassID of target-provided register class. + SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; }; /// \return Returns information about the register usages of the loop for the @@ -1080,14 +1190,16 @@ public: /// Returns true if the target machine supports masked store operation /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedStore(Type *DataType, Value *Ptr) { - return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType); + bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) { + return Legal->isConsecutivePtr(Ptr) && + TTI.isLegalMaskedStore(DataType, Alignment); } /// Returns true if the target machine supports masked load operation /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { - return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType); + bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) { + return Legal->isConsecutivePtr(Ptr) && + TTI.isLegalMaskedLoad(DataType, Alignment); } /// Returns true if the target machine supports masked scatter operation @@ -1157,11 +1269,14 @@ public: /// to handle accesses with gaps, and there is nothing preventing us from /// creating a scalar epilogue. bool requiresScalarEpilogue() const { - return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue(); + return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); } - /// Returns true if a scalar epilogue is not allowed due to optsize. - bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; } + /// Returns true if a scalar epilogue is not allowed due to optsize or a + /// loop hint annotation. + bool isScalarEpilogueAllowed() const { + return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; + } /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { return FoldTailByMasking; } @@ -1187,7 +1302,7 @@ private: /// \return An upper bound for the vectorization factor, larger than zero. /// One is returned if vectorization should best be avoided due to cost. - unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount); + unsigned computeFeasibleMaxVF(unsigned ConstTripCount); /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -1246,15 +1361,6 @@ private: /// should be used. bool useEmulatedMaskMemRefHack(Instruction *I); - /// Create an analysis remark that explains why vectorization failed - /// - /// \p RemarkName is the identifier for the remark. \return the remark object - /// that can be streamed to. - OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) { - return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(), - RemarkName, TheLoop); - } - /// Map of scalar integer values to the smallest bitwidth they can be legally /// represented as. The vector equivalents of these values should be truncated /// to this type. @@ -1270,13 +1376,13 @@ private: SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; /// Records whether it is allowed to have the original scalar loop execute at - /// least once. This may be needed as a fallback loop in case runtime + /// least once. This may be needed as a fallback loop in case runtime /// aliasing/dependence checks fail, or to handle the tail/remainder /// iterations when the trip count is unknown or doesn't divide by the VF, /// or as a peel-loop to handle gaps in interleave-groups. /// Under optsize and when the trip count is very small we don't allow any /// iterations to execute in the scalar loop. - bool IsScalarEpilogueAllowed = true; + ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; /// All blocks of loop are to be masked to fold tail of scalar iterations. bool FoldTailByMasking = false; @@ -1496,7 +1602,7 @@ struct LoopVectorize : public FunctionPass { auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; + auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); @@ -2253,12 +2359,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Type *ScalarDataTy = getMemInstValueType(Instr); Type *DataTy = VectorType::get(ScalarDataTy, VF); Value *Ptr = getLoadStorePointerOperand(Instr); - unsigned Alignment = getLoadStoreAlignment(Instr); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. const DataLayout &DL = Instr->getModule()->getDataLayout(); - if (!Alignment) - Alignment = DL.getABITypeAlignment(ScalarDataTy); + const Align Alignment = + DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); unsigned AddressSpace = getLoadStoreAddressSpace(Instr); // Determine if the pointer operand of the access is either consecutive or @@ -2322,8 +2427,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; Value *VectorGep = getOrCreateVectorValue(Ptr, Part); - NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, - MaskPart); + NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, + Alignment.value(), MaskPart); } else { if (Reverse) { // If we store to reverse consecutive memory locations, then we need @@ -2334,10 +2439,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, } auto *VecPtr = CreateVecPtr(Part, Ptr); if (isMaskRequired) - NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, - Mask[Part]); + NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, + Alignment.value(), Mask[Part]); else - NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); + NewSI = + Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value()); } addMetadata(NewSI, SI); } @@ -2352,18 +2458,18 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; Value *VectorGep = getOrCreateVectorValue(Ptr, Part); - NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, + NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart, nullptr, "wide.masked.gather"); addMetadata(NewLI, LI); } else { auto *VecPtr = CreateVecPtr(Part, Ptr); if (isMaskRequired) - NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], + NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part], UndefValue::get(DataTy), "wide.masked.load"); else - NewLI = - Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); + NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(), + "wide.load"); // Add metadata to the load, but setVectorValue to the reverse shuffle. addMetadata(NewLI, LI); @@ -2615,8 +2721,9 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { if (C->isZero()) return; - assert(!Cost->foldTailByMasking() && - "Cannot SCEV check stride or overflow when folding tail"); + assert(!BB->getParent()->hasOptSize() && + "Cannot SCEV check stride or overflow when optimizing for size"); + // Create a new block containing the stride check. BB->setName("vector.scevcheck"); auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); @@ -2649,7 +2756,20 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { if (!MemRuntimeCheck) return; - assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail"); + if (BB->getParent()->hasOptSize()) { + assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && + "Cannot emit memory checks when optimizing for size, unless forced " + "to vectorize."); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", + L->getStartLoc(), L->getHeader()) + << "Code-size may be reduced by not forcing " + "vectorization, or by source-code modifications " + "eliminating the need for runtime checks " + "(e.g., adding 'restrict')."; + }); + } + // Create a new block containing the memory check. BB->setName("vector.memcheck"); auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); @@ -2666,7 +2786,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. - LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, + LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, PSE.getSE()); LVer->prepareNoAliasMetadata(); } @@ -3598,6 +3718,26 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { setDebugLocFromInst(Builder, LoopExitInst); + // If tail is folded by masking, the vector value to leave the loop should be + // a Select choosing between the vectorized LoopExitInst and vectorized Phi, + // instead of the former. + if (Cost->foldTailByMasking()) { + for (unsigned Part = 0; Part < UF; ++Part) { + Value *VecLoopExitInst = + VectorLoopValueMap.getVectorValue(LoopExitInst, Part); + Value *Sel = nullptr; + for (User *U : VecLoopExitInst->users()) { + if (isa<SelectInst>(U)) { + assert(!Sel && "Reduction exit feeding two selects"); + Sel = U; + } else + assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); + } + assert(Sel && "Reduction exit feeds no select"); + VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); + } + } + // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. @@ -4064,7 +4204,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { case Instruction::FCmp: { // Widen compares. Generate vector compares. bool FCmp = (I.getOpcode() == Instruction::FCmp); - auto *Cmp = dyn_cast<CmpInst>(&I); + auto *Cmp = cast<CmpInst>(&I); setDebugLocFromInst(Builder, Cmp); for (unsigned Part = 0; Part < UF; ++Part) { Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); @@ -4097,7 +4237,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - auto *CI = dyn_cast<CastInst>(&I); + auto *CI = cast<CastInst>(&I); setDebugLocFromInst(Builder, CI); /// Vectorize casts. @@ -4421,9 +4561,10 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne "Widening decision should be ready at this moment"); return WideningDecision == CM_Scalarize; } + const MaybeAlign Alignment = getLoadStoreAlignment(I); return isa<LoadInst>(I) ? - !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty)) - : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty)); + !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty)) + : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty)); } case Instruction::UDiv: case Instruction::SDiv: @@ -4452,10 +4593,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, // Check if masking is required. // A Group may need masking for one of two reasons: it resides in a block that // needs predication, or it was decided to use masking to deal with gaps. - bool PredicatedAccessRequiresMasking = + bool PredicatedAccessRequiresMasking = Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); - bool AccessWithGapsRequiresMasking = - Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed; + bool AccessWithGapsRequiresMasking = + Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) return true; @@ -4466,8 +4607,9 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, "Masked interleave-groups for predicated accesses are not enabled."); auto *Ty = getMemInstValueType(I); - return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty) - : TTI.isLegalMaskedStore(Ty); + const MaybeAlign Alignment = getLoadStoreAlignment(I); + return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) + : TTI.isLegalMaskedStore(Ty, Alignment); } bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, @@ -4675,82 +4817,96 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { Uniforms[VF].insert(Worklist.begin(), Worklist.end()); } -Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { - if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { - // TODO: It may by useful to do since it's still likely to be dynamically - // uniform if the target can skip. - LLVM_DEBUG( - dbgs() << "LV: Not inserting runtime ptr check for divergent target"); - - ORE->emit( - createMissedAnalysis("CantVersionLoopWithDivergentTarget") - << "runtime pointer checks needed. Not enabled for divergent target"); - - return None; - } - - unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize. - return computeFeasibleMaxVF(OptForSize, TC); +bool LoopVectorizationCostModel::runtimeChecksRequired() { + LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); if (Legal->getRuntimePointerChecking()->Need) { - ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") - << "runtime pointer checks needed. Enable vectorization of this " - "loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz"); - LLVM_DEBUG( - dbgs() - << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); - return None; + reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", + "runtime pointer checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz", + "CantVersionLoopWithOptForSize", ORE, TheLoop); + return true; } if (!PSE.getUnionPredicate().getPredicates().empty()) { - ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") - << "runtime SCEV checks needed. Enable vectorization of this " - "loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz"); - LLVM_DEBUG( - dbgs() - << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n"); - return None; + reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", + "runtime SCEV checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz", + "CantVersionLoopWithOptForSize", ORE, TheLoop); + return true; } // FIXME: Avoid specializing for stride==1 instead of bailing out. if (!Legal->getLAI()->getSymbolicStrides().empty()) { - ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") - << "runtime stride == 1 checks needed. Enable vectorization of " - "this loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz"); - LLVM_DEBUG( - dbgs() - << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n"); + reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", + "runtime stride == 1 checks needed. Enable vectorization of " + "this loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz", + "CantVersionLoopWithOptForSize", ORE, TheLoop); + return true; + } + + return false; +} + +Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { + if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { + // TODO: It may by useful to do since it's still likely to be dynamically + // uniform if the target can skip. + reportVectorizationFailure( + "Not inserting runtime ptr check for divergent target", + "runtime pointer checks needed. Not enabled for divergent target", + "CantVersionLoopWithDivergentTarget", ORE, TheLoop); return None; } - // If we optimize the program for size, avoid creating the tail loop. + unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); - if (TC == 1) { - ORE->emit(createMissedAnalysis("SingleIterationLoop") - << "loop trip count is one, irrelevant for vectorization"); - LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n"); + reportVectorizationFailure("Single iteration (non) loop", + "loop trip count is one, irrelevant for vectorization", + "SingleIterationLoop", ORE, TheLoop); return None; } - // Record that scalar epilogue is not allowed. - LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); + switch (ScalarEpilogueStatus) { + case CM_ScalarEpilogueAllowed: + return computeFeasibleMaxVF(TC); + case CM_ScalarEpilogueNotNeededUsePredicate: + LLVM_DEBUG( + dbgs() << "LV: vector predicate hint/switch found.\n" + << "LV: Not allowing scalar epilogue, creating predicated " + << "vector loop.\n"); + break; + case CM_ScalarEpilogueNotAllowedLowTripLoop: + // fallthrough as a special case of OptForSize + case CM_ScalarEpilogueNotAllowedOptSize: + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) + LLVM_DEBUG( + dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); + else + LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " + << "count.\n"); + + // Bail if runtime checks are required, which are not good when optimising + // for size. + if (runtimeChecksRequired()) + return None; + break; + } - IsScalarEpilogueAllowed = !OptForSize; + // Now try the tail folding - // We don't create an epilogue when optimizing for size. // Invalidate interleave groups that require an epilogue if we can't mask // the interleave-group. - if (!useMaskedInterleavedAccesses(TTI)) + if (!useMaskedInterleavedAccesses(TTI)) InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); - unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC); - + unsigned MaxVF = computeFeasibleMaxVF(TC); if (TC > 0 && TC % MaxVF == 0) { + // Accept MaxVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); return MaxVF; } @@ -4759,28 +4915,30 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { // found modulo the vectorization factor is not zero, try to fold the tail // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. - if (Legal->canFoldTailByMasking()) { + if (Legal->prepareToFoldTailByMasking()) { FoldTailByMasking = true; return MaxVF; } if (TC == 0) { - ORE->emit( - createMissedAnalysis("UnknownLoopCountComplexCFG") - << "unable to calculate the loop count due to complex control flow"); + reportVectorizationFailure( + "Unable to calculate the loop count due to complex control flow", + "unable to calculate the loop count due to complex control flow", + "UnknownLoopCountComplexCFG", ORE, TheLoop); return None; } - ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize") - << "cannot optimize for size and vectorize at the same time. " - "Enable vectorization of this loop with '#pragma clang loop " - "vectorize(enable)' when compiling with -Os/-Oz"); + reportVectorizationFailure( + "Cannot optimize for size and vectorize at the same time.", + "cannot optimize for size and vectorize at the same time. " + "Enable vectorization of this loop with '#pragma clang loop " + "vectorize(enable)' when compiling with -Os/-Oz", + "NoTailLoopWithOptForSize", ORE, TheLoop); return None; } unsigned -LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, - unsigned ConstTripCount) { +LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -4818,8 +4976,8 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, } unsigned MaxVF = MaxVectorSize; - if (TTI.shouldMaximizeVectorBandwidth(OptForSize) || - (MaximizeBandwidth && !OptForSize)) { + if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || + (MaximizeBandwidth && isScalarEpilogueAllowed())) { // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorSize). SmallVector<unsigned, 8> VFs; @@ -4832,9 +4990,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, // Select the largest VF which doesn't require more registers than existing // ones. - unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); for (int i = RUs.size() - 1; i >= 0; --i) { - if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { + bool Selected = true; + for (auto& pair : RUs[i].MaxLocalUsers) { + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); + if (pair.second > TargetNumRegisters) + Selected = false; + } + if (Selected) { MaxVF = VFs[i]; break; } @@ -4886,10 +5049,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { } if (!EnableCondStoresVectorization && NumPredStores) { - ORE->emit(createMissedAnalysis("ConditionalStore") - << "store that is conditionally executed prevents vectorization"); - LLVM_DEBUG( - dbgs() << "LV: No vectorization. There are conditional stores.\n"); + reportVectorizationFailure("There are conditional stores.", + "store that is conditionally executed prevents vectorization", + "ConditionalStore", ORE, TheLoop); Width = 1; Cost = ScalarCost; } @@ -4958,8 +5120,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { return {MinWidth, MaxWidth}; } -unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, - unsigned VF, +unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, unsigned LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. @@ -4975,8 +5136,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - // When we optimize for size, we don't interleave. - if (OptForSize) + if (!isScalarEpilogueAllowed()) return 1; // We used the distance for the interleave count. @@ -4988,22 +5148,12 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, if (TC > 1 && TC < TinyTripCountInterleaveThreshold) return 1; - unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); - LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters - << " registers\n"); - - if (VF == 1) { - if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) - TargetNumRegisters = ForceTargetNumScalarRegs; - } else { - if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) - TargetNumRegisters = ForceTargetNumVectorRegs; - } - RegisterUsage R = calculateRegisterUsage({VF})[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. - R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); + for (auto& pair : R.MaxLocalUsers) { + pair.second = std::max(pair.second, 1U); + } // We calculate the interleave count using the following formula. // Subtract the number of loop invariants from the number of available @@ -5016,13 +5166,35 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // We also want power of two interleave counts to ensure that the induction // variable of the vector loop wraps to zero, when tail is folded by masking; // this currently happens when OptForSize, in which case IC is set to 1 above. - unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / - R.MaxLocalUsers); + unsigned IC = UINT_MAX; - // Don't count the induction variable as interleaved. - if (EnableIndVarRegisterHeur) - IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / - std::max(1U, (R.MaxLocalUsers - 1))); + for (auto& pair : R.MaxLocalUsers) { + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); + LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters + << " registers of " + << TTI.getRegisterClassName(pair.first) << " register class\n"); + if (VF == 1) { + if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) + TargetNumRegisters = ForceTargetNumScalarRegs; + } else { + if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) + TargetNumRegisters = ForceTargetNumVectorRegs; + } + unsigned MaxLocalUsers = pair.second; + unsigned LoopInvariantRegs = 0; + if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) + LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; + + unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); + // Don't count the induction variable as interleaved. + if (EnableIndVarRegisterHeur) { + TmpIC = + PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / + std::max(1U, (MaxLocalUsers - 1))); + } + + IC = std::min(IC, TmpIC); + } // Clamp the interleave ranges to reasonable counts. unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); @@ -5036,6 +5208,14 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } + // If the trip count is constant, limit the interleave count to be less than + // the trip count divided by VF. + if (TC > 0) { + assert(TC >= VF && "VF exceeds trip count?"); + if ((TC / VF) < MaxInterleaveCount) + MaxInterleaveCount = (TC / VF); + } + // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) @@ -5044,7 +5224,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, assert(LoopCost && "Non-zero loop cost expected"); // Clamp the calculated IC to be between the 1 and the max interleave count - // that the target allows. + // that the target and trip count allows. if (IC > MaxInterleaveCount) IC = MaxInterleaveCount; else if (IC < 1) @@ -5196,7 +5376,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { const DataLayout &DL = TheFunction->getParent()->getDataLayout(); SmallVector<RegisterUsage, 8> RUs(VFs.size()); - SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0); + SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); @@ -5226,21 +5406,45 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { // For each VF find the maximum usage of registers. for (unsigned j = 0, e = VFs.size(); j < e; ++j) { + // Count the number of live intervals. + SmallMapVector<unsigned, unsigned, 4> RegUsage; + if (VFs[j] == 1) { - MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); - continue; + for (auto Inst : OpenIntervals) { + unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); + if (RegUsage.find(ClassID) == RegUsage.end()) + RegUsage[ClassID] = 1; + else + RegUsage[ClassID] += 1; + } + } else { + collectUniformsAndScalars(VFs[j]); + for (auto Inst : OpenIntervals) { + // Skip ignored values for VF > 1. + if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) + continue; + if (isScalarAfterVectorization(Inst, VFs[j])) { + unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); + if (RegUsage.find(ClassID) == RegUsage.end()) + RegUsage[ClassID] = 1; + else + RegUsage[ClassID] += 1; + } else { + unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); + if (RegUsage.find(ClassID) == RegUsage.end()) + RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); + else + RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); + } + } } - collectUniformsAndScalars(VFs[j]); - // Count the number of live intervals. - unsigned RegUsage = 0; - for (auto Inst : OpenIntervals) { - // Skip ignored values for VF > 1. - if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() || - isScalarAfterVectorization(Inst, VFs[j])) - continue; - RegUsage += GetRegUsage(Inst->getType(), VFs[j]); + + for (auto& pair : RegUsage) { + if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) + MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); + else + MaxUsages[j][pair.first] = pair.second; } - MaxUsages[j] = std::max(MaxUsages[j], RegUsage); } LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " @@ -5251,18 +5455,34 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { } for (unsigned i = 0, e = VFs.size(); i < e; ++i) { - unsigned Invariant = 0; - if (VFs[i] == 1) - Invariant = LoopInvariants.size(); - else { - for (auto Inst : LoopInvariants) - Invariant += GetRegUsage(Inst->getType(), VFs[i]); + SmallMapVector<unsigned, unsigned, 4> Invariant; + + for (auto Inst : LoopInvariants) { + unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); + unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); + if (Invariant.find(ClassID) == Invariant.end()) + Invariant[ClassID] = Usage; + else + Invariant[ClassID] += Usage; } - LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); - LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); - LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant - << '\n'); + LLVM_DEBUG({ + dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; + dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() + << " item\n"; + for (const auto &pair : MaxUsages[i]) { + dbgs() << "LV(REG): RegisterClass: " + << TTI.getRegisterClassName(pair.first) << ", " << pair.second + << " registers\n"; + } + dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() + << " item\n"; + for (const auto &pair : Invariant) { + dbgs() << "LV(REG): RegisterClass: " + << TTI.getRegisterClassName(pair.first) << ", " << pair.second + << " registers\n"; + } + }); RU.LoopInvariantRegs = Invariant; RU.MaxLocalUsers = MaxUsages[i]; @@ -5511,7 +5731,6 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, Type *ValTy = getMemInstValueType(I); auto SE = PSE.getSE(); - unsigned Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); Value *Ptr = getLoadStorePointerOperand(I); Type *PtrTy = ToVectorTy(Ptr->getType(), VF); @@ -5525,9 +5744,9 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. - Cost += VF * - TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, - AS); + const MaybeAlign Alignment = getLoadStoreAlignment(I); + Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), + Alignment ? Alignment->value() : 0, AS); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. @@ -5552,18 +5771,20 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); - unsigned Alignment = getLoadStoreAlignment(I); Value *Ptr = getLoadStorePointerOperand(I); unsigned AS = getLoadStoreAddressSpace(I); int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Stride should be 1 or -1 for consecutive memory access"); + const MaybeAlign Alignment = getLoadStoreAlignment(I); unsigned Cost = 0; if (Legal->isMaskRequired(I)) - Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, + Alignment ? Alignment->value() : 0, AS); else - Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, + Alignment ? Alignment->value() : 0, AS, I); bool Reverse = ConsecutiveStride < 0; if (Reverse) @@ -5575,33 +5796,37 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); - unsigned Alignment = getLoadStoreAlignment(I); + const MaybeAlign Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); if (isa<LoadInst>(I)) { return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + + TTI.getMemoryOpCost(Instruction::Load, ValTy, + Alignment ? Alignment->value() : 0, AS) + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); } StoreInst *SI = cast<StoreInst>(I); bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + - (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost( - Instruction::ExtractElement, - VectorTy, VF - 1)); + TTI.getMemoryOpCost(Instruction::Store, ValTy, + Alignment ? Alignment->value() : 0, AS) + + (isLoopInvariantStoreValue + ? 0 + : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, + VF - 1)); } unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); - unsigned Alignment = getLoadStoreAlignment(I); + const MaybeAlign Alignment = getLoadStoreAlignment(I); Value *Ptr = getLoadStorePointerOperand(I); return TTI.getAddressComputationCost(VectorTy) + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, - Legal->isMaskRequired(I), Alignment); + Legal->isMaskRequired(I), + Alignment ? Alignment->value() : 0); } unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, @@ -5626,8 +5851,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, } // Calculate the cost of the whole interleaved group. - bool UseMaskForGaps = - Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed; + bool UseMaskForGaps = + Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); unsigned Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); @@ -5648,11 +5873,12 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, // moment. if (VF == 1) { Type *ValTy = getMemInstValueType(I); - unsigned Alignment = getLoadStoreAlignment(I); + const MaybeAlign Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); + TTI.getMemoryOpCost(I->getOpcode(), ValTy, + Alignment ? Alignment->value() : 0, AS, I); } return getWideningCost(I, VF); } @@ -6167,8 +6393,7 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, } VectorizationFactor -LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize, - unsigned UserVF) { +LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { unsigned VF = UserVF; // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. @@ -6207,10 +6432,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize, return VectorizationFactor::Disabled(); } -Optional<VectorizationFactor> LoopVectorizationPlanner::plan(bool OptForSize, - unsigned UserVF) { +Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { assert(OrigLoop->empty() && "Inner loop expected."); - Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize); + Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. return None; @@ -6840,8 +7064,15 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, // If the tail is to be folded by masking, the primary induction variable // needs to be represented in VPlan for it to model early-exit masking. - if (CM.foldTailByMasking()) + // Also, both the Phi and the live-out instruction of each reduction are + // required in order to introduce a select between them in VPlan. + if (CM.foldTailByMasking()) { NeedDef.insert(Legal->getPrimaryInduction()); + for (auto &Reduction : *Legal->getReductionVars()) { + NeedDef.insert(Reduction.first); + NeedDef.insert(Reduction.second.getLoopExitInstr()); + } + } // Collect instructions from the original loop that will become trivially dead // in the vectorized loop. We don't need to vectorize these instructions. For @@ -6873,7 +7104,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // Create a dummy pre-entry VPBasicBlock to start building the VPlan. VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); - auto Plan = llvm::make_unique<VPlan>(VPBB); + auto Plan = std::make_unique<VPlan>(VPBB); VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); // Represent values that will have defs inside VPlan. @@ -6968,6 +7199,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBlockUtils::disconnectBlocks(PreEntry, Entry); delete PreEntry; + // Finally, if tail is folded by masking, introduce selects between the phi + // and the live-out instruction of each reduction, at the end of the latch. + if (CM.foldTailByMasking()) { + Builder.setInsertPoint(VPBB); + auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); + for (auto &Reduction : *Legal->getReductionVars()) { + VPValue *Phi = Plan->getVPValue(Reduction.first); + VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); + Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); + } + } + std::string PlanName; raw_string_ostream RSO(PlanName); unsigned VF = Range.Start; @@ -6993,7 +7236,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); // Create new empty VPlan - auto Plan = llvm::make_unique<VPlan>(); + auto Plan = std::make_unique<VPlan>(); // Build hierarchical CFG VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); @@ -7199,6 +7442,20 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); } +static ScalarEpilogueLowering +getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { + ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; + if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && + (F->hasOptSize() || + llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) + SEL = CM_ScalarEpilogueNotAllowedOptSize; + else if (PreferPredicateOverEpilog || Hints.getPredicate()) + SEL = CM_ScalarEpilogueNotNeededUsePredicate; + + return SEL; +} + // Process the loop in the VPlan-native vectorization path. This path builds // VPlan upfront in the vectorization pipeline, which allows to apply // VPlan-to-VPlan transformations from the very beginning without modifying the @@ -7213,7 +7470,9 @@ static bool processLoopInVPlanNativePath( assert(EnableVPlanNativePath && "VPlan-native path is disabled."); Function *F = L->getHeader()->getParent(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); - LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, + ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); + + LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an @@ -7223,15 +7482,8 @@ static bool processLoopInVPlanNativePath( // Get user vectorization factor. const unsigned UserVF = Hints.getWidth(); - // Check the function attributes and profiles to find out if this function - // should be optimized for size. - bool OptForSize = - Hints.getForce() != LoopVectorizeHints::FK_Enabled && - (F->hasOptSize() || - llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)); - // Plan how to best vectorize, return the best VF and its cost. - const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF); + const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); // If we are stress testing VPlan builds, do not attempt to generate vector // code. Masked vector code generation support will follow soon. @@ -7310,10 +7562,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check the function attributes and profiles to find out if this function // should be optimized for size. - bool OptForSize = - Hints.getForce() != LoopVectorizeHints::FK_Enabled && - (F->hasOptSize() || - llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)); + ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before @@ -7325,36 +7574,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { ORE, BFI, PSI, Hints); assert(L->empty() && "Inner loop expected."); + // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. - // Prefer constant trip counts over profile data, over upper bound estimate. - unsigned ExpectedTC = 0; - bool HasExpectedTC = false; - if (const SCEVConstant *ConstExits = - dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) { - const APInt &ExitsCount = ConstExits->getAPInt(); - // We are interested in small values for ExpectedTC. Skip over those that - // can't fit an unsigned. - if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) { - ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1; - HasExpectedTC = true; - } - } - // ExpectedTC may be large because it's bound by a variable. Check - // profiling information to validate we should vectorize. - if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) { - auto EstimatedTC = getLoopEstimatedTripCount(L); - if (EstimatedTC) { - ExpectedTC = *EstimatedTC; - HasExpectedTC = true; - } - } - if (!HasExpectedTC) { - ExpectedTC = SE->getSmallConstantMaxTripCount(L); - HasExpectedTC = (ExpectedTC > 0); - } - - if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) { + auto ExpectedTC = getSmallBestKnownTC(*SE, L); + if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is worth vectorizing only if no scalar " << "iteration overheads are incurred."); @@ -7362,10 +7586,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { LLVM_DEBUG(dbgs() << "\n"); - // Loops with a very small trip count are considered for vectorization - // under OptForSize, thereby making sure the cost of their loop body is - // dominant, free of runtime guards and scalar iteration overheads. - OptForSize = true; + SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; } } @@ -7374,11 +7595,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { // an integer loop and the vector instructions selected are purely integer // vector instructions? if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { - LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" - "attribute is used.\n"); - ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), - "NoImplicitFloat", L) - << "loop not vectorized due to NoImplicitFloat attribute"); + reportVectorizationFailure( + "Can't vectorize when the NoImplicitFloat attribute is used", + "loop not vectorized due to NoImplicitFloat attribute", + "NoImplicitFloat", ORE, L); Hints.emitRemarkWithHints(); return false; } @@ -7389,11 +7609,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { // additional fp-math flags can help. if (Hints.isPotentiallyUnsafe() && TTI->isFPVectorizationPotentiallyUnsafe()) { - LLVM_DEBUG( - dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n"); - ORE->emit( - createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L) - << "loop not vectorized due to unsafe FP support."); + reportVectorizationFailure( + "Potentially unsafe FP op prevents vectorization", + "loop not vectorized due to unsafe FP support.", + "UnsafeFP", ORE, L); Hints.emitRemarkWithHints(); return false; } @@ -7411,8 +7630,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { } // Use the cost model. - LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, - &Hints, IAI); + LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, + F, &Hints, IAI); CM.collectValuesToIgnore(); // Use the planner for vectorization. @@ -7422,7 +7641,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { unsigned UserVF = Hints.getWidth(); // Plan how to best vectorize, return the best VF and its cost. - Optional<VectorizationFactor> MaybeVF = LVP.plan(OptForSize, UserVF); + Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; @@ -7431,7 +7650,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. - IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); + IC = CM.selectInterleaveCount(VF.Width, VF.Cost); } // Identify the diagnostic messages that should be produced. @@ -7609,7 +7828,8 @@ bool LoopVectorizePass::runImpl( // The second condition is necessary because, even if the target has no // vector registers, loop vectorization may still enable scalar // interleaving. - if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) + if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && + TTI->getMaxInterleaveFactor(1) < 2) return false; bool Changed = false; diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 27a86c0bca91..974eff9974d9 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -194,10 +194,13 @@ static bool allSameBlock(ArrayRef<Value *> VL) { return true; } -/// \returns True if all of the values in \p VL are constants. +/// \returns True if all of the values in \p VL are constants (but not +/// globals/constant expressions). static bool allConstant(ArrayRef<Value *> VL) { + // Constant expressions and globals can't be vectorized like normal integer/FP + // constants. for (Value *i : VL) - if (!isa<Constant>(i)) + if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i)) return false; return true; } @@ -486,6 +489,7 @@ namespace slpvectorizer { /// Bottom Up SLP Vectorizer. class BoUpSLP { struct TreeEntry; + struct ScheduleData; public: using ValueList = SmallVector<Value *, 8>; @@ -614,6 +618,15 @@ public: /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable() const; + /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values + /// can be load combined in the backend. Load combining may not be allowed in + /// the IR optimizer, so we do not want to alter the pattern. For example, + /// partially transforming a scalar bswap() pattern into vector code is + /// effectively impossible for the backend to undo. + /// TODO: If load combining is allowed in the IR optimizer, this analysis + /// may not be necessary. + bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const; + OptimizationRemarkEmitter *getORE() { return ORE; } /// This structure holds any data we need about the edges being traversed @@ -1117,6 +1130,14 @@ public: #endif }; + /// Checks if the instruction is marked for deletion. + bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); } + + /// Marks values operands for later deletion by replacing them with Undefs. + void eraseInstructions(ArrayRef<Value *> AV); + + ~BoUpSLP(); + private: /// Checks if all users of \p I are the part of the vectorization tree. bool areAllUsersVectorized(Instruction *I) const; @@ -1153,8 +1174,7 @@ private: /// Set the Builder insert point to one after the last instruction in /// the bundle - void setInsertPointAfterBundle(ArrayRef<Value *> VL, - const InstructionsState &S); + void setInsertPointAfterBundle(TreeEntry *E); /// \returns a vector from a collection of scalars in \p VL. Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); @@ -1220,27 +1240,37 @@ private: /// reordering of operands during buildTree_rec() and vectorizeTree(). SmallVector<ValueList, 2> Operands; + /// The main/alternate instruction. + Instruction *MainOp = nullptr; + Instruction *AltOp = nullptr; + public: /// Set this bundle's \p OpIdx'th operand to \p OpVL. - void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL, - ArrayRef<unsigned> ReuseShuffleIndices) { + void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) { if (Operands.size() < OpIdx + 1) Operands.resize(OpIdx + 1); assert(Operands[OpIdx].size() == 0 && "Already resized?"); Operands[OpIdx].resize(Scalars.size()); for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane) - Operands[OpIdx][Lane] = (!ReuseShuffleIndices.empty()) - ? OpVL[ReuseShuffleIndices[Lane]] - : OpVL[Lane]; - } - - /// If there is a user TreeEntry, then set its operand. - void trySetUserTEOperand(const EdgeInfo &UserTreeIdx, - ArrayRef<Value *> OpVL, - ArrayRef<unsigned> ReuseShuffleIndices) { - if (UserTreeIdx.UserTE) - UserTreeIdx.UserTE->setOperand(UserTreeIdx.EdgeIdx, OpVL, - ReuseShuffleIndices); + Operands[OpIdx][Lane] = OpVL[Lane]; + } + + /// Set the operands of this bundle in their original order. + void setOperandsInOrder() { + assert(Operands.empty() && "Already initialized?"); + auto *I0 = cast<Instruction>(Scalars[0]); + Operands.resize(I0->getNumOperands()); + unsigned NumLanes = Scalars.size(); + for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands(); + OpIdx != NumOperands; ++OpIdx) { + Operands[OpIdx].resize(NumLanes); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + auto *I = cast<Instruction>(Scalars[Lane]); + assert(I->getNumOperands() == NumOperands && + "Expected same number of operands"); + Operands[OpIdx][Lane] = I->getOperand(OpIdx); + } + } } /// \returns the \p OpIdx operand of this TreeEntry. @@ -1249,6 +1279,9 @@ private: return Operands[OpIdx]; } + /// \returns the number of operands. + unsigned getNumOperands() const { return Operands.size(); } + /// \return the single \p OpIdx operand. Value *getSingleOperand(unsigned OpIdx) const { assert(OpIdx < Operands.size() && "Off bounds"); @@ -1256,6 +1289,58 @@ private: return Operands[OpIdx][0]; } + /// Some of the instructions in the list have alternate opcodes. + bool isAltShuffle() const { + return getOpcode() != getAltOpcode(); + } + + bool isOpcodeOrAlt(Instruction *I) const { + unsigned CheckedOpcode = I->getOpcode(); + return (getOpcode() == CheckedOpcode || + getAltOpcode() == CheckedOpcode); + } + + /// Chooses the correct key for scheduling data. If \p Op has the same (or + /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is + /// \p OpValue. + Value *isOneOf(Value *Op) const { + auto *I = dyn_cast<Instruction>(Op); + if (I && isOpcodeOrAlt(I)) + return Op; + return MainOp; + } + + void setOperations(const InstructionsState &S) { + MainOp = S.MainOp; + AltOp = S.AltOp; + } + + Instruction *getMainOp() const { + return MainOp; + } + + Instruction *getAltOp() const { + return AltOp; + } + + /// The main/alternate opcodes for the list of instructions. + unsigned getOpcode() const { + return MainOp ? MainOp->getOpcode() : 0; + } + + unsigned getAltOpcode() const { + return AltOp ? AltOp->getOpcode() : 0; + } + + /// Update operations state of this entry if reorder occurred. + bool updateStateIfReorder() { + if (ReorderIndices.empty()) + return false; + InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front()); + setOperations(S); + return true; + } + #ifndef NDEBUG /// Debug printer. LLVM_DUMP_METHOD void dump() const { @@ -1269,6 +1354,8 @@ private: for (Value *V : Scalars) dbgs().indent(2) << *V << "\n"; dbgs() << "NeedToGather: " << NeedToGather << "\n"; + dbgs() << "MainOp: " << *MainOp << "\n"; + dbgs() << "AltOp: " << *AltOp << "\n"; dbgs() << "VectorizedValue: "; if (VectorizedValue) dbgs() << *VectorizedValue; @@ -1279,12 +1366,12 @@ private: if (ReuseShuffleIndices.empty()) dbgs() << "Emtpy"; else - for (unsigned Idx : ReuseShuffleIndices) - dbgs() << Idx << ", "; + for (unsigned ReuseIdx : ReuseShuffleIndices) + dbgs() << ReuseIdx << ", "; dbgs() << "\n"; dbgs() << "ReorderIndices: "; - for (unsigned Idx : ReorderIndices) - dbgs() << Idx << ", "; + for (unsigned ReorderIdx : ReorderIndices) + dbgs() << ReorderIdx << ", "; dbgs() << "\n"; dbgs() << "UserTreeIndices: "; for (const auto &EInfo : UserTreeIndices) @@ -1295,11 +1382,13 @@ private: }; /// Create a new VectorizableTree entry. - TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, + TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle, + const InstructionsState &S, const EdgeInfo &UserTreeIdx, ArrayRef<unsigned> ReuseShuffleIndices = None, ArrayRef<unsigned> ReorderIndices = None) { - VectorizableTree.push_back(llvm::make_unique<TreeEntry>(VectorizableTree)); + bool Vectorized = (bool)Bundle; + VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree)); TreeEntry *Last = VectorizableTree.back().get(); Last->Idx = VectorizableTree.size() - 1; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); @@ -1307,11 +1396,22 @@ private: Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); Last->ReorderIndices = ReorderIndices; + Last->setOperations(S); if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); - ScalarToTreeEntry[VL[i]] = Last->Idx; - } + ScalarToTreeEntry[VL[i]] = Last; + } + // Update the scheduler bundle to point to this TreeEntry. + unsigned Lane = 0; + for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember; + BundleMember = BundleMember->NextInBundle) { + BundleMember->TE = Last; + BundleMember->Lane = Lane; + ++Lane; + } + assert((!Bundle.getValue() || Lane == VL.size()) && + "Bundle and VL out of sync"); } else { MustGather.insert(VL.begin(), VL.end()); } @@ -1319,7 +1419,6 @@ private: if (UserTreeIdx.UserTE) Last->UserTreeIndices.push_back(UserTreeIdx); - Last->trySetUserTEOperand(UserTreeIdx, VL, ReuseShuffleIndices); return Last; } @@ -1340,19 +1439,19 @@ private: TreeEntry *getTreeEntry(Value *V) { auto I = ScalarToTreeEntry.find(V); if (I != ScalarToTreeEntry.end()) - return VectorizableTree[I->second].get(); + return I->second; return nullptr; } const TreeEntry *getTreeEntry(Value *V) const { auto I = ScalarToTreeEntry.find(V); if (I != ScalarToTreeEntry.end()) - return VectorizableTree[I->second].get(); + return I->second; return nullptr; } /// Maps a specific scalar to its tree entry. - SmallDenseMap<Value*, int> ScalarToTreeEntry; + SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry; /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -1408,15 +1507,14 @@ private: /// This is required to ensure that there are no incorrect collisions in the /// AliasCache, which can happen if a new instruction is allocated at the /// same address as a previously deleted instruction. - void eraseInstruction(Instruction *I) { - I->removeFromParent(); - I->dropAllReferences(); - DeletedInstructions.emplace_back(I); + void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) { + auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first; + It->getSecond() = It->getSecond() && ReplaceOpsWithUndef; } /// Temporary store for deleted instructions. Instructions will be deleted /// eventually when the BoUpSLP is destructed. - SmallVector<unique_value, 8> DeletedInstructions; + DenseMap<Instruction *, bool> DeletedInstructions; /// A list of values that need to extracted out of the tree. /// This list holds pairs of (Internal Scalar : External User). External User @@ -1453,6 +1551,8 @@ private: UnscheduledDepsInBundle = UnscheduledDeps; clearDependencies(); OpValue = OpVal; + TE = nullptr; + Lane = -1; } /// Returns true if the dependency information has been calculated. @@ -1559,6 +1659,12 @@ private: /// Opcode of the current instruction in the schedule data. Value *OpValue = nullptr; + + /// The TreeEntry that this instruction corresponds to. + TreeEntry *TE = nullptr; + + /// The lane of this node in the TreeEntry. + int Lane = -1; }; #ifndef NDEBUG @@ -1633,10 +1739,9 @@ private: continue; } // Handle the def-use chain dependencies. - for (Use &U : BundleMember->Inst->operands()) { - auto *I = dyn_cast<Instruction>(U.get()); - if (!I) - continue; + + // Decrement the unscheduled counter and insert to ready list if ready. + auto &&DecrUnsched = [this, &ReadyList](Instruction *I) { doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) { if (OpDef && OpDef->hasValidDependencies() && OpDef->incrementUnscheduledDeps(-1) == 0) { @@ -1651,6 +1756,24 @@ private: << "SLP: gets ready (def): " << *DepBundle << "\n"); } }); + }; + + // If BundleMember is a vector bundle, its operands may have been + // reordered duiring buildTree(). We therefore need to get its operands + // through the TreeEntry. + if (TreeEntry *TE = BundleMember->TE) { + int Lane = BundleMember->Lane; + assert(Lane >= 0 && "Lane not set"); + for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands(); + OpIdx != NumOperands; ++OpIdx) + if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane])) + DecrUnsched(I); + } else { + // If BundleMember is a stand-alone instruction, no operand reordering + // has taken place, so we directly access its operands. + for (Use &U : BundleMember->Inst->operands()) + if (auto *I = dyn_cast<Instruction>(U.get())) + DecrUnsched(I); } // Handle the memory dependencies. for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { @@ -1697,8 +1820,11 @@ private: /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. - bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, - const InstructionsState &S); + /// \returns the scheduling bundle. The returned Optional value is non-None + /// if \p VL is allowed to be scheduled. + Optional<ScheduleData *> + tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, + const InstructionsState &S); /// Un-bundles a group of instructions. void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue); @@ -1945,6 +2071,30 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { } // end namespace llvm +BoUpSLP::~BoUpSLP() { + for (const auto &Pair : DeletedInstructions) { + // Replace operands of ignored instructions with Undefs in case if they were + // marked for deletion. + if (Pair.getSecond()) { + Value *Undef = UndefValue::get(Pair.getFirst()->getType()); + Pair.getFirst()->replaceAllUsesWith(Undef); + } + Pair.getFirst()->dropAllReferences(); + } + for (const auto &Pair : DeletedInstructions) { + assert(Pair.getFirst()->use_empty() && + "trying to erase instruction with users."); + Pair.getFirst()->eraseFromParent(); + } +} + +void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) { + for (auto *V : AV) { + if (auto *I = dyn_cast<Instruction>(V)) + eraseInstruction(I, /*ReplaceWithUndef=*/true); + }; +} + void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ArrayRef<Value *> UserIgnoreLst) { ExtraValueToDebugLocsMap ExternallyUsedValues; @@ -2026,28 +2176,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } // Don't handle vectors. if (S.OpValue->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } @@ -2055,11 +2205,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // the same block. // Don't vectorize ephemeral values. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (EphValues.count(VL[i])) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] + for (Value *V : VL) { + if (EphValues.count(V)) { + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is ephemeral.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } } @@ -2069,7 +2219,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); if (!E->isSame(VL)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } // Record the reuse of the tree node. FIXME, currently this is only used to @@ -2077,19 +2227,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, E->UserTreeIndices.push_back(UserTreeIdx); LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n"); - E->trySetUserTEOperand(UserTreeIdx, VL, None); return; } // Check that none of the instructions in the bundle are already in the tree. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - auto *I = dyn_cast<Instruction>(VL[i]); + for (Value *V : VL) { + auto *I = dyn_cast<Instruction>(V); if (!I) continue; if (getTreeEntry(I)) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is already in tree.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } } @@ -2097,10 +2246,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // If any of the scalars is marked as a value that needs to stay scalar, then // we need to gather the scalars. // The reduction nodes (stored in UserIgnoreList) also should stay scalar. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) { + for (Value *V : VL) { + if (MustGather.count(V) || is_contained(UserIgnoreList, V)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } } @@ -2114,7 +2263,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } @@ -2128,13 +2277,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (Res.second) UniqueValues.emplace_back(V); } - if (UniqueValues.size() == VL.size()) { + size_t NumUniqueScalarValues = UniqueValues.size(); + if (NumUniqueScalarValues == VL.size()) { ReuseShuffleIndicies.clear(); } else { LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); - if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) { + if (NumUniqueScalarValues <= 1 || + !llvm::isPowerOf2_32(NumUniqueScalarValues)) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } VL = UniqueValues; @@ -2142,16 +2293,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, auto &BSRef = BlocksSchedules[BB]; if (!BSRef) - BSRef = llvm::make_unique<BlockScheduling>(BB); + BSRef = std::make_unique<BlockScheduling>(BB); BlockScheduling &BS = *BSRef.get(); - if (!BS.tryScheduleBundle(VL, this, S)) { + Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S); + if (!Bundle) { LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -2160,7 +2313,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, (unsigned) Instruction::ShuffleVector : S.getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: { - PHINode *PH = dyn_cast<PHINode>(VL0); + auto *PH = cast<PHINode>(VL0); // Check for terminator values (e.g. invoke). for (unsigned j = 0; j < VL.size(); ++j) @@ -2172,23 +2325,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = + newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); + // Keeps the reordered operands to avoid code duplication. + SmallVector<ValueList, 2> OperandsVec; for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. for (Value *j : VL) Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock( PH->getIncomingBlock(i))); - - buildTree_rec(Operands, Depth + 1, {TE, i}); + TE->setOperand(i, Operands); + OperandsVec.push_back(Operands); } + for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx) + buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx}); return; } case Instruction::ExtractValue: @@ -2198,13 +2357,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (Reuse) { LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; Op0.assign(VL.size(), VL0->getOperand(0)); - VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies); + VectorizableTree.back()->setOperand(0, Op0); return; } if (!CurrentOrder.empty()) { @@ -2220,17 +2379,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, auto StoredCurrentOrderAndNum = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++StoredCurrentOrderAndNum->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, StoredCurrentOrderAndNum->getFirst()); // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; Op0.assign(VL.size(), VL0->getOperand(0)); - VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies); + VectorizableTree.back()->setOperand(0, Op0); return; } LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); - newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); BS.cancelScheduling(VL, VL0); return; } @@ -2246,7 +2407,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -2259,7 +2421,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, auto *L = cast<LoadInst>(V); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -2289,15 +2452,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, - ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, + UserTreeIdx, ReuseShuffleIndicies); + TE->setOperandsInOrder(); LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { // Need to reorder. auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++I->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, - ReuseShuffleIndicies, I->getFirst()); + TreeEntry *TE = + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, I->getFirst()); + TE->setOperandsInOrder(); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } return; @@ -2306,7 +2472,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } case Instruction::ZExt: @@ -2322,24 +2489,27 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); - for (unsigned i = 0; i < VL.size(); ++i) { - Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); + for (Value *V : VL) { + Type *Ty = cast<Instruction>(V)->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); + TE->setOperandsInOrder(); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast<Instruction>(j)->getOperand(i)); + for (Value *V : VL) + Operands.push_back(cast<Instruction>(V)->getOperand(i)); buildTree_rec(Operands, Depth + 1, {TE, i}); } @@ -2351,19 +2521,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0); Type *ComparedTy = VL0->getOperand(0)->getType(); - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - CmpInst *Cmp = cast<CmpInst>(VL[i]); + for (Value *V : VL) { + CmpInst *Cmp = cast<CmpInst>(V); if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); ValueList Left, Right; @@ -2384,7 +2556,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Right.push_back(RHS); } } - + TE->setOperand(0, Left); + TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); buildTree_rec(Right, Depth + 1, {TE, 1}); return; @@ -2409,7 +2582,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::And: case Instruction::Or: case Instruction::Xor: { - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -2417,11 +2591,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { ValueList Left, Right; reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + TE->setOperand(0, Left); + TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); buildTree_rec(Right, Depth + 1, {TE, 1}); return; } + TE->setOperandsInOrder(); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -2434,11 +2611,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. - for (unsigned j = 0; j < VL.size(); ++j) { - if (cast<Instruction>(VL[j])->getNumOperands() != 2) { + for (Value *V : VL) { + if (cast<Instruction>(V)->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } } @@ -2446,58 +2624,64 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // We can't combine several GEPs into one vector if they operate on // different types. Type *Ty0 = VL0->getOperand(0)->getType(); - for (unsigned j = 0; j < VL.size(); ++j) { - Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType(); + for (Value *V : VL) { + Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType(); if (Ty0 != CurTy) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } } // We don't combine GEPs with non-constant indexes. - for (unsigned j = 0; j < VL.size(); ++j) { - auto Op = cast<Instruction>(VL[j])->getOperand(1); + for (Value *V : VL) { + auto Op = cast<Instruction>(V)->getOperand(1); if (!isa<ConstantInt>(Op)) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); + TE->setOperandsInOrder(); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast<Instruction>(j)->getOperand(i)); + for (Value *V : VL) + Operands.push_back(cast<Instruction>(V)->getOperand(i)); buildTree_rec(Operands, Depth + 1, {TE, i}); } return; } case Instruction::Store: { - // Check if the stores are consecutive or of we need to swizzle them. + // Check if the stores are consecutive or if we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; - for (Value *j : VL) - Operands.push_back(cast<Instruction>(j)->getOperand(0)); - + for (Value *V : VL) + Operands.push_back(cast<Instruction>(V)->getOperand(0)); + TE->setOperandsInOrder(); buildTree_rec(Operands, Depth + 1, {TE, 0}); return; } @@ -2509,7 +2693,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -2519,14 +2704,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (unsigned j = 0; j != NumArgs; ++j) if (hasVectorInstrinsicScalarOpd(ID, j)) ScalarArgs[j] = CI->getArgOperand(j); - for (unsigned i = 1, e = VL.size(); i != e; ++i) { - CallInst *CI2 = dyn_cast<CallInst>(VL[i]); + for (Value *V : VL) { + CallInst *CI2 = dyn_cast<CallInst>(V); if (!CI2 || CI2->getCalledFunction() != Int || getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V << "\n"); return; } @@ -2537,7 +2723,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Value *A1J = CI2->getArgOperand(j); if (ScalarArgs[j] != A1J) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument " << ScalarArgs[j] << "!=" << A1J << "\n"); @@ -2551,19 +2738,22 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" - << *CI << "!=" << *VL[i] << '\n'); + << *CI << "!=" << *V << '\n'); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + TE->setOperandsInOrder(); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) { - CallInst *CI2 = dyn_cast<CallInst>(j); + for (Value *V : VL) { + auto *CI2 = cast<CallInst>(V); Operands.push_back(CI2->getArgOperand(i)); } buildTree_rec(Operands, Depth + 1, {TE, i}); @@ -2575,27 +2765,32 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // then do not vectorize this instruction. if (!S.isAltShuffle()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. if (isa<BinaryOperator>(VL0)) { ValueList Left, Right; reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + TE->setOperand(0, Left); + TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); buildTree_rec(Right, Depth + 1, {TE, 1}); return; } + TE->setOperandsInOrder(); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast<Instruction>(j)->getOperand(i)); + for (Value *V : VL) + Operands.push_back(cast<Instruction>(V)->getOperand(i)); buildTree_rec(Operands, Depth + 1, {TE, i}); } @@ -2603,7 +2798,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } default: BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -2738,7 +2934,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { return ReuseShuffleCost + TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); } - if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement && + if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) && allSameBlock(VL)) { Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL); if (ShuffleKind.hasValue()) { @@ -2761,11 +2957,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } return ReuseShuffleCost + getGatherCost(VL); } - InstructionsState S = getSameOpcode(VL); - assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); - Instruction *VL0 = cast<Instruction>(S.OpValue); - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); + Instruction *VL0 = E->getMainOp(); + unsigned ShuffleOrOp = + E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: return 0; @@ -2851,7 +3046,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); int ScalarEltCost = - TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0); + TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } @@ -2864,7 +3059,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // Check if the values are candidates to demote. if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { VecCost = ReuseShuffleCost + - TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0); + TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, VL0); } return VecCost - ScalarCost; } @@ -2872,14 +3067,14 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { case Instruction::ICmp: case Instruction::Select: { // Calculate the cost of this instruction. - int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy, + int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0); + int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::FNeg: @@ -2940,12 +3135,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { SmallVector<const Value *, 4> Operands(VL0->operand_values()); int ScalarEltCost = TTI->getArithmeticInstrCost( - S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); + E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK, + int VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); return ReuseShuffleCost + VecCost - ScalarCost; } @@ -3027,11 +3222,11 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { return ReuseShuffleCost + VecCallCost - ScalarCallCost; } case Instruction::ShuffleVector: { - assert(S.isAltShuffle() && - ((Instruction::isBinaryOp(S.getOpcode()) && - Instruction::isBinaryOp(S.getAltOpcode())) || - (Instruction::isCast(S.getOpcode()) && - Instruction::isCast(S.getAltOpcode()))) && + assert(E->isAltShuffle() && + ((Instruction::isBinaryOp(E->getOpcode()) && + Instruction::isBinaryOp(E->getAltOpcode())) || + (Instruction::isCast(E->getOpcode()) && + Instruction::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"); int ScalarCost = 0; if (NeedToShuffleReuses) { @@ -3046,25 +3241,25 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { I, TargetTransformInfo::TCK_RecipThroughput); } } - for (Value *i : VL) { - Instruction *I = cast<Instruction>(i); - assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + for (Value *V : VL) { + Instruction *I = cast<Instruction>(V); + assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); ScalarCost += TTI->getInstructionCost( I, TargetTransformInfo::TCK_RecipThroughput); } // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. int VecCost = 0; - if (Instruction::isBinaryOp(S.getOpcode())) { - VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy); - VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy); + if (Instruction::isBinaryOp(E->getOpcode())) { + VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy); + VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy); } else { - Type *Src0SclTy = S.MainOp->getOperand(0)->getType(); - Type *Src1SclTy = S.AltOp->getOperand(0)->getType(); + Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType(); + Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType(); VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size()); VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size()); - VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty); - VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty); + VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty); + VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty); } VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); return ReuseShuffleCost + VecCost - ScalarCost; @@ -3098,6 +3293,43 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const { return true; } +bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const { + if (RdxOpcode != Instruction::Or) + return false; + + unsigned NumElts = VectorizableTree[0]->Scalars.size(); + Value *FirstReduced = VectorizableTree[0]->Scalars[0]; + + // Look past the reduction to find a source value. Arbitrarily follow the + // path through operand 0 of any 'or'. Also, peek through optional + // shift-left-by-constant. + Value *ZextLoad = FirstReduced; + while (match(ZextLoad, m_Or(m_Value(), m_Value())) || + match(ZextLoad, m_Shl(m_Value(), m_Constant()))) + ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0); + + // Check if the input to the reduction is an extended load. + Value *LoadPtr; + if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) + return false; + + // Require that the total load bit width is a legal integer type. + // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. + // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. + Type *SrcTy = LoadPtr->getType()->getPointerElementType(); + unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts; + LLVMContext &Context = FirstReduced->getContext(); + if (!TTI->isTypeLegal(IntegerType::get(Context, LoadBitWidth))) + return false; + + // Everything matched - assume that we can fold the whole sequence using + // load combining. + LLVM_DEBUG(dbgs() << "SLP: Assume load combining for scalar reduction of " + << *(cast<Instruction>(FirstReduced)) << "\n"); + + return true; +} + bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const { // We can vectorize the tree if its size is greater than or equal to the // minimum size specified by the MinTreeSize command line option. @@ -3319,16 +3551,16 @@ void BoUpSLP::reorderInputsAccordingToOpcode( Right = Ops.getVL(1); } -void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, - const InstructionsState &S) { +void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) { // Get the basic block this bundle is in. All instructions in the bundle // should be in this block. - auto *Front = cast<Instruction>(S.OpValue); + auto *Front = E->getMainOp(); auto *BB = Front->getParent(); - assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { - auto *I = cast<Instruction>(V); - return !S.isOpcodeOrAlt(I) || I->getParent() == BB; - })); + assert(llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()), + [=](Value *V) -> bool { + auto *I = cast<Instruction>(V); + return !E->isOpcodeOrAlt(I) || I->getParent() == BB; + })); // The last instruction in the bundle in program order. Instruction *LastInst = nullptr; @@ -3339,7 +3571,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, // bundle. The end of the bundle is marked by null ScheduleData. if (BlocksSchedules.count(BB)) { auto *Bundle = - BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back())); + BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back())); if (Bundle && Bundle->isPartOfBundle()) for (; Bundle; Bundle = Bundle->NextInBundle) if (Bundle->OpValue == Bundle->Inst) @@ -3365,14 +3597,15 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, // we both exit early from buildTree_rec and that the bundle be out-of-order // (causing us to iterate all the way to the end of the block). if (!LastInst) { - SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end()); + SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end()); for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { - if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I)) + if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I)) LastInst = &I; if (Bundle.empty()) break; } } + assert(LastInst && "Failed to find last instruction in bundle"); // Set the insertion point after the last instruction in the bundle. Set the // debug location to Front. @@ -3385,7 +3618,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { // Generate the 'InsertElement' instruction. for (unsigned i = 0; i < Ty->getNumElements(); ++i) { Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); - if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) { + if (auto *Insrt = dyn_cast<InsertElementInst>(Vec)) { GatherSeq.insert(Insrt); CSEBlocks.insert(Insrt->getParent()); @@ -3494,8 +3727,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return E->VectorizedValue; } - InstructionsState S = getSameOpcode(E->Scalars); - Instruction *VL0 = cast<Instruction>(S.OpValue); + Instruction *VL0 = E->getMainOp(); Type *ScalarTy = VL0->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) ScalarTy = SI->getValueOperand()->getType(); @@ -3504,7 +3736,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); if (E->NeedToGather) { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3518,11 +3750,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + unsigned ShuffleOrOp = + E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: { - PHINode *PH = dyn_cast<PHINode>(VL0); + auto *PH = cast<PHINode>(VL0); Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); @@ -3577,7 +3809,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { E->VectorizedValue = V; return V; } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3612,7 +3844,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { E->VectorizedValue = NewV; return NewV; } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3637,7 +3869,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *InVec = vectorizeTree(E->getOperand(0)); @@ -3646,7 +3878,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return E->VectorizedValue; } - CastInst *CI = dyn_cast<CastInst>(VL0); + auto *CI = cast<CastInst>(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3658,7 +3890,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::FCmp: case Instruction::ICmp: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *L = vectorizeTree(E->getOperand(0)); Value *R = vectorizeTree(E->getOperand(1)); @@ -3670,7 +3902,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); Value *V; - if (S.getOpcode() == Instruction::FCmp) + if (E->getOpcode() == Instruction::FCmp) V = Builder.CreateFCmp(P0, L, R); else V = Builder.CreateICmp(P0, L, R); @@ -3685,7 +3917,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::Select: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *Cond = vectorizeTree(E->getOperand(0)); Value *True = vectorizeTree(E->getOperand(1)); @@ -3706,7 +3938,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::FNeg: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *Op = vectorizeTree(E->getOperand(0)); @@ -3716,7 +3948,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V = Builder.CreateUnOp( - static_cast<Instruction::UnaryOps>(S.getOpcode()), Op); + static_cast<Instruction::UnaryOps>(E->getOpcode()), Op); propagateIRFlags(V, E->Scalars, VL0); if (auto *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); @@ -3748,7 +3980,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *LHS = vectorizeTree(E->getOperand(0)); Value *RHS = vectorizeTree(E->getOperand(1)); @@ -3759,7 +3991,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V = Builder.CreateBinOp( - static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS); + static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, + RHS); propagateIRFlags(V, E->Scalars, VL0); if (auto *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); @@ -3776,12 +4009,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Load: { // Loads are inserted at the head of the tree because we don't want to // sink them all the way down past store instructions. - bool IsReorder = !E->ReorderIndices.empty(); - if (IsReorder) { - S = getSameOpcode(E->Scalars, E->ReorderIndices.front()); - VL0 = cast<Instruction>(S.OpValue); - } - setInsertPointAfterBundle(E->Scalars, S); + bool IsReorder = E->updateStateIfReorder(); + if (IsReorder) + VL0 = E->getMainOp(); + setInsertPointAfterBundle(E); LoadInst *LI = cast<LoadInst>(VL0); Type *ScalarLoadTy = LI->getType(); @@ -3797,11 +4028,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (getTreeEntry(PO)) ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0)); - unsigned Alignment = LI->getAlignment(); + MaybeAlign Alignment = MaybeAlign(LI->getAlignment()); LI = Builder.CreateLoad(VecTy, VecPtr); - if (!Alignment) { - Alignment = DL->getABITypeAlignment(ScalarLoadTy); - } + if (!Alignment) + Alignment = MaybeAlign(DL->getABITypeAlignment(ScalarLoadTy)); LI->setAlignment(Alignment); Value *V = propagateMetadata(LI, E->Scalars); if (IsReorder) { @@ -3824,7 +4054,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { unsigned Alignment = SI->getAlignment(); unsigned AS = SI->getPointerAddressSpace(); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *VecValue = vectorizeTree(E->getOperand(0)); Value *ScalarPtr = SI->getPointerOperand(); @@ -3840,7 +4070,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (!Alignment) Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); - ST->setAlignment(Alignment); + ST->setAlignment(Align(Alignment)); Value *V = propagateMetadata(ST, E->Scalars); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3851,7 +4081,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::GetElementPtr: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *Op0 = vectorizeTree(E->getOperand(0)); @@ -3878,13 +4108,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::Call: { CallInst *CI = cast<CallInst>(VL0); - setInsertPointAfterBundle(E->Scalars, S); - Function *FI; + setInsertPointAfterBundle(E); + Intrinsic::ID IID = Intrinsic::not_intrinsic; - Value *ScalarArg = nullptr; - if (CI && (FI = CI->getCalledFunction())) { + if (Function *FI = CI->getCalledFunction()) IID = FI->getIntrinsicID(); - } + + Value *ScalarArg = nullptr; std::vector<Value *> OpVecs; for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { ValueList OpVL; @@ -3926,20 +4156,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::ShuffleVector: { - assert(S.isAltShuffle() && - ((Instruction::isBinaryOp(S.getOpcode()) && - Instruction::isBinaryOp(S.getAltOpcode())) || - (Instruction::isCast(S.getOpcode()) && - Instruction::isCast(S.getAltOpcode()))) && + assert(E->isAltShuffle() && + ((Instruction::isBinaryOp(E->getOpcode()) && + Instruction::isBinaryOp(E->getAltOpcode())) || + (Instruction::isCast(E->getOpcode()) && + Instruction::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"); - Value *LHS, *RHS; - if (Instruction::isBinaryOp(S.getOpcode())) { - setInsertPointAfterBundle(E->Scalars, S); + Value *LHS = nullptr, *RHS = nullptr; + if (Instruction::isBinaryOp(E->getOpcode())) { + setInsertPointAfterBundle(E); LHS = vectorizeTree(E->getOperand(0)); RHS = vectorizeTree(E->getOperand(1)); } else { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); LHS = vectorizeTree(E->getOperand(0)); } @@ -3949,16 +4179,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V0, *V1; - if (Instruction::isBinaryOp(S.getOpcode())) { + if (Instruction::isBinaryOp(E->getOpcode())) { V0 = Builder.CreateBinOp( - static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS); + static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS); V1 = Builder.CreateBinOp( - static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS); + static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS); } else { V0 = Builder.CreateCast( - static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy); + static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy); V1 = Builder.CreateCast( - static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy); + static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy); } // Create shuffle to take alternate operations from the vector. @@ -3969,8 +4199,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { SmallVector<Constant *, 8> Mask(e); for (unsigned i = 0; i < e; ++i) { auto *OpInst = cast<Instruction>(E->Scalars[i]); - assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); - if (OpInst->getOpcode() == S.getAltOpcode()) { + assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); + if (OpInst->getOpcode() == E->getAltOpcode()) { Mask[i] = Builder.getInt32(e + i); AltScalars.push_back(E->Scalars[i]); } else { @@ -4136,20 +4366,18 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; +#ifndef NDEBUG Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { -#ifndef NDEBUG for (User *U : Scalar->users()) { LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); - // It is legal to replace users in the ignorelist by undef. + // It is legal to delete users in the ignorelist. assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && - "Replacing out-of-tree value with undef"); + "Deleting out-of-tree value"); } -#endif - Value *Undef = UndefValue::get(Ty); - Scalar->replaceAllUsesWith(Undef); } +#endif LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); eraseInstruction(cast<Instruction>(Scalar)); } @@ -4165,7 +4393,7 @@ void BoUpSLP::optimizeGatherSequence() { << " gather sequences instructions.\n"); // LICM InsertElementInst sequences. for (Instruction *I : GatherSeq) { - if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(I)) + if (isDeleted(I)) continue; // Check if this block is inside a loop. @@ -4219,6 +4447,8 @@ void BoUpSLP::optimizeGatherSequence() { // For all instructions in blocks containing gather sequences: for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { Instruction *In = &*it++; + if (isDeleted(In)) + continue; if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) continue; @@ -4245,11 +4475,11 @@ void BoUpSLP::optimizeGatherSequence() { // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. -bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, - BoUpSLP *SLP, - const InstructionsState &S) { +Optional<BoUpSLP::ScheduleData *> +BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, + const InstructionsState &S) { if (isa<PHINode>(S.OpValue)) - return true; + return nullptr; // Initialize the instruction bundle. Instruction *OldScheduleEnd = ScheduleEnd; @@ -4262,7 +4492,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, // instructions of the bundle. for (Value *V : VL) { if (!extendSchedulingRegion(V, S)) - return false; + return None; } for (Value *V : VL) { @@ -4308,6 +4538,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, resetSchedule(); initialFillReadyList(ReadyInsts); } + assert(Bundle && "Failed to find schedule bundle"); LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " << BB->getName() << "\n"); @@ -4329,9 +4560,9 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, } if (!Bundle->isReady()) { cancelScheduling(VL, S.OpValue); - return false; + return None; } - return true; + return Bundle; } void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL, @@ -4364,7 +4595,7 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL, BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { // Allocate a new ScheduleData for the instruction. if (ChunkPos >= ChunkSize) { - ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize)); + ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize)); ChunkPos = 0; } return &(ScheduleDataChunks.back()[ChunkPos++]); @@ -4977,7 +5208,7 @@ struct SLPVectorizer : public FunctionPass { auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; + auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); @@ -5052,7 +5283,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, // If the target claims to have no vector registers don't attempt // vectorization. - if (!TTI->getNumberOfRegisters(true)) + if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) return false; // Don't vectorize when the attribute NoImplicitFloat is used. @@ -5100,19 +5331,6 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, return Changed; } -/// Check that the Values in the slice in VL array are still existent in -/// the WeakTrackingVH array. -/// Vectorization of part of the VL array may cause later values in the VL array -/// to become invalid. We track when this has happened in the WeakTrackingVH -/// array. -static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, - ArrayRef<WeakTrackingVH> VH, unsigned SliceBegin, - unsigned SliceSize) { - VL = VL.slice(SliceBegin, SliceSize); - VH = VH.slice(SliceBegin, SliceSize); - return !std::equal(VL.begin(), VL.end(), VH.begin()); -} - bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, unsigned VecRegSize) { const unsigned ChainLen = Chain.size(); @@ -5124,20 +5342,20 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, if (!isPowerOf2_32(Sz) || VF < 2) return false; - // Keep track of values that were deleted by vectorizing in the loop below. - const SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end()); - bool Changed = false; // Look for profitable vectorizable trees at all offsets, starting at zero. for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) { + ArrayRef<Value *> Operands = Chain.slice(i, VF); // Check that a previous iteration of this loop did not delete the Value. - if (hasValueBeenRAUWed(Chain, TrackValues, i, VF)) + if (llvm::any_of(Operands, [&R](Value *V) { + auto *I = dyn_cast<Instruction>(V); + return I && R.isDeleted(I); + })) continue; LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i << "\n"); - ArrayRef<Value *> Operands = Chain.slice(i, VF); R.buildTree(Operands); if (R.isTreeTinyAndNotFullyVectorizable()) @@ -5329,12 +5547,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, bool CandidateFound = false; int MinCost = SLPCostThreshold; - // Keep track of values that were deleted by vectorizing in the loop below. - SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end()); - unsigned NextInst = 0, MaxInst = VL.size(); - for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; - VF /= 2) { + for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) { // No actual vectorization should happen, if number of parts is the same as // provided vectorization factor (i.e. the scalar type is used for vector // code during codegen). @@ -5352,13 +5566,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) break; + ArrayRef<Value *> Ops = VL.slice(I, OpsWidth); // Check that a previous iteration of this loop did not delete the Value. - if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth)) + if (llvm::any_of(Ops, [&R](Value *V) { + auto *I = dyn_cast<Instruction>(V); + return I && R.isDeleted(I); + })) continue; LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " << "\n"); - ArrayRef<Value *> Ops = VL.slice(I, OpsWidth); R.buildTree(Ops); Optional<ArrayRef<unsigned>> Order = R.bestOrder(); @@ -5571,7 +5788,7 @@ class HorizontalReduction { Value *createOp(IRBuilder<> &Builder, const Twine &Name) const { assert(isVectorizable() && "Expected add|fadd or min/max reduction operation."); - Value *Cmp; + Value *Cmp = nullptr; switch (Kind) { case RK_Arithmetic: return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS, @@ -5579,23 +5796,23 @@ class HorizontalReduction { case RK_Min: Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS) : Builder.CreateFCmpOLT(LHS, RHS); - break; + return Builder.CreateSelect(Cmp, LHS, RHS, Name); case RK_Max: Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS) : Builder.CreateFCmpOGT(LHS, RHS); - break; + return Builder.CreateSelect(Cmp, LHS, RHS, Name); case RK_UMin: assert(Opcode == Instruction::ICmp && "Expected integer types."); Cmp = Builder.CreateICmpULT(LHS, RHS); - break; + return Builder.CreateSelect(Cmp, LHS, RHS, Name); case RK_UMax: assert(Opcode == Instruction::ICmp && "Expected integer types."); Cmp = Builder.CreateICmpUGT(LHS, RHS); - break; + return Builder.CreateSelect(Cmp, LHS, RHS, Name); case RK_None: - llvm_unreachable("Unknown reduction operation."); + break; } - return Builder.CreateSelect(Cmp, LHS, RHS, Name); + llvm_unreachable("Unknown reduction operation."); } public: @@ -6203,6 +6420,8 @@ public: } if (V.isTreeTinyAndNotFullyVectorizable()) break; + if (V.isLoadCombineReductionCandidate(ReductionData.getOpcode())) + break; V.computeMinimumValueSizes(); @@ -6275,6 +6494,9 @@ public: } // Update users. ReductionRoot->replaceAllUsesWith(VectorizedTree); + // Mark all scalar reduction ops for deletion, they are replaced by the + // vector reductions. + V.eraseInstructions(IgnoreList); } return VectorizedTree != nullptr; } @@ -6323,7 +6545,7 @@ private: IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; - int ScalarReduxCost; + int ScalarReduxCost = 0; switch (ReductionData.getKind()) { case RK_Arithmetic: ScalarReduxCost = @@ -6429,10 +6651,9 @@ static bool findBuildVector(InsertElementInst *LastInsertElem, /// \return true if it matches. static bool findBuildAggregate(InsertValueInst *IV, SmallVectorImpl<Value *> &BuildVectorOpds) { - Value *V; do { BuildVectorOpds.push_back(IV->getInsertedValueOperand()); - V = IV->getAggregateOperand(); + Value *V = IV->getAggregateOperand(); if (isa<UndefValue>(V)) break; IV = dyn_cast<InsertValueInst>(V); @@ -6530,18 +6751,13 @@ static bool tryToVectorizeHorReductionOrInstOperands( // horizontal reduction. // Interrupt the process if the Root instruction itself was vectorized or all // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. - SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0}); + SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0}); SmallPtrSet<Value *, 8> VisitedInstrs; bool Res = false; while (!Stack.empty()) { - Value *V; + Instruction *Inst; unsigned Level; - std::tie(V, Level) = Stack.pop_back_val(); - if (!V) - continue; - auto *Inst = dyn_cast<Instruction>(V); - if (!Inst) - continue; + std::tie(Inst, Level) = Stack.pop_back_val(); auto *BI = dyn_cast<BinaryOperator>(Inst); auto *SI = dyn_cast<SelectInst>(Inst); if (BI || SI) { @@ -6582,8 +6798,8 @@ static bool tryToVectorizeHorReductionOrInstOperands( for (auto *Op : Inst->operand_values()) if (VisitedInstrs.insert(Op).second) if (auto *I = dyn_cast<Instruction>(Op)) - if (!isa<PHINode>(I) && I->getParent() == BB) - Stack.emplace_back(Op, Level); + if (!isa<PHINode>(I) && !R.isDeleted(I) && I->getParent() == BB) + Stack.emplace_back(I, Level); } return Res; } @@ -6652,11 +6868,10 @@ bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, } bool SLPVectorizerPass::vectorizeSimpleInstructions( - SmallVectorImpl<WeakVH> &Instructions, BasicBlock *BB, BoUpSLP &R) { + SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R) { bool OpsChanged = false; - for (auto &VH : reverse(Instructions)) { - auto *I = dyn_cast_or_null<Instruction>(VH); - if (!I) + for (auto *I : reverse(Instructions)) { + if (R.isDeleted(I)) continue; if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); @@ -6685,7 +6900,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (!P) break; - if (!VisitedInstrs.count(P)) + if (!VisitedInstrs.count(P) && !R.isDeleted(P)) Incoming.push_back(P); } @@ -6729,9 +6944,12 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { VisitedInstrs.clear(); - SmallVector<WeakVH, 8> PostProcessInstructions; + SmallVector<Instruction *, 8> PostProcessInstructions; SmallDenseSet<Instruction *, 4> KeyNodes; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + // Skip instructions marked for the deletion. + if (R.isDeleted(&*it)) + continue; // We may go through BB multiple times so skip the one we have checked. if (!VisitedInstrs.insert(&*it).second) { if (it->use_empty() && KeyNodes.count(&*it) > 0 && @@ -6811,10 +7029,16 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length " << Entry.second.size() << ".\n"); - // We process the getelementptr list in chunks of 16 (like we do for - // stores) to minimize compile-time. - for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) { - auto Len = std::min<unsigned>(BE - BI, 16); + // Process the GEP list in chunks suitable for the target's supported + // vector size. If a vector register can't hold 1 element, we are done. + unsigned MaxVecRegSize = R.getMaxVecRegSize(); + unsigned EltSize = R.getVectorElementSize(Entry.second[0]); + if (MaxVecRegSize < EltSize) + continue; + + unsigned MaxElts = MaxVecRegSize / EltSize; + for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) { + auto Len = std::min<unsigned>(BE - BI, MaxElts); auto GEPList = makeArrayRef(&Entry.second[BI], Len); // Initialize a set a candidate getelementptrs. Note that we use a @@ -6824,10 +7048,10 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { SetVector<Value *> Candidates(GEPList.begin(), GEPList.end()); // Some of the candidates may have already been vectorized after we - // initially collected them. If so, the WeakTrackingVHs will have - // nullified the - // values, so remove them from the set of candidates. - Candidates.remove(nullptr); + // initially collected them. If so, they are marked as deleted, so remove + // them from the set of candidates. + Candidates.remove_if( + [&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); }); // Remove from the set of candidates all pairs of getelementptrs with // constant differences. Such getelementptrs are likely not good @@ -6835,18 +7059,18 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { // computed from the other. We also ensure all candidate getelementptr // indices are unique. for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) { - auto *GEPI = cast<GetElementPtrInst>(GEPList[I]); + auto *GEPI = GEPList[I]; if (!Candidates.count(GEPI)) continue; auto *SCEVI = SE->getSCEV(GEPList[I]); for (int J = I + 1; J < E && Candidates.size() > 1; ++J) { - auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]); + auto *GEPJ = GEPList[J]; auto *SCEVJ = SE->getSCEV(GEPList[J]); if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) { - Candidates.remove(GEPList[I]); - Candidates.remove(GEPList[J]); + Candidates.remove(GEPI); + Candidates.remove(GEPJ); } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) { - Candidates.remove(GEPList[J]); + Candidates.remove(GEPJ); } } } diff --git a/lib/Transforms/Vectorize/VPlan.cpp b/lib/Transforms/Vectorize/VPlan.cpp index 517d759d7bfc..4b80d1fb20aa 100644 --- a/lib/Transforms/Vectorize/VPlan.cpp +++ b/lib/Transforms/Vectorize/VPlan.cpp @@ -283,6 +283,12 @@ iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { return getParent()->getRecipeList().erase(getIterator()); } +void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { + InsertPos->getParent()->getRecipeList().splice( + std::next(InsertPos->getIterator()), getParent()->getRecipeList(), + getIterator()); +} + void VPInstruction::generateInstruction(VPTransformState &State, unsigned Part) { IRBuilder<> &Builder = State.Builder; @@ -309,6 +315,14 @@ void VPInstruction::generateInstruction(VPTransformState &State, State.set(this, V, Part); break; } + case Instruction::Select: { + Value *Cond = State.get(getOperand(0), Part); + Value *Op1 = State.get(getOperand(1), Part); + Value *Op2 = State.get(getOperand(2), Part); + Value *V = Builder.CreateSelect(Cond, Op1, Op2); + State.set(this, V, Part); + break; + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -728,7 +742,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, auto NewIGIter = Old2New.find(IG); if (NewIGIter == Old2New.end()) Old2New[IG] = new InterleaveGroup<VPInstruction>( - IG->getFactor(), IG->isReverse(), IG->getAlignment()); + IG->getFactor(), IG->isReverse(), Align(IG->getAlignment())); if (Inst == IG->getInsertPos()) Old2New[IG]->setInsertPos(VPInst); @@ -736,7 +750,8 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, InterleaveGroupMap[VPInst] = Old2New[IG]; InterleaveGroupMap[VPInst]->insertMember( VPInst, IG->getIndex(Inst), - IG->isReverse() ? (-1) * int(IG->getFactor()) : IG->getFactor()); + Align(IG->isReverse() ? (-1) * int(IG->getFactor()) + : IG->getFactor())); } } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) visitRegion(Region, Old2New, IAI); diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h index 8a06412ad590..44d8a198f27e 100644 --- a/lib/Transforms/Vectorize/VPlan.h +++ b/lib/Transforms/Vectorize/VPlan.h @@ -615,6 +615,10 @@ public: /// the specified recipe. void insertBefore(VPRecipeBase *InsertPos); + /// Unlink this recipe from its current VPBasicBlock and insert it into + /// the VPBasicBlock that MovePos lives in, right after MovePos. + void moveAfter(VPRecipeBase *MovePos); + /// This method unlinks 'this' from the containing basic block and deletes it. /// /// \returns an iterator pointing to the element after the erased one diff --git a/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp b/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp index 7ed7d21b6caa..b22d3190d654 100644 --- a/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp +++ b/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp @@ -21,7 +21,7 @@ void VPlanHCFGTransforms::VPInstructionsToVPRecipes( LoopVectorizationLegality::InductionList *Inductions, SmallPtrSetImpl<Instruction *> &DeadInstructions) { - VPRegionBlock *TopRegion = dyn_cast<VPRegionBlock>(Plan->getEntry()); + auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry()); ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry()); // Condition bit VPValues get deleted during transformation to VPRecipes. diff --git a/lib/Transforms/Vectorize/VPlanSLP.cpp b/lib/Transforms/Vectorize/VPlanSLP.cpp index e5ab24e52df6..9019ed15ec5f 100644 --- a/lib/Transforms/Vectorize/VPlanSLP.cpp +++ b/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -346,11 +346,14 @@ SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() { void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) { dbgs() << " Ops: "; - for (auto Op : Values) - if (auto *Instr = cast_or_null<VPInstruction>(Op)->getUnderlyingInstr()) - dbgs() << *Instr << " | "; - else - dbgs() << " nullptr | "; + for (auto Op : Values) { + if (auto *VPInstr = cast_or_null<VPInstruction>(Op)) + if (auto *Instr = VPInstr->getUnderlyingInstr()) { + dbgs() << *Instr << " | "; + continue; + } + dbgs() << " nullptr | "; + } dbgs() << "\n"; } |