diff options
Diffstat (limited to 'llvm/lib/Transforms')
75 files changed, 3820 insertions, 1524 deletions
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp index abac3f801a22..4624b735bef8 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp @@ -475,12 +475,12 @@ void TruncInstCombine::ReduceExpressionDag(Type *SclTy) { // any of its operands, this way, when we get to the operand, we already // removed the instructions (from the expression dag) that uses it. CurrentTruncInst->eraseFromParent(); - for (auto I = InstInfoMap.rbegin(), E = InstInfoMap.rend(); I != E; ++I) { + for (auto &I : llvm::reverse(InstInfoMap)) { // We still need to check that the instruction has no users before we erase // it, because {SExt, ZExt}Inst Instruction might have other users that was // not reduced, in such case, we need to keep that instruction. - if (I->first->use_empty()) - I->first->eraseFromParent(); + if (I.first->use_empty()) + I.first->eraseFromParent(); } } diff --git a/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/llvm/lib/Transforms/CFGuard/CFGuard.cpp index 96c083a144b2..5fc5295969d0 100644 --- a/llvm/lib/Transforms/CFGuard/CFGuard.cpp +++ b/llvm/lib/Transforms/CFGuard/CFGuard.cpp @@ -165,6 +165,12 @@ void CFGuard::insertCFGuardCheck(CallBase *CB) { IRBuilder<> B(CB); Value *CalledOperand = CB->getCalledOperand(); + // If the indirect call is called within catchpad or cleanuppad, + // we need to copy "funclet" bundle of the call. + SmallVector<llvm::OperandBundleDef, 1> Bundles; + if (auto Bundle = CB->getOperandBundle(LLVMContext::OB_funclet)) + Bundles.push_back(OperandBundleDef(*Bundle)); + // Load the global symbol as a pointer to the check function. LoadInst *GuardCheckLoad = B.CreateLoad(GuardFnPtrType, GuardFnGlobal); @@ -172,7 +178,7 @@ void CFGuard::insertCFGuardCheck(CallBase *CB) { // even if the original CallBase is an Invoke or CallBr instruction. CallInst *GuardCheck = B.CreateCall(GuardFnType, GuardCheckLoad, - {B.CreateBitCast(CalledOperand, B.getInt8PtrTy())}); + {B.CreateBitCast(CalledOperand, B.getInt8PtrTy())}, Bundles); // Ensure that the first argument is passed in the correct register // (e.g. ECX on 32-bit X86 targets). diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index ac3d078714ce..a0d12865bd3a 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -1237,8 +1237,10 @@ namespace { struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> { using Base = PtrUseVisitor<AllocaUseVisitor>; AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT, - const CoroBeginInst &CB, const SuspendCrossingInfo &Checker) - : PtrUseVisitor(DL), DT(DT), CoroBegin(CB), Checker(Checker) {} + const CoroBeginInst &CB, const SuspendCrossingInfo &Checker, + bool ShouldUseLifetimeStartInfo) + : PtrUseVisitor(DL), DT(DT), CoroBegin(CB), Checker(Checker), + ShouldUseLifetimeStartInfo(ShouldUseLifetimeStartInfo) {} void visit(Instruction &I) { Users.insert(&I); @@ -1390,6 +1392,7 @@ private: SmallPtrSet<Instruction *, 4> Users{}; SmallPtrSet<IntrinsicInst *, 2> LifetimeStarts{}; bool MayWriteBeforeCoroBegin{false}; + bool ShouldUseLifetimeStartInfo{true}; mutable llvm::Optional<bool> ShouldLiveOnFrame{}; @@ -1398,7 +1401,7 @@ private: // more precise. We look at every pair of lifetime.start intrinsic and // every basic block that uses the pointer to see if they cross suspension // points. The uses cover both direct uses as well as indirect uses. - if (!LifetimeStarts.empty()) { + if (ShouldUseLifetimeStartInfo && !LifetimeStarts.empty()) { for (auto *I : Users) for (auto *S : LifetimeStarts) if (Checker.isDefinitionAcrossSuspend(*S, I)) @@ -2484,8 +2487,15 @@ static void collectFrameAllocas(Function &F, coro::Shape &Shape, continue; } DominatorTree DT(F); + // The code that uses lifetime.start intrinsic does not work for functions + // with loops without exit. Disable it on ABIs we know to generate such + // code. + bool ShouldUseLifetimeStartInfo = + (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon && + Shape.ABI != coro::ABI::RetconOnce); AllocaUseVisitor Visitor{F.getParent()->getDataLayout(), DT, - *Shape.CoroBegin, Checker}; + *Shape.CoroBegin, Checker, + ShouldUseLifetimeStartInfo}; Visitor.visitPtr(*AI); if (!Visitor.getShouldLiveOnFrame()) continue; @@ -2572,9 +2582,15 @@ void coro::salvageDebugInfo( DVI->setExpression(Expr); /// It makes no sense to move the dbg.value intrinsic. if (!isa<DbgValueInst>(DVI)) { - if (auto *InsertPt = dyn_cast<Instruction>(Storage)) + if (auto *II = dyn_cast<InvokeInst>(Storage)) + DVI->moveBefore(II->getNormalDest()->getFirstNonPHI()); + else if (auto *CBI = dyn_cast<CallBrInst>(Storage)) + DVI->moveBefore(CBI->getDefaultDest()->getFirstNonPHI()); + else if (auto *InsertPt = dyn_cast<Instruction>(Storage)) { + assert(!InsertPt->isTerminator() && + "Unimaged terminator that could return a storage."); DVI->moveAfter(InsertPt); - else if (isa<Argument>(Storage)) + } else if (isa<Argument>(Storage)) DVI->moveAfter(F->getEntryBlock().getFirstNonPHI()); } } @@ -2664,7 +2680,10 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) { } } - sinkLifetimeStartMarkers(F, Shape, Checker); + if (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon && + Shape.ABI != coro::ABI::RetconOnce) + sinkLifetimeStartMarkers(F, Shape, Checker); + if (Shape.ABI != coro::ABI::Async || !Shape.CoroSuspends.empty()) collectFrameAllocas(F, Shape, Checker, FrameData.Allocas); LLVM_DEBUG(dumpAllocas(FrameData.Allocas)); diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index fa1d92f439b8..12c1829524ef 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -280,6 +280,27 @@ static void replaceFallthroughCoroEnd(AnyCoroEndInst *End, BB->getTerminator()->eraseFromParent(); } +// Mark a coroutine as done, which implies that the coroutine is finished and +// never get resumed. +// +// In resume-switched ABI, the done state is represented by storing zero in +// ResumeFnAddr. +// +// NOTE: We couldn't omit the argument `FramePtr`. It is necessary because the +// pointer to the frame in splitted function is not stored in `Shape`. +static void markCoroutineAsDone(IRBuilder<> &Builder, const coro::Shape &Shape, + Value *FramePtr) { + assert( + Shape.ABI == coro::ABI::Switch && + "markCoroutineAsDone is only supported for Switch-Resumed ABI for now."); + auto *GepIndex = Builder.CreateStructGEP( + Shape.FrameTy, FramePtr, coro::Shape::SwitchFieldIndex::Resume, + "ResumeFn.addr"); + auto *NullPtr = ConstantPointerNull::get(cast<PointerType>( + Shape.FrameTy->getTypeAtIndex(coro::Shape::SwitchFieldIndex::Resume))); + Builder.CreateStore(NullPtr, GepIndex); +} + /// Replace an unwind call to llvm.coro.end. static void replaceUnwindCoroEnd(AnyCoroEndInst *End, const coro::Shape &Shape, Value *FramePtr, bool InResume, @@ -288,10 +309,18 @@ static void replaceUnwindCoroEnd(AnyCoroEndInst *End, const coro::Shape &Shape, switch (Shape.ABI) { // In switch-lowering, this does nothing in the main function. - case coro::ABI::Switch: + case coro::ABI::Switch: { + // In C++'s specification, the coroutine should be marked as done + // if promise.unhandled_exception() throws. The frontend will + // call coro.end(true) along this path. + // + // FIXME: We should refactor this once there is other language + // which uses Switch-Resumed style other than C++. + markCoroutineAsDone(Builder, Shape, FramePtr); if (!InResume) return; break; + } // In async lowering this does nothing. case coro::ABI::Async: break; @@ -364,13 +393,9 @@ static void createResumeEntryBlock(Function &F, coro::Shape &Shape) { auto *Save = S->getCoroSave(); Builder.SetInsertPoint(Save); if (S->isFinal()) { - // Final suspend point is represented by storing zero in ResumeFnAddr. - auto *GepIndex = Builder.CreateStructGEP(FrameTy, FramePtr, - coro::Shape::SwitchFieldIndex::Resume, - "ResumeFn.addr"); - auto *NullPtr = ConstantPointerNull::get(cast<PointerType>( - FrameTy->getTypeAtIndex(coro::Shape::SwitchFieldIndex::Resume))); - Builder.CreateStore(NullPtr, GepIndex); + // The coroutine should be marked done if it reaches the final suspend + // point. + markCoroutineAsDone(Builder, Shape, FramePtr); } else { auto *GepIndex = Builder.CreateStructGEP( FrameTy, FramePtr, Shape.getSwitchIndexField(), "index.addr"); diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp index e4883ef89db7..fba8b03e44ba 100644 --- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -141,7 +141,6 @@ static bool isCoroutineIntrinsicName(StringRef Name) { "llvm.coro.id.retcon", "llvm.coro.id.retcon.once", "llvm.coro.noop", - "llvm.coro.param", "llvm.coro.prepare.async", "llvm.coro.prepare.retcon", "llvm.coro.promise", diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index 93bb11433775..3a42a2cac928 100644 --- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -835,14 +835,20 @@ bool ArgumentPromotionPass::areFunctionArgsABICompatible( const Function &F, const TargetTransformInfo &TTI, SmallPtrSetImpl<Argument *> &ArgsToPromote, SmallPtrSetImpl<Argument *> &ByValArgsToTransform) { + // TODO: Check individual arguments so we can promote a subset? + SmallVector<Type *, 32> Types; + for (Argument *Arg : ArgsToPromote) + Types.push_back(Arg->getType()->getPointerElementType()); + for (Argument *Arg : ByValArgsToTransform) + Types.push_back(Arg->getParamByValType()); + for (const Use &U : F.uses()) { CallBase *CB = dyn_cast<CallBase>(U.getUser()); if (!CB) return false; const Function *Caller = CB->getCaller(); const Function *Callee = CB->getCalledFunction(); - if (!TTI.areFunctionArgsABICompatible(Caller, Callee, ArgsToPromote) || - !TTI.areFunctionArgsABICompatible(Caller, Callee, ByValArgsToTransform)) + if (!TTI.areTypesABICompatible(Caller, Callee, Types)) return false; } return true; diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index edadc79e3a9f..7e729e57153c 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -2139,12 +2139,10 @@ bool Attributor::shouldSeedAttribute(AbstractAttribute &AA) { bool Result = true; #ifndef NDEBUG if (SeedAllowList.size() != 0) - Result = - std::count(SeedAllowList.begin(), SeedAllowList.end(), AA.getName()); + Result = llvm::is_contained(SeedAllowList, AA.getName()); Function *Fn = AA.getAnchorScope(); if (FunctionSeedAllowList.size() != 0 && Fn) - Result &= std::count(FunctionSeedAllowList.begin(), - FunctionSeedAllowList.end(), Fn->getName()); + Result &= llvm::is_contained(FunctionSeedAllowList, Fn->getName()); #endif return Result; } diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index ec08287393de..b977821bcaa6 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -417,7 +417,7 @@ const Value *stripAndAccumulateMinimalOffsets( AttributorAnalysis); } -static const Value *getMinimalBaseOfAccsesPointerOperand( +static const Value *getMinimalBaseOfAccessPointerOperand( Attributor &A, const AbstractAttribute &QueryingAA, const Instruction *I, int64_t &BytesOffset, const DataLayout &DL, bool AllowNonInbounds = false) { const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false); @@ -2129,7 +2129,7 @@ static int64_t getKnownNonNullAndDerefBytesForUse( int64_t Offset; const Value *Base = - getMinimalBaseOfAccsesPointerOperand(A, QueryingAA, I, Offset, DL); + getMinimalBaseOfAccessPointerOperand(A, QueryingAA, I, Offset, DL); if (Base) { if (Base == &AssociatedValue && getPointerOperand(I, /* AllowVolatile */ false) == UseV) { @@ -6414,31 +6414,36 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { return indicatePessimisticFixpoint(); } + // Collect the types that will replace the privatizable type in the function + // signature. + SmallVector<Type *, 16> ReplacementTypes; + identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes); + // Verify callee and caller agree on how the promoted argument would be // passed. - // TODO: The use of the ArgumentPromotion interface here is ugly, we need a - // specialized form of TargetTransformInfo::areFunctionArgsABICompatible - // which doesn't require the arguments ArgumentPromotion wanted to pass. Function &Fn = *getIRPosition().getAnchorScope(); - SmallPtrSet<Argument *, 1> ArgsToPromote, Dummy; - ArgsToPromote.insert(getAssociatedArgument()); const auto *TTI = A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(Fn); - if (!TTI || - !ArgumentPromotionPass::areFunctionArgsABICompatible( - Fn, *TTI, ArgsToPromote, Dummy) || - ArgsToPromote.empty()) { + if (!TTI) { + LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Missing TTI for function " + << Fn.getName() << "\n"); + return indicatePessimisticFixpoint(); + } + + auto CallSiteCheck = [&](AbstractCallSite ACS) { + CallBase *CB = ACS.getInstruction(); + return TTI->areTypesABICompatible( + CB->getCaller(), CB->getCalledFunction(), ReplacementTypes); + }; + bool AllCallSitesKnown; + if (!A.checkForAllCallSites(CallSiteCheck, *this, true, + AllCallSitesKnown)) { LLVM_DEBUG( dbgs() << "[AAPrivatizablePtr] ABI incompatibility detected for " << Fn.getName() << "\n"); return indicatePessimisticFixpoint(); } - // Collect the types that will replace the privatizable type in the function - // signature. - SmallVector<Type *, 16> ReplacementTypes; - identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes); - // Register a rewrite of the argument. Argument *Arg = getAssociatedArgument(); if (!A.isValidFunctionSignatureRewrite(*Arg, ReplacementTypes)) { @@ -6558,7 +6563,6 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { return false; }; - bool AllCallSitesKnown; if (!A.checkForAllCallSites(IsCompatiblePrivArgOfOtherCallSite, *this, true, AllCallSitesKnown)) return indicatePessimisticFixpoint(); diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index cde78713b554..321d4a19a585 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -76,6 +76,7 @@ STATISTIC(NumNoCapture, "Number of arguments marked nocapture"); STATISTIC(NumReturned, "Number of arguments marked returned"); STATISTIC(NumReadNoneArg, "Number of arguments marked readnone"); STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly"); +STATISTIC(NumWriteOnlyArg, "Number of arguments marked writeonly"); STATISTIC(NumNoAlias, "Number of function returns marked noalias"); STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull"); STATISTIC(NumNoRecurse, "Number of functions marked as norecurse"); @@ -580,16 +581,8 @@ struct ArgumentUsesTracker : public CaptureTracker { return true; } - // Note: the callee and the two successor blocks *follow* the argument - // operands. This means there is no need to adjust UseIndex to account for - // these. - - unsigned UseIndex = - std::distance(const_cast<const Use *>(CB->arg_begin()), U); - - assert(UseIndex < CB->data_operands_size() && - "Indirect function calls should have been filtered above!"); - + assert(!CB->isCallee(U) && "callee operand reported captured?"); + const unsigned UseIndex = CB->getDataOperandNo(U); if (UseIndex >= CB->arg_size()) { // Data operand, but not a argument operand -- must be a bundle operand assert(CB->hasOperandBundles() && "Must be!"); @@ -649,8 +642,8 @@ struct GraphTraits<ArgumentGraph *> : public GraphTraits<ArgumentGraphNode *> { /// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone. static Attribute::AttrKind -determinePointerReadAttrs(Argument *A, - const SmallPtrSet<Argument *, 8> &SCCNodes) { +determinePointerAccessAttrs(Argument *A, + const SmallPtrSet<Argument *, 8> &SCCNodes) { SmallVector<Use *, 32> Worklist; SmallPtrSet<Use *, 32> Visited; @@ -659,7 +652,7 @@ determinePointerReadAttrs(Argument *A, return Attribute::None; bool IsRead = false; - // We don't need to track IsWritten. If A is written to, return immediately. + bool IsWrite = false; for (Use &U : A->uses()) { Visited.insert(&U); @@ -667,6 +660,10 @@ determinePointerReadAttrs(Argument *A, } while (!Worklist.empty()) { + if (IsWrite && IsRead) + // No point in searching further.. + return Attribute::None; + Use *U = Worklist.pop_back_val(); Instruction *I = cast<Instruction>(U->getUser()); @@ -684,73 +681,49 @@ determinePointerReadAttrs(Argument *A, case Instruction::Call: case Instruction::Invoke: { - bool Captures = true; + CallBase &CB = cast<CallBase>(*I); + if (CB.isCallee(U)) { + IsRead = true; + // Note that indirect calls do not capture, see comment in + // CaptureTracking for context + continue; + } - if (I->getType()->isVoidTy()) - Captures = false; + // Given we've explictily handled the callee operand above, what's left + // must be a data operand (e.g. argument or operand bundle) + const unsigned UseIndex = CB.getDataOperandNo(U); - auto AddUsersToWorklistIfCapturing = [&] { - if (Captures) + if (!CB.doesNotCapture(UseIndex)) { + if (!CB.onlyReadsMemory()) + // If the callee can save a copy into other memory, then simply + // scanning uses of the call is insufficient. We have no way + // of tracking copies of the pointer through memory to see + // if a reloaded copy is written to, thus we must give up. + return Attribute::None; + // Push users for processing once we finish this one + if (!I->getType()->isVoidTy()) for (Use &UU : I->uses()) if (Visited.insert(&UU).second) Worklist.push_back(&UU); - }; - - CallBase &CB = cast<CallBase>(*I); - if (CB.doesNotAccessMemory()) { - AddUsersToWorklistIfCapturing(); - continue; } + + if (CB.doesNotAccessMemory()) + continue; - Function *F = CB.getCalledFunction(); - if (!F) { - if (CB.onlyReadsMemory()) { - IsRead = true; - AddUsersToWorklistIfCapturing(); - continue; - } - return Attribute::None; - } - - // Note: the callee and the two successor blocks *follow* the argument - // operands. This means there is no need to adjust UseIndex to account - // for these. - - unsigned UseIndex = std::distance(CB.arg_begin(), U); - - // U cannot be the callee operand use: since we're exploring the - // transitive uses of an Argument, having such a use be a callee would - // imply the call site is an indirect call or invoke; and we'd take the - // early exit above. - assert(UseIndex < CB.data_operands_size() && - "Data operand use expected!"); - - bool IsOperandBundleUse = UseIndex >= CB.arg_size(); + if (Function *F = CB.getCalledFunction()) + if (CB.isArgOperand(U) && UseIndex < F->arg_size() && + SCCNodes.count(F->getArg(UseIndex))) + // This is an argument which is part of the speculative SCC. Note + // that only operands corresponding to formal arguments of the callee + // can participate in the speculation. + break; - if (UseIndex >= F->arg_size() && !IsOperandBundleUse) { - assert(F->isVarArg() && "More params than args in non-varargs call"); + // The accessors used on call site here do the right thing for calls and + // invokes with operand bundles. + if (!CB.onlyReadsMemory() && !CB.onlyReadsMemory(UseIndex)) return Attribute::None; - } - - Captures &= !CB.doesNotCapture(UseIndex); - - // Since the optimizer (by design) cannot see the data flow corresponding - // to a operand bundle use, these cannot participate in the optimistic SCC - // analysis. Instead, we model the operand bundle uses as arguments in - // call to a function external to the SCC. - if (IsOperandBundleUse || - !SCCNodes.count(&*std::next(F->arg_begin(), UseIndex))) { - - // The accessors used on call site here do the right thing for calls and - // invokes with operand bundles. - - if (!CB.onlyReadsMemory() && !CB.onlyReadsMemory(UseIndex)) - return Attribute::None; - if (!CB.doesNotAccessMemory(UseIndex)) - IsRead = true; - } - - AddUsersToWorklistIfCapturing(); + if (!CB.doesNotAccessMemory(UseIndex)) + IsRead = true; break; } @@ -763,6 +736,19 @@ determinePointerReadAttrs(Argument *A, IsRead = true; break; + case Instruction::Store: + if (cast<StoreInst>(I)->getValueOperand() == *U) + // untrackable capture + return Attribute::None; + + // A volatile store has side effects beyond what writeonly can be relied + // upon. + if (cast<StoreInst>(I)->isVolatile()) + return Attribute::None; + + IsWrite = true; + break; + case Instruction::ICmp: case Instruction::Ret: break; @@ -772,7 +758,14 @@ determinePointerReadAttrs(Argument *A, } } - return IsRead ? Attribute::ReadOnly : Attribute::ReadNone; + if (IsWrite && IsRead) + return Attribute::None; + else if (IsRead) + return Attribute::ReadOnly; + else if (IsWrite) + return Attribute::WriteOnly; + else + return Attribute::ReadNone; } /// Deduce returned attributes for the SCC. @@ -865,9 +858,10 @@ static bool addArgumentAttrsFromCallsites(Function &F) { return Changed; } -static bool addReadAttr(Argument *A, Attribute::AttrKind R) { - assert((R == Attribute::ReadOnly || R == Attribute::ReadNone) - && "Must be a Read attribute."); +static bool addAccessAttr(Argument *A, Attribute::AttrKind R) { + assert((R == Attribute::ReadOnly || R == Attribute::ReadNone || + R == Attribute::WriteOnly) + && "Must be an access attribute."); assert(A && "Argument must not be null."); // If the argument already has the attribute, nothing needs to be done. @@ -880,7 +874,12 @@ static bool addReadAttr(Argument *A, Attribute::AttrKind R) { A->removeAttr(Attribute::ReadOnly); A->removeAttr(Attribute::ReadNone); A->addAttr(R); - R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg; + if (R == Attribute::ReadOnly) + ++NumReadOnlyArg; + else if (R == Attribute::WriteOnly) + ++NumWriteOnlyArg; + else + ++NumReadNoneArg; return true; } @@ -945,15 +944,15 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, // Otherwise, it's captured. Don't bother doing SCC analysis on it. } if (!HasNonLocalUses && !A->onlyReadsMemory()) { - // Can we determine that it's readonly/readnone without doing an SCC? - // Note that we don't allow any calls at all here, or else our result - // will be dependent on the iteration order through the functions in the - // SCC. + // Can we determine that it's readonly/readnone/writeonly without doing + // an SCC? Note that we don't allow any calls at all here, or else our + // result will be dependent on the iteration order through the + // functions in the SCC. SmallPtrSet<Argument *, 8> Self; Self.insert(&*A); - Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self); + Attribute::AttrKind R = determinePointerAccessAttrs(&*A, Self); if (R != Attribute::None) - if (addReadAttr(A, R)) + if (addAccessAttr(A, R)) Changed.insert(F); } } @@ -979,6 +978,13 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, A->addAttr(Attribute::NoCapture); ++NumNoCapture; Changed.insert(A->getParent()); + + // Infer the access attributes given the new nocapture one + SmallPtrSet<Argument *, 8> Self; + Self.insert(&*A); + Attribute::AttrKind R = determinePointerAccessAttrs(&*A, Self); + if (R != Attribute::None) + addAccessAttr(A, R); } continue; } @@ -1023,10 +1029,10 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, Changed.insert(A->getParent()); } - // We also want to compute readonly/readnone. With a small number of false - // negatives, we can assume that any pointer which is captured isn't going - // to be provably readonly or readnone, since by definition we can't - // analyze all uses of a captured pointer. + // We also want to compute readonly/readnone/writeonly. With a small number + // of false negatives, we can assume that any pointer which is captured + // isn't going to be provably readonly or readnone, since by definition + // we can't analyze all uses of a captured pointer. // // The false negatives happen when the pointer is captured by a function // that promises readonly/readnone behaviour on the pointer, then the @@ -1034,24 +1040,28 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, // Also, a readonly/readnone pointer may be returned, but returning a // pointer is capturing it. - Attribute::AttrKind ReadAttr = Attribute::ReadNone; - for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { + auto meetAccessAttr = [](Attribute::AttrKind A, Attribute::AttrKind B) { + if (A == B) + return A; + if (A == Attribute::ReadNone) + return B; + if (B == Attribute::ReadNone) + return A; + return Attribute::None; + }; + + Attribute::AttrKind AccessAttr = Attribute::ReadNone; + for (unsigned i = 0, e = ArgumentSCC.size(); + i != e && AccessAttr != Attribute::None; ++i) { Argument *A = ArgumentSCC[i]->Definition; - Attribute::AttrKind K = determinePointerReadAttrs(A, ArgumentSCCNodes); - if (K == Attribute::ReadNone) - continue; - if (K == Attribute::ReadOnly) { - ReadAttr = Attribute::ReadOnly; - continue; - } - ReadAttr = K; - break; + Attribute::AttrKind K = determinePointerAccessAttrs(A, ArgumentSCCNodes); + AccessAttr = meetAccessAttr(AccessAttr, K); } - if (ReadAttr != Attribute::None) { + if (AccessAttr != Attribute::None) { for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { Argument *A = ArgumentSCC[i]->Definition; - if (addReadAttr(A, ReadAttr)) + if (addAccessAttr(A, AccessAttr)) Changed.insert(A->getParent()); } } diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index fbd083bb9bbf..2425646455bd 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -64,8 +64,8 @@ static cl::opt<unsigned> FuncSpecializationMaxIters( cl::desc("The maximum number of iterations function specialization is run"), cl::init(1)); -static cl::opt<unsigned> MaxConstantsThreshold( - "func-specialization-max-constants", cl::Hidden, +static cl::opt<unsigned> MaxClonesThreshold( + "func-specialization-max-clones", cl::Hidden, cl::desc("The maximum number of clones allowed for a single function " "specialization"), cl::init(3)); @@ -92,6 +92,28 @@ static cl::opt<bool> EnableSpecializationForLiteralConstant( cl::desc("Enable specialization of functions that take a literal constant " "as an argument.")); +namespace { +// Bookkeeping struct to pass data from the analysis and profitability phase +// to the actual transform helper functions. +struct ArgInfo { + Function *Fn; // The function to perform specialisation on. + Argument *Arg; // The Formal argument being analysed. + Constant *Const; // A corresponding actual constant argument. + InstructionCost Gain; // Profitability: Gain = Bonus - Cost. + + // Flag if this will be a partial specialization, in which case we will need + // to keep the original function around in addition to the added + // specializations. + bool Partial = false; + + ArgInfo(Function *F, Argument *A, Constant *C, InstructionCost G) + : Fn(F), Arg(A), Const(C), Gain(G){}; +}; +} // Anonymous namespace + +using FuncList = SmallVectorImpl<Function *>; +using ConstList = SmallVectorImpl<Constant *>; + // Helper to check if \p LV is either a constant or a constant // range with a single element. This should cover exactly the same cases as the // old ValueLatticeElement::isConstant() and is intended to be used in the @@ -169,7 +191,7 @@ static Constant *getConstantStackValue(CallInst *Call, Value *Val, // ret void // } // -static void constantArgPropagation(SmallVectorImpl<Function *> &WorkList, +static void constantArgPropagation(FuncList &WorkList, Module &M, SCCPSolver &Solver) { // Iterate over the argument tracked functions see if there // are any new constant values for the call instruction via @@ -254,40 +276,33 @@ public: /// /// \returns true if at least one function is specialized. bool - specializeFunctions(SmallVectorImpl<Function *> &FuncDecls, - SmallVectorImpl<Function *> &CurrentSpecializations) { - - // Attempt to specialize the argument-tracked functions. + specializeFunctions(FuncList &FuncDecls, + FuncList &CurrentSpecializations) { bool Changed = false; for (auto *F : FuncDecls) { - if (specializeFunction(F, CurrentSpecializations)) { - Changed = true; - LLVM_DEBUG(dbgs() << "FnSpecialization: Can specialize this func.\n"); - } else { + if (!isCandidateFunction(F, CurrentSpecializations)) + continue; + + auto Cost = getSpecializationCost(F); + if (!Cost.isValid()) { LLVM_DEBUG( - dbgs() << "FnSpecialization: Cannot specialize this func.\n"); + dbgs() << "FnSpecialization: Invalid specialisation cost.\n"); + continue; } - } - for (auto *SpecializedFunc : CurrentSpecializations) { - SpecializedFuncs.insert(SpecializedFunc); - - // Initialize the state of the newly created functions, marking them - // argument-tracked and executable. - if (SpecializedFunc->hasExactDefinition() && - !SpecializedFunc->hasFnAttribute(Attribute::Naked)) - Solver.addTrackedFunction(SpecializedFunc); - Solver.addArgumentTrackedFunction(SpecializedFunc); - FuncDecls.push_back(SpecializedFunc); - Solver.markBlockExecutable(&SpecializedFunc->front()); + auto ConstArgs = calculateGains(F, Cost); + if (ConstArgs.empty()) { + LLVM_DEBUG(dbgs() << "FnSpecialization: no possible constants found\n"); + continue; + } - // Replace the function arguments for the specialized functions. - for (Argument &Arg : SpecializedFunc->args()) - if (!Arg.use_empty() && tryToReplaceWithConstant(&Arg)) - LLVM_DEBUG(dbgs() << "FnSpecialization: Replaced constant argument: " - << Arg.getName() << "\n"); + for (auto &CA : ConstArgs) { + specializeFunction(CA, CurrentSpecializations); + Changed = true; + } } + updateSpecializedFuncs(FuncDecls, CurrentSpecializations); NumFuncSpecialized += NbFunctionsSpecialized; return Changed; } @@ -333,15 +348,83 @@ private: return Clone; } - /// This function decides whether to specialize function \p F based on the - /// known constant values its arguments can take on. Specialization is - /// performed on the first interesting argument. Specializations based on - /// additional arguments will be evaluated on following iterations of the - /// main IPSCCP solve loop. \returns true if the function is specialized and - /// false otherwise. - bool specializeFunction(Function *F, - SmallVectorImpl<Function *> &Specializations) { + /// This function decides whether it's worthwhile to specialize function \p F + /// based on the known constant values its arguments can take on, i.e. it + /// calculates a gain and returns a list of actual arguments that are deemed + /// profitable to specialize. Specialization is performed on the first + /// interesting argument. Specializations based on additional arguments will + /// be evaluated on following iterations of the main IPSCCP solve loop. + SmallVector<ArgInfo> calculateGains(Function *F, InstructionCost Cost) { + SmallVector<ArgInfo> Worklist; + // Determine if we should specialize the function based on the values the + // argument can take on. If specialization is not profitable, we continue + // on to the next argument. + for (Argument &FormalArg : F->args()) { + LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing arg: " + << FormalArg.getName() << "\n"); + // Determine if this argument is interesting. If we know the argument can + // take on any constant values, they are collected in Constants. If the + // argument can only ever equal a constant value in Constants, the + // function will be completely specialized, and the IsPartial flag will + // be set to false by isArgumentInteresting (that function only adds + // values to the Constants list that are deemed profitable). + bool IsPartial = true; + SmallVector<Constant *> ActualConstArg; + if (!isArgumentInteresting(&FormalArg, ActualConstArg, IsPartial)) { + LLVM_DEBUG(dbgs() << "FnSpecialization: Argument is not interesting\n"); + continue; + } + + for (auto *ActualArg : ActualConstArg) { + InstructionCost Gain = + ForceFunctionSpecialization + ? 1 + : getSpecializationBonus(&FormalArg, ActualArg) - Cost; + if (Gain <= 0) + continue; + Worklist.push_back({F, &FormalArg, ActualArg, Gain}); + } + + if (Worklist.empty()) + continue; + + // Sort the candidates in descending order. + llvm::stable_sort(Worklist, [](const ArgInfo &L, const ArgInfo &R) { + return L.Gain > R.Gain; + }); + + // Truncate the worklist to 'MaxClonesThreshold' candidates if + // necessary. + if (Worklist.size() > MaxClonesThreshold) { + LLVM_DEBUG(dbgs() << "FnSpecialization: number of candidates exceed " + << "the maximum number of clones threshold.\n" + << "Truncating worklist to " << MaxClonesThreshold + << " candidates.\n"); + Worklist.erase(Worklist.begin() + MaxClonesThreshold, + Worklist.end()); + } + + if (IsPartial || Worklist.size() < ActualConstArg.size()) + for (auto &ActualArg : Worklist) + ActualArg.Partial = true; + + LLVM_DEBUG(dbgs() << "Sorted list of candidates by gain:\n"; + for (auto &C + : Worklist) { + dbgs() << "- Function = " << C.Fn->getName() << ", "; + dbgs() << "FormalArg = " << C.Arg->getName() << ", "; + dbgs() << "ActualArg = " << C.Const->getName() << ", "; + dbgs() << "Gain = " << C.Gain << "\n"; + }); + + // FIXME: Only one argument per function. + break; + } + return Worklist; + } + + bool isCandidateFunction(Function *F, FuncList &Specializations) { // Do not specialize the cloned function again. if (SpecializedFuncs.contains(F)) return false; @@ -362,84 +445,32 @@ private: LLVM_DEBUG(dbgs() << "FnSpecialization: Try function: " << F->getName() << "\n"); + return true; + } - // Determine if it would be profitable to create a specialization of the - // function where the argument takes on the given constant value. If so, - // add the constant to Constants. - auto FnSpecCost = getSpecializationCost(F); - if (!FnSpecCost.isValid()) { - LLVM_DEBUG(dbgs() << "FnSpecialization: Invalid specialisation cost.\n"); - return false; - } - - LLVM_DEBUG(dbgs() << "FnSpecialization: func specialisation cost: "; - FnSpecCost.print(dbgs()); dbgs() << "\n"); + void specializeFunction(ArgInfo &AI, FuncList &Specializations) { + Function *Clone = cloneCandidateFunction(AI.Fn); + Argument *ClonedArg = Clone->getArg(AI.Arg->getArgNo()); - // Determine if we should specialize the function based on the values the - // argument can take on. If specialization is not profitable, we continue - // on to the next argument. - for (Argument &A : F->args()) { - LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing arg: " << A.getName() - << "\n"); - // True if this will be a partial specialization. We will need to keep - // the original function around in addition to the added specializations. - bool IsPartial = true; + // Rewrite calls to the function so that they call the clone instead. + rewriteCallSites(AI.Fn, Clone, *ClonedArg, AI.Const); - // Determine if this argument is interesting. If we know the argument can - // take on any constant values, they are collected in Constants. If the - // argument can only ever equal a constant value in Constants, the - // function will be completely specialized, and the IsPartial flag will - // be set to false by isArgumentInteresting (that function only adds - // values to the Constants list that are deemed profitable). - SmallVector<Constant *, 4> Constants; - if (!isArgumentInteresting(&A, Constants, FnSpecCost, IsPartial)) { - LLVM_DEBUG(dbgs() << "FnSpecialization: Argument is not interesting\n"); - continue; - } - - assert(!Constants.empty() && "No constants on which to specialize"); - LLVM_DEBUG(dbgs() << "FnSpecialization: Argument is interesting!\n" - << "FnSpecialization: Specializing '" << F->getName() - << "' on argument: " << A << "\n" - << "FnSpecialization: Constants are:\n\n"; - for (unsigned I = 0; I < Constants.size(); ++I) dbgs() - << *Constants[I] << "\n"; - dbgs() << "FnSpecialization: End of constants\n\n"); - - // Create a version of the function in which the argument is marked - // constant with the given value. - for (auto *C : Constants) { - // Clone the function. We leave the ValueToValueMap empty to allow - // IPSCCP to propagate the constant arguments. - Function *Clone = cloneCandidateFunction(F); - Argument *ClonedArg = Clone->arg_begin() + A.getArgNo(); - - // Rewrite calls to the function so that they call the clone instead. - rewriteCallSites(F, Clone, *ClonedArg, C); - - // Initialize the lattice state of the arguments of the function clone, - // marking the argument on which we specialized the function constant - // with the given value. - Solver.markArgInFuncSpecialization(F, ClonedArg, C); - - // Mark all the specialized functions - Specializations.push_back(Clone); - NbFunctionsSpecialized++; - } + // Initialize the lattice state of the arguments of the function clone, + // marking the argument on which we specialized the function constant + // with the given value. + Solver.markArgInFuncSpecialization(AI.Fn, ClonedArg, AI.Const); - // If the function has been completely specialized, the original function - // is no longer needed. Mark it unreachable. - if (!IsPartial) - Solver.markFunctionUnreachable(F); - - // FIXME: Only one argument per function. - return true; - } + // Mark all the specialized functions + Specializations.push_back(Clone); + NbFunctionsSpecialized++; - return false; + // If the function has been completely specialized, the original function + // is no longer needed. Mark it unreachable. + if (!AI.Partial) + Solver.markFunctionUnreachable(AI.Fn); } - /// Compute the cost of specializing function \p F. + /// Compute and return the cost of specializing function \p F. InstructionCost getSpecializationCost(Function *F) { // Compute the code metrics for the function. SmallPtrSet<const Value *, 32> EphValues; @@ -578,9 +609,7 @@ private: /// /// \returns true if the function should be specialized on the given /// argument. - bool isArgumentInteresting(Argument *A, - SmallVectorImpl<Constant *> &Constants, - const InstructionCost &FnSpecCost, + bool isArgumentInteresting(Argument *A, ConstList &Constants, bool &IsPartial) { // For now, don't attempt to specialize functions based on the values of // composite types. @@ -608,42 +637,8 @@ private: // // TODO 2: this currently does not support constants, i.e. integer ranges. // - SmallVector<Constant *, 4> PossibleConstants; - bool AllConstant = getPossibleConstants(A, PossibleConstants); - if (PossibleConstants.empty()) { - LLVM_DEBUG(dbgs() << "FnSpecialization: no possible constants found\n"); - return false; - } - if (PossibleConstants.size() > MaxConstantsThreshold) { - LLVM_DEBUG(dbgs() << "FnSpecialization: number of constants found exceed " - << "the maximum number of constants threshold.\n"); - return false; - } - - for (auto *C : PossibleConstants) { - LLVM_DEBUG(dbgs() << "FnSpecialization: Constant: " << *C << "\n"); - if (ForceFunctionSpecialization) { - LLVM_DEBUG(dbgs() << "FnSpecialization: Forced!\n"); - Constants.push_back(C); - continue; - } - if (getSpecializationBonus(A, C) > FnSpecCost) { - LLVM_DEBUG(dbgs() << "FnSpecialization: profitable!\n"); - Constants.push_back(C); - } else { - LLVM_DEBUG(dbgs() << "FnSpecialization: not profitable\n"); - } - } - - // None of the constant values the argument can take on were deemed good - // candidates on which to specialize the function. - if (Constants.empty()) - return false; - - // This will be a partial specialization if some of the constants were - // rejected due to their profitability. - IsPartial = !AllConstant || PossibleConstants.size() != Constants.size(); - + IsPartial = !getPossibleConstants(A, Constants); + LLVM_DEBUG(dbgs() << "FnSpecialization: interesting arg: " << *A << "\n"); return true; } @@ -653,8 +648,7 @@ private: /// \returns true if all of the values the argument can take on are constant /// (e.g., the argument's parent function cannot be called with an /// overdefined value). - bool getPossibleConstants(Argument *A, - SmallVectorImpl<Constant *> &Constants) { + bool getPossibleConstants(Argument *A, ConstList &Constants) { Function *F = A->getParent(); bool AllConstant = true; @@ -681,7 +675,7 @@ private: // For now, constant expressions are fine but only if they are function // calls. - if (auto *CE = dyn_cast<ConstantExpr>(V)) + if (auto *CE = dyn_cast<ConstantExpr>(V)) if (!isa<Function>(CE->getOperand(0))) return false; @@ -737,6 +731,29 @@ private: } } } + + void updateSpecializedFuncs(FuncList &FuncDecls, + FuncList &CurrentSpecializations) { + for (auto *SpecializedFunc : CurrentSpecializations) { + SpecializedFuncs.insert(SpecializedFunc); + + // Initialize the state of the newly created functions, marking them + // argument-tracked and executable. + if (SpecializedFunc->hasExactDefinition() && + !SpecializedFunc->hasFnAttribute(Attribute::Naked)) + Solver.addTrackedFunction(SpecializedFunc); + + Solver.addArgumentTrackedFunction(SpecializedFunc); + FuncDecls.push_back(SpecializedFunc); + Solver.markBlockExecutable(&SpecializedFunc->front()); + + // Replace the function arguments for the specialized functions. + for (Argument &Arg : SpecializedFunc->args()) + if (!Arg.use_empty() && tryToReplaceWithConstant(&Arg)) + LLVM_DEBUG(dbgs() << "FnSpecialization: Replaced constant argument: " + << Arg.getName() << "\n"); + } + } }; } // namespace diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index ba7589c2bf60..b1f3ff15c97b 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -305,8 +305,9 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV, else if (auto *LI = dyn_cast<LoadInst>(U)) { // A load from zeroinitializer is always zeroinitializer, regardless of // any applied offset. - if (Init->isNullValue()) { - LI->replaceAllUsesWith(Constant::getNullValue(LI->getType())); + Type *Ty = LI->getType(); + if (Init->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy()) { + LI->replaceAllUsesWith(Constant::getNullValue(Ty)); EraseFromParent(LI); continue; } @@ -316,8 +317,7 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV, PtrOp = PtrOp->stripAndAccumulateConstantOffsets( DL, Offset, /* AllowNonInbounds */ true); if (PtrOp == GV) { - if (auto *Value = ConstantFoldLoadFromConst(Init, LI->getType(), - Offset, DL)) { + if (auto *Value = ConstantFoldLoadFromConst(Init, Ty, Offset, DL)) { LI->replaceAllUsesWith(Value); EraseFromParent(LI); } @@ -368,8 +368,7 @@ static bool isSafeSROAGEP(User *U) { return false; } - return llvm::all_of(U->users(), - [](User *UU) { return isSafeSROAElementUse(UU); }); + return llvm::all_of(U->users(), isSafeSROAElementUse); } /// Return true if the specified instruction is a safe user of a derived diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp index 833049d6896f..a964fcde0396 100644 --- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp +++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp @@ -294,7 +294,7 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region, // Find all incoming values from the outlining region. int NumIncomingVals = 0; for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i) - if (find(Region, PN.getIncomingBlock(i)) != Region.end()) { + if (llvm::is_contained(Region, PN.getIncomingBlock(i))) { ++NumIncomingVals; if (NumIncomingVals > 1) { ++NumSplitExitPhis; diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp index 992c2b292e1e..4e3689f09536 100644 --- a/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/llvm/lib/Transforms/IPO/Inliner.cpp @@ -856,6 +856,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, if (InlineHistoryID != -1 && inlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory)) { + LLVM_DEBUG(dbgs() << "Skipping inlining due to history: " + << F.getName() << " -> " << Callee.getName() << "\n"); setInlineRemark(*CB, "recursive"); continue; } diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index f78971f0e586..c0bb19e184d6 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -1774,8 +1774,9 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, bool IsJumpTableCanonical) { SmallSetVector<Constant *, 4> Constants; for (Use &U : llvm::make_early_inc_range(Old->uses())) { - // Skip block addresses - if (isa<BlockAddress>(U.getUser())) + // Skip block addresses and no_cfi values, which refer to the function + // body instead of the jump table. + if (isa<BlockAddress, NoCFIValue>(U.getUser())) continue; // Skip direct calls to externally defined or non-dso_local functions @@ -1802,7 +1803,7 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, } void LowerTypeTestsModule::replaceDirectCalls(Value *Old, Value *New) { - Old->replaceUsesWithIf(New, [](Use &U) { return isDirectCall(U); }); + Old->replaceUsesWithIf(New, isDirectCall); } bool LowerTypeTestsModule::lower() { diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 055ee6b50296..f289e3ecc979 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -3964,6 +3964,9 @@ struct AAKernelInfoCallSite : AAKernelInfo { case OMPRTL___kmpc_master: case OMPRTL___kmpc_end_master: case OMPRTL___kmpc_barrier: + case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2: + case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2: + case OMPRTL___kmpc_nvptx_end_reduce_nowait: break; case OMPRTL___kmpc_distribute_static_init_4: case OMPRTL___kmpc_distribute_static_init_4u: @@ -4010,6 +4013,7 @@ struct AAKernelInfoCallSite : AAKernelInfo { break; case OMPRTL___kmpc_omp_task: // We do not look into tasks right now, just give up. + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); SPMDCompatibilityTracker.insert(&CB); ReachedUnknownParallelRegions.insert(&CB); break; @@ -4020,6 +4024,7 @@ struct AAKernelInfoCallSite : AAKernelInfo { default: // Unknown OpenMP runtime calls cannot be executed in SPMD-mode, // generally. However, they do not hide parallel regions. + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); SPMDCompatibilityTracker.insert(&CB); break; } @@ -4079,6 +4084,7 @@ struct AAKernelInfoCallSite : AAKernelInfo { SPMDCompatibilityTracker.insert(&CB); break; default: + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); SPMDCompatibilityTracker.insert(&CB); } diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp index bae9a1e27e75..7334bf695b67 100644 --- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp +++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp @@ -32,7 +32,7 @@ ContextTrieNode *ContextTrieNode::getChildContext(const LineLocation &CallSite, if (CalleeName.empty()) return getHottestChildContext(CallSite); - uint64_t Hash = nodeHash(CalleeName, CallSite); + uint64_t Hash = FunctionSamples::getCallSiteHash(CalleeName, CallSite); auto It = AllChildContext.find(Hash); if (It != AllChildContext.end()) return &It->second; @@ -65,7 +65,8 @@ ContextTrieNode::getHottestChildContext(const LineLocation &CallSite) { ContextTrieNode &ContextTrieNode::moveToChildContext( const LineLocation &CallSite, ContextTrieNode &&NodeToMove, uint32_t ContextFramesToRemove, bool DeleteNode) { - uint64_t Hash = nodeHash(NodeToMove.getFuncName(), CallSite); + uint64_t Hash = + FunctionSamples::getCallSiteHash(NodeToMove.getFuncName(), CallSite); assert(!AllChildContext.count(Hash) && "Node to remove must exist"); LineLocation OldCallSite = NodeToMove.CallSiteLoc; ContextTrieNode &OldParentContext = *NodeToMove.getParentContext(); @@ -108,7 +109,7 @@ ContextTrieNode &ContextTrieNode::moveToChildContext( void ContextTrieNode::removeChildContext(const LineLocation &CallSite, StringRef CalleeName) { - uint64_t Hash = nodeHash(CalleeName, CallSite); + uint64_t Hash = FunctionSamples::getCallSiteHash(CalleeName, CallSite); // Note this essentially calls dtor and destroys that child context AllChildContext.erase(Hash); } @@ -174,21 +175,9 @@ void ContextTrieNode::dumpTree() { } } -uint64_t ContextTrieNode::nodeHash(StringRef ChildName, - const LineLocation &Callsite) { - // We still use child's name for child hash, this is - // because for children of root node, we don't have - // different line/discriminator, and we'll rely on name - // to differentiate children. - uint64_t NameHash = std::hash<std::string>{}(ChildName.str()); - uint64_t LocId = - (((uint64_t)Callsite.LineOffset) << 32) | Callsite.Discriminator; - return NameHash + (LocId << 5) + LocId; -} - ContextTrieNode *ContextTrieNode::getOrCreateChildContext( const LineLocation &CallSite, StringRef CalleeName, bool AllowCreate) { - uint64_t Hash = nodeHash(CalleeName, CallSite); + uint64_t Hash = FunctionSamples::getCallSiteHash(CalleeName, CallSite); auto It = AllChildContext.find(Hash); if (It != AllChildContext.end()) { assert(It->second.getFuncName() == CalleeName && diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index b8fac9d47763..bc6051de90c4 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -467,6 +467,9 @@ protected: void emitOptimizationRemarksForInlineCandidates( const SmallVectorImpl<CallBase *> &Candidates, const Function &F, bool Hot); + void promoteMergeNotInlinedContextSamples( + DenseMap<CallBase *, const FunctionSamples *> NonInlinedCallSites, + const Function &F); std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG); std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(CallGraph &CG); void generateMDProfMetadata(Function &F); @@ -485,7 +488,7 @@ protected: std::unique_ptr<SampleContextTracker> ContextTracker; /// Flag indicating whether input profile is context-sensitive - bool ProfileIsCS = false; + bool ProfileIsCSFlat = false; /// Flag indicating which LTO/ThinLTO phase the pass is invoked in. /// @@ -602,7 +605,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) { // call instruction should have 0 count. // For CS profile, the callsite count of previously inlined callees is // populated with the entry count of the callees. - if (!ProfileIsCS) + if (!ProfileIsCSFlat) if (const auto *CB = dyn_cast<CallBase>(&Inst)) if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB)) return 0; @@ -641,7 +644,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) { // call instruction should have 0 count. // For CS profile, the callsite count of previously inlined callees is // populated with the entry count of the callees. - if (!ProfileIsCS) + if (!ProfileIsCSFlat) if (const auto *CB = dyn_cast<CallBase>(&Inst)) if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB)) return 0; @@ -695,7 +698,7 @@ SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const { if (Function *Callee = Inst.getCalledFunction()) CalleeName = Callee->getName(); - if (ProfileIsCS) + if (ProfileIsCSFlat) return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName); const FunctionSamples *FS = findFunctionSamples(Inst); @@ -727,7 +730,7 @@ SampleProfileLoader::findIndirectCallFunctionSamples( FunctionSamples::getGUID(R->getName()); }; - if (ProfileIsCS) { + if (ProfileIsCSFlat) { auto CalleeSamples = ContextTracker->getIndirectCalleeContextSamplesFor(DIL); if (CalleeSamples.empty()) @@ -780,7 +783,7 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { auto it = DILocation2SampleMap.try_emplace(DIL,nullptr); if (it.second) { - if (ProfileIsCS) + if (ProfileIsCSFlat) it.first->second = ContextTracker->getContextSamplesFor(DIL); else it.first->second = @@ -1039,7 +1042,7 @@ void SampleProfileLoader::findExternalInlineCandidate( // For AutoFDO profile, retrieve candidate profiles by walking over // the nested inlinee profiles. - if (!ProfileIsCS) { + if (!ProfileIsCSFlat) { Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold); return; } @@ -1134,7 +1137,7 @@ bool SampleProfileLoader::inlineHotFunctions( assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) && "GUIDToFuncNameMap has to be populated"); AllCandidates.push_back(CB); - if (FS->getEntrySamples() > 0 || ProfileIsCS) + if (FS->getEntrySamples() > 0 || ProfileIsCSFlat) LocalNotInlinedCallSites.try_emplace(CB, FS); if (callsiteIsHot(FS, PSI, ProfAccForSymsInList)) Hot = true; @@ -1156,11 +1159,9 @@ bool SampleProfileLoader::inlineHotFunctions( } for (CallBase *I : CIS) { Function *CalledFunction = I->getCalledFunction(); - InlineCandidate Candidate = { - I, - LocalNotInlinedCallSites.count(I) ? LocalNotInlinedCallSites[I] - : nullptr, - 0 /* dummy count */, 1.0 /* dummy distribution factor */}; + InlineCandidate Candidate = {I, LocalNotInlinedCallSites.lookup(I), + 0 /* dummy count */, + 1.0 /* dummy distribution factor */}; // Do not inline recursive calls. if (CalledFunction == &F) continue; @@ -1198,53 +1199,9 @@ bool SampleProfileLoader::inlineHotFunctions( } // For CS profile, profile for not inlined context will be merged when - // base profile is being trieved - if (ProfileIsCS) - return Changed; - - // Accumulate not inlined callsite information into notInlinedSamples - for (const auto &Pair : LocalNotInlinedCallSites) { - CallBase *I = Pair.getFirst(); - Function *Callee = I->getCalledFunction(); - if (!Callee || Callee->isDeclaration()) - continue; - - ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline", - I->getDebugLoc(), I->getParent()) - << "previous inlining not repeated: '" - << ore::NV("Callee", Callee) << "' into '" - << ore::NV("Caller", &F) << "'"); - - ++NumCSNotInlined; - const FunctionSamples *FS = Pair.getSecond(); - if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) { - continue; - } - - if (ProfileMergeInlinee) { - // A function call can be replicated by optimizations like callsite - // splitting or jump threading and the replicates end up sharing the - // sample nested callee profile instead of slicing the original inlinee's - // profile. We want to do merge exactly once by filtering out callee - // profiles with a non-zero head sample count. - if (FS->getHeadSamples() == 0) { - // Use entry samples as head samples during the merge, as inlinees - // don't have head samples. - const_cast<FunctionSamples *>(FS)->addHeadSamples( - FS->getEntrySamples()); - - // Note that we have to do the merge right after processing function. - // This allows OutlineFS's profile to be used for annotation during - // top-down processing of functions' annotation. - FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee); - OutlineFS->merge(*FS); - } - } else { - auto pair = - notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0}); - pair.first->second.entryCount += FS->getEntrySamples(); - } - } + // base profile is being retrieved. + if (!FunctionSamples::ProfileIsCSFlat) + promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F); return Changed; } @@ -1285,7 +1242,7 @@ bool SampleProfileLoader::tryInlineCandidate( InlinedCallSites->push_back(I); } - if (ProfileIsCS) + if (ProfileIsCSFlat) ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples); ++NumCSInlined; @@ -1430,7 +1387,6 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) { bool SampleProfileLoader::inlineHotFunctionsWithPriority( Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) { - assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now"); // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure // Profile symbol list is ignored when profile-sample-accurate is on. @@ -1467,6 +1423,8 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority( if (ExternalInlineAdvisor) SizeLimit = std::numeric_limits<unsigned>::max(); + DenseMap<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites; + // Perform iterative BFS call site prioritized inlining bool Changed = false; while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) { @@ -1521,6 +1479,8 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority( } ICPCount++; Changed = true; + } else if (!ContextTracker) { + LocalNotInlinedCallSites.try_emplace(I, FS); } } } else if (CalledFunction && CalledFunction->getSubprogram() && @@ -1532,6 +1492,8 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority( CQueue.emplace(NewCandidate); } Changed = true; + } else if (!ContextTracker) { + LocalNotInlinedCallSites.try_emplace(I, Candidate.CalleeSamples); } } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { findExternalInlineCandidate(I, findCalleeFunctionSamples(*I), @@ -1549,9 +1511,63 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority( ++NumCSInlinedHitGrowthLimit; } + // For CS profile, profile for not inlined context will be merged when + // base profile is being retrieved. + if (!FunctionSamples::ProfileIsCSFlat) + promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F); return Changed; } +void SampleProfileLoader::promoteMergeNotInlinedContextSamples( + DenseMap<CallBase *, const FunctionSamples *> NonInlinedCallSites, + const Function &F) { + // Accumulate not inlined callsite information into notInlinedSamples + for (const auto &Pair : NonInlinedCallSites) { + CallBase *I = Pair.getFirst(); + Function *Callee = I->getCalledFunction(); + if (!Callee || Callee->isDeclaration()) + continue; + + ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline", + I->getDebugLoc(), I->getParent()) + << "previous inlining not repeated: '" + << ore::NV("Callee", Callee) << "' into '" + << ore::NV("Caller", &F) << "'"); + + ++NumCSNotInlined; + const FunctionSamples *FS = Pair.getSecond(); + if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) { + continue; + } + + if (ProfileMergeInlinee) { + // A function call can be replicated by optimizations like callsite + // splitting or jump threading and the replicates end up sharing the + // sample nested callee profile instead of slicing the original + // inlinee's profile. We want to do merge exactly once by filtering out + // callee profiles with a non-zero head sample count. + if (FS->getHeadSamples() == 0) { + // Use entry samples as head samples during the merge, as inlinees + // don't have head samples. + const_cast<FunctionSamples *>(FS)->addHeadSamples( + FS->getEntrySamples()); + + // Note that we have to do the merge right after processing function. + // This allows OutlineFS's profile to be used for annotation during + // top-down processing of functions' annotation. + FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee); + OutlineFS->merge(*FS, 1); + // Set outlined profile to be synthetic to not bias the inliner. + OutlineFS->SetContextSynthetic(); + } + } else { + auto pair = + notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0}); + pair.first->second.entryCount += FS->getEntrySamples(); + } + } +} + /// Returns the sorted CallTargetMap \p M by count in descending order. static SmallVector<InstrProfValueData, 2> GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M) { @@ -1607,7 +1623,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) { // With CSSPGO all indirect call targets are counted torwards the // original indirect call site in the profile, including both // inlined and non-inlined targets. - if (!FunctionSamples::ProfileIsCS) { + if (!FunctionSamples::ProfileIsCSFlat) { if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) { for (const auto &NameFS : *M) @@ -1754,7 +1770,7 @@ bool SampleProfileLoader::emitAnnotations(Function &F) { } DenseSet<GlobalValue::GUID> InlinedGUIDs; - if (ProfileIsCS && CallsitePrioritizedInline) + if (CallsitePrioritizedInline) Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs); else Changed |= inlineHotFunctions(F, InlinedGUIDs); @@ -1782,7 +1798,7 @@ INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile", std::unique_ptr<ProfiledCallGraph> SampleProfileLoader::buildProfiledCallGraph(CallGraph &CG) { std::unique_ptr<ProfiledCallGraph> ProfiledCG; - if (ProfileIsCS) + if (ProfileIsCSFlat) ProfiledCG = std::make_unique<ProfiledCallGraph>(*ContextTracker); else ProfiledCG = std::make_unique<ProfiledCallGraph>(Reader->getProfiles()); @@ -1828,7 +1844,7 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) { assert(&CG->getModule() == &M); if (UseProfiledCallGraph || - (ProfileIsCS && !UseProfiledCallGraph.getNumOccurrences())) { + (ProfileIsCSFlat && !UseProfiledCallGraph.getNumOccurrences())) { // Use profiled call edges to augment the top-down order. There are cases // that the top-down order computed based on the static call graph doesn't // reflect real execution order. For example @@ -1961,10 +1977,8 @@ bool SampleProfileLoader::doInitialization(Module &M, } // Apply tweaks if context-sensitive profile is available. - if (Reader->profileIsCS()) { - ProfileIsCS = true; - FunctionSamples::ProfileIsCS = true; - + if (Reader->profileIsCSFlat() || Reader->profileIsCSNested()) { + ProfileIsCSFlat = Reader->profileIsCSFlat(); // Enable priority-base inliner and size inline by default for CSSPGO. if (!ProfileSizeInline.getNumOccurrences()) ProfileSizeInline = true; @@ -1982,10 +1996,15 @@ bool SampleProfileLoader::doInitialization(Module &M, // Enable iterative-BFI by default for CSSPGO. if (!UseIterativeBFIInference.getNumOccurrences()) UseIterativeBFIInference = true; - - // Tracker for profiles under different context - ContextTracker = std::make_unique<SampleContextTracker>( - Reader->getProfiles(), &GUIDToFuncNameMap); + // Enable Profi by default for CSSPGO. + if (!SampleProfileUseProfi.getNumOccurrences()) + SampleProfileUseProfi = true; + + if (FunctionSamples::ProfileIsCSFlat) { + // Tracker for profiles under different context + ContextTracker = std::make_unique<SampleContextTracker>( + Reader->getProfiles(), &GUIDToFuncNameMap); + } } // Load pseudo probe descriptors for probe-based function samples. @@ -1994,7 +2013,8 @@ bool SampleProfileLoader::doInitialization(Module &M, if (!ProbeManager->moduleIsProbed(M)) { const char *Msg = "Pseudo-probe-based profile requires SampleProfileProbePass"; - Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg)); + Ctx.diagnose(DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg, + DS_Warning)); return false; } } @@ -2062,7 +2082,7 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, } // Account for cold calls not inlined.... - if (!ProfileIsCS) + if (!ProfileIsCSFlat) for (const std::pair<Function *, NotInlinedProfileInfo> &pair : notInlinedCallInfo) updateProfileCallee(pair.first, pair.second.entryCount); @@ -2138,7 +2158,7 @@ bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) ORE = OwnedORE.get(); } - if (ProfileIsCS) + if (ProfileIsCSFlat) Samples = ContextTracker->getBaseSamplesFor(F); else Samples = Reader->getSamplesFor(F); diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index 0cc1b37844f6..daaf6cbeb3fd 100644 --- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -87,7 +87,8 @@ void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId, if (isa<Function>(&ExportGV) && allowPromotionAlias(OldName)) { // Create a local alias with the original name to avoid breaking // references from inline assembly. - std::string Alias = ".set " + OldName + "," + NewName + "\n"; + std::string Alias = + ".lto_set_conditional " + OldName + "," + NewName + "\n"; ExportM.appendModuleInlineAsm(Alias); } } diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 61054e7ae46f..6acace1d9fd4 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -359,6 +359,36 @@ template <> struct DenseMapInfo<VTableSlotSummary> { namespace { +// Returns true if the function must be unreachable based on ValueInfo. +// +// In particular, identifies a function as unreachable in the following +// conditions +// 1) All summaries are live. +// 2) All function summaries indicate it's unreachable +bool mustBeUnreachableFunction(ValueInfo TheFnVI) { + if ((!TheFnVI) || TheFnVI.getSummaryList().empty()) { + // Returns false if ValueInfo is absent, or the summary list is empty + // (e.g., function declarations). + return false; + } + + for (auto &Summary : TheFnVI.getSummaryList()) { + // Conservatively returns false if any non-live functions are seen. + // In general either all summaries should be live or all should be dead. + if (!Summary->isLive()) + return false; + if (auto *FS = dyn_cast<FunctionSummary>(Summary.get())) { + if (!FS->fflags().MustBeUnreachable) + return false; + } + // Do nothing if a non-function has the same GUID (which is rare). + // This is correct since non-function summaries are not relevant. + } + // All function summaries are live and all of them agree that the function is + // unreachble. + return true; +} + // A virtual call site. VTable is the loaded virtual table pointer, and CS is // the indirect virtual call. struct VirtualCallSite { @@ -562,10 +592,12 @@ struct DevirtModule { void buildTypeIdentifierMap( std::vector<VTableBits> &Bits, DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap); + bool tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot, const std::set<TypeMemberInfo> &TypeMemberInfos, - uint64_t ByteOffset); + uint64_t ByteOffset, + ModuleSummaryIndex *ExportSummary); void applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn, bool &IsExported); @@ -640,6 +672,23 @@ struct DevirtModule { bool run(); + // Look up the corresponding ValueInfo entry of `TheFn` in `ExportSummary`. + // + // Caller guarantees that `ExportSummary` is not nullptr. + static ValueInfo lookUpFunctionValueInfo(Function *TheFn, + ModuleSummaryIndex *ExportSummary); + + // Returns true if the function definition must be unreachable. + // + // Note if this helper function returns true, `F` is guaranteed + // to be unreachable; if it returns false, `F` might still + // be unreachable but not covered by this helper function. + // + // Implementation-wise, if function definition is present, IR is analyzed; if + // not, look up function flags from ExportSummary as a fallback. + static bool mustBeUnreachableFunction(Function *const F, + ModuleSummaryIndex *ExportSummary); + // Lower the module using the action and summary passed as command line // arguments. For testing purposes only. static bool @@ -969,7 +1018,8 @@ void DevirtModule::buildTypeIdentifierMap( bool DevirtModule::tryFindVirtualCallTargets( std::vector<VirtualCallTarget> &TargetsForSlot, - const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset) { + const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset, + ModuleSummaryIndex *ExportSummary) { for (const TypeMemberInfo &TM : TypeMemberInfos) { if (!TM.Bits->GV->isConstant()) return false; @@ -997,6 +1047,11 @@ bool DevirtModule::tryFindVirtualCallTargets( if (Fn->getName() == "__cxa_pure_virtual") continue; + // We can disregard unreachable functions as possible call targets, as + // unreachable functions shouldn't be called. + if (mustBeUnreachableFunction(Fn, ExportSummary)) + continue; + TargetsForSlot.push_back({Fn, &TM}); } @@ -1053,6 +1108,9 @@ bool DevirtIndex::tryFindVirtualCallTargets( if (VTP.VTableOffset != P.AddressPointOffset + ByteOffset) continue; + if (mustBeUnreachableFunction(VTP.FuncVI)) + continue; + TargetsForSlot.push_back(VTP.FuncVI); } } @@ -1744,7 +1802,7 @@ void DevirtModule::rebuildGlobal(VTableBits &B) { GlobalVariable::PrivateLinkage, NewInit, "", B.GV); NewGV->setSection(B.GV->getSection()); NewGV->setComdat(B.GV->getComdat()); - NewGV->setAlignment(MaybeAlign(B.GV->getAlignment())); + NewGV->setAlignment(B.GV->getAlign()); // Copy the original vtable's metadata to the anonymous global, adjusting // offsets as required. @@ -2014,6 +2072,44 @@ void DevirtModule::removeRedundantTypeTests() { } } +ValueInfo +DevirtModule::lookUpFunctionValueInfo(Function *TheFn, + ModuleSummaryIndex *ExportSummary) { + assert((ExportSummary != nullptr) && + "Caller guarantees ExportSummary is not nullptr"); + + const auto TheFnGUID = TheFn->getGUID(); + const auto TheFnGUIDWithExportedName = GlobalValue::getGUID(TheFn->getName()); + // Look up ValueInfo with the GUID in the current linkage. + ValueInfo TheFnVI = ExportSummary->getValueInfo(TheFnGUID); + // If no entry is found and GUID is different from GUID computed using + // exported name, look up ValueInfo with the exported name unconditionally. + // This is a fallback. + // + // The reason to have a fallback: + // 1. LTO could enable global value internalization via + // `enable-lto-internalization`. + // 2. The GUID in ExportedSummary is computed using exported name. + if ((!TheFnVI) && (TheFnGUID != TheFnGUIDWithExportedName)) { + TheFnVI = ExportSummary->getValueInfo(TheFnGUIDWithExportedName); + } + return TheFnVI; +} + +bool DevirtModule::mustBeUnreachableFunction( + Function *const F, ModuleSummaryIndex *ExportSummary) { + // First, learn unreachability by analyzing function IR. + if (!F->isDeclaration()) { + // A function must be unreachable if its entry block ends with an + // 'unreachable'. + return isa<UnreachableInst>(F->getEntryBlock().getTerminator()); + } + // Learn unreachability from ExportSummary if ExportSummary is present. + return ExportSummary && + ::mustBeUnreachableFunction( + DevirtModule::lookUpFunctionValueInfo(F, ExportSummary)); +} + bool DevirtModule::run() { // If only some of the modules were split, we cannot correctly perform // this transformation. We already checked for the presense of type tests @@ -2137,7 +2233,7 @@ bool DevirtModule::run() { cast<MDString>(S.first.TypeID)->getString()) .WPDRes[S.first.ByteOffset]; if (tryFindVirtualCallTargets(TargetsForSlot, TypeMemberInfos, - S.first.ByteOffset)) { + S.first.ByteOffset, ExportSummary)) { if (!trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res)) { DidVirtualConstProp |= diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index dc55b5a31596..de1034c910d5 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1795,6 +1795,55 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I, } } + // (~A & B & C) | ... --> ... + // (~A | B | C) | ... --> ... + // TODO: One use checks are conservative. We just need to check that a total + // number of multiple used values does not exceed reduction + // in operations. + if (match(Op0, + m_OneUse(m_c_BinOp(FlippedOpcode, + m_BinOp(FlippedOpcode, m_Value(B), m_Value(C)), + m_CombineAnd(m_Value(X), m_Not(m_Value(A)))))) || + match(Op0, m_OneUse(m_c_BinOp( + FlippedOpcode, + m_c_BinOp(FlippedOpcode, m_Value(C), + m_CombineAnd(m_Value(X), m_Not(m_Value(A)))), + m_Value(B))))) { + // X = ~A + // (~A & B & C) | ~(A | B | C) --> ~(A | (B ^ C)) + // (~A | B | C) & ~(A & B & C) --> (~A | (B ^ C)) + if (match(Op1, m_OneUse(m_Not(m_c_BinOp( + Opcode, m_c_BinOp(Opcode, m_Specific(A), m_Specific(B)), + m_Specific(C))))) || + match(Op1, m_OneUse(m_Not(m_c_BinOp( + Opcode, m_c_BinOp(Opcode, m_Specific(B), m_Specific(C)), + m_Specific(A))))) || + match(Op1, m_OneUse(m_Not(m_c_BinOp( + Opcode, m_c_BinOp(Opcode, m_Specific(A), m_Specific(C)), + m_Specific(B)))))) { + Value *Xor = Builder.CreateXor(B, C); + return (Opcode == Instruction::Or) + ? BinaryOperator::CreateNot(Builder.CreateOr(Xor, A)) + : BinaryOperator::CreateOr(Xor, X); + } + + // (~A & B & C) | ~(A | B) --> (C | ~B) & ~A + // (~A | B | C) & ~(A & B) --> (C & ~B) | ~A + if (match(Op1, m_OneUse(m_Not(m_OneUse( + m_c_BinOp(Opcode, m_Specific(A), m_Specific(B))))))) + return BinaryOperator::Create( + FlippedOpcode, Builder.CreateBinOp(Opcode, C, Builder.CreateNot(B)), + X); + + // (~A & B & C) | ~(A | C) --> (B | ~C) & ~A + // (~A | B | C) & ~(A & C) --> (B & ~C) | ~A + if (match(Op1, m_OneUse(m_Not(m_OneUse( + m_c_BinOp(Opcode, m_Specific(A), m_Specific(C))))))) + return BinaryOperator::Create( + FlippedOpcode, Builder.CreateBinOp(Opcode, B, Builder.CreateNot(C)), + X); + } + return nullptr; } @@ -2102,6 +2151,15 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { Value *Cmp = Builder.CreateICmpSLT(X, Zero, "isneg"); return SelectInst::Create(Cmp, Y, Zero); } + // If there's a 'not' of the shifted value, swap the select operands: + // ~(iN X s>> (N-1)) & Y --> (X s< 0) ? 0 : Y + if (match(&I, m_c_And(m_OneUse(m_Not( + m_AShr(m_Value(X), m_SpecificInt(FullShift)))), + m_Value(Y)))) { + Constant *Zero = ConstantInt::getNullValue(Ty); + Value *Cmp = Builder.CreateICmpSLT(X, Zero, "isneg"); + return SelectInst::Create(Cmp, Zero, Y); + } // (~x) & y --> ~(x | (~y)) iff that gets rid of inversions if (sinkNotIntoOtherHandOfAndOrOr(I)) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 7da2669e1d13..14427bd1f2f4 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -2472,6 +2472,12 @@ static bool isSafeToEliminateVarargsCast(const CallBase &Call, Instruction *InstCombinerImpl::tryOptimizeCall(CallInst *CI) { if (!CI->getCalledFunction()) return nullptr; + // Skip optimizing notail and musttail calls so + // LibCallSimplifier::optimizeCall doesn't have to preserve those invariants. + // LibCallSimplifier::optimizeCall should try to preseve tail calls though. + if (CI->isMustTailCall() || CI->isNoTailCall()) + return nullptr; + auto InstCombineRAUW = [this](Instruction *From, Value *With) { replaceInstUsesWith(*From, With); }; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 33f217659c01..8df4a4529f47 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -157,7 +157,7 @@ Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI, Amt = Builder.CreateAdd(Amt, Off); } - AllocaInst *New = Builder.CreateAlloca(CastElTy, Amt); + AllocaInst *New = Builder.CreateAlloca(CastElTy, AI.getAddressSpace(), Amt); New->setAlignment(AI.getAlign()); New->takeName(&AI); New->setUsedWithInAlloca(AI.isUsedWithInAlloca()); @@ -965,13 +965,13 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) { if (match(Src, m_VScale(DL))) { if (Trunc.getFunction() && Trunc.getFunction()->hasFnAttribute(Attribute::VScaleRange)) { - unsigned MaxVScale = Trunc.getFunction() - ->getFnAttribute(Attribute::VScaleRange) - .getVScaleRangeArgs() - .second; - if (MaxVScale > 0 && Log2_32(MaxVScale) < DestWidth) { - Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1)); - return replaceInstUsesWith(Trunc, VScale); + Attribute Attr = + Trunc.getFunction()->getFnAttribute(Attribute::VScaleRange); + if (Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax()) { + if (Log2_32(MaxVScale.getValue()) < DestWidth) { + Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1)); + return replaceInstUsesWith(Trunc, VScale); + } } } } @@ -1337,14 +1337,13 @@ Instruction *InstCombinerImpl::visitZExt(ZExtInst &CI) { if (match(Src, m_VScale(DL))) { if (CI.getFunction() && CI.getFunction()->hasFnAttribute(Attribute::VScaleRange)) { - unsigned MaxVScale = CI.getFunction() - ->getFnAttribute(Attribute::VScaleRange) - .getVScaleRangeArgs() - .second; - unsigned TypeWidth = Src->getType()->getScalarSizeInBits(); - if (MaxVScale > 0 && Log2_32(MaxVScale) < TypeWidth) { - Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1)); - return replaceInstUsesWith(CI, VScale); + Attribute Attr = CI.getFunction()->getFnAttribute(Attribute::VScaleRange); + if (Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax()) { + unsigned TypeWidth = Src->getType()->getScalarSizeInBits(); + if (Log2_32(MaxVScale.getValue()) < TypeWidth) { + Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1)); + return replaceInstUsesWith(CI, VScale); + } } } } @@ -1608,13 +1607,12 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &CI) { if (match(Src, m_VScale(DL))) { if (CI.getFunction() && CI.getFunction()->hasFnAttribute(Attribute::VScaleRange)) { - unsigned MaxVScale = CI.getFunction() - ->getFnAttribute(Attribute::VScaleRange) - .getVScaleRangeArgs() - .second; - if (MaxVScale > 0 && Log2_32(MaxVScale) < (SrcBitSize - 1)) { - Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1)); - return replaceInstUsesWith(CI, VScale); + Attribute Attr = CI.getFunction()->getFnAttribute(Attribute::VScaleRange); + if (Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax()) { + if (Log2_32(MaxVScale.getValue()) < (SrcBitSize - 1)) { + Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1)); + return replaceInstUsesWith(CI, VScale); + } } } } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 20c75188ec9f..39b55b028110 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -600,6 +600,7 @@ public: /// Canonicalize the position of binops relative to shufflevector. Instruction *foldVectorBinop(BinaryOperator &Inst); Instruction *foldVectorSelect(SelectInst &Sel); + Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf); /// Given a binary operator, cast instruction, or select which has a PHI node /// as operand #0, see if we can fold the instruction into the PHI (which is diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 79a8a065d02a..0dbfdba353c4 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -163,7 +163,7 @@ static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI, uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType()); if (!AllocaSize) return false; - return isDereferenceableAndAlignedPointer(V, Align(AI->getAlignment()), + return isDereferenceableAndAlignedPointer(V, AI->getAlign(), APInt(64, AllocaSize), DL); } @@ -183,7 +183,8 @@ static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC, if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) { if (C->getValue().getActiveBits() <= 64) { Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue()); - AllocaInst *New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName()); + AllocaInst *New = IC.Builder.CreateAlloca(NewTy, AI.getAddressSpace(), + nullptr, AI.getName()); New->setAlignment(AI.getAlign()); // Scan to the end of the allocation instructions, to skip over a block of @@ -199,21 +200,13 @@ static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC, Type *IdxTy = IC.getDataLayout().getIntPtrType(AI.getType()); Value *NullIdx = Constant::getNullValue(IdxTy); Value *Idx[2] = {NullIdx, NullIdx}; - Instruction *NewI = GetElementPtrInst::CreateInBounds( + Instruction *GEP = GetElementPtrInst::CreateInBounds( NewTy, New, Idx, New->getName() + ".sub"); - IC.InsertNewInstBefore(NewI, *It); - - // Gracefully handle allocas in other address spaces. - if (AI.getType()->getPointerAddressSpace() != - NewI->getType()->getPointerAddressSpace()) { - NewI = - CastInst::CreatePointerBitCastOrAddrSpaceCast(NewI, AI.getType()); - IC.InsertNewInstBefore(NewI, *It); - } + IC.InsertNewInstBefore(GEP, *It); // Now make everything use the getelementptr instead of the original // allocation. - return IC.replaceInstUsesWith(AI, NewI); + return IC.replaceInstUsesWith(AI, GEP); } } @@ -640,7 +633,6 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) { return nullptr; StringRef Name = LI.getName(); - assert(LI.getAlignment() && "Alignment must be set at this point"); if (auto *ST = dyn_cast<StructType>(T)) { // If the struct only have one element, we unpack. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index 779d298da7a4..aca7ec8d7325 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -755,6 +755,15 @@ Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) { if (simplifyDivRemOfSelectWithZeroOp(I)) return &I; + // If the divisor is a select-of-constants, try to constant fold all div ops: + // C / (select Cond, TrueC, FalseC) --> select Cond, (C / TrueC), (C / FalseC) + // TODO: Adapt simplifyDivRemOfSelectWithZeroOp to allow this and other folds. + if (match(Op0, m_ImmConstant()) && + match(Op1, m_Select(m_Value(), m_ImmConstant(), m_ImmConstant()))) { + if (Instruction *R = FoldOpIntoSelect(I, cast<SelectInst>(Op1))) + return R; + } + const APInt *C2; if (match(Op1, m_APInt(C2))) { Value *X; @@ -1461,6 +1470,15 @@ Instruction *InstCombinerImpl::commonIRemTransforms(BinaryOperator &I) { if (simplifyDivRemOfSelectWithZeroOp(I)) return &I; + // If the divisor is a select-of-constants, try to constant fold all rem ops: + // C % (select Cond, TrueC, FalseC) --> select Cond, (C % TrueC), (C % FalseC) + // TODO: Adapt simplifyDivRemOfSelectWithZeroOp to allow this and other folds. + if (match(Op0, m_ImmConstant()) && + match(Op1, m_Select(m_Value(), m_ImmConstant(), m_ImmConstant()))) { + if (Instruction *R = FoldOpIntoSelect(I, cast<SelectInst>(Op1))) + return R; + } + if (isa<Constant>(Op1)) { if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) { if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) { diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 35739c3b9a21..30f6aab2114b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -664,10 +664,7 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) { return nullptr; // When processing loads, we need to propagate two bits of information to the - // sunk load: whether it is volatile, and what its alignment is. We currently - // don't sink loads when some have their alignment specified and some don't. - // visitLoadInst will propagate an alignment onto the load when TD is around, - // and if TD isn't around, we can't handle the mixed case. + // sunk load: whether it is volatile, and what its alignment is. bool isVolatile = FirstLI->isVolatile(); Align LoadAlignment = FirstLI->getAlign(); unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace(); @@ -699,7 +696,7 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) { !isSafeAndProfitableToSinkLoad(LI)) return nullptr; - LoadAlignment = std::min(LoadAlignment, Align(LI->getAlign())); + LoadAlignment = std::min(LoadAlignment, LI->getAlign()); // If the PHI is of volatile loads and the load block has multiple // successors, sinking it would remove a load of the volatile value from diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 518d3952dce5..a6d6b5199105 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1482,7 +1482,12 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp, if (C0->getType() != Sel.getType()) return nullptr; - // FIXME: are there any magic icmp predicate+constant pairs we must not touch? + // ULT with 'add' of a constant is canonical. See foldICmpAddConstant(). + // FIXME: Are there more magic icmp predicate+constant pairs we must avoid? + // Or should we just abandon this transform entirely? + if (Pred == CmpInst::ICMP_ULT && match(X, m_Add(m_Value(), m_Constant()))) + return nullptr; + Value *SelVal0, *SelVal1; // We do not care which one is from where. match(&Sel, m_Select(m_Value(), m_Value(SelVal0), m_Value(SelVal1))); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index e357a9da8b12..4dc712f32536 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1595,12 +1595,6 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V, simplifyAndSetOp(I, 0, DemandedElts, UndefElts); simplifyAndSetOp(I, 1, DemandedElts, UndefElts2); - // Any change to an instruction with potential poison must clear those flags - // because we can not guarantee those constraints now. Other analysis may - // determine that it is safe to re-apply the flags. - if (MadeChange) - BO->dropPoisonGeneratingFlags(); - // Output elements are undefined if both are undefined. Consider things // like undef & 0. The result is known zero, not undef. UndefElts &= UndefElts2; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 32e537897140..c6a4602e59e3 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -363,6 +363,18 @@ static APInt findDemandedEltsByAllUsers(Value *V) { return UnionUsedElts; } +/// Given a constant index for a extractelement or insertelement instruction, +/// return it with the canonical type if it isn't already canonical. We +/// arbitrarily pick 64 bit as our canonical type. The actual bitwidth doesn't +/// matter, we just want a consistent type to simplify CSE. +ConstantInt *getPreferredVectorIndex(ConstantInt *IndexC) { + const unsigned IndexBW = IndexC->getType()->getBitWidth(); + if (IndexBW == 64 || IndexC->getValue().getActiveBits() > 64) + return nullptr; + return ConstantInt::get(IndexC->getContext(), + IndexC->getValue().zextOrTrunc(64)); +} + Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { Value *SrcVec = EI.getVectorOperand(); Value *Index = EI.getIndexOperand(); @@ -374,6 +386,10 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { // find a previously computed scalar that was inserted into the vector. auto *IndexC = dyn_cast<ConstantInt>(Index); if (IndexC) { + // Canonicalize type of constant indices to i64 to simplify CSE + if (auto *NewIdx = getPreferredVectorIndex(IndexC)) + return replaceOperand(EI, 1, NewIdx); + ElementCount EC = EI.getVectorOperandType()->getElementCount(); unsigned NumElts = EC.getKnownMinValue(); @@ -401,37 +417,6 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { if (!EC.isScalable() && IndexC->getValue().uge(NumElts)) return nullptr; - // This instruction only demands the single element from the input vector. - // Skip for scalable type, the number of elements is unknown at - // compile-time. - if (!EC.isScalable() && NumElts != 1) { - // If the input vector has a single use, simplify it based on this use - // property. - if (SrcVec->hasOneUse()) { - APInt UndefElts(NumElts, 0); - APInt DemandedElts(NumElts, 0); - DemandedElts.setBit(IndexC->getZExtValue()); - if (Value *V = - SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts)) - return replaceOperand(EI, 0, V); - } else { - // If the input vector has multiple uses, simplify it based on a union - // of all elements used. - APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec); - if (!DemandedElts.isAllOnes()) { - APInt UndefElts(NumElts, 0); - if (Value *V = SimplifyDemandedVectorElts( - SrcVec, DemandedElts, UndefElts, 0 /* Depth */, - true /* AllowMultipleUsers */)) { - if (V != SrcVec) { - SrcVec->replaceAllUsesWith(V); - return &EI; - } - } - } - } - } - if (Instruction *I = foldBitcastExtElt(EI)) return I; @@ -473,11 +458,9 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { if (auto *I = dyn_cast<Instruction>(SrcVec)) { if (auto *IE = dyn_cast<InsertElementInst>(I)) { - // Extracting the inserted element? - if (IE->getOperand(2) == Index) - return replaceInstUsesWith(EI, IE->getOperand(1)); - // If the inserted and extracted elements are constants, they must not - // be the same value, extract from the pre-inserted value instead. + // instsimplify already handled the case where the indices are constants + // and equal by value, if both are constants, they must not be the same + // value, extract from the pre-inserted value instead. if (isa<Constant>(IE->getOperand(2)) && IndexC) return replaceOperand(EI, 0, IE->getOperand(0)); } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { @@ -497,30 +480,27 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { llvm::count_if(GEP->operands(), [](const Value *V) { return isa<VectorType>(V->getType()); }); - if (VectorOps > 1) - return nullptr; - assert(VectorOps == 1 && "Expected exactly one vector GEP operand!"); - - Value *NewPtr = GEP->getPointerOperand(); - if (isa<VectorType>(NewPtr->getType())) - NewPtr = Builder.CreateExtractElement(NewPtr, IndexC); - - SmallVector<Value *> NewOps; - for (unsigned I = 1; I != GEP->getNumOperands(); ++I) { - Value *Op = GEP->getOperand(I); - if (isa<VectorType>(Op->getType())) - NewOps.push_back(Builder.CreateExtractElement(Op, IndexC)); - else - NewOps.push_back(Op); - } + if (VectorOps == 1) { + Value *NewPtr = GEP->getPointerOperand(); + if (isa<VectorType>(NewPtr->getType())) + NewPtr = Builder.CreateExtractElement(NewPtr, IndexC); + + SmallVector<Value *> NewOps; + for (unsigned I = 1; I != GEP->getNumOperands(); ++I) { + Value *Op = GEP->getOperand(I); + if (isa<VectorType>(Op->getType())) + NewOps.push_back(Builder.CreateExtractElement(Op, IndexC)); + else + NewOps.push_back(Op); + } - GetElementPtrInst *NewGEP = GetElementPtrInst::Create( - cast<PointerType>(NewPtr->getType())->getElementType(), NewPtr, - NewOps); - NewGEP->setIsInBounds(GEP->isInBounds()); - return NewGEP; + GetElementPtrInst *NewGEP = GetElementPtrInst::Create( + cast<PointerType>(NewPtr->getType())->getElementType(), NewPtr, + NewOps); + NewGEP->setIsInBounds(GEP->isInBounds()); + return NewGEP; + } } - return nullptr; } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) { // If this is extracting an element from a shufflevector, figure out where // it came from and extract from the appropriate input element instead. @@ -554,6 +534,44 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { } } } + + // Run demanded elements after other transforms as this can drop flags on + // binops. If there's two paths to the same final result, we prefer the + // one which doesn't force us to drop flags. + if (IndexC) { + ElementCount EC = EI.getVectorOperandType()->getElementCount(); + unsigned NumElts = EC.getKnownMinValue(); + // This instruction only demands the single element from the input vector. + // Skip for scalable type, the number of elements is unknown at + // compile-time. + if (!EC.isScalable() && NumElts != 1) { + // If the input vector has a single use, simplify it based on this use + // property. + if (SrcVec->hasOneUse()) { + APInt UndefElts(NumElts, 0); + APInt DemandedElts(NumElts, 0); + DemandedElts.setBit(IndexC->getZExtValue()); + if (Value *V = + SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts)) + return replaceOperand(EI, 0, V); + } else { + // If the input vector has multiple uses, simplify it based on a union + // of all elements used. + APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec); + if (!DemandedElts.isAllOnes()) { + APInt UndefElts(NumElts, 0); + if (Value *V = SimplifyDemandedVectorElts( + SrcVec, DemandedElts, UndefElts, 0 /* Depth */, + true /* AllowMultipleUsers */)) { + if (V != SrcVec) { + SrcVec->replaceAllUsesWith(V); + return &EI; + } + } + } + } + } + } return nullptr; } @@ -1476,6 +1494,11 @@ Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) { VecOp, ScalarOp, IdxOp, SQ.getWithInstruction(&IE))) return replaceInstUsesWith(IE, V); + // Canonicalize type of constant indices to i64 to simplify CSE + if (auto *IndexC = dyn_cast<ConstantInt>(IdxOp)) + if (auto *NewIdx = getPreferredVectorIndex(IndexC)) + return replaceOperand(IE, 2, NewIdx); + // If the scalar is bitcast and inserted into undef, do the insert in the // source type followed by bitcast. // TODO: Generalize for insert into any constant, not just undef? @@ -2008,9 +2031,7 @@ static Instruction *canonicalizeInsertSplat(ShuffleVectorInst &Shuf, } /// Try to fold shuffles that are the equivalent of a vector select. -static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf, - InstCombiner::BuilderTy &Builder, - const DataLayout &DL) { +Instruction *InstCombinerImpl::foldSelectShuffle(ShuffleVectorInst &Shuf) { if (!Shuf.isSelect()) return nullptr; @@ -2118,21 +2139,23 @@ static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf, V = Builder.CreateShuffleVector(X, Y, Mask); } - Instruction *NewBO = ConstantsAreOp1 ? BinaryOperator::Create(BOpc, V, NewC) : - BinaryOperator::Create(BOpc, NewC, V); + Value *NewBO = ConstantsAreOp1 ? Builder.CreateBinOp(BOpc, V, NewC) : + Builder.CreateBinOp(BOpc, NewC, V); // Flags are intersected from the 2 source binops. But there are 2 exceptions: // 1. If we changed an opcode, poison conditions might have changed. // 2. If the shuffle had undef mask elements, the new binop might have undefs // where the original code did not. But if we already made a safe constant, // then there's no danger. - NewBO->copyIRFlags(B0); - NewBO->andIRFlags(B1); - if (DropNSW) - NewBO->setHasNoSignedWrap(false); - if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB) - NewBO->dropPoisonGeneratingFlags(); - return NewBO; + if (auto *NewI = dyn_cast<Instruction>(NewBO)) { + NewI->copyIRFlags(B0); + NewI->andIRFlags(B1); + if (DropNSW) + NewI->setHasNoSignedWrap(false); + if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB) + NewI->dropPoisonGeneratingFlags(); + } + return replaceInstUsesWith(Shuf, NewBO); } /// Convert a narrowing shuffle of a bitcasted vector into a vector truncate. @@ -2497,7 +2520,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) { if (Instruction *I = canonicalizeInsertSplat(SVI, Builder)) return I; - if (Instruction *I = foldSelectShuffle(SVI, Builder, DL)) + if (Instruction *I = foldSelectShuffle(SVI)) return I; if (Instruction *I = foldTruncShuffle(SVI, DL.isBigEndian())) diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 1f81624f79e7..eb5eadba194d 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2546,7 +2546,7 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { return nullptr; } -static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo *TLI, +static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo &TLI, Instruction *AI) { if (isa<ConstantPointerNull>(V)) return true; @@ -2557,12 +2557,34 @@ static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo *TLI, // through bitcasts of V can cause // the result statement below to be true, even when AI and V (ex: // i8* ->i32* ->i8* of AI) are the same allocations. - return isAllocLikeFn(V, TLI) && V != AI; + return isAllocLikeFn(V, &TLI) && V != AI; +} + +/// Given a call CB which uses an address UsedV, return true if we can prove the +/// call's only possible effect is storing to V. +static bool isRemovableWrite(CallBase &CB, Value *UsedV, + const TargetLibraryInfo &TLI) { + if (!CB.use_empty()) + // TODO: add recursion if returned attribute is present + return false; + + if (CB.isTerminator()) + // TODO: remove implementation restriction + return false; + + if (!CB.willReturn() || !CB.doesNotThrow()) + return false; + + // If the only possible side effect of the call is writing to the alloca, + // and the result isn't used, we can safely remove any reads implied by the + // call including those which might read the alloca itself. + Optional<MemoryLocation> Dest = MemoryLocation::getForDest(&CB, TLI); + return Dest && Dest->Ptr == UsedV; } static bool isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakTrackingVH> &Users, - const TargetLibraryInfo *TLI) { + const TargetLibraryInfo &TLI) { SmallVector<Instruction*, 4> Worklist; Worklist.push_back(AI); @@ -2627,12 +2649,17 @@ static bool isAllocSiteRemovable(Instruction *AI, } } - if (isFreeCall(I, TLI)) { + if (isRemovableWrite(*cast<CallBase>(I), PI, TLI)) { + Users.emplace_back(I); + continue; + } + + if (isFreeCall(I, &TLI)) { Users.emplace_back(I); continue; } - if (isReallocLikeFn(I, TLI, true)) { + if (isReallocLikeFn(I, &TLI, true)) { Users.emplace_back(I); Worklist.push_back(I); continue; @@ -2676,7 +2703,7 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) { DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false)); } - if (isAllocSiteRemovable(&MI, Users, &TLI)) { + if (isAllocSiteRemovable(&MI, Users, TLI)) { for (unsigned i = 0, e = Users.size(); i != e; ++i) { // Lowering all @llvm.objectsize calls first because they may // use a bitcast/GEP of the alloca we are removing. diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 38c219ce3465..9f26b37bbc79 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -232,6 +232,12 @@ static cl::opt<int> ClTrackOrigins("dfsan-track-origins", cl::desc("Track origins of labels"), cl::Hidden, cl::init(0)); +static cl::opt<bool> ClIgnorePersonalityRoutine( + "dfsan-ignore-personality-routine", + cl::desc("If a personality routine is marked uninstrumented from the ABI " + "list, do not create a wrapper for it."), + cl::Hidden, cl::init(false)); + static StringRef getGlobalTypeString(const GlobalValue &G) { // Types of GlobalVariables are always pointer types. Type *GType = G.getValueType(); @@ -1115,7 +1121,7 @@ DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName, BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF); if (F->isVarArg()) { - NewF->removeFnAttrs(AttrBuilder().addAttribute("split-stack")); + NewF->removeFnAttr("split-stack"); CallInst::Create(DFSanVarargWrapperFn, IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "", BB); @@ -1357,9 +1363,24 @@ bool DataFlowSanitizer::runImpl(Module &M) { std::vector<Function *> FnsToInstrument; SmallPtrSet<Function *, 2> FnsWithNativeABI; SmallPtrSet<Function *, 2> FnsWithForceZeroLabel; + SmallPtrSet<Constant *, 1> PersonalityFns; for (Function &F : M) - if (!F.isIntrinsic() && !DFSanRuntimeFunctions.contains(&F)) + if (!F.isIntrinsic() && !DFSanRuntimeFunctions.contains(&F)) { FnsToInstrument.push_back(&F); + if (F.hasPersonalityFn()) + PersonalityFns.insert(F.getPersonalityFn()->stripPointerCasts()); + } + + if (ClIgnorePersonalityRoutine) { + for (auto *C : PersonalityFns) { + assert(isa<Function>(C) && "Personality routine is not a function!"); + Function *F = cast<Function>(C); + if (!isInstrumented(F)) + FnsToInstrument.erase( + std::remove(FnsToInstrument.begin(), FnsToInstrument.end(), F), + FnsToInstrument.end()); + } + } // Give function aliases prefixes when necessary, and build wrappers where the // instrumentedness is inconsistent. diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index d1d3b8ffdf7a..de34348606ef 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -26,7 +26,9 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DIBuilder.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" @@ -40,6 +42,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/ProfileData/InstrProf.h" +#include "llvm/ProfileData/InstrProfCorrelator.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" @@ -57,6 +60,13 @@ using namespace llvm; #define DEBUG_TYPE "instrprof" +namespace llvm { +cl::opt<bool> + DebugInfoCorrelate("debug-info-correlate", cl::ZeroOrMore, + cl::desc("Use debug info to correlate profiles."), + cl::init(false)); +} // namespace llvm + namespace { cl::opt<bool> DoHashBasedCounterSplit( @@ -641,6 +651,12 @@ void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) { } void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { + // TODO: Value profiling heavily depends on the data section which is omitted + // in lightweight mode. We need to move the value profile pointer to the + // Counter struct to get this working. + assert( + !DebugInfoCorrelate && + "Value profiling is not yet supported with lightweight instrumentation"); GlobalVariable *Name = Ind->getName(); auto It = ProfileDataMap.find(Name); assert(It != ProfileDataMap.end() && It->second.DataVar && @@ -855,6 +871,12 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { GlobalValue::LinkageTypes Linkage = NamePtr->getLinkage(); GlobalValue::VisibilityTypes Visibility = NamePtr->getVisibility(); + // Use internal rather than private linkage so the counter variable shows up + // in the symbol table when using debug info for correlation. + if (DebugInfoCorrelate && TT.isOSBinFormatMachO() && + Linkage == GlobalValue::PrivateLinkage) + Linkage = GlobalValue::InternalLinkage; + // Due to the limitation of binder as of 2021/09/28, the duplicate weak // symbols in the same csect won't be discarded. When there are duplicate weak // symbols, we can NOT guarantee that the relocations get resolved to the @@ -916,6 +938,42 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { MaybeSetComdat(CounterPtr); CounterPtr->setLinkage(Linkage); PD.RegionCounters = CounterPtr; + if (DebugInfoCorrelate) { + if (auto *SP = Fn->getSubprogram()) { + DIBuilder DB(*M, true, SP->getUnit()); + Metadata *FunctionNameAnnotation[] = { + MDString::get(Ctx, InstrProfCorrelator::FunctionNameAttributeName), + MDString::get(Ctx, getPGOFuncNameVarInitializer(NamePtr)), + }; + Metadata *CFGHashAnnotation[] = { + MDString::get(Ctx, InstrProfCorrelator::CFGHashAttributeName), + ConstantAsMetadata::get(Inc->getHash()), + }; + Metadata *NumCountersAnnotation[] = { + MDString::get(Ctx, InstrProfCorrelator::NumCountersAttributeName), + ConstantAsMetadata::get(Inc->getNumCounters()), + }; + auto Annotations = DB.getOrCreateArray({ + MDNode::get(Ctx, FunctionNameAnnotation), + MDNode::get(Ctx, CFGHashAnnotation), + MDNode::get(Ctx, NumCountersAnnotation), + }); + auto *DICounter = DB.createGlobalVariableExpression( + SP, CounterPtr->getName(), /*LinkageName=*/StringRef(), SP->getFile(), + /*LineNo=*/0, DB.createUnspecifiedType("Profile Data Type"), + CounterPtr->hasLocalLinkage(), /*IsDefined=*/true, /*Expr=*/nullptr, + /*Decl=*/nullptr, /*TemplateParams=*/nullptr, /*AlignInBits=*/0, + Annotations); + CounterPtr->addDebugInfo(DICounter); + DB.finalize(); + } else { + std::string Msg = ("Missing debug info for function " + Fn->getName() + + "; required for profile correlation.") + .str(); + Ctx.diagnose( + DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning)); + } + } auto *Int8PtrTy = Type::getInt8PtrTy(Ctx); // Allocate statically the array of pointers to value profile nodes for @@ -939,6 +997,9 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx)); } + if (DebugInfoCorrelate) + return PD.RegionCounters; + // Create data variable. auto *IntPtrTy = M->getDataLayout().getIntPtrType(M->getContext()); auto *Int16Ty = Type::getInt16Ty(Ctx); diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 4d15b784f486..446e601cd4d7 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -307,6 +307,11 @@ static cl::opt<bool> cl::desc("Enable KernelMemorySanitizer instrumentation"), cl::Hidden, cl::init(false)); +static cl::opt<bool> + ClDisableChecks("msan-disable-checks", + cl::desc("Apply no_sanitize to the whole file"), cl::Hidden, + cl::init(false)); + // This is an experiment to enable handling of cases where shadow is a non-zero // compile-time constant. For some unexplainable reason they were silently // ignored in the instrumentation. @@ -1095,7 +1100,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { MemorySanitizerVisitor(Function &F, MemorySanitizer &MS, const TargetLibraryInfo &TLI) : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)), TLI(&TLI) { - bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeMemory); + bool SanitizeFunction = + F.hasFnAttribute(Attribute::SanitizeMemory) && !ClDisableChecks; InsertChecks = SanitizeFunction; PropagateShadow = SanitizeFunction; PoisonStack = SanitizeFunction && ClPoisonStack; @@ -1214,7 +1220,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *Shadow = SI->isAtomic() ? getCleanShadow(Val) : getShadow(Val); Value *ShadowPtr, *OriginPtr; Type *ShadowTy = Shadow->getType(); - const Align Alignment = assumeAligned(SI->getAlignment()); + const Align Alignment = SI->getAlign(); const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment); std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ true); @@ -3887,8 +3893,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { &I, IRB, IRB.getInt8Ty(), Align(1), /*isStore*/ true); Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0); - IRB.CreateMemSet(ShadowBase, PoisonValue, Len, - MaybeAlign(I.getAlignment())); + IRB.CreateMemSet(ShadowBase, PoisonValue, Len, I.getAlign()); } if (PoisonStack && MS.TrackOrigins) { diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index af5946325bbb..b6ba1fc2132c 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -273,14 +273,14 @@ static cl::opt<bool> PGOVerifyBFI( "internal option -pass-remakrs-analysis=pgo.")); static cl::opt<unsigned> PGOVerifyBFIRatio( - "pgo-verify-bfi-ratio", cl::init(5), cl::Hidden, - cl::desc("Set the threshold for pgo-verify-big -- only print out " + "pgo-verify-bfi-ratio", cl::init(2), cl::Hidden, + cl::desc("Set the threshold for pgo-verify-bfi: only print out " "mismatched BFI if the difference percentage is greater than " "this value (in percentage).")); static cl::opt<unsigned> PGOVerifyBFICutoff( - "pgo-verify-bfi-cutoff", cl::init(1), cl::Hidden, - cl::desc("Set the threshold for pgo-verify-bfi -- skip the counts whose " + "pgo-verify-bfi-cutoff", cl::init(5), cl::Hidden, + cl::desc("Set the threshold for pgo-verify-bfi: skip the counts whose " "profile count value is below.")); namespace llvm { @@ -291,6 +291,8 @@ extern cl::opt<PGOViewCountsType> PGOViewCounts; // Command line option to specify the name of the function for CFG dump // Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name= extern cl::opt<std::string> ViewBlockFreqFuncName; + +extern cl::opt<bool> DebugInfoCorrelate; } // namespace llvm static cl::opt<bool> @@ -467,8 +469,9 @@ private: createProfileFileNameVar(M, InstrProfileOutput); // The variable in a comdat may be discarded by LTO. Ensure the // declaration will be retained. - appendToCompilerUsed( - M, createIRLevelProfileFlagVar(M, /*IsCS=*/true, PGOInstrumentEntry)); + appendToCompilerUsed(M, createIRLevelProfileFlagVar(M, /*IsCS=*/true, + PGOInstrumentEntry, + DebugInfoCorrelate)); return false; } std::string InstrProfileOutput; @@ -1616,7 +1619,8 @@ static bool InstrumentAllFunctions( // For the context-sensitve instrumentation, we should have a separated pass // (before LTO/ThinLTO linking) to create these variables. if (!IsCS) - createIRLevelProfileFlagVar(M, /*IsCS=*/false, PGOInstrumentEntry); + createIRLevelProfileFlagVar(M, /*IsCS=*/false, PGOInstrumentEntry, + DebugInfoCorrelate); std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers; collectComdatMembers(M, ComdatMembers); @@ -1638,8 +1642,9 @@ PGOInstrumentationGenCreateVar::run(Module &M, ModuleAnalysisManager &AM) { createProfileFileNameVar(M, CSInstrName); // The variable in a comdat may be discarded by LTO. Ensure the declaration // will be retained. - appendToCompilerUsed( - M, createIRLevelProfileFlagVar(M, /*IsCS=*/true, PGOInstrumentEntry)); + appendToCompilerUsed(M, createIRLevelProfileFlagVar(M, /*IsCS=*/true, + PGOInstrumentEntry, + DebugInfoCorrelate)); return PreservedAnalyses::all(); } @@ -1774,7 +1779,7 @@ static void verifyFuncBFI(PGOUseFunc &Func, LoopInfo &LI, uint64_t Diff = (BFICountValue >= CountValue) ? BFICountValue - CountValue : CountValue - BFICountValue; - if (Diff < CountValue / 100 * PGOVerifyBFIRatio) + if (Diff <= CountValue / 100 * PGOVerifyBFIRatio) continue; } BBMisMatchNum++; diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 27f54f8026e1..37a7053d778e 100644 --- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -271,8 +271,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, // subtree of BB (subtree not including the BB itself). DenseMap<BasicBlock *, InsertPtsCostPair> InsertPtsMap; InsertPtsMap.reserve(Orders.size() + 1); - for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) { - BasicBlock *Node = *RIt; + for (BasicBlock *Node : llvm::reverse(Orders)) { bool NodeInBBs = BBs.count(Node); auto &InsertPts = InsertPtsMap[Node].first; BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second; diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 8c4523206070..dda1a2f08076 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -588,7 +588,7 @@ struct AllSwitchPaths { PrevBB = BB; } - if (TPath.isExitValueSet()) + if (TPath.isExitValueSet() && isSupported(TPath)) TPaths.push_back(TPath); } } @@ -683,6 +683,62 @@ private: return Res; } + /// The determinator BB should precede the switch-defining BB. + /// + /// Otherwise, it is possible that the state defined in the determinator block + /// defines the state for the next iteration of the loop, rather than for the + /// current one. + /// + /// Currently supported paths: + /// \code + /// < switch bb1 determ def > [ 42, determ ] + /// < switch_and_def bb1 determ > [ 42, determ ] + /// < switch_and_def_and_determ bb1 > [ 42, switch_and_def_and_determ ] + /// \endcode + /// + /// Unsupported paths: + /// \code + /// < switch bb1 def determ > [ 43, determ ] + /// < switch_and_determ bb1 def > [ 43, switch_and_determ ] + /// \endcode + bool isSupported(const ThreadingPath &TPath) { + Instruction *SwitchCondI = dyn_cast<Instruction>(Switch->getCondition()); + assert(SwitchCondI); + if (!SwitchCondI) + return false; + + const BasicBlock *SwitchCondDefBB = SwitchCondI->getParent(); + const BasicBlock *SwitchCondUseBB = Switch->getParent(); + const BasicBlock *DeterminatorBB = TPath.getDeterminatorBB(); + + assert( + SwitchCondUseBB == TPath.getPath().front() && + "The first BB in a threading path should have the switch instruction"); + if (SwitchCondUseBB != TPath.getPath().front()) + return false; + + // Make DeterminatorBB the first element in Path. + PathType Path = TPath.getPath(); + auto ItDet = std::find(Path.begin(), Path.end(), DeterminatorBB); + std::rotate(Path.begin(), ItDet, Path.end()); + + bool IsDetBBSeen = false; + bool IsDefBBSeen = false; + bool IsUseBBSeen = false; + for (BasicBlock *BB : Path) { + if (BB == DeterminatorBB) + IsDetBBSeen = true; + if (BB == SwitchCondDefBB) + IsDefBBSeen = true; + if (BB == SwitchCondUseBB) + IsUseBBSeen = true; + if (IsDetBBSeen && IsUseBBSeen && !IsDefBBSeen) + return false; + } + + return true; + } + SwitchInst *Switch; BasicBlock *SwitchBlock; OptimizationRemarkEmitter *ORE; diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index e0d3a6accadd..eadbb4293539 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -175,44 +175,6 @@ static cl::opt<bool> using OverlapIntervalsTy = std::map<int64_t, int64_t>; using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>; -/// If the value of this instruction and the memory it writes to is unused, may -/// we delete this instruction? -static bool isRemovable(Instruction *I) { - // Don't remove volatile/atomic stores. - if (StoreInst *SI = dyn_cast<StoreInst>(I)) - return SI->isUnordered(); - - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { - switch (II->getIntrinsicID()) { - default: llvm_unreachable("Does not have LocForWrite"); - case Intrinsic::lifetime_end: - // Never remove dead lifetime_end's, e.g. because it is followed by a - // free. - return false; - case Intrinsic::init_trampoline: - // Always safe to remove init_trampoline. - return true; - case Intrinsic::memset: - case Intrinsic::memmove: - case Intrinsic::memcpy: - case Intrinsic::memcpy_inline: - // Don't remove volatile memory intrinsics. - return !cast<MemIntrinsic>(II)->isVolatile(); - case Intrinsic::memcpy_element_unordered_atomic: - case Intrinsic::memmove_element_unordered_atomic: - case Intrinsic::memset_element_unordered_atomic: - case Intrinsic::masked_store: - return true; - } - } - - // note: only get here for calls with analyzable writes - i.e. libcalls - if (auto *CB = dyn_cast<CallBase>(I)) - return CB->use_empty(); - - return false; -} - /// Returns true if the end of this instruction can be safely shortened in /// length. static bool isShortenableAtTheEnd(Instruction *I) { @@ -835,7 +797,7 @@ struct DSEState { auto *MD = dyn_cast_or_null<MemoryDef>(MA); if (MD && MemDefs.size() < MemorySSADefsPerBlockLimit && - (getLocForWriteEx(&I) || isMemTerminatorInst(&I))) + (getLocForWrite(&I) || isMemTerminatorInst(&I))) MemDefs.push_back(MD); } } @@ -1022,48 +984,39 @@ struct DSEState { return I.first->second; } - Optional<MemoryLocation> getLocForWriteEx(Instruction *I) const { + Optional<MemoryLocation> getLocForWrite(Instruction *I) const { if (!I->mayWriteToMemory()) return None; - if (auto *MTI = dyn_cast<AnyMemIntrinsic>(I)) - return {MemoryLocation::getForDest(MTI)}; + if (auto *CB = dyn_cast<CallBase>(I)) + return MemoryLocation::getForDest(CB, TLI); + + return MemoryLocation::getOrNone(I); + } + + /// Assuming this instruction has a dead analyzable write, can we delete + /// this instruction? + bool isRemovable(Instruction *I) { + assert(getLocForWrite(I) && "Must have analyzable write"); + + // Don't remove volatile/atomic stores. + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return SI->isUnordered(); if (auto *CB = dyn_cast<CallBase>(I)) { - // If the functions may write to memory we do not know about, bail out. - if (!CB->onlyAccessesArgMemory() && - !CB->onlyAccessesInaccessibleMemOrArgMem()) - return None; + // Don't remove volatile memory intrinsics. + if (auto *MI = dyn_cast<MemIntrinsic>(CB)) + return !MI->isVolatile(); - LibFunc LF; - if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) { - switch (LF) { - case LibFunc_strncpy: - if (const auto *Len = dyn_cast<ConstantInt>(CB->getArgOperand(2))) - return MemoryLocation(CB->getArgOperand(0), - LocationSize::precise(Len->getZExtValue()), - CB->getAAMetadata()); - LLVM_FALLTHROUGH; - case LibFunc_strcpy: - case LibFunc_strcat: - case LibFunc_strncat: - return {MemoryLocation::getAfter(CB->getArgOperand(0))}; - default: - break; - } - } - switch (CB->getIntrinsicID()) { - case Intrinsic::init_trampoline: - return {MemoryLocation::getAfter(CB->getArgOperand(0))}; - case Intrinsic::masked_store: - return {MemoryLocation::getForArgument(CB, 1, TLI)}; - default: - break; - } - return None; + // Never remove dead lifetime intrinsics, e.g. because they are followed + // by a free. + if (CB->isLifetimeStartOrEnd()) + return false; + + return CB->use_empty() && CB->willReturn() && CB->doesNotThrow(); } - return MemoryLocation::getOrNone(I); + return false; } /// Returns true if \p UseInst completely overwrites \p DefLoc @@ -1081,7 +1034,7 @@ struct DSEState { return false; int64_t InstWriteOffset, DepWriteOffset; - if (auto CC = getLocForWriteEx(UseInst)) + if (auto CC = getLocForWrite(UseInst)) return isOverwrite(UseInst, DefInst, *CC, DefLoc, InstWriteOffset, DepWriteOffset) == OW_Complete; return false; @@ -1093,7 +1046,7 @@ struct DSEState { << *Def->getMemoryInst() << ") is at the end the function \n"); - auto MaybeLoc = getLocForWriteEx(Def->getMemoryInst()); + auto MaybeLoc = getLocForWrite(Def->getMemoryInst()); if (!MaybeLoc) { LLVM_DEBUG(dbgs() << " ... could not get location for write.\n"); return false; @@ -1237,30 +1190,14 @@ struct DSEState { /// loop. In particular, this guarantees that it only references a single /// MemoryLocation during execution of the containing function. bool isGuaranteedLoopInvariant(const Value *Ptr) { - auto IsGuaranteedLoopInvariantBase = [this](const Value *Ptr) { - Ptr = Ptr->stripPointerCasts(); - if (auto *I = dyn_cast<Instruction>(Ptr)) { - if (isa<AllocaInst>(Ptr)) - return true; - - if (isAllocLikeFn(I, &TLI)) - return true; - - return false; - } - return true; - }; - Ptr = Ptr->stripPointerCasts(); - if (auto *I = dyn_cast<Instruction>(Ptr)) { - if (I->getParent()->isEntryBlock()) - return true; - } - if (auto *GEP = dyn_cast<GEPOperator>(Ptr)) { - return IsGuaranteedLoopInvariantBase(GEP->getPointerOperand()) && - GEP->hasAllConstantIndices(); - } - return IsGuaranteedLoopInvariantBase(Ptr); + if (auto *GEP = dyn_cast<GEPOperator>(Ptr)) + if (GEP->hasAllConstantIndices()) + Ptr = GEP->getPointerOperand()->stripPointerCasts(); + + if (auto *I = dyn_cast<Instruction>(Ptr)) + return I->getParent()->isEntryBlock(); + return true; } // Find a MemoryDef writing to \p KillingLoc and dominating \p StartAccess, @@ -1372,7 +1309,7 @@ struct DSEState { // If Current does not have an analyzable write location or is not // removable, skip it. - CurrentLoc = getLocForWriteEx(CurrentI); + CurrentLoc = getLocForWrite(CurrentI); if (!CurrentLoc || !isRemovable(CurrentI)) { CanOptimize = false; continue; @@ -1729,14 +1666,13 @@ struct DSEState { LLVM_DEBUG( dbgs() << "Trying to eliminate MemoryDefs at the end of the function\n"); - for (int I = MemDefs.size() - 1; I >= 0; I--) { - MemoryDef *Def = MemDefs[I]; - if (SkipStores.contains(Def) || !isRemovable(Def->getMemoryInst())) + for (MemoryDef *Def : llvm::reverse(MemDefs)) { + if (SkipStores.contains(Def)) continue; Instruction *DefI = Def->getMemoryInst(); - auto DefLoc = getLocForWriteEx(DefI); - if (!DefLoc) + auto DefLoc = getLocForWrite(DefI); + if (!DefLoc || !isRemovable(DefI)) continue; // NOTE: Currently eliminating writes at the end of a function is limited @@ -1763,13 +1699,19 @@ struct DSEState { /// \returns true if \p Def is a no-op store, either because it /// directly stores back a loaded value or stores zero to a calloced object. bool storeIsNoop(MemoryDef *Def, const Value *DefUO) { - StoreInst *Store = dyn_cast<StoreInst>(Def->getMemoryInst()); - MemSetInst *MemSet = dyn_cast<MemSetInst>(Def->getMemoryInst()); + Instruction *DefI = Def->getMemoryInst(); + StoreInst *Store = dyn_cast<StoreInst>(DefI); + MemSetInst *MemSet = dyn_cast<MemSetInst>(DefI); Constant *StoredConstant = nullptr; if (Store) StoredConstant = dyn_cast<Constant>(Store->getOperand(0)); - if (MemSet) + else if (MemSet) StoredConstant = dyn_cast<Constant>(MemSet->getValue()); + else + return false; + + if (!isRemovable(DefI)) + return false; if (StoredConstant && StoredConstant->isNullValue()) { auto *DefUOInst = dyn_cast<Instruction>(DefUO); @@ -1902,7 +1844,7 @@ struct DSEState { bool Changed = false; for (auto OI : IOL) { Instruction *DeadI = OI.first; - MemoryLocation Loc = *getLocForWriteEx(DeadI); + MemoryLocation Loc = *getLocForWrite(DeadI); assert(isRemovable(DeadI) && "Expect only removable instruction"); const Value *Ptr = Loc.Ptr->stripPointerCasts(); @@ -1925,9 +1867,14 @@ struct DSEState { LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs that write the " "already existing value\n"); for (auto *Def : MemDefs) { - if (SkipStores.contains(Def) || MSSA.isLiveOnEntryDef(Def) || - !isRemovable(Def->getMemoryInst())) + if (SkipStores.contains(Def) || MSSA.isLiveOnEntryDef(Def)) continue; + + Instruction *DefInst = Def->getMemoryInst(); + auto MaybeDefLoc = getLocForWrite(DefInst); + if (!MaybeDefLoc || !isRemovable(DefInst)) + continue; + MemoryDef *UpperDef; // To conserve compile-time, we avoid walking to the next clobbering def. // Instead, we just try to get the optimized access, if it exists. DSE @@ -1939,17 +1886,14 @@ struct DSEState { if (!UpperDef || MSSA.isLiveOnEntryDef(UpperDef)) continue; - Instruction *DefInst = Def->getMemoryInst(); Instruction *UpperInst = UpperDef->getMemoryInst(); - auto IsRedundantStore = [this, DefInst, - UpperInst](MemoryLocation UpperLoc) { + auto IsRedundantStore = [&]() { if (DefInst->isIdenticalTo(UpperInst)) return true; if (auto *MemSetI = dyn_cast<MemSetInst>(UpperInst)) { if (auto *SI = dyn_cast<StoreInst>(DefInst)) { - auto MaybeDefLoc = getLocForWriteEx(DefInst); - if (!MaybeDefLoc) - return false; + // MemSetInst must have a write location. + MemoryLocation UpperLoc = *getLocForWrite(UpperInst); int64_t InstWriteOffset = 0; int64_t DepWriteOffset = 0; auto OR = isOverwrite(UpperInst, DefInst, UpperLoc, *MaybeDefLoc, @@ -1962,9 +1906,7 @@ struct DSEState { return false; }; - auto MaybeUpperLoc = getLocForWriteEx(UpperInst); - if (!MaybeUpperLoc || !IsRedundantStore(*MaybeUpperLoc) || - isReadClobber(*MaybeUpperLoc, DefInst)) + if (!IsRedundantStore() || isReadClobber(*MaybeDefLoc, DefInst)) continue; LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n DEAD: " << *DefInst << '\n'); @@ -1995,7 +1937,7 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, MaybeKillingLoc = State.getLocForTerminator(KillingI).map( [](const std::pair<MemoryLocation, bool> &P) { return P.first; }); else - MaybeKillingLoc = State.getLocForWriteEx(KillingI); + MaybeKillingLoc = State.getLocForWrite(KillingI); if (!MaybeKillingLoc) { LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for " @@ -2059,7 +2001,7 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, if (!DebugCounter::shouldExecute(MemorySSACounter)) continue; - MemoryLocation DeadLoc = *State.getLocForWriteEx(DeadI); + MemoryLocation DeadLoc = *State.getLocForWrite(DeadI); if (IsMemTerm) { const Value *DeadUndObj = getUnderlyingObject(DeadLoc.Ptr); @@ -2124,8 +2066,7 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, } // Check if the store is a no-op. - if (!Shortend && isRemovable(KillingI) && - State.storeIsNoop(KillingDef, KillingUndObj)) { + if (!Shortend && State.storeIsNoop(KillingDef, KillingUndObj)) { LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n DEAD: " << *KillingI << '\n'); State.deleteDeadInstruction(KillingI); diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 90f71f7729a7..a24997dd3fd4 100644 --- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -1366,8 +1366,16 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); continue; } - if (auto *I = dyn_cast<Instruction>(V)) - I->andIRFlags(&Inst); + if (auto *I = dyn_cast<Instruction>(V)) { + // If I being poison triggers UB, there is no need to drop those + // flags. Otherwise, only retain flags present on both I and Inst. + // TODO: Currently some fast-math flags are not treated as + // poison-generating even though they should. Until this is fixed, + // always retain flags present on both I and Inst for floating point + // instructions. + if (isa<FPMathOperator>(I) || (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I))) + I->andIRFlags(&Inst); + } Inst.replaceAllUsesWith(V); salvageKnowledge(&Inst, &AC); removeMSSA(Inst); diff --git a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp index e54a270fb276..44017b555769 100644 --- a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp +++ b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp @@ -13,10 +13,12 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/IR/CFG.h" #include "llvm/IR/InstrTypes.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/ValueHandle.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/FlattenCFG.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -24,11 +26,11 @@ using namespace llvm; #define DEBUG_TYPE "flattencfg" namespace { -struct FlattenCFGPass : public FunctionPass { +struct FlattenCFGLegacyPass : public FunctionPass { static char ID; // Pass identification, replacement for typeid public: - FlattenCFGPass() : FunctionPass(ID) { - initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry()); + FlattenCFGLegacyPass() : FunctionPass(ID) { + initializeFlattenCFGLegacyPassPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; @@ -39,21 +41,10 @@ public: private: AliasAnalysis *AA; }; -} - -char FlattenCFGPass::ID = 0; -INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false, - false) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false, - false) - -// Public interface to the FlattenCFG pass -FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); } /// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function, /// iterating until no more changes are made. -static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) { +bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) { bool Changed = false; bool LocalChange = true; @@ -78,8 +69,22 @@ static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) { } return Changed; } +} // namespace -bool FlattenCFGPass::runOnFunction(Function &F) { +char FlattenCFGLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(FlattenCFGLegacyPass, "flattencfg", "Flatten the CFG", + false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_END(FlattenCFGLegacyPass, "flattencfg", "Flatten the CFG", + false, false) + +// Public interface to the FlattenCFG pass +FunctionPass *llvm::createFlattenCFGPass() { + return new FlattenCFGLegacyPass(); +} + +bool FlattenCFGLegacyPass::runOnFunction(Function &F) { AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); bool EverChanged = false; // iterativelyFlattenCFG can make some blocks dead. @@ -89,3 +94,15 @@ bool FlattenCFGPass::runOnFunction(Function &F) { } return EverChanged; } + +PreservedAnalyses FlattenCFGPass::run(Function &F, + FunctionAnalysisManager &AM) { + bool EverChanged = false; + AliasAnalysis *AA = &AM.getResult<AAManager>(F); + // iterativelyFlattenCFG can make some blocks dead. + while (iterativelyFlattenCFG(F, AA)) { + removeUnreachableBlocks(F); + EverChanged = true; + } + return EverChanged ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 6f97f3e93123..bc792ca3d8da 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -107,11 +107,6 @@ static cl::opt<bool> ControlFlowHoisting( "licm-control-flow-hoisting", cl::Hidden, cl::init(false), cl::desc("Enable control flow (and PHI) hoisting in LICM")); -static cl::opt<unsigned> HoistSinkColdnessThreshold( - "licm-coldness-threshold", cl::Hidden, cl::init(4), - cl::desc("Relative coldness Threshold of hoisting/sinking destination " - "block for LICM to be considered beneficial")); - static cl::opt<uint32_t> MaxNumUsesTraversed( "licm-max-num-uses-traversed", cl::Hidden, cl::init(8), cl::desc("Max num uses visited for identifying load " @@ -819,35 +814,6 @@ public: }; } // namespace -// Hoisting/sinking instruction out of a loop isn't always beneficial. It's only -// only worthwhile if the destination block is actually colder than current -// block. -static bool worthSinkOrHoistInst(Instruction &I, BasicBlock *DstBlock, - OptimizationRemarkEmitter *ORE, - BlockFrequencyInfo *BFI) { - // Check block frequency only when runtime profile is available - // to avoid pathological cases. With static profile, lean towards - // hosting because it helps canonicalize the loop for vectorizer. - if (!DstBlock->getParent()->hasProfileData()) - return true; - - if (!HoistSinkColdnessThreshold || !BFI) - return true; - - BasicBlock *SrcBlock = I.getParent(); - if (BFI->getBlockFreq(DstBlock).getFrequency() / HoistSinkColdnessThreshold > - BFI->getBlockFreq(SrcBlock).getFrequency()) { - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "SinkHoistInst", &I) - << "failed to sink or hoist instruction because containing block " - "has lower frequency than destination block"; - }); - return false; - } - - return true; -} - /// Walk the specified region of the CFG (defined by all blocks dominated by /// the specified block, and that are in the current loop) in depth first /// order w.r.t the DominatorTree. This allows us to visit definitions before @@ -909,7 +875,6 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I, AA, DT, CurLoop, /*CurAST*/ nullptr, MSSAU, true, &Flags, ORE) && - worthSinkOrHoistInst(I, CurLoop->getLoopPreheader(), ORE, BFI) && isSafeToExecuteUnconditionally( I, DT, TLI, CurLoop, SafetyInfo, ORE, CurLoop->getLoopPreheader()->getTerminator())) { @@ -1741,7 +1706,6 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, // First check if I is worth sinking for all uses. Sink only when it is worth // across all uses. SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end()); - SmallVector<PHINode *, 8> ExitPNs; for (auto *UI : Users) { auto *User = cast<Instruction>(UI); @@ -1751,14 +1715,6 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, PHINode *PN = cast<PHINode>(User); assert(ExitBlockSet.count(PN->getParent()) && "The LCSSA PHI is not in an exit block!"); - if (!worthSinkOrHoistInst(I, PN->getParent(), ORE, BFI)) { - return Changed; - } - - ExitPNs.push_back(PN); - } - - for (auto *PN : ExitPNs) { // The PHI must be trivially replaceable. Instruction *New = sinkThroughTriviallyReplaceablePHI( diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index 77d76609c926..57e36e5b9b90 100644 --- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -224,8 +224,8 @@ bool LoopDataPrefetch::run() { bool MadeChange = false; for (Loop *I : *LI) - for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L) - MadeChange |= runOnLoop(*L); + for (Loop *L : depth_first(I)) + MadeChange |= runOnLoop(L); return MadeChange; } diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 42da86a9ecf5..5d00fa56e888 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -786,9 +786,9 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL, Type *IntIdxTy = DL->getIndexType(StorePtr->getType()); const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize); if (processLoopStridedStore(StorePtr, StoreSizeSCEV, - MaybeAlign(HeadStore->getAlignment()), - StoredVal, HeadStore, AdjacentStores, StoreEv, - BECount, IsNegStride)) { + MaybeAlign(HeadStore->getAlign()), StoredVal, + HeadStore, AdjacentStores, StoreEv, BECount, + IsNegStride)) { TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end()); Changed = true; } @@ -967,12 +967,22 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, << "\n"); if (PositiveStrideSCEV != MemsetSizeSCEV) { - // TODO: folding can be done to the SCEVs - // The folding is to fold expressions that is covered by the loop guard - // at loop entry. After the folding, compare again and proceed - // optimization if equal. - LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n"); - return false; + // If an expression is covered by the loop guard, compare again and + // proceed with optimization if equal. + const SCEV *FoldedPositiveStride = + SE->applyLoopGuards(PositiveStrideSCEV, CurLoop); + const SCEV *FoldedMemsetSize = + SE->applyLoopGuards(MemsetSizeSCEV, CurLoop); + + LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n" + << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n" + << " FoldedPositiveStride: " << *FoldedPositiveStride + << "\n"); + + if (FoldedPositiveStride != FoldedMemsetSize) { + LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n"); + return false; + } } } diff --git a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp index 56d66b93dd69..9d22eceb987f 100644 --- a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -1456,16 +1456,12 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *BackedgeTakenCount) { } // Remove instructions associated with non-base iterations. - for (BasicBlock::reverse_iterator J = Header->rbegin(), JE = Header->rend(); - J != JE;) { - unsigned I = Uses[&*J].find_first(); + for (Instruction &Inst : llvm::make_early_inc_range(llvm::reverse(*Header))) { + unsigned I = Uses[&Inst].find_first(); if (I > 0 && I < IL_All) { - LLVM_DEBUG(dbgs() << "LRR: removing: " << *J << "\n"); - J++->eraseFromParent(); - continue; + LLVM_DEBUG(dbgs() << "LRR: removing: " << Inst << "\n"); + Inst.eraseFromParent(); } - - ++J; } // Rewrite each BaseInst using SCEV. diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index a9a2266e1196..798af48c2337 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -6011,7 +6011,7 @@ struct SCEVDbgValueBuilder { // See setFinalExpression: prepend our opcodes on the start of any old // expression opcodes. assert(!DI.hasArgList()); - llvm::SmallVector<uint64_t, 6> FinalExpr(Expr.begin() + 2, Expr.end()); + llvm::SmallVector<uint64_t, 6> FinalExpr(llvm::drop_begin(Expr, 2)); auto *NewExpr = DIExpression::prependOpcodes(OldExpr, FinalExpr, /*StackValue*/ true); DI.setExpression(NewExpr); diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 39c8b65968aa..893928fb0560 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -1136,6 +1136,31 @@ static LoopUnrollResult tryToUnrollLoop( TransformationMode TM = hasUnrollTransformation(L); if (TM & TM_Disable) return LoopUnrollResult::Unmodified; + + // If this loop isn't forced to be unrolled, avoid unrolling it when the + // parent loop has an explicit unroll-and-jam pragma. This is to prevent + // automatic unrolling from interfering with the user requested + // transformation. + Loop *ParentL = L->getParentLoop(); + if (ParentL != NULL && + hasUnrollAndJamTransformation(ParentL) == TM_ForcedByUser && + hasUnrollTransformation(L) != TM_ForcedByUser) { + LLVM_DEBUG(dbgs() << "Not unrolling loop since parent loop has" + << " llvm.loop.unroll_and_jam.\n"); + return LoopUnrollResult::Unmodified; + } + + // If this loop isn't forced to be unrolled, avoid unrolling it when the + // loop has an explicit unroll-and-jam pragma. This is to prevent automatic + // unrolling from interfering with the user requested transformation. + if (hasUnrollAndJamTransformation(L) == TM_ForcedByUser && + hasUnrollTransformation(L) != TM_ForcedByUser) { + LLVM_DEBUG( + dbgs() + << " Not unrolling loop since it has llvm.loop.unroll_and_jam.\n"); + return LoopUnrollResult::Unmodified; + } + if (!L->isLoopSimplifyForm()) { LLVM_DEBUG( dbgs() << " Not unrolling loop which is not in loop-simplify form.\n"); diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index 91215cd19e2b..10a8742940b1 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -638,6 +638,7 @@ class NewGVN { BitVector TouchedInstructions; DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange; + mutable DenseMap<const IntrinsicInst *, const Value *> IntrinsicInstPred; #ifndef NDEBUG // Debugging for how many times each block and instruction got processed. @@ -794,7 +795,7 @@ private: BasicBlock *PHIBlock) const; const Expression *performSymbolicAggrValueEvaluation(Instruction *) const; ExprResult performSymbolicCmpEvaluation(Instruction *) const; - ExprResult performSymbolicPredicateInfoEvaluation(Instruction *) const; + ExprResult performSymbolicPredicateInfoEvaluation(IntrinsicInst *) const; // Congruence finding. bool someEquivalentDominates(const Instruction *, const Instruction *) const; @@ -815,6 +816,8 @@ private: // Ranking unsigned int getRank(const Value *) const; bool shouldSwapOperands(const Value *, const Value *) const; + bool shouldSwapOperandsForIntrinsic(const Value *, const Value *, + const IntrinsicInst *I) const; // Reachability handling. void updateReachableEdge(BasicBlock *, BasicBlock *); @@ -1552,7 +1555,7 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const { } NewGVN::ExprResult -NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const { +NewGVN::performSymbolicPredicateInfoEvaluation(IntrinsicInst *I) const { auto *PI = PredInfo->getPredicateInfoFor(I); if (!PI) return ExprResult::none(); @@ -1572,7 +1575,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const { Value *AdditionallyUsedValue = CmpOp0; // Sort the ops. - if (shouldSwapOperands(FirstOp, SecondOp)) { + if (shouldSwapOperandsForIntrinsic(FirstOp, SecondOp, I)) { std::swap(FirstOp, SecondOp); Predicate = CmpInst::getSwappedPredicate(Predicate); AdditionallyUsedValue = CmpOp1; @@ -1598,7 +1601,7 @@ NewGVN::ExprResult NewGVN::performSymbolicCallEvaluation(Instruction *I) const { // Intrinsics with the returned attribute are copies of arguments. if (auto *ReturnedValue = II->getReturnedArgOperand()) { if (II->getIntrinsicID() == Intrinsic::ssa_copy) - if (auto Res = performSymbolicPredicateInfoEvaluation(I)) + if (auto Res = performSymbolicPredicateInfoEvaluation(II)) return Res; return ExprResult::some(createVariableOrConstant(ReturnedValue)); } @@ -2951,6 +2954,7 @@ void NewGVN::cleanupTables() { PredicateToUsers.clear(); MemoryToUsers.clear(); RevisitOnReachabilityChange.clear(); + IntrinsicInstPred.clear(); } // Assign local DFS number mapping to instructions, and leave space for Value @@ -4152,6 +4156,29 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const { return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B); } +bool NewGVN::shouldSwapOperandsForIntrinsic(const Value *A, const Value *B, + const IntrinsicInst *I) const { + auto LookupResult = IntrinsicInstPred.find(I); + if (shouldSwapOperands(A, B)) { + if (LookupResult == IntrinsicInstPred.end()) + IntrinsicInstPred.insert({I, B}); + else + LookupResult->second = B; + return true; + } + + if (LookupResult != IntrinsicInstPred.end()) { + auto *SeenPredicate = LookupResult->second; + if (SeenPredicate) { + if (SeenPredicate == B) + return true; + else + LookupResult->second = nullptr; + } + } + return false; +} + namespace { class NewGVNLegacyPass : public FunctionPass { diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 2d3490b2d29e..e12eca0ed287 100644 --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -1359,16 +1359,6 @@ static constexpr Attribute::AttrKind FnAttrsToStrip[] = Attribute::InaccessibleMemOrArgMemOnly, Attribute::NoSync, Attribute::NoFree}; -// List of all parameter and return attributes which must be stripped when -// lowering from the abstract machine model. Note that we list attributes -// here which aren't valid as return attributes, that is okay. There are -// also some additional attributes with arguments which are handled -// explicitly and are not in this list. -static constexpr Attribute::AttrKind ParamAttrsToStrip[] = - {Attribute::ReadNone, Attribute::ReadOnly, Attribute::WriteOnly, - Attribute::NoAlias, Attribute::NoFree}; - - // Create new attribute set containing only attributes which can be transferred // from original call to the safepoint. static AttributeList legalizeCallAttributes(LLVMContext &Ctx, @@ -2650,24 +2640,19 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, return !Records.empty(); } -// Handles both return values and arguments for Functions and calls. -template <typename AttrHolder> -static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, - unsigned Index) { +// List of all parameter and return attributes which must be stripped when +// lowering from the abstract machine model. Note that we list attributes +// here which aren't valid as return attributes, that is okay. +static AttrBuilder getParamAndReturnAttributesToRemove() { AttrBuilder R; - AttributeSet AS = AH.getAttributes().getAttributes(Index); - if (AS.getDereferenceableBytes()) - R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable, - AS.getDereferenceableBytes())); - if (AS.getDereferenceableOrNullBytes()) - R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull, - AS.getDereferenceableOrNullBytes())); - for (auto Attr : ParamAttrsToStrip) - if (AS.hasAttribute(Attr)) - R.addAttribute(Attr); - - if (!R.empty()) - AH.setAttributes(AH.getAttributes().removeAttributesAtIndex(Ctx, Index, R)); + R.addDereferenceableAttr(1); + R.addDereferenceableOrNullAttr(1); + R.addAttribute(Attribute::ReadNone); + R.addAttribute(Attribute::ReadOnly); + R.addAttribute(Attribute::WriteOnly); + R.addAttribute(Attribute::NoAlias); + R.addAttribute(Attribute::NoFree); + return R; } static void stripNonValidAttributesFromPrototype(Function &F) { @@ -2683,13 +2668,13 @@ static void stripNonValidAttributesFromPrototype(Function &F) { return; } + AttrBuilder R = getParamAndReturnAttributesToRemove(); for (Argument &A : F.args()) if (isa<PointerType>(A.getType())) - RemoveNonValidAttrAtIndex(Ctx, F, - A.getArgNo() + AttributeList::FirstArgIndex); + F.removeParamAttrs(A.getArgNo(), R); if (isa<PointerType>(F.getReturnType())) - RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex); + F.removeRetAttrs(R); for (auto Attr : FnAttrsToStrip) F.removeFnAttr(Attr); @@ -2757,13 +2742,13 @@ static void stripNonValidDataFromBody(Function &F) { stripInvalidMetadataFromInstruction(I); + AttrBuilder R = getParamAndReturnAttributesToRemove(); if (auto *Call = dyn_cast<CallBase>(&I)) { for (int i = 0, e = Call->arg_size(); i != e; i++) if (isa<PointerType>(Call->getArgOperand(i)->getType())) - RemoveNonValidAttrAtIndex(Ctx, *Call, - i + AttributeList::FirstArgIndex); + Call->removeParamAttrs(i, R); if (isa<PointerType>(Call->getType())) - RemoveNonValidAttrAtIndex(Ctx, *Call, AttributeList::ReturnIndex); + Call->removeRetAttrs(R); } } diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index 28e00c873361..ff2f8a25f379 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -101,8 +101,7 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) { Constant *Const = nullptr; if (V->getType()->isStructTy()) { std::vector<ValueLatticeElement> IVs = Solver.getStructLatticeValueFor(V); - if (any_of(IVs, - [](const ValueLatticeElement &LV) { return isOverdefined(LV); })) + if (llvm::any_of(IVs, isOverdefined)) return false; std::vector<Constant *> ConstVals; auto *ST = cast<StructType>(V->getType()); diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp index a041af0d70d0..f9650efc051f 100644 --- a/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -54,7 +54,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeMakeGuardsExplicitLegacyPassPass(Registry); initializeGVNHoistLegacyPassPass(Registry); initializeGVNSinkLegacyPassPass(Registry); - initializeFlattenCFGPassPass(Registry); + initializeFlattenCFGLegacyPassPass(Registry); initializeIRCELegacyPassPass(Registry); initializeIndVarSimplifyLegacyPassPass(Registry); initializeInferAddressSpacesPass(Registry); diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index ffa2f9adb978..d23925042b0a 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -648,13 +648,13 @@ Value *ConstantOffsetExtractor::applyExts(Value *V) { Value *Current = V; // ExtInsts is built in the use-def order. Therefore, we apply them to V // in the reversed order. - for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) { + for (CastInst *I : llvm::reverse(ExtInsts)) { if (Constant *C = dyn_cast<Constant>(Current)) { // If Current is a constant, apply s/zext using ConstantExpr::getCast. // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt. - Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType()); + Current = ConstantExpr::getCast(I->getOpcode(), C, I->getType()); } else { - Instruction *Ext = (*I)->clone(); + Instruction *Ext = I->clone(); Ext->setOperand(0, Current); Ext->insertBefore(IP); Current = Ext; diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp new file mode 100644 index 000000000000..dfb9f608eab2 --- /dev/null +++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp @@ -0,0 +1,942 @@ +//===- CodeLayout.cpp - Implementation of code layout algorithms ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// ExtTSP - layout of basic blocks with i-cache optimization. +// +// The algorithm tries to find a layout of nodes (basic blocks) of a given CFG +// optimizing jump locality and thus processor I-cache utilization. This is +// achieved via increasing the number of fall-through jumps and co-locating +// frequently executed nodes together. The name follows the underlying +// optimization problem, Extended-TSP, which is a generalization of classical +// (maximum) Traveling Salesmen Problem. +// +// The algorithm is a greedy heuristic that works with chains (ordered lists) +// of basic blocks. Initially all chains are isolated basic blocks. On every +// iteration, we pick a pair of chains whose merging yields the biggest increase +// in the ExtTSP score, which models how i-cache "friendly" a specific chain is. +// A pair of chains giving the maximum gain is merged into a new chain. The +// procedure stops when there is only one chain left, or when merging does not +// increase ExtTSP. In the latter case, the remaining chains are sorted by +// density in the decreasing order. +// +// An important aspect is the way two chains are merged. Unlike earlier +// algorithms (e.g., based on the approach of Pettis-Hansen), two +// chains, X and Y, are first split into three, X1, X2, and Y. Then we +// consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y, +// X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score. +// This improves the quality of the final result (the search space is larger) +// while keeping the implementation sufficiently fast. +// +// Reference: +// * A. Newell and S. Pupyrev, Improved Basic Block Reordering, +// IEEE Transactions on Computers, 2020 +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/CodeLayout.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; +#define DEBUG_TYPE "code-layout" + +// Algorithm-specific constants. The values are tuned for the best performance +// of large-scale front-end bound binaries. +static cl::opt<double> + ForwardWeight("ext-tsp-forward-weight", cl::Hidden, cl::init(0.1), + cl::desc("The weight of forward jumps for ExtTSP value")); + +static cl::opt<double> + BackwardWeight("ext-tsp-backward-weight", cl::Hidden, cl::init(0.1), + cl::desc("The weight of backward jumps for ExtTSP value")); + +static cl::opt<unsigned> ForwardDistance( + "ext-tsp-forward-distance", cl::Hidden, cl::init(1024), + cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP")); + +static cl::opt<unsigned> BackwardDistance( + "ext-tsp-backward-distance", cl::Hidden, cl::init(640), + cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP")); + +// The maximum size of a chain for splitting. Larger values of the threshold +// may yield better quality at the cost of worsen run-time. +static cl::opt<unsigned> ChainSplitThreshold( + "ext-tsp-chain-split-threshold", cl::Hidden, cl::init(128), + cl::desc("The maximum size of a chain to apply splitting")); + +// The option enables splitting (large) chains along in-coming and out-going +// jumps. This typically results in a better quality. +static cl::opt<bool> EnableChainSplitAlongJumps( + "ext-tsp-enable-chain-split-along-jumps", cl::Hidden, cl::init(true), + cl::desc("The maximum size of a chain to apply splitting")); + +namespace { + +// Epsilon for comparison of doubles. +constexpr double EPS = 1e-8; + +// Compute the Ext-TSP score for a jump between a given pair of blocks, +// using their sizes, (estimated) addresses and the jump execution count. +double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr, + uint64_t Count) { + // Fallthrough + if (SrcAddr + SrcSize == DstAddr) { + // Assume that FallthroughWeight = 1.0 after normalization + return static_cast<double>(Count); + } + // Forward + if (SrcAddr + SrcSize < DstAddr) { + const auto Dist = DstAddr - (SrcAddr + SrcSize); + if (Dist <= ForwardDistance) { + double Prob = 1.0 - static_cast<double>(Dist) / ForwardDistance; + return ForwardWeight * Prob * Count; + } + return 0; + } + // Backward + const auto Dist = SrcAddr + SrcSize - DstAddr; + if (Dist <= BackwardDistance) { + double Prob = 1.0 - static_cast<double>(Dist) / BackwardDistance; + return BackwardWeight * Prob * Count; + } + return 0; +} + +/// A type of merging two chains, X and Y. The former chain is split into +/// X1 and X2 and then concatenated with Y in the order specified by the type. +enum class MergeTypeTy : int { X_Y, X1_Y_X2, Y_X2_X1, X2_X1_Y }; + +/// The gain of merging two chains, that is, the Ext-TSP score of the merge +/// together with the corresponfiding merge 'type' and 'offset'. +class MergeGainTy { +public: + explicit MergeGainTy() {} + explicit MergeGainTy(double Score, size_t MergeOffset, MergeTypeTy MergeType) + : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {} + + double score() const { return Score; } + + size_t mergeOffset() const { return MergeOffset; } + + MergeTypeTy mergeType() const { return MergeType; } + + // Returns 'true' iff Other is preferred over this. + bool operator<(const MergeGainTy &Other) const { + return (Other.Score > EPS && Other.Score > Score + EPS); + } + + // Update the current gain if Other is preferred over this. + void updateIfLessThan(const MergeGainTy &Other) { + if (*this < Other) + *this = Other; + } + +private: + double Score{-1.0}; + size_t MergeOffset{0}; + MergeTypeTy MergeType{MergeTypeTy::X_Y}; +}; + +class Block; +class Jump; +class Chain; +class ChainEdge; + +/// A node in the graph, typically corresponding to a basic block in CFG. +class Block { +public: + Block(const Block &) = delete; + Block(Block &&) = default; + Block &operator=(const Block &) = delete; + Block &operator=(Block &&) = default; + + // The original index of the block in CFG. + size_t Index{0}; + // The index of the block in the current chain. + size_t CurIndex{0}; + // Size of the block in the binary. + uint64_t Size{0}; + // Execution count of the block in the profile data. + uint64_t ExecutionCount{0}; + // Current chain of the node. + Chain *CurChain{nullptr}; + // An offset of the block in the current chain. + mutable uint64_t EstimatedAddr{0}; + // Forced successor of the block in CFG. + Block *ForcedSucc{nullptr}; + // Forced predecessor of the block in CFG. + Block *ForcedPred{nullptr}; + // Outgoing jumps from the block. + std::vector<Jump *> OutJumps; + // Incoming jumps to the block. + std::vector<Jump *> InJumps; + +public: + explicit Block(size_t Index, uint64_t Size_, uint64_t EC) + : Index(Index), Size(Size_), ExecutionCount(EC) {} + bool isEntry() const { return Index == 0; } +}; + +/// An arc in the graph, typically corresponding to a jump between two blocks. +class Jump { +public: + Jump(const Jump &) = delete; + Jump(Jump &&) = default; + Jump &operator=(const Jump &) = delete; + Jump &operator=(Jump &&) = default; + + // Source block of the jump. + Block *Source; + // Target block of the jump. + Block *Target; + // Execution count of the arc in the profile data. + uint64_t ExecutionCount{0}; + +public: + explicit Jump(Block *Source, Block *Target, uint64_t ExecutionCount) + : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {} +}; + +/// A chain (ordered sequence) of blocks. +class Chain { +public: + Chain(const Chain &) = delete; + Chain(Chain &&) = default; + Chain &operator=(const Chain &) = delete; + Chain &operator=(Chain &&) = default; + + explicit Chain(uint64_t Id, Block *Block) + : Id(Id), Score(0), Blocks(1, Block) {} + + uint64_t id() const { return Id; } + + bool isEntry() const { return Blocks[0]->Index == 0; } + + double score() const { return Score; } + + void setScore(double NewScore) { Score = NewScore; } + + const std::vector<Block *> &blocks() const { return Blocks; } + + const std::vector<std::pair<Chain *, ChainEdge *>> &edges() const { + return Edges; + } + + ChainEdge *getEdge(Chain *Other) const { + for (auto It : Edges) { + if (It.first == Other) + return It.second; + } + return nullptr; + } + + void removeEdge(Chain *Other) { + auto It = Edges.begin(); + while (It != Edges.end()) { + if (It->first == Other) { + Edges.erase(It); + return; + } + It++; + } + } + + void addEdge(Chain *Other, ChainEdge *Edge) { + Edges.push_back(std::make_pair(Other, Edge)); + } + + void merge(Chain *Other, const std::vector<Block *> &MergedBlocks) { + Blocks = MergedBlocks; + // Update the block's chains + for (size_t Idx = 0; Idx < Blocks.size(); Idx++) { + Blocks[Idx]->CurChain = this; + Blocks[Idx]->CurIndex = Idx; + } + } + + void mergeEdges(Chain *Other); + + void clear() { + Blocks.clear(); + Blocks.shrink_to_fit(); + Edges.clear(); + Edges.shrink_to_fit(); + } + +private: + // Unique chain identifier. + uint64_t Id; + // Cached ext-tsp score for the chain. + double Score; + // Blocks of the chain. + std::vector<Block *> Blocks; + // Adjacent chains and corresponding edges (lists of jumps). + std::vector<std::pair<Chain *, ChainEdge *>> Edges; +}; + +/// An edge in CFG representing jumps between two chains. +/// When blocks are merged into chains, the edges are combined too so that +/// there is always at most one edge between a pair of chains +class ChainEdge { +public: + ChainEdge(const ChainEdge &) = delete; + ChainEdge(ChainEdge &&) = default; + ChainEdge &operator=(const ChainEdge &) = delete; + ChainEdge &operator=(ChainEdge &&) = default; + + explicit ChainEdge(Jump *Jump) + : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain), + Jumps(1, Jump) {} + + const std::vector<Jump *> &jumps() const { return Jumps; } + + void changeEndpoint(Chain *From, Chain *To) { + if (From == SrcChain) + SrcChain = To; + if (From == DstChain) + DstChain = To; + } + + void appendJump(Jump *Jump) { Jumps.push_back(Jump); } + + void moveJumps(ChainEdge *Other) { + Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end()); + Other->Jumps.clear(); + Other->Jumps.shrink_to_fit(); + } + + bool hasCachedMergeGain(Chain *Src, Chain *Dst) const { + return Src == SrcChain ? CacheValidForward : CacheValidBackward; + } + + MergeGainTy getCachedMergeGain(Chain *Src, Chain *Dst) const { + return Src == SrcChain ? CachedGainForward : CachedGainBackward; + } + + void setCachedMergeGain(Chain *Src, Chain *Dst, MergeGainTy MergeGain) { + if (Src == SrcChain) { + CachedGainForward = MergeGain; + CacheValidForward = true; + } else { + CachedGainBackward = MergeGain; + CacheValidBackward = true; + } + } + + void invalidateCache() { + CacheValidForward = false; + CacheValidBackward = false; + } + +private: + // Source chain. + Chain *SrcChain{nullptr}; + // Destination chain. + Chain *DstChain{nullptr}; + // Original jumps in the binary with correspinding execution counts. + std::vector<Jump *> Jumps; + // Cached ext-tsp value for merging the pair of chains. + // Since the gain of merging (Src, Dst) and (Dst, Src) might be different, + // we store both values here. + MergeGainTy CachedGainForward; + MergeGainTy CachedGainBackward; + // Whether the cached value must be recomputed. + bool CacheValidForward{false}; + bool CacheValidBackward{false}; +}; + +void Chain::mergeEdges(Chain *Other) { + assert(this != Other && "cannot merge a chain with itself"); + + // Update edges adjacent to chain Other + for (auto EdgeIt : Other->Edges) { + const auto DstChain = EdgeIt.first; + const auto DstEdge = EdgeIt.second; + const auto TargetChain = DstChain == Other ? this : DstChain; + auto CurEdge = getEdge(TargetChain); + if (CurEdge == nullptr) { + DstEdge->changeEndpoint(Other, this); + this->addEdge(TargetChain, DstEdge); + if (DstChain != this && DstChain != Other) { + DstChain->addEdge(this, DstEdge); + } + } else { + CurEdge->moveJumps(DstEdge); + } + // Cleanup leftover edge + if (DstChain != Other) { + DstChain->removeEdge(Other); + } + } +} + +using BlockIter = std::vector<Block *>::const_iterator; + +/// A wrapper around three chains of blocks; it is used to avoid extra +/// instantiation of the vectors. +class MergedChain { +public: + MergedChain(BlockIter Begin1, BlockIter End1, BlockIter Begin2 = BlockIter(), + BlockIter End2 = BlockIter(), BlockIter Begin3 = BlockIter(), + BlockIter End3 = BlockIter()) + : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3), + End3(End3) {} + + template <typename F> void forEach(const F &Func) const { + for (auto It = Begin1; It != End1; It++) + Func(*It); + for (auto It = Begin2; It != End2; It++) + Func(*It); + for (auto It = Begin3; It != End3; It++) + Func(*It); + } + + std::vector<Block *> getBlocks() const { + std::vector<Block *> Result; + Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) + + std::distance(Begin3, End3)); + Result.insert(Result.end(), Begin1, End1); + Result.insert(Result.end(), Begin2, End2); + Result.insert(Result.end(), Begin3, End3); + return Result; + } + + const Block *getFirstBlock() const { return *Begin1; } + +private: + BlockIter Begin1; + BlockIter End1; + BlockIter Begin2; + BlockIter End2; + BlockIter Begin3; + BlockIter End3; +}; + +/// The implementation of the ExtTSP algorithm. +class ExtTSPImpl { + using EdgeT = std::pair<uint64_t, uint64_t>; + using EdgeCountMap = DenseMap<EdgeT, uint64_t>; + +public: + ExtTSPImpl(size_t NumNodes, const std::vector<uint64_t> &NodeSizes, + const std::vector<uint64_t> &NodeCounts, + const EdgeCountMap &EdgeCounts) + : NumNodes(NumNodes) { + initialize(NodeSizes, NodeCounts, EdgeCounts); + } + + /// Run the algorithm and return an optimized ordering of blocks. + void run(std::vector<uint64_t> &Result) { + // Pass 1: Merge blocks with their mutually forced successors + mergeForcedPairs(); + + // Pass 2: Merge pairs of chains while improving the ExtTSP objective + mergeChainPairs(); + + // Pass 3: Merge cold blocks to reduce code size + mergeColdChains(); + + // Collect blocks from all chains + concatChains(Result); + } + +private: + /// Initialize the algorithm's data structures. + void initialize(const std::vector<uint64_t> &NodeSizes, + const std::vector<uint64_t> &NodeCounts, + const EdgeCountMap &EdgeCounts) { + // Initialize blocks + AllBlocks.reserve(NumNodes); + for (uint64_t Node = 0; Node < NumNodes; Node++) { + uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL); + uint64_t ExecutionCount = NodeCounts[Node]; + // The execution count of the entry block is set to at least 1 + if (Node == 0 && ExecutionCount == 0) + ExecutionCount = 1; + AllBlocks.emplace_back(Node, Size, ExecutionCount); + } + + // Initialize jumps between blocks + SuccNodes = std::vector<std::vector<uint64_t>>(NumNodes); + PredNodes = std::vector<std::vector<uint64_t>>(NumNodes); + AllJumps.reserve(EdgeCounts.size()); + for (auto It : EdgeCounts) { + auto Pred = It.first.first; + auto Succ = It.first.second; + // Ignore self-edges + if (Pred == Succ) + continue; + + SuccNodes[Pred].push_back(Succ); + PredNodes[Succ].push_back(Pred); + auto ExecutionCount = It.second; + if (ExecutionCount > 0) { + auto &Block = AllBlocks[Pred]; + auto &SuccBlock = AllBlocks[Succ]; + AllJumps.emplace_back(&Block, &SuccBlock, ExecutionCount); + SuccBlock.InJumps.push_back(&AllJumps.back()); + Block.OutJumps.push_back(&AllJumps.back()); + } + } + + // Initialize chains + AllChains.reserve(NumNodes); + HotChains.reserve(NumNodes); + for (auto &Block : AllBlocks) { + AllChains.emplace_back(Block.Index, &Block); + Block.CurChain = &AllChains.back(); + if (Block.ExecutionCount > 0) { + HotChains.push_back(&AllChains.back()); + } + } + + // Initialize chain edges + AllEdges.reserve(AllJumps.size()); + for (auto &Block : AllBlocks) { + for (auto &Jump : Block.OutJumps) { + const auto SuccBlock = Jump->Target; + auto CurEdge = Block.CurChain->getEdge(SuccBlock->CurChain); + // this edge is already present in the graph + if (CurEdge != nullptr) { + assert(SuccBlock->CurChain->getEdge(Block.CurChain) != nullptr); + CurEdge->appendJump(Jump); + continue; + } + // this is a new edge + AllEdges.emplace_back(Jump); + Block.CurChain->addEdge(SuccBlock->CurChain, &AllEdges.back()); + SuccBlock->CurChain->addEdge(Block.CurChain, &AllEdges.back()); + } + } + } + + /// For a pair of blocks, A and B, block B is the forced successor of A, + /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps + /// to B are from A. Such blocks should be adjacent in the optimal ordering; + /// the method finds and merges such pairs of blocks. + void mergeForcedPairs() { + // Find fallthroughs based on edge weights + for (auto &Block : AllBlocks) { + if (SuccNodes[Block.Index].size() == 1 && + PredNodes[SuccNodes[Block.Index][0]].size() == 1 && + SuccNodes[Block.Index][0] != 0) { + size_t SuccIndex = SuccNodes[Block.Index][0]; + Block.ForcedSucc = &AllBlocks[SuccIndex]; + AllBlocks[SuccIndex].ForcedPred = &Block; + } + } + + // There might be 'cycles' in the forced dependencies, since profile + // data isn't 100% accurate. Typically this is observed in loops, when the + // loop edges are the hottest successors for the basic blocks of the loop. + // Break the cycles by choosing the block with the smallest index as the + // head. This helps to keep the original order of the loops, which likely + // have already been rotated in the optimized manner. + for (auto &Block : AllBlocks) { + if (Block.ForcedSucc == nullptr || Block.ForcedPred == nullptr) + continue; + + auto SuccBlock = Block.ForcedSucc; + while (SuccBlock != nullptr && SuccBlock != &Block) { + SuccBlock = SuccBlock->ForcedSucc; + } + if (SuccBlock == nullptr) + continue; + // Break the cycle + AllBlocks[Block.ForcedPred->Index].ForcedSucc = nullptr; + Block.ForcedPred = nullptr; + } + + // Merge blocks with their fallthrough successors + for (auto &Block : AllBlocks) { + if (Block.ForcedPred == nullptr && Block.ForcedSucc != nullptr) { + auto CurBlock = &Block; + while (CurBlock->ForcedSucc != nullptr) { + const auto NextBlock = CurBlock->ForcedSucc; + mergeChains(Block.CurChain, NextBlock->CurChain, 0, MergeTypeTy::X_Y); + CurBlock = NextBlock; + } + } + } + } + + /// Merge pairs of chains while improving the ExtTSP objective. + void mergeChainPairs() { + /// Deterministically compare pairs of chains + auto compareChainPairs = [](const Chain *A1, const Chain *B1, + const Chain *A2, const Chain *B2) { + if (A1 != A2) + return A1->id() < A2->id(); + return B1->id() < B2->id(); + }; + + while (HotChains.size() > 1) { + Chain *BestChainPred = nullptr; + Chain *BestChainSucc = nullptr; + auto BestGain = MergeGainTy(); + // Iterate over all pairs of chains + for (auto ChainPred : HotChains) { + // Get candidates for merging with the current chain + for (auto EdgeIter : ChainPred->edges()) { + auto ChainSucc = EdgeIter.first; + auto ChainEdge = EdgeIter.second; + // Ignore loop edges + if (ChainPred == ChainSucc) + continue; + + // Compute the gain of merging the two chains + auto CurGain = getBestMergeGain(ChainPred, ChainSucc, ChainEdge); + if (CurGain.score() <= EPS) + continue; + + if (BestGain < CurGain || + (std::abs(CurGain.score() - BestGain.score()) < EPS && + compareChainPairs(ChainPred, ChainSucc, BestChainPred, + BestChainSucc))) { + BestGain = CurGain; + BestChainPred = ChainPred; + BestChainSucc = ChainSucc; + } + } + } + + // Stop merging when there is no improvement + if (BestGain.score() <= EPS) + break; + + // Merge the best pair of chains + mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(), + BestGain.mergeType()); + } + } + + /// Merge cold blocks to reduce code size. + void mergeColdChains() { + for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) { + // Iterating over neighbors in the reverse order to make sure original + // fallthrough jumps are merged first + size_t NumSuccs = SuccNodes[SrcBB].size(); + for (size_t Idx = 0; Idx < NumSuccs; Idx++) { + auto DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1]; + auto SrcChain = AllBlocks[SrcBB].CurChain; + auto DstChain = AllBlocks[DstBB].CurChain; + if (SrcChain != DstChain && !DstChain->isEntry() && + SrcChain->blocks().back()->Index == SrcBB && + DstChain->blocks().front()->Index == DstBB) { + mergeChains(SrcChain, DstChain, 0, MergeTypeTy::X_Y); + } + } + } + } + + /// Compute the Ext-TSP score for a given block order and a list of jumps. + double extTSPScore(const MergedChain &MergedBlocks, + const std::vector<Jump *> &Jumps) const { + if (Jumps.empty()) + return 0.0; + uint64_t CurAddr = 0; + MergedBlocks.forEach([&](const Block *BB) { + BB->EstimatedAddr = CurAddr; + CurAddr += BB->Size; + }); + + double Score = 0; + for (auto &Jump : Jumps) { + const auto SrcBlock = Jump->Source; + const auto DstBlock = Jump->Target; + Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size, + DstBlock->EstimatedAddr, Jump->ExecutionCount); + } + return Score; + } + + /// Compute the gain of merging two chains. + /// + /// The function considers all possible ways of merging two chains and + /// computes the one having the largest increase in ExtTSP objective. The + /// result is a pair with the first element being the gain and the second + /// element being the corresponding merging type. + MergeGainTy getBestMergeGain(Chain *ChainPred, Chain *ChainSucc, + ChainEdge *Edge) const { + if (Edge->hasCachedMergeGain(ChainPred, ChainSucc)) { + return Edge->getCachedMergeGain(ChainPred, ChainSucc); + } + + // Precompute jumps between ChainPred and ChainSucc + auto Jumps = Edge->jumps(); + auto EdgePP = ChainPred->getEdge(ChainPred); + if (EdgePP != nullptr) { + Jumps.insert(Jumps.end(), EdgePP->jumps().begin(), EdgePP->jumps().end()); + } + assert(!Jumps.empty() && "trying to merge chains w/o jumps"); + + // The object holds the best currently chosen gain of merging the two chains + MergeGainTy Gain = MergeGainTy(); + + /// Given a merge offset and a list of merge types, try to merge two chains + /// and update Gain with a better alternative + auto tryChainMerging = [&](size_t Offset, + const std::vector<MergeTypeTy> &MergeTypes) { + // Skip merging corresponding to concatenation w/o splitting + if (Offset == 0 || Offset == ChainPred->blocks().size()) + return; + // Skip merging if it breaks Forced successors + auto BB = ChainPred->blocks()[Offset - 1]; + if (BB->ForcedSucc != nullptr) + return; + // Apply the merge, compute the corresponding gain, and update the best + // value, if the merge is beneficial + for (auto &MergeType : MergeTypes) { + Gain.updateIfLessThan( + computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType)); + } + }; + + // Try to concatenate two chains w/o splitting + Gain.updateIfLessThan( + computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeTy::X_Y)); + + if (EnableChainSplitAlongJumps) { + // Attach (a part of) ChainPred before the first block of ChainSucc + for (auto &Jump : ChainSucc->blocks().front()->InJumps) { + const auto SrcBlock = Jump->Source; + if (SrcBlock->CurChain != ChainPred) + continue; + size_t Offset = SrcBlock->CurIndex + 1; + tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::X2_X1_Y}); + } + + // Attach (a part of) ChainPred after the last block of ChainSucc + for (auto &Jump : ChainSucc->blocks().back()->OutJumps) { + const auto DstBlock = Jump->Source; + if (DstBlock->CurChain != ChainPred) + continue; + size_t Offset = DstBlock->CurIndex; + tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1}); + } + } + + // Try to break ChainPred in various ways and concatenate with ChainSucc + if (ChainPred->blocks().size() <= ChainSplitThreshold) { + for (size_t Offset = 1; Offset < ChainPred->blocks().size(); Offset++) { + // Try to split the chain in different ways. In practice, applying + // X2_Y_X1 merging is almost never provides benefits; thus, we exclude + // it from consideration to reduce the search space + tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1, + MergeTypeTy::X2_X1_Y}); + } + } + Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain); + return Gain; + } + + /// Compute the score gain of merging two chains, respecting a given + /// merge 'type' and 'offset'. + /// + /// The two chains are not modified in the method. + MergeGainTy computeMergeGain(const Chain *ChainPred, const Chain *ChainSucc, + const std::vector<Jump *> &Jumps, + size_t MergeOffset, + MergeTypeTy MergeType) const { + auto MergedBlocks = mergeBlocks(ChainPred->blocks(), ChainSucc->blocks(), + MergeOffset, MergeType); + + // Do not allow a merge that does not preserve the original entry block + if ((ChainPred->isEntry() || ChainSucc->isEntry()) && + !MergedBlocks.getFirstBlock()->isEntry()) + return MergeGainTy(); + + // The gain for the new chain + auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->score(); + return MergeGainTy(NewGainScore, MergeOffset, MergeType); + } + + /// Merge two chains of blocks respecting a given merge 'type' and 'offset'. + /// + /// If MergeType == 0, then the result is a concatentation of two chains. + /// Otherwise, the first chain is cut into two sub-chains at the offset, + /// and merged using all possible ways of concatenating three chains. + MergedChain mergeBlocks(const std::vector<Block *> &X, + const std::vector<Block *> &Y, size_t MergeOffset, + MergeTypeTy MergeType) const { + // Split the first chain, X, into X1 and X2 + BlockIter BeginX1 = X.begin(); + BlockIter EndX1 = X.begin() + MergeOffset; + BlockIter BeginX2 = X.begin() + MergeOffset; + BlockIter EndX2 = X.end(); + BlockIter BeginY = Y.begin(); + BlockIter EndY = Y.end(); + + // Construct a new chain from the three existing ones + switch (MergeType) { + case MergeTypeTy::X_Y: + return MergedChain(BeginX1, EndX2, BeginY, EndY); + case MergeTypeTy::X1_Y_X2: + return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2); + case MergeTypeTy::Y_X2_X1: + return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1); + case MergeTypeTy::X2_X1_Y: + return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY); + } + llvm_unreachable("unexpected chain merge type"); + } + + /// Merge chain From into chain Into, update the list of active chains, + /// adjacency information, and the corresponding cached values. + void mergeChains(Chain *Into, Chain *From, size_t MergeOffset, + MergeTypeTy MergeType) { + assert(Into != From && "a chain cannot be merged with itself"); + + // Merge the blocks + auto MergedBlocks = + mergeBlocks(Into->blocks(), From->blocks(), MergeOffset, MergeType); + Into->merge(From, MergedBlocks.getBlocks()); + Into->mergeEdges(From); + From->clear(); + + // Update cached ext-tsp score for the new chain + auto SelfEdge = Into->getEdge(Into); + if (SelfEdge != nullptr) { + MergedBlocks = MergedChain(Into->blocks().begin(), Into->blocks().end()); + Into->setScore(extTSPScore(MergedBlocks, SelfEdge->jumps())); + } + + // Remove chain From from the list of active chains + auto Iter = std::remove(HotChains.begin(), HotChains.end(), From); + HotChains.erase(Iter, HotChains.end()); + + // Invalidate caches + for (auto EdgeIter : Into->edges()) { + EdgeIter.second->invalidateCache(); + } + } + + /// Concatenate all chains into a final order of blocks. + void concatChains(std::vector<uint64_t> &Order) { + // Collect chains and calculate some stats for their sorting + std::vector<Chain *> SortedChains; + DenseMap<const Chain *, double> ChainDensity; + for (auto &Chain : AllChains) { + if (!Chain.blocks().empty()) { + SortedChains.push_back(&Chain); + // Using doubles to avoid overflow of ExecutionCount + double Size = 0; + double ExecutionCount = 0; + for (auto Block : Chain.blocks()) { + Size += static_cast<double>(Block->Size); + ExecutionCount += static_cast<double>(Block->ExecutionCount); + } + assert(Size > 0 && "a chain of zero size"); + ChainDensity[&Chain] = ExecutionCount / Size; + } + } + + // Sorting chains by density in the decreasing order + std::stable_sort(SortedChains.begin(), SortedChains.end(), + [&](const Chain *C1, const Chain *C2) { + // Makre sure the original entry block is at the + // beginning of the order + if (C1->isEntry() != C2->isEntry()) { + return C1->isEntry(); + } + + const double D1 = ChainDensity[C1]; + const double D2 = ChainDensity[C2]; + // Compare by density and break ties by chain identifiers + return (D1 != D2) ? (D1 > D2) : (C1->id() < C2->id()); + }); + + // Collect the blocks in the order specified by their chains + Order.reserve(NumNodes); + for (auto Chain : SortedChains) { + for (auto Block : Chain->blocks()) { + Order.push_back(Block->Index); + } + } + } + +private: + /// The number of nodes in the graph. + const size_t NumNodes; + + /// Successors of each node. + std::vector<std::vector<uint64_t>> SuccNodes; + + /// Predecessors of each node. + std::vector<std::vector<uint64_t>> PredNodes; + + /// All basic blocks. + std::vector<Block> AllBlocks; + + /// All jumps between blocks. + std::vector<Jump> AllJumps; + + /// All chains of basic blocks. + std::vector<Chain> AllChains; + + /// All edges between chains. + std::vector<ChainEdge> AllEdges; + + /// Active chains. The vector gets updated at runtime when chains are merged. + std::vector<Chain *> HotChains; +}; + +} // end of anonymous namespace + +std::vector<uint64_t> llvm::applyExtTspLayout( + const std::vector<uint64_t> &NodeSizes, + const std::vector<uint64_t> &NodeCounts, + const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) { + size_t NumNodes = NodeSizes.size(); + + // Verify correctness of the input data. + assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input"); + assert(NumNodes > 2 && "Incorrect input"); + + // Apply the reordering algorithm. + auto Alg = ExtTSPImpl(NumNodes, NodeSizes, NodeCounts, EdgeCounts); + std::vector<uint64_t> Result; + Alg.run(Result); + + // Verify correctness of the output. + assert(Result.front() == 0 && "Original entry point is not preserved"); + assert(Result.size() == NumNodes && "Incorrect size of reordered layout"); + return Result; +} + +double llvm::calcExtTspScore( + const std::vector<uint64_t> &Order, const std::vector<uint64_t> &NodeSizes, + const std::vector<uint64_t> &NodeCounts, + const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) { + // Estimate addresses of the blocks in memory + auto Addr = std::vector<uint64_t>(NodeSizes.size(), 0); + for (size_t Idx = 1; Idx < Order.size(); Idx++) { + Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]]; + } + + // Increase the score for each jump + double Score = 0; + for (auto It : EdgeCounts) { + auto Pred = It.first.first; + auto Succ = It.first.second; + uint64_t Count = It.second; + Score += extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count); + } + return Score; +} + +double llvm::calcExtTspScore( + const std::vector<uint64_t> &NodeSizes, + const std::vector<uint64_t> &NodeCounts, + const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) { + auto Order = std::vector<uint64_t>(NodeSizes.size()); + for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) { + Order[Idx] = Idx; + } + return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts); +} diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp index fc7083b0c30d..589622d69578 100644 --- a/llvm/lib/Transforms/Utils/Debugify.cpp +++ b/llvm/lib/Transforms/Utils/Debugify.cpp @@ -596,7 +596,7 @@ bool llvm::checkDebugInfoMetadata(Module &M, auto DILocsBefore = DIPreservationMap[NameOfWrappedPass].DILocations; auto DILocsAfter = DIPreservationAfter[NameOfWrappedPass].DILocations; - auto InstToDelete = DIPreservationAfter[NameOfWrappedPass].InstToDelete; + auto InstToDelete = DIPreservationMap[NameOfWrappedPass].InstToDelete; auto DIVarsBefore = DIPreservationMap[NameOfWrappedPass].DIVariables; auto DIVarsAfter = DIPreservationAfter[NameOfWrappedPass].DIVariables; diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp index 326864803d7c..06596f7b04e1 100644 --- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp +++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp @@ -58,6 +58,14 @@ int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const { return 0; } +int FunctionComparator::cmpAligns(Align L, Align R) const { + if (L.value() < R.value()) + return -1; + if (L.value() > R.value()) + return 1; + return 0; +} + int FunctionComparator::cmpOrderings(AtomicOrdering L, AtomicOrdering R) const { if ((int)L < (int)R) return -1; @@ -556,13 +564,12 @@ int FunctionComparator::cmpOperations(const Instruction *L, if (int Res = cmpTypes(AI->getAllocatedType(), cast<AllocaInst>(R)->getAllocatedType())) return Res; - return cmpNumbers(AI->getAlignment(), cast<AllocaInst>(R)->getAlignment()); + return cmpAligns(AI->getAlign(), cast<AllocaInst>(R)->getAlign()); } if (const LoadInst *LI = dyn_cast<LoadInst>(L)) { if (int Res = cmpNumbers(LI->isVolatile(), cast<LoadInst>(R)->isVolatile())) return Res; - if (int Res = - cmpNumbers(LI->getAlignment(), cast<LoadInst>(R)->getAlignment())) + if (int Res = cmpAligns(LI->getAlign(), cast<LoadInst>(R)->getAlign())) return Res; if (int Res = cmpOrderings(LI->getOrdering(), cast<LoadInst>(R)->getOrdering())) @@ -578,8 +585,7 @@ int FunctionComparator::cmpOperations(const Instruction *L, if (int Res = cmpNumbers(SI->isVolatile(), cast<StoreInst>(R)->isVolatile())) return Res; - if (int Res = - cmpNumbers(SI->getAlignment(), cast<StoreInst>(R)->getAlignment())) + if (int Res = cmpAligns(SI->getAlign(), cast<StoreInst>(R)->getAlign())) return Res; if (int Res = cmpOrderings(SI->getOrdering(), cast<StoreInst>(R)->getOrdering())) diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index ec926b1f5a94..ecad79b68185 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -402,6 +402,18 @@ bool llvm::isInstructionTriviallyDead(Instruction *I, return wouldInstructionBeTriviallyDead(I, TLI); } +bool llvm::wouldInstructionBeTriviallyDeadOnUnusedPaths( + Instruction *I, const TargetLibraryInfo *TLI) { + // Instructions that are "markers" and have implied meaning on code around + // them (without explicit uses), are not dead on unused paths. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) + if (II->getIntrinsicID() == Intrinsic::stacksave || + II->getIntrinsicID() == Intrinsic::launder_invariant_group || + II->isLifetimeStartOrEnd()) + return false; + return wouldInstructionBeTriviallyDead(I, TLI); +} + bool llvm::wouldInstructionBeTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI) { if (I->isTerminator()) diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp index f3cf42be8ba1..69fd110dc3c2 100644 --- a/llvm/lib/Transforms/Utils/LoopPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -104,9 +104,7 @@ bool llvm::canPeel(Loop *L) { // note that LoopPeeling currently can only update the branch weights of latch // blocks and branch weights to blocks with deopt or unreachable do not need // updating. - return all_of(Exits, [](const BasicBlock *BB) { - return IsBlockFollowedByDeoptOrUnreachable(BB); - }); + return llvm::all_of(Exits, IsBlockFollowedByDeoptOrUnreachable); } // This function calculates the number of iterations after which the given Phi @@ -333,6 +331,31 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, return DesiredPeelCount; } +/// This "heuristic" exactly matches implicit behavior which used to exist +/// inside getLoopEstimatedTripCount. It was added here to keep an +/// improvement inside that API from causing peeling to become more agressive. +/// This should probably be removed. +static bool violatesLegacyMultiExitLoopCheck(Loop *L) { + BasicBlock *Latch = L->getLoopLatch(); + if (!Latch) + return true; + + BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator()); + if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch)) + return true; + + assert((LatchBR->getSuccessor(0) == L->getHeader() || + LatchBR->getSuccessor(1) == L->getHeader()) && + "At least one edge out of the latch must go to the header"); + + SmallVector<BasicBlock *, 4> ExitBlocks; + L->getUniqueNonLatchExitBlocks(ExitBlocks); + return any_of(ExitBlocks, [](const BasicBlock *EB) { + return !EB->getTerminatingDeoptimizeCall(); + }); +} + + // Return the number of iterations we want to peel off. void llvm::computePeelCount(Loop *L, unsigned LoopSize, TargetTransformInfo::PeelingPreferences &PP, @@ -436,6 +459,8 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, // We only do this in the presence of profile information, since otherwise // our estimates of the trip count are not reliable enough. if (L->getHeader()->getParent()->hasProfileData()) { + if (violatesLegacyMultiExitLoopCheck(L)) + return; Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L); if (!PeelCount) return; diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index c8e42acdffb3..93157bd87c34 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -773,8 +773,8 @@ void llvm::breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE, } -/// Checks if \p L has single exit through latch block except possibly -/// "deoptimizing" exits. Returns branch instruction terminating the loop +/// Checks if \p L has an exiting latch branch. There may also be other +/// exiting blocks. Returns branch instruction terminating the loop /// latch if above check is successful, nullptr otherwise. static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) { BasicBlock *Latch = L->getLoopLatch(); @@ -789,53 +789,61 @@ static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) { LatchBR->getSuccessor(1) == L->getHeader()) && "At least one edge out of the latch must go to the header"); - SmallVector<BasicBlock *, 4> ExitBlocks; - L->getUniqueNonLatchExitBlocks(ExitBlocks); - if (any_of(ExitBlocks, [](const BasicBlock *EB) { - return !EB->getTerminatingDeoptimizeCall(); - })) - return nullptr; - return LatchBR; } -Optional<unsigned> -llvm::getLoopEstimatedTripCount(Loop *L, - unsigned *EstimatedLoopInvocationWeight) { - // Support loops with an exiting latch and other existing exists only - // deoptimize. - BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); - if (!LatchBranch) - return None; - +/// Return the estimated trip count for any exiting branch which dominates +/// the loop latch. +static Optional<uint64_t> +getEstimatedTripCount(BranchInst *ExitingBranch, Loop *L, + uint64_t &OrigExitWeight) { // To estimate the number of times the loop body was executed, we want to // know the number of times the backedge was taken, vs. the number of times // we exited the loop. - uint64_t BackedgeTakenWeight, LatchExitWeight; - if (!LatchBranch->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight)) + uint64_t LoopWeight, ExitWeight; + if (!ExitingBranch->extractProfMetadata(LoopWeight, ExitWeight)) return None; - if (LatchBranch->getSuccessor(0) != L->getHeader()) - std::swap(BackedgeTakenWeight, LatchExitWeight); + if (L->contains(ExitingBranch->getSuccessor(1))) + std::swap(LoopWeight, ExitWeight); - if (!LatchExitWeight) + if (!ExitWeight) + // Don't have a way to return predicated infinite return None; - if (EstimatedLoopInvocationWeight) - *EstimatedLoopInvocationWeight = LatchExitWeight; + OrigExitWeight = ExitWeight; - // Estimated backedge taken count is a ratio of the backedge taken weight by - // the weight of the edge exiting the loop, rounded to nearest. - uint64_t BackedgeTakenCount = - llvm::divideNearest(BackedgeTakenWeight, LatchExitWeight); - // Estimated trip count is one plus estimated backedge taken count. - return BackedgeTakenCount + 1; + // Estimated exit count is a ratio of the loop weight by the weight of the + // edge exiting the loop, rounded to nearest. + uint64_t ExitCount = llvm::divideNearest(LoopWeight, ExitWeight); + // Estimated trip count is one plus estimated exit count. + return ExitCount + 1; +} + +Optional<unsigned> +llvm::getLoopEstimatedTripCount(Loop *L, + unsigned *EstimatedLoopInvocationWeight) { + // Currently we take the estimate exit count only from the loop latch, + // ignoring other exiting blocks. This can overestimate the trip count + // if we exit through another exit, but can never underestimate it. + // TODO: incorporate information from other exits + if (BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L)) { + uint64_t ExitWeight; + if (Optional<uint64_t> EstTripCount = + getEstimatedTripCount(LatchBranch, L, ExitWeight)) { + if (EstimatedLoopInvocationWeight) + *EstimatedLoopInvocationWeight = ExitWeight; + return *EstTripCount; + } + } + return None; } bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount, unsigned EstimatedloopInvocationWeight) { - // Support loops with an exiting latch and other existing exists only - // deoptimize. + // At the moment, we currently support changing the estimate trip count of + // the latch branch only. We could extend this API to manipulate estimated + // trip counts for any exit. BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); if (!LatchBranch) return false; @@ -923,8 +931,7 @@ Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, // Helper to generate an ordered reduction. Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src, - unsigned Op, RecurKind RdxKind, - ArrayRef<Value *> RedOps) { + unsigned Op, RecurKind RdxKind) { unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements(); // Extract and apply reduction ops in ascending order: @@ -942,9 +949,6 @@ Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src, "Invalid min/max"); Result = createMinMaxOp(Builder, RdxKind, Result, Ext); } - - if (!RedOps.empty()) - propagateIRFlags(Result, RedOps); } return Result; @@ -952,14 +956,20 @@ Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src, // Helper to generate a log2 shuffle reduction. Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src, - unsigned Op, RecurKind RdxKind, - ArrayRef<Value *> RedOps) { + unsigned Op, RecurKind RdxKind) { unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements(); // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles // and vector ops, reducing the set of values being computed by half each // round. assert(isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!"); + // Note: fast-math-flags flags are controlled by the builder configuration + // and are assumed to apply to all generated arithmetic instructions. Other + // poison generating flags (nsw/nuw/inbounds/inrange/exact) are not part + // of the builder configuration, and since they're not passed explicitly, + // will never be relevant here. Note that it would be generally unsound to + // propagate these from an intrinsic call to the expansion anyways as we/ + // change the order of operations. Value *TmpVec = Src; SmallVector<int, 32> ShuffleMask(VF); for (unsigned i = VF; i != 1; i >>= 1) { @@ -973,7 +983,6 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src, Value *Shuf = Builder.CreateShuffleVector(TmpVec, ShuffleMask, "rdx.shuf"); if (Op != Instruction::ICmp && Op != Instruction::FCmp) { - // The builder propagates its fast-math-flags setting. TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"); } else { @@ -981,13 +990,6 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src, "Invalid min/max"); TmpVec = createMinMaxOp(Builder, RdxKind, TmpVec, Shuf); } - if (!RedOps.empty()) - propagateIRFlags(TmpVec, RedOps); - - // We may compute the reassociated scalar ops in a way that does not - // preserve nsw/nuw etc. Conservatively, drop those flags. - if (auto *ReductionInst = dyn_cast<Instruction>(TmpVec)) - ReductionInst->dropPoisonGeneratingFlags(); } // The result is in the first element of the vector. return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); @@ -1035,8 +1037,7 @@ Value *llvm::createSelectCmpTargetReduction(IRBuilderBase &Builder, Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, const TargetTransformInfo *TTI, - Value *Src, RecurKind RdxKind, - ArrayRef<Value *> RedOps) { + Value *Src, RecurKind RdxKind) { auto *SrcVecEltTy = cast<VectorType>(Src->getType())->getElementType(); switch (RdxKind) { case RecurKind::Add: diff --git a/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/llvm/lib/Transforms/Utils/MetaRenamer.cpp index 3ce10535d45f..9fba2f3f86b5 100644 --- a/llvm/lib/Transforms/Utils/MetaRenamer.cpp +++ b/llvm/lib/Transforms/Utils/MetaRenamer.cpp @@ -15,6 +15,7 @@ #include "llvm/Transforms/Utils/MetaRenamer.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -31,10 +32,36 @@ #include "llvm/IR/TypeFinder.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils.h" using namespace llvm; +static cl::opt<std::string> RenameExcludeFunctionPrefixes( + "rename-exclude-function-prefixes", + cl::desc("Prefixes for functions that don't need to be renamed, separated " + "by a comma"), + cl::Hidden); + +static cl::opt<std::string> RenameExcludeAliasPrefixes( + "rename-exclude-alias-prefixes", + cl::desc("Prefixes for aliases that don't need to be renamed, separated " + "by a comma"), + cl::Hidden); + +static cl::opt<std::string> RenameExcludeGlobalPrefixes( + "rename-exclude-global-prefixes", + cl::desc( + "Prefixes for global values that don't need to be renamed, separated " + "by a comma"), + cl::Hidden); + +static cl::opt<std::string> RenameExcludeStructPrefixes( + "rename-exclude-struct-prefixes", + cl::desc("Prefixes for structs that don't need to be renamed, separated " + "by a comma"), + cl::Hidden); + static const char *const metaNames[] = { // See http://en.wikipedia.org/wiki/Metasyntactic_variable "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge", @@ -66,6 +93,18 @@ struct Renamer { PRNG prng; }; +static void +parseExcludedPrefixes(StringRef PrefixesStr, + SmallVectorImpl<StringRef> &ExcludedPrefixes) { + for (;;) { + auto PrefixesSplit = PrefixesStr.split(','); + if (PrefixesSplit.first.empty()) + break; + ExcludedPrefixes.push_back(PrefixesSplit.first); + PrefixesStr = PrefixesSplit.second; + } +} + void MetaRename(Function &F) { for (Argument &Arg : F.args()) if (!Arg.getType()->isVoidTy()) @@ -91,10 +130,26 @@ void MetaRename(Module &M, Renamer renamer(randSeed); + SmallVector<StringRef, 8> ExcludedAliasesPrefixes; + SmallVector<StringRef, 8> ExcludedGlobalsPrefixes; + SmallVector<StringRef, 8> ExcludedStructsPrefixes; + SmallVector<StringRef, 8> ExcludedFuncPrefixes; + parseExcludedPrefixes(RenameExcludeAliasPrefixes, ExcludedAliasesPrefixes); + parseExcludedPrefixes(RenameExcludeGlobalPrefixes, ExcludedGlobalsPrefixes); + parseExcludedPrefixes(RenameExcludeStructPrefixes, ExcludedStructsPrefixes); + parseExcludedPrefixes(RenameExcludeFunctionPrefixes, ExcludedFuncPrefixes); + + auto IsNameExcluded = [](StringRef &Name, + SmallVectorImpl<StringRef> &ExcludedPrefixes) { + return any_of(ExcludedPrefixes, + [&Name](auto &Prefix) { return Name.startswith(Prefix); }); + }; + // Rename all aliases for (GlobalAlias &GA : M.aliases()) { StringRef Name = GA.getName(); - if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1)) + if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) || + IsNameExcluded(Name, ExcludedAliasesPrefixes)) continue; GA.setName("alias"); @@ -103,7 +158,8 @@ void MetaRename(Module &M, // Rename all global variables for (GlobalVariable &GV : M.globals()) { StringRef Name = GV.getName(); - if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1)) + if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) || + IsNameExcluded(Name, ExcludedGlobalsPrefixes)) continue; GV.setName("global"); @@ -113,7 +169,9 @@ void MetaRename(Module &M, TypeFinder StructTypes; StructTypes.run(M, true); for (StructType *STy : StructTypes) { - if (STy->isLiteral() || STy->getName().empty()) + StringRef Name = STy->getName(); + if (STy->isLiteral() || Name.empty() || + IsNameExcluded(Name, ExcludedStructsPrefixes)) continue; SmallString<128> NameStorage; @@ -128,7 +186,8 @@ void MetaRename(Module &M, // Leave library functions alone because their presence or absence could // affect the behavior of other passes. if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) || - GetTLI(F).getLibFunc(F, Tmp)) + GetTLI(F).getLibFunc(F, Tmp) || + IsNameExcluded(Name, ExcludedFuncPrefixes)) continue; // Leave @main alone. The output of -metarenamer might be passed to diff --git a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp index 3ebc89158173..65207056a3f4 100644 --- a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp +++ b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp @@ -144,6 +144,10 @@ static void convertToRelLookupTable(GlobalVariable &LookupTable) { Value *Offset = Builder.CreateShl(Index, ConstantInt::get(IntTy, 2), "reltable.shift"); + // Insert the call to load.relative instrinsic before LOAD. + // GEP might not be immediately followed by a LOAD, like it can be hoisted + // outside the loop or another instruction might be inserted them in between. + Builder.SetInsertPoint(Load); Function *LoadRelIntrinsic = llvm::Intrinsic::getDeclaration( &M, Intrinsic::load_relative, {Index->getType()}); Value *Base = Builder.CreateBitCast(RelLookupTable, Builder.getInt8PtrTy()); diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp index 9495e442e0bf..2f2dff6b5f0b 100644 --- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp +++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp @@ -220,7 +220,7 @@ private: Now = Pred; } - assert(PathCapacity > 0 && "found incorrect augmenting path"); + assert(PathCapacity > 0 && "found an incorrect augmenting path"); // Update the flow along the path Now = Target; @@ -271,6 +271,352 @@ private: uint64_t Target; }; +/// A post-processing adjustment of control flow. It applies two steps by +/// rerouting some flow and making it more realistic: +/// +/// - First, it removes all isolated components ("islands") with a positive flow +/// that are unreachable from the entry block. For every such component, we +/// find the shortest from the entry to an exit passing through the component, +/// and increase the flow by one unit along the path. +/// +/// - Second, it identifies all "unknown subgraphs" consisting of basic blocks +/// with no sampled counts. Then it rebalnces the flow that goes through such +/// a subgraph so that each branch is taken with probability 50%. +/// An unknown subgraph is such that for every two nodes u and v: +/// - u dominates v and u is not unknown; +/// - v post-dominates u; and +/// - all inner-nodes of all (u,v)-paths are unknown. +/// +class FlowAdjuster { +public: + FlowAdjuster(FlowFunction &Func) : Func(Func) { + assert(Func.Blocks[Func.Entry].isEntry() && + "incorrect index of the entry block"); + } + + // Run the post-processing + void run() { + /// Adjust the flow to get rid of isolated components. + joinIsolatedComponents(); + + /// Rebalance the flow inside unknown subgraphs. + rebalanceUnknownSubgraphs(); + } + + /// The probability for the first successor of a unknown subgraph + static constexpr double UnknownFirstSuccProbability = 0.5; + +private: + void joinIsolatedComponents() { + // Find blocks that are reachable from the source + auto Visited = std::vector<bool>(NumBlocks(), false); + findReachable(Func.Entry, Visited); + + // Iterate over all non-reachable blocks and adjust their weights + for (uint64_t I = 0; I < NumBlocks(); I++) { + auto &Block = Func.Blocks[I]; + if (Block.Flow > 0 && !Visited[I]) { + // Find a path from the entry to an exit passing through the block I + auto Path = findShortestPath(I); + // Increase the flow along the path + assert(Path.size() > 0 && Path[0]->Source == Func.Entry && + "incorrectly computed path adjusting control flow"); + Func.Blocks[Func.Entry].Flow += 1; + for (auto &Jump : Path) { + Jump->Flow += 1; + Func.Blocks[Jump->Target].Flow += 1; + // Update reachability + findReachable(Jump->Target, Visited); + } + } + } + } + + /// Run BFS from a given block along the jumps with a positive flow and mark + /// all reachable blocks. + void findReachable(uint64_t Src, std::vector<bool> &Visited) { + if (Visited[Src]) + return; + std::queue<uint64_t> Queue; + Queue.push(Src); + Visited[Src] = true; + while (!Queue.empty()) { + Src = Queue.front(); + Queue.pop(); + for (auto Jump : Func.Blocks[Src].SuccJumps) { + uint64_t Dst = Jump->Target; + if (Jump->Flow > 0 && !Visited[Dst]) { + Queue.push(Dst); + Visited[Dst] = true; + } + } + } + } + + /// Find the shortest path from the entry block to an exit block passing + /// through a given block. + std::vector<FlowJump *> findShortestPath(uint64_t BlockIdx) { + // A path from the entry block to BlockIdx + auto ForwardPath = findShortestPath(Func.Entry, BlockIdx); + // A path from BlockIdx to an exit block + auto BackwardPath = findShortestPath(BlockIdx, AnyExitBlock); + + // Concatenate the two paths + std::vector<FlowJump *> Result; + Result.insert(Result.end(), ForwardPath.begin(), ForwardPath.end()); + Result.insert(Result.end(), BackwardPath.begin(), BackwardPath.end()); + return Result; + } + + /// Apply the Dijkstra algorithm to find the shortest path from a given + /// Source to a given Target block. + /// If Target == -1, then the path ends at an exit block. + std::vector<FlowJump *> findShortestPath(uint64_t Source, uint64_t Target) { + // Quit early, if possible + if (Source == Target) + return std::vector<FlowJump *>(); + if (Func.Blocks[Source].isExit() && Target == AnyExitBlock) + return std::vector<FlowJump *>(); + + // Initialize data structures + auto Distance = std::vector<int64_t>(NumBlocks(), INF); + auto Parent = std::vector<FlowJump *>(NumBlocks(), nullptr); + Distance[Source] = 0; + std::set<std::pair<uint64_t, uint64_t>> Queue; + Queue.insert(std::make_pair(Distance[Source], Source)); + + // Run the Dijkstra algorithm + while (!Queue.empty()) { + uint64_t Src = Queue.begin()->second; + Queue.erase(Queue.begin()); + // If we found a solution, quit early + if (Src == Target || + (Func.Blocks[Src].isExit() && Target == AnyExitBlock)) + break; + + for (auto Jump : Func.Blocks[Src].SuccJumps) { + uint64_t Dst = Jump->Target; + int64_t JumpDist = jumpDistance(Jump); + if (Distance[Dst] > Distance[Src] + JumpDist) { + Queue.erase(std::make_pair(Distance[Dst], Dst)); + + Distance[Dst] = Distance[Src] + JumpDist; + Parent[Dst] = Jump; + + Queue.insert(std::make_pair(Distance[Dst], Dst)); + } + } + } + // If Target is not provided, find the closest exit block + if (Target == AnyExitBlock) { + for (uint64_t I = 0; I < NumBlocks(); I++) { + if (Func.Blocks[I].isExit() && Parent[I] != nullptr) { + if (Target == AnyExitBlock || Distance[Target] > Distance[I]) { + Target = I; + } + } + } + } + assert(Parent[Target] != nullptr && "a path does not exist"); + + // Extract the constructed path + std::vector<FlowJump *> Result; + uint64_t Now = Target; + while (Now != Source) { + assert(Now == Parent[Now]->Target && "incorrect parent jump"); + Result.push_back(Parent[Now]); + Now = Parent[Now]->Source; + } + // Reverse the path, since it is extracted from Target to Source + std::reverse(Result.begin(), Result.end()); + return Result; + } + + /// A distance of a path for a given jump. + /// In order to incite the path to use blocks/jumps with large positive flow, + /// and avoid changing branch probability of outgoing edges drastically, + /// set the distance as follows: + /// if Jump.Flow > 0, then distance = max(100 - Jump->Flow, 0) + /// if Block.Weight > 0, then distance = 1 + /// otherwise distance >> 1 + int64_t jumpDistance(FlowJump *Jump) const { + int64_t BaseDistance = 100; + if (Jump->IsUnlikely) + return MinCostMaxFlow::AuxCostUnlikely; + if (Jump->Flow > 0) + return std::max(BaseDistance - (int64_t)Jump->Flow, (int64_t)0); + if (Func.Blocks[Jump->Target].Weight > 0) + return BaseDistance; + return BaseDistance * (NumBlocks() + 1); + }; + + uint64_t NumBlocks() const { return Func.Blocks.size(); } + + /// Rebalance unknown subgraphs so as each branch splits with probabilities + /// UnknownFirstSuccProbability and 1 - UnknownFirstSuccProbability + void rebalanceUnknownSubgraphs() { + assert(UnknownFirstSuccProbability >= 0.0 && + UnknownFirstSuccProbability <= 1.0 && + "the share of the unknown successor should be between 0 and 1"); + // Try to find unknown subgraphs from each non-unknown block + for (uint64_t I = 0; I < Func.Blocks.size(); I++) { + auto SrcBlock = &Func.Blocks[I]; + // Do not attempt to find unknown successors from a unknown or a + // zero-flow block + if (SrcBlock->UnknownWeight || SrcBlock->Flow == 0) + continue; + + std::vector<FlowBlock *> UnknownSuccs; + FlowBlock *DstBlock = nullptr; + // Find a unknown subgraphs starting at block SrcBlock + if (!findUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs)) + continue; + // At the moment, we do not rebalance subgraphs containing cycles among + // unknown blocks + if (!isAcyclicSubgraph(SrcBlock, DstBlock, UnknownSuccs)) + continue; + + // Rebalance the flow + rebalanceUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs); + } + } + + /// Find a unknown subgraph starting at block SrcBlock. + /// If the search is successful, the method sets DstBlock and UnknownSuccs. + bool findUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *&DstBlock, + std::vector<FlowBlock *> &UnknownSuccs) { + // Run BFS from SrcBlock and make sure all paths are going through unknown + // blocks and end at a non-unknown DstBlock + auto Visited = std::vector<bool>(NumBlocks(), false); + std::queue<uint64_t> Queue; + DstBlock = nullptr; + + Queue.push(SrcBlock->Index); + Visited[SrcBlock->Index] = true; + while (!Queue.empty()) { + auto &Block = Func.Blocks[Queue.front()]; + Queue.pop(); + // Process blocks reachable from Block + for (auto Jump : Block.SuccJumps) { + uint64_t Dst = Jump->Target; + if (Visited[Dst]) + continue; + Visited[Dst] = true; + if (!Func.Blocks[Dst].UnknownWeight) { + // If we see non-unique non-unknown block reachable from SrcBlock, + // stop processing and skip rebalancing + FlowBlock *CandidateDstBlock = &Func.Blocks[Dst]; + if (DstBlock != nullptr && DstBlock != CandidateDstBlock) + return false; + DstBlock = CandidateDstBlock; + } else { + Queue.push(Dst); + UnknownSuccs.push_back(&Func.Blocks[Dst]); + } + } + } + + // If the list of unknown blocks is empty, we don't need rebalancing + if (UnknownSuccs.empty()) + return false; + // If all reachable nodes from SrcBlock are unknown, skip rebalancing + if (DstBlock == nullptr) + return false; + // If any of the unknown blocks is an exit block, skip rebalancing + for (auto Block : UnknownSuccs) { + if (Block->isExit()) + return false; + } + + return true; + } + + /// Verify if the given unknown subgraph is acyclic, and if yes, reorder + /// UnknownSuccs in the topological order (so that all jumps are "forward"). + bool isAcyclicSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock, + std::vector<FlowBlock *> &UnknownSuccs) { + // Extract local in-degrees in the considered subgraph + auto LocalInDegree = std::vector<uint64_t>(NumBlocks(), 0); + for (auto Jump : SrcBlock->SuccJumps) { + LocalInDegree[Jump->Target]++; + } + for (uint64_t I = 0; I < UnknownSuccs.size(); I++) { + for (auto Jump : UnknownSuccs[I]->SuccJumps) { + LocalInDegree[Jump->Target]++; + } + } + // A loop containing SrcBlock + if (LocalInDegree[SrcBlock->Index] > 0) + return false; + + std::vector<FlowBlock *> AcyclicOrder; + std::queue<uint64_t> Queue; + Queue.push(SrcBlock->Index); + while (!Queue.empty()) { + auto &Block = Func.Blocks[Queue.front()]; + Queue.pop(); + // Stop propagation once we reach DstBlock + if (Block.Index == DstBlock->Index) + break; + + AcyclicOrder.push_back(&Block); + // Add to the queue all successors with zero local in-degree + for (auto Jump : Block.SuccJumps) { + uint64_t Dst = Jump->Target; + LocalInDegree[Dst]--; + if (LocalInDegree[Dst] == 0) { + Queue.push(Dst); + } + } + } + + // If there is a cycle in the subgraph, AcyclicOrder contains only a subset + // of all blocks + if (UnknownSuccs.size() + 1 != AcyclicOrder.size()) + return false; + UnknownSuccs = AcyclicOrder; + return true; + } + + /// Rebalance a given subgraph. + void rebalanceUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock, + std::vector<FlowBlock *> &UnknownSuccs) { + assert(SrcBlock->Flow > 0 && "zero-flow block in unknown subgraph"); + assert(UnknownSuccs.front() == SrcBlock && "incorrect order of unknowns"); + + for (auto Block : UnknownSuccs) { + // Block's flow is the sum of incoming flows + uint64_t TotalFlow = 0; + if (Block == SrcBlock) { + TotalFlow = Block->Flow; + } else { + for (auto Jump : Block->PredJumps) { + TotalFlow += Jump->Flow; + } + Block->Flow = TotalFlow; + } + + // Process all successor jumps and update corresponding flow values + for (uint64_t I = 0; I < Block->SuccJumps.size(); I++) { + auto Jump = Block->SuccJumps[I]; + if (I + 1 == Block->SuccJumps.size()) { + Jump->Flow = TotalFlow; + continue; + } + uint64_t Flow = uint64_t(TotalFlow * UnknownFirstSuccProbability); + Jump->Flow = Flow; + TotalFlow -= Flow; + } + } + } + + /// A constant indicating an arbitrary exit block of a function. + static constexpr uint64_t AnyExitBlock = uint64_t(-1); + + /// The function. + FlowFunction &Func; +}; + /// Initializing flow network for a given function. /// /// Every block is split into three nodes that are responsible for (i) an @@ -440,6 +786,39 @@ void verifyWeights(const FlowFunction &Func) { } } assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow"); + + // Verify that there are no isolated flow components + // One could modify FlowFunction to hold edges indexed by the sources, which + // will avoid a creation of the object + auto PositiveFlowEdges = std::vector<std::vector<uint64_t>>(NumBlocks); + for (auto &Jump : Func.Jumps) { + if (Jump.Flow > 0) { + PositiveFlowEdges[Jump.Source].push_back(Jump.Target); + } + } + + // Run BFS from the source along edges with positive flow + std::queue<uint64_t> Queue; + auto Visited = std::vector<bool>(NumBlocks, false); + Queue.push(Func.Entry); + Visited[Func.Entry] = true; + while (!Queue.empty()) { + uint64_t Src = Queue.front(); + Queue.pop(); + for (uint64_t Dst : PositiveFlowEdges[Src]) { + if (!Visited[Dst]) { + Queue.push(Dst); + Visited[Dst] = true; + } + } + } + + // Verify that every block that has a positive flow is reached from the source + // along edges with a positive flow + for (uint64_t I = 0; I < NumBlocks; I++) { + auto &Block = Func.Blocks[I]; + assert((Visited[I] || Block.Flow == 0) && "an isolated flow component"); + } } #endif @@ -455,6 +834,10 @@ void llvm::applyFlowInference(FlowFunction &Func) { // Extract flow values for every block and every edge extractWeights(InferenceNetwork, Func); + // Post-processing adjustments to the flow + auto Adjuster = FlowAdjuster(Func); + Adjuster.run(); + #ifndef NDEBUG // Verify the result verifyWeights(Func); diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 71c15d5c51fc..c840ee85795f 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -1047,9 +1047,9 @@ bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) { if (SE.DT.dominates(IncV, InsertPos)) break; } - for (auto I = IVIncs.rbegin(), E = IVIncs.rend(); I != E; ++I) { - fixupInsertPoints(*I); - (*I)->moveBefore(InsertPos); + for (Instruction *I : llvm::reverse(IVIncs)) { + fixupInsertPoints(I); + I->moveBefore(InsertPos); } return true; } diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index afa3ecde77f9..1046998c26de 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -3629,7 +3629,7 @@ static bool tryWidenCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, return false; // TODO // Use lambda to lazily compute expensive condition after cheap ones. auto NoSideEffects = [](BasicBlock &BB) { - return !llvm::any_of(BB, [](const Instruction &I) { + return llvm::none_of(BB, [](const Instruction &I) { return I.mayWriteToMemory() || I.mayHaveSideEffects(); }); }; diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index e190a1294eb3..02727a3dbf9c 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -193,6 +193,19 @@ static void annotateNonNullAndDereferenceable(CallInst *CI, ArrayRef<unsigned> A } } +// Copy CallInst "flags" like musttail, notail, and tail. Return New param for +// easier chaining. Calls to emit* and B.createCall should probably be wrapped +// in this function when New is created to replace Old. Callers should take +// care to check Old.isMustTailCall() if they aren't replacing Old directly +// with New. +static Value *copyFlags(const CallInst &Old, Value *New) { + assert(!Old.isMustTailCall() && "do not copy musttail call flags"); + assert(!Old.isNoTailCall() && "do not copy notail call flags"); + if (auto *NewCI = dyn_cast_or_null<CallInst>(New)) + NewCI->setTailCallKind(Old.getTailCallKind()); + return New; +} + //===----------------------------------------------------------------------===// // String and Memory Library Call Optimizations //===----------------------------------------------------------------------===// @@ -215,7 +228,7 @@ Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilderBase &B) { if (Len == 0) return Dst; - return emitStrLenMemCpy(Src, Dst, Len, B); + return copyFlags(*CI, emitStrLenMemCpy(Src, Dst, Len, B)); } Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, @@ -279,7 +292,7 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilderBase &B) { // strncat(x, s, c) -> strcat(x, s) // s is constant so the strcat can be optimized further. - return emitStrLenMemCpy(Src, Dst, SrcLen, B); + return copyFlags(*CI, emitStrLenMemCpy(Src, Dst, SrcLen, B)); } Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilderBase &B) { @@ -300,9 +313,11 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilderBase &B) { if (!FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32. return nullptr; - return emitMemChr(SrcStr, CI->getArgOperand(1), // include nul. - ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len), - B, DL, TLI); + return copyFlags( + *CI, + emitMemChr(SrcStr, CI->getArgOperand(1), // include nul. + ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len), B, + DL, TLI)); } // Otherwise, the character is a constant, see if the first argument is @@ -340,7 +355,7 @@ Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilderBase &B) { if (!getConstantStringInfo(SrcStr, Str)) { // strrchr(s, 0) -> strchr(s, 0) if (CharC->isZero()) - return emitStrChr(SrcStr, '\0', B, TLI); + return copyFlags(*CI, emitStrChr(SrcStr, '\0', B, TLI)); return nullptr; } @@ -385,25 +400,28 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilderBase &B) { annotateDereferenceableBytes(CI, 1, Len2); if (Len1 && Len2) { - return emitMemCmp(Str1P, Str2P, - ConstantInt::get(DL.getIntPtrType(CI->getContext()), - std::min(Len1, Len2)), - B, DL, TLI); + return copyFlags( + *CI, emitMemCmp(Str1P, Str2P, + ConstantInt::get(DL.getIntPtrType(CI->getContext()), + std::min(Len1, Len2)), + B, DL, TLI)); } // strcmp to memcmp if (!HasStr1 && HasStr2) { if (canTransformToMemCmp(CI, Str1P, Len2, DL)) - return emitMemCmp( - Str1P, Str2P, - ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), B, DL, - TLI); + return copyFlags( + *CI, + emitMemCmp(Str1P, Str2P, + ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), + B, DL, TLI)); } else if (HasStr1 && !HasStr2) { if (canTransformToMemCmp(CI, Str2P, Len1, DL)) - return emitMemCmp( - Str1P, Str2P, - ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), B, DL, - TLI); + return copyFlags( + *CI, + emitMemCmp(Str1P, Str2P, + ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), + B, DL, TLI)); } annotateNonNullNoUndefBasedOnAccess(CI, {0, 1}); @@ -430,7 +448,7 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) { return ConstantInt::get(CI->getType(), 0); if (Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1) - return emitMemCmp(Str1P, Str2P, Size, B, DL, TLI); + return copyFlags(*CI, emitMemCmp(Str1P, Str2P, Size, B, DL, TLI)); StringRef Str1, Str2; bool HasStr1 = getConstantStringInfo(Str1P, Str1); @@ -462,17 +480,19 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) { if (!HasStr1 && HasStr2) { Len2 = std::min(Len2, Length); if (canTransformToMemCmp(CI, Str1P, Len2, DL)) - return emitMemCmp( - Str1P, Str2P, - ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), B, DL, - TLI); + return copyFlags( + *CI, + emitMemCmp(Str1P, Str2P, + ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), + B, DL, TLI)); } else if (HasStr1 && !HasStr2) { Len1 = std::min(Len1, Length); if (canTransformToMemCmp(CI, Str2P, Len1, DL)) - return emitMemCmp( - Str1P, Str2P, - ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), B, DL, - TLI); + return copyFlags( + *CI, + emitMemCmp(Str1P, Str2P, + ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), + B, DL, TLI)); } return nullptr; @@ -485,7 +505,7 @@ Value *LibCallSimplifier::optimizeStrNDup(CallInst *CI, IRBuilderBase &B) { if (SrcLen && Size) { annotateDereferenceableBytes(CI, 0, SrcLen); if (SrcLen <= Size->getZExtValue() + 1) - return emitStrDup(Src, B, TLI); + return copyFlags(*CI, emitStrDup(Src, B, TLI)); } return nullptr; @@ -495,7 +515,7 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) { Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); if (Dst == Src) // strcpy(x,x) -> x return Src; - + annotateNonNullNoUndefBasedOnAccess(CI, {0, 1}); // See if we can get the length of the input string. uint64_t Len = GetStringLength(Src); @@ -511,6 +531,7 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) { ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len)); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return Dst; } @@ -520,7 +541,7 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) { // stpcpy(d,s) -> strcpy(d,s) if the result is not used. if (CI->use_empty()) - return emitStrCpy(Dst, Src, B, TLI); + return copyFlags(*CI, emitStrCpy(Dst, Src, B, TLI)); if (Dst == Src) { // stpcpy(x,x) -> x+strlen(x) Value *StrLen = emitStrLen(Src, B, DL, TLI); @@ -544,6 +565,7 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) { CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1), LenV); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return DstEnd; } @@ -583,6 +605,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) { AttrBuilder ArgAttrs(CI->getAttributes().getParamAttrs(0)); NewCI->setAttributes(NewCI->getAttributes().addParamAttributes( CI->getContext(), 0, ArgAttrs)); + copyFlags(*CI, NewCI); return Dst; } @@ -606,6 +629,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) { ConstantInt::get(DL.getIntPtrType(PT), Len)); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return Dst; } @@ -737,7 +761,7 @@ Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilderBase &B) { // strpbrk(s, "a") -> strchr(s, 'a') if (HasS2 && S2.size() == 1) - return emitStrChr(CI->getArgOperand(0), S2[0], B, TLI); + return copyFlags(*CI, emitStrChr(CI->getArgOperand(0), S2[0], B, TLI)); return nullptr; } @@ -793,7 +817,7 @@ Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilderBase &B) { // strcspn(s, "") -> strlen(s) if (HasS2 && S2.empty()) - return emitStrLen(CI->getArgOperand(0), B, DL, TLI); + return copyFlags(*CI, emitStrLen(CI->getArgOperand(0), B, DL, TLI)); return nullptr; } @@ -1062,7 +1086,7 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilderBase &B) { Value *LHS = CI->getArgOperand(0); Value *RHS = CI->getArgOperand(1); Value *Size = CI->getArgOperand(2); - return emitBCmp(LHS, RHS, Size, B, DL, TLI); + return copyFlags(*CI, emitBCmp(LHS, RHS, Size, B, DL, TLI)); } return nullptr; @@ -1083,6 +1107,7 @@ Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilderBase &B) { CI->getArgOperand(1), Align(1), Size); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return CI->getArgOperand(0); } @@ -1110,7 +1135,8 @@ Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilderBase &B) { size_t Pos = SrcStr.find(StopChar->getSExtValue() & 0xFF); if (Pos == StringRef::npos) { if (N->getZExtValue() <= SrcStr.size()) { - B.CreateMemCpy(Dst, Align(1), Src, Align(1), CI->getArgOperand(3)); + copyFlags(*CI, B.CreateMemCpy(Dst, Align(1), Src, Align(1), + CI->getArgOperand(3))); return Constant::getNullValue(CI->getType()); } return nullptr; @@ -1119,7 +1145,7 @@ Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilderBase &B) { Value *NewN = ConstantInt::get(N->getType(), std::min(uint64_t(Pos + 1), N->getZExtValue())); // memccpy -> llvm.memcpy - B.CreateMemCpy(Dst, Align(1), Src, Align(1), NewN); + copyFlags(*CI, B.CreateMemCpy(Dst, Align(1), Src, Align(1), NewN)); return Pos + 1 <= N->getZExtValue() ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, NewN) : Constant::getNullValue(CI->getType()); @@ -1136,6 +1162,7 @@ Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilderBase &B) { // TODO: Attach return value attributes to the 1st operand to preserve them? NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return B.CreateInBoundsGEP(B.getInt8Ty(), Dst, N); } @@ -1150,6 +1177,7 @@ Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilderBase &B) { CI->getArgOperand(1), Align(1), Size); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return CI->getArgOperand(0); } @@ -1164,12 +1192,13 @@ Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilderBase &B) { CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align(1)); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return CI->getArgOperand(0); } Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilderBase &B) { if (isa<ConstantPointerNull>(CI->getArgOperand(0))) - return emitMalloc(CI->getArgOperand(1), B, DL, TLI); + return copyFlags(*CI, emitMalloc(CI->getArgOperand(1), B, DL, TLI)); return nullptr; } @@ -1190,7 +1219,7 @@ static Value *replaceUnaryCall(CallInst *CI, IRBuilderBase &B, Function *F = Intrinsic::getDeclaration(M, IID, CI->getType()); CallInst *NewCall = B.CreateCall(F, V); NewCall->takeName(CI); - return NewCall; + return copyFlags(*CI, NewCall); } /// Return a variant of Val with float type. @@ -1311,7 +1340,8 @@ Value *LibCallSimplifier::optimizeCAbs(CallInst *CI, IRBuilderBase &B) { Function *FSqrt = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::sqrt, CI->getType()); - return B.CreateCall(FSqrt, B.CreateFAdd(RealReal, ImagImag), "cabs"); + return copyFlags( + *CI, B.CreateCall(FSqrt, B.CreateFAdd(RealReal, ImagImag), "cabs")); } static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func, @@ -1334,14 +1364,16 @@ static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func, // sin(-X) --> -sin(X) // tan(-X) --> -tan(X) if (match(Call->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X))))) - return B.CreateFNeg(B.CreateCall(Call->getCalledFunction(), X)); + return B.CreateFNeg( + copyFlags(*Call, B.CreateCall(Call->getCalledFunction(), X))); break; case LibFunc_cos: case LibFunc_cosf: case LibFunc_cosl: // cos(-X) --> cos(X) if (match(Call->getArgOperand(0), m_FNeg(m_Value(X)))) - return B.CreateCall(Call->getCalledFunction(), X, "cos"); + return copyFlags(*Call, + B.CreateCall(Call->getCalledFunction(), X, "cos")); break; default: break; @@ -1476,9 +1508,10 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) { (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo)) && hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) { if (Value *ExpoI = getIntToFPVal(Expo, B, TLI->getIntSize())) - return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI, TLI, - LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl, - B, Attrs); + return copyFlags(*Pow, + emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI, + TLI, LibFunc_ldexp, LibFunc_ldexpf, + LibFunc_ldexpl, B, Attrs)); } // pow(2.0 ** n, x) -> exp2(n * x) @@ -1496,11 +1529,13 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) { double N = NI.logBase2() * (IsReciprocal ? -1.0 : 1.0); Value *FMul = B.CreateFMul(Expo, ConstantFP::get(Ty, N), "mul"); if (Pow->doesNotAccessMemory()) - return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty), - FMul, "exp2"); + return copyFlags(*Pow, B.CreateCall(Intrinsic::getDeclaration( + Mod, Intrinsic::exp2, Ty), + FMul, "exp2")); else - return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f, - LibFunc_exp2l, B, Attrs); + return copyFlags(*Pow, emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, + LibFunc_exp2f, + LibFunc_exp2l, B, Attrs)); } } @@ -1508,8 +1543,9 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) { // TODO: There is no exp10() intrinsic yet, but some day there shall be one. if (match(Base, m_SpecificFP(10.0)) && hasFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l)) - return emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10, LibFunc_exp10f, - LibFunc_exp10l, B, Attrs); + return copyFlags(*Pow, emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10, + LibFunc_exp10f, LibFunc_exp10l, + B, Attrs)); // pow(x, y) -> exp2(log2(x) * y) if (Pow->hasApproxFunc() && Pow->hasNoNaNs() && BaseF->isFiniteNonZero() && @@ -1528,11 +1564,13 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) { if (Log) { Value *FMul = B.CreateFMul(Log, Expo, "mul"); if (Pow->doesNotAccessMemory()) - return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty), - FMul, "exp2"); + return copyFlags(*Pow, B.CreateCall(Intrinsic::getDeclaration( + Mod, Intrinsic::exp2, Ty), + FMul, "exp2")); else if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) - return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f, - LibFunc_exp2l, B, Attrs); + return copyFlags(*Pow, emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, + LibFunc_exp2f, + LibFunc_exp2l, B, Attrs)); } } @@ -1595,6 +1633,8 @@ Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilderBase &B) { Sqrt = B.CreateCall(FAbsFn, Sqrt, "abs"); } + Sqrt = copyFlags(*Pow, Sqrt); + // Handle non finite base by expanding to // (x == -infinity ? +infinity : sqrt(x)). if (!Pow->hasNoInfs()) { @@ -1721,15 +1761,18 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) { if (ExpoF->isInteger() && ExpoF->convertToInteger(IntExpo, APFloat::rmTowardZero, &Ignored) == APFloat::opOK) { - return createPowWithIntegerExponent( - Base, ConstantInt::get(B.getIntNTy(TLI->getIntSize()), IntExpo), M, B); + return copyFlags( + *Pow, + createPowWithIntegerExponent( + Base, ConstantInt::get(B.getIntNTy(TLI->getIntSize()), IntExpo), + M, B)); } } // powf(x, itofp(y)) -> powi(x, y) if (AllowApprox && (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo))) { if (Value *ExpoI = getIntToFPVal(Expo, B, TLI->getIntSize())) - return createPowWithIntegerExponent(Base, ExpoI, M, B); + return copyFlags(*Pow, createPowWithIntegerExponent(Base, ExpoI, M, B)); } // Shrink pow() to powf() if the arguments are single precision, @@ -1792,7 +1835,8 @@ Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilderBase &B) { Intrinsic::ID IID = Callee->getName().startswith("fmin") ? Intrinsic::minnum : Intrinsic::maxnum; Function *F = Intrinsic::getDeclaration(CI->getModule(), IID, CI->getType()); - return B.CreateCall(F, { CI->getArgOperand(0), CI->getArgOperand(1) }); + return copyFlags( + *CI, B.CreateCall(F, {CI->getArgOperand(0), CI->getArgOperand(1)})); } Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) { @@ -2010,9 +2054,9 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) { // of the square root calculation. Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, ArgType); Value *SqrtCall = B.CreateCall(Sqrt, OtherOp, "sqrt"); - return B.CreateFMul(FabsCall, SqrtCall); + return copyFlags(*CI, B.CreateFMul(FabsCall, SqrtCall)); } - return FabsCall; + return copyFlags(*CI, FabsCall); } // TODO: Generalize to handle any trig function and its inverse. @@ -2327,7 +2371,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) { // printf("x") -> putchar('x'), even for "%" and "%%". if (FormatStr.size() == 1 || FormatStr == "%%") - return emitPutChar(B.getInt32(FormatStr[0]), B, TLI); + return copyFlags(*CI, emitPutChar(B.getInt32(FormatStr[0]), B, TLI)); // Try to remove call or emit putchar/puts. if (FormatStr == "%s" && CI->arg_size() > 1) { @@ -2339,12 +2383,12 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) { return (Value *)CI; // printf("%s", "a") --> putchar('a') if (OperandStr.size() == 1) - return emitPutChar(B.getInt32(OperandStr[0]), B, TLI); + return copyFlags(*CI, emitPutChar(B.getInt32(OperandStr[0]), B, TLI)); // printf("%s", str"\n") --> puts(str) if (OperandStr.back() == '\n') { OperandStr = OperandStr.drop_back(); Value *GV = B.CreateGlobalString(OperandStr, "str"); - return emitPutS(GV, B, TLI); + return copyFlags(*CI, emitPutS(GV, B, TLI)); } return nullptr; } @@ -2356,19 +2400,19 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) { // pass to be run after this pass, to merge duplicate strings. FormatStr = FormatStr.drop_back(); Value *GV = B.CreateGlobalString(FormatStr, "str"); - return emitPutS(GV, B, TLI); + return copyFlags(*CI, emitPutS(GV, B, TLI)); } // Optimize specific format strings. // printf("%c", chr) --> putchar(chr) if (FormatStr == "%c" && CI->arg_size() > 1 && CI->getArgOperand(1)->getType()->isIntegerTy()) - return emitPutChar(CI->getArgOperand(1), B, TLI); + return copyFlags(*CI, emitPutChar(CI->getArgOperand(1), B, TLI)); // printf("%s\n", str) --> puts(str) if (FormatStr == "%s\n" && CI->arg_size() > 1 && CI->getArgOperand(1)->getType()->isPointerTy()) - return emitPutS(CI->getArgOperand(1), B, TLI); + return copyFlags(*CI, emitPutS(CI->getArgOperand(1), B, TLI)); return nullptr; } @@ -2459,7 +2503,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, if (CI->use_empty()) // sprintf(dest, "%s", str) -> strcpy(dest, str) - return emitStrCpy(Dest, CI->getArgOperand(2), B, TLI); + return copyFlags(*CI, emitStrCpy(Dest, CI->getArgOperand(2), B, TLI)); uint64_t SrcLen = GetStringLength(CI->getArgOperand(2)); if (SrcLen) { @@ -2558,10 +2602,12 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, // snprintf(dst, size, fmt) -> llvm.memcpy(align 1 dst, align 1 fmt, // strlen(fmt)+1) - B.CreateMemCpy( - CI->getArgOperand(0), Align(1), CI->getArgOperand(2), Align(1), - ConstantInt::get(DL.getIntPtrType(CI->getContext()), - FormatStr.size() + 1)); // Copy the null byte. + copyFlags( + *CI, + B.CreateMemCpy( + CI->getArgOperand(0), Align(1), CI->getArgOperand(2), Align(1), + ConstantInt::get(DL.getIntPtrType(CI->getContext()), + FormatStr.size() + 1))); // Copy the null byte. return ConstantInt::get(CI->getType(), FormatStr.size()); } @@ -2599,8 +2645,10 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, else if (N < Str.size() + 1) return nullptr; - B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(3), - Align(1), ConstantInt::get(CI->getType(), Str.size() + 1)); + copyFlags( + *CI, B.CreateMemCpy(CI->getArgOperand(0), Align(1), + CI->getArgOperand(3), Align(1), + ConstantInt::get(CI->getType(), Str.size() + 1))); // The snprintf result is the unincremented number of bytes in the string. return ConstantInt::get(CI->getType(), Str.size()); @@ -2640,10 +2688,11 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, if (FormatStr.contains('%')) return nullptr; // We found a format specifier. - return emitFWrite( - CI->getArgOperand(1), - ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size()), - CI->getArgOperand(0), B, DL, TLI); + return copyFlags( + *CI, emitFWrite(CI->getArgOperand(1), + ConstantInt::get(DL.getIntPtrType(CI->getContext()), + FormatStr.size()), + CI->getArgOperand(0), B, DL, TLI)); } // The remaining optimizations require the format string to be "%s" or "%c" @@ -2656,14 +2705,16 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, // fprintf(F, "%c", chr) --> fputc(chr, F) if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return nullptr; - return emitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI); + return copyFlags( + *CI, emitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI)); } if (FormatStr[1] == 's') { // fprintf(F, "%s", str) --> fputs(str, F) if (!CI->getArgOperand(2)->getType()->isPointerTy()) return nullptr; - return emitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI); + return copyFlags( + *CI, emitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI)); } return nullptr; } @@ -2750,10 +2801,11 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilderBase &B) { return nullptr; // Known to have no uses (see above). - return emitFWrite( - CI->getArgOperand(0), - ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len - 1), - CI->getArgOperand(1), B, DL, TLI); + return copyFlags( + *CI, + emitFWrite(CI->getArgOperand(0), + ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len - 1), + CI->getArgOperand(1), B, DL, TLI)); } Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilderBase &B) { @@ -2765,15 +2817,16 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilderBase &B) { // puts("") -> putchar('\n') StringRef Str; if (getConstantStringInfo(CI->getArgOperand(0), Str) && Str.empty()) - return emitPutChar(B.getInt32('\n'), B, TLI); + return copyFlags(*CI, emitPutChar(B.getInt32('\n'), B, TLI)); return nullptr; } Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilderBase &B) { // bcopy(src, dst, n) -> llvm.memmove(dst, src, n) - return B.CreateMemMove(CI->getArgOperand(1), Align(1), CI->getArgOperand(0), - Align(1), CI->getArgOperand(2)); + return copyFlags(*CI, B.CreateMemMove(CI->getArgOperand(1), Align(1), + CI->getArgOperand(0), Align(1), + CI->getArgOperand(2))); } bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) { @@ -2971,6 +3024,8 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI, } Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) { + assert(!CI->isMustTailCall() && "These transforms aren't musttail safe."); + // TODO: Split out the code below that operates on FP calls so that // we can all non-FP calls with the StrictFP attribute to be // optimized. @@ -3212,6 +3267,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, Align(1), CI->getArgOperand(2)); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return CI->getArgOperand(0); } return nullptr; @@ -3225,6 +3281,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, Align(1), CI->getArgOperand(2)); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return CI->getArgOperand(0); } return nullptr; @@ -3238,6 +3295,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI, CI->getArgOperand(2), Align(1)); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return CI->getArgOperand(0); } return nullptr; @@ -3252,7 +3310,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemPCpyChk(CallInst *CI, CallInst *NewCI = cast<CallInst>(Call); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); - return NewCI; + return copyFlags(*CI, NewCI); } return nullptr; } @@ -3277,9 +3335,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI, // string lengths for varying. if (isFortifiedCallFoldable(CI, 2, None, 1)) { if (Func == LibFunc_strcpy_chk) - return emitStrCpy(Dst, Src, B, TLI); + return copyFlags(*CI, emitStrCpy(Dst, Src, B, TLI)); else - return emitStpCpy(Dst, Src, B, TLI); + return copyFlags(*CI, emitStpCpy(Dst, Src, B, TLI)); } if (OnlyLowerUnknownSize) @@ -3303,14 +3361,14 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI, // a __memcpy_chk, we still need to return the correct end pointer. if (Ret && Func == LibFunc_stpcpy_chk) return B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(SizeTTy, Len - 1)); - return Ret; + return copyFlags(*CI, cast<CallInst>(Ret)); } Value *FortifiedLibCallSimplifier::optimizeStrLenChk(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 1, None, 0)) - return emitStrLen(CI->getArgOperand(0), B, CI->getModule()->getDataLayout(), - TLI); + return copyFlags(*CI, emitStrLen(CI->getArgOperand(0), B, + CI->getModule()->getDataLayout(), TLI)); return nullptr; } @@ -3319,11 +3377,13 @@ Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI, LibFunc Func) { if (isFortifiedCallFoldable(CI, 3, 2)) { if (Func == LibFunc_strncpy_chk) - return emitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), B, TLI); + return copyFlags(*CI, + emitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), B, TLI)); else - return emitStpNCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), B, TLI); + return copyFlags(*CI, + emitStpNCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), B, TLI)); } return nullptr; @@ -3332,8 +3392,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeMemCCpyChk(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 4, 3)) - return emitMemCCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), CI->getArgOperand(3), B, TLI); + return copyFlags( + *CI, emitMemCCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), CI->getArgOperand(3), B, TLI)); return nullptr; } @@ -3342,8 +3403,9 @@ Value *FortifiedLibCallSimplifier::optimizeSNPrintfChk(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 3, 1, None, 2)) { SmallVector<Value *, 8> VariadicArgs(drop_begin(CI->args(), 5)); - return emitSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(4), VariadicArgs, B, TLI); + return copyFlags(*CI, + emitSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(4), VariadicArgs, B, TLI)); } return nullptr; @@ -3353,8 +3415,9 @@ Value *FortifiedLibCallSimplifier::optimizeSPrintfChk(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 2, None, None, 1)) { SmallVector<Value *, 8> VariadicArgs(drop_begin(CI->args(), 4)); - return emitSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), VariadicArgs, - B, TLI); + return copyFlags(*CI, + emitSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), + VariadicArgs, B, TLI)); } return nullptr; @@ -3363,7 +3426,8 @@ Value *FortifiedLibCallSimplifier::optimizeSPrintfChk(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeStrCatChk(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 2)) - return emitStrCat(CI->getArgOperand(0), CI->getArgOperand(1), B, TLI); + return copyFlags( + *CI, emitStrCat(CI->getArgOperand(0), CI->getArgOperand(1), B, TLI)); return nullptr; } @@ -3371,8 +3435,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrCatChk(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeStrLCat(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 3)) - return emitStrLCat(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), B, TLI); + return copyFlags(*CI, + emitStrLCat(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), B, TLI)); return nullptr; } @@ -3380,8 +3445,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrLCat(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeStrNCatChk(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 3)) - return emitStrNCat(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), B, TLI); + return copyFlags(*CI, + emitStrNCat(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), B, TLI)); return nullptr; } @@ -3389,8 +3455,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrNCatChk(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeStrLCpyChk(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 3)) - return emitStrLCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), B, TLI); + return copyFlags(*CI, + emitStrLCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), B, TLI)); return nullptr; } @@ -3398,8 +3465,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrLCpyChk(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeVSNPrintfChk(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 3, 1, None, 2)) - return emitVSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(4), CI->getArgOperand(5), B, TLI); + return copyFlags( + *CI, emitVSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(4), CI->getArgOperand(5), B, TLI)); return nullptr; } @@ -3407,8 +3475,9 @@ Value *FortifiedLibCallSimplifier::optimizeVSNPrintfChk(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeVSPrintfChk(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 2, None, None, 1)) - return emitVSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), - CI->getArgOperand(4), B, TLI); + return copyFlags(*CI, + emitVSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), + CI->getArgOperand(4), B, TLI)); return nullptr; } diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp index c3eafd6b2492..b822db938af8 100644 --- a/llvm/lib/Transforms/Utils/ValueMapper.cpp +++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp @@ -450,6 +450,12 @@ Value *Mapper::mapValue(const Value *V) { DSOLocalEquivalent::get(Func), NewTy); } + if (const auto *NC = dyn_cast<NoCFIValue>(C)) { + auto *Val = mapValue(NC->getGlobalValue()); + GlobalValue *GV = cast<GlobalValue>(Val); + return getVM()[NC] = NoCFIValue::get(GV); + } + auto mapValueOrNull = [this](Value *V) { auto Mapped = mapValue(V); assert((Mapped || (Flags & RF_NullMapMissingGlobalValues)) && diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 805011191da0..81e5aa223c07 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -55,22 +55,23 @@ static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( cl::desc("The maximum number of SCEV checks allowed with a " "vectorize(enable) pragma")); -// FIXME: When scalable vectorization is stable enough, change the default -// to SK_PreferFixedWidth. -static cl::opt<LoopVectorizeHints::ScalableForceKind> ScalableVectorization( - "scalable-vectorization", cl::init(LoopVectorizeHints::SK_FixedWidthOnly), - cl::Hidden, - cl::desc("Control whether the compiler can use scalable vectors to " - "vectorize a loop"), - cl::values( - clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off", - "Scalable vectorization is disabled."), - clEnumValN(LoopVectorizeHints::SK_PreferFixedWidth, "on", - "Scalable vectorization is available, but favor fixed-width " - "vectorization when the cost is inconclusive."), - clEnumValN(LoopVectorizeHints::SK_PreferScalable, "preferred", - "Scalable vectorization is available and favored when the " - "cost is inconclusive."))); +static cl::opt<LoopVectorizeHints::ScalableForceKind> + ForceScalableVectorization( + "scalable-vectorization", cl::init(LoopVectorizeHints::SK_Unspecified), + cl::Hidden, + cl::desc("Control whether the compiler can use scalable vectors to " + "vectorize a loop"), + cl::values( + clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off", + "Scalable vectorization is disabled."), + clEnumValN( + LoopVectorizeHints::SK_PreferScalable, "preferred", + "Scalable vectorization is available and favored when the " + "cost is inconclusive."), + clEnumValN( + LoopVectorizeHints::SK_PreferScalable, "on", + "Scalable vectorization is available and favored when the " + "cost is inconclusive."))); /// Maximum vectorization interleave count. static const unsigned MaxInterleaveFactor = 16; @@ -95,7 +96,8 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) { LoopVectorizeHints::LoopVectorizeHints(const Loop *L, bool InterleaveOnlyWhenForced, - OptimizationRemarkEmitter &ORE) + OptimizationRemarkEmitter &ORE, + const TargetTransformInfo *TTI) : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH), Interleave("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE), Force("vectorize.enable", FK_Undefined, HK_FORCE), @@ -110,14 +112,32 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L, if (VectorizerParams::isInterleaveForced()) Interleave.Value = VectorizerParams::VectorizationInterleave; + // If the metadata doesn't explicitly specify whether to enable scalable + // vectorization, then decide based on the following criteria (increasing + // level of priority): + // - Target default + // - Metadata width + // - Force option (always overrides) + if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) { + if (TTI) + Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable + : SK_FixedWidthOnly; + + if (Width.Value) + // If the width is set, but the metadata says nothing about the scalable + // property, then assume it concerns only a fixed-width UserVF. + // If width is not set, the flag takes precedence. + Scalable.Value = SK_FixedWidthOnly; + } + + // If the flag is set to force any use of scalable vectors, override the loop + // hints. + if (ForceScalableVectorization.getValue() != + LoopVectorizeHints::SK_Unspecified) + Scalable.Value = ForceScalableVectorization.getValue(); + + // Scalable vectorization is disabled if no preference is specified. if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) - // If the width is set, but the metadata says nothing about the scalable - // property, then assume it concerns only a fixed-width UserVF. - // If width is not set, the flag takes precedence. - Scalable.Value = Width.Value ? SK_FixedWidthOnly : ScalableVectorization; - else if (ScalableVectorization == SK_FixedWidthOnly) - // If the flag is set to disable any use of scalable vectors, override the - // loop hint. Scalable.Value = SK_FixedWidthOnly; if (IsVectorized.Value != 1) @@ -929,7 +949,7 @@ bool LoopVectorizationLegality::canVectorizeFPMath( })); } -bool LoopVectorizationLegality::isInductionPhi(const Value *V) { +bool LoopVectorizationLegality::isInductionPhi(const Value *V) const { Value *In0 = const_cast<Value *>(V); PHINode *PN = dyn_cast_or_null<PHINode>(In0); if (!PN) @@ -938,16 +958,29 @@ bool LoopVectorizationLegality::isInductionPhi(const Value *V) { return Inductions.count(PN); } -bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) { +const InductionDescriptor * +LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const { + if (!isInductionPhi(Phi)) + return nullptr; + auto &ID = getInductionVars().find(Phi)->second; + if (ID.getKind() == InductionDescriptor::IK_IntInduction || + ID.getKind() == InductionDescriptor::IK_FpInduction) + return &ID; + return nullptr; +} + +bool LoopVectorizationLegality::isCastedInductionVariable( + const Value *V) const { auto *Inst = dyn_cast<Instruction>(V); return (Inst && InductionCastsToIgnore.count(Inst)); } -bool LoopVectorizationLegality::isInductionVariable(const Value *V) { +bool LoopVectorizationLegality::isInductionVariable(const Value *V) const { return isInductionPhi(V) || isCastedInductionVariable(V); } -bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) { +bool LoopVectorizationLegality::isFirstOrderRecurrence( + const PHINode *Phi) const { return FirstOrderRecurrences.count(Phi); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index a7d6609f8c56..71eb39a18d2f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -45,16 +45,17 @@ class VPBuilder { VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator(); VPInstruction *createInstruction(unsigned Opcode, - ArrayRef<VPValue *> Operands) { - VPInstruction *Instr = new VPInstruction(Opcode, Operands); + ArrayRef<VPValue *> Operands, DebugLoc DL) { + VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL); if (BB) BB->insert(Instr, InsertPt); return Instr; } VPInstruction *createInstruction(unsigned Opcode, - std::initializer_list<VPValue *> Operands) { - return createInstruction(Opcode, ArrayRef<VPValue *>(Operands)); + std::initializer_list<VPValue *> Operands, + DebugLoc DL) { + return createInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL); } public: @@ -123,30 +124,33 @@ public: /// its underlying Instruction. VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, Instruction *Inst = nullptr) { - VPInstruction *NewVPInst = createInstruction(Opcode, Operands); + DebugLoc DL; + if (Inst) + DL = Inst->getDebugLoc(); + VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL); NewVPInst->setUnderlyingValue(Inst); return NewVPInst; } - VPValue *createNaryOp(unsigned Opcode, - std::initializer_list<VPValue *> Operands, - Instruction *Inst = nullptr) { - return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst); + VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, + DebugLoc DL) { + return createInstruction(Opcode, Operands, DL); } - VPValue *createNot(VPValue *Operand) { - return createInstruction(VPInstruction::Not, {Operand}); + VPValue *createNot(VPValue *Operand, DebugLoc DL) { + return createInstruction(VPInstruction::Not, {Operand}, DL); } - VPValue *createAnd(VPValue *LHS, VPValue *RHS) { - return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}); + VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL) { + return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL); } - VPValue *createOr(VPValue *LHS, VPValue *RHS) { - return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}); + VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL) { + return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, DL); } - VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal) { - return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}); + VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, + DebugLoc DL) { + return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL); } //===--------------------------------------------------------------------===// diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 5ca0adb4242c..4747f34fcc62 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -428,6 +428,8 @@ class GeneratedRTChecks; namespace llvm { +AnalysisKey ShouldRunExtraVectorPasses::Key; + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -506,8 +508,8 @@ public: /// Widen an integer or floating-point induction variable \p IV. If \p Trunc /// is provided, the integer induction variable will first be truncated to /// the corresponding type. - void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, - VPValue *Def, VPValue *CastDef, + void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID, + Value *Start, TruncInst *Trunc, VPValue *Def, VPTransformState &State); /// Construct the vector value of a scalarized value \p V one lane at a time. @@ -534,7 +536,7 @@ public: /// Returns true if the reordering of FP operations is not allowed, but we are /// able to vectorize with strict in-order reductions for the given RdxDesc. - bool useOrderedReductions(RecurrenceDescriptor &RdxDesc); + bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); /// Create a broadcast instruction. This method generates a broadcast /// instruction (shuffle) for loop invariant values and for the induction @@ -619,7 +621,7 @@ protected: /// can also be a truncate instruction. void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID, VPValue *Def, - VPValue *CastDef, VPTransformState &State); + VPTransformState &State); /// Create a vector induction phi node based on an existing scalar one. \p /// EntryVal is the value from the original loop that maps to the vector phi @@ -629,7 +631,6 @@ protected: void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, Value *Step, Value *Start, Instruction *EntryVal, VPValue *Def, - VPValue *CastDef, VPTransformState &State); /// Returns true if an instruction \p I should be scalarized instead of @@ -639,29 +640,6 @@ protected: /// Returns true if we should generate a scalar version of \p IV. bool needsScalarInduction(Instruction *IV) const; - /// If there is a cast involved in the induction variable \p ID, which should - /// be ignored in the vectorized loop body, this function records the - /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the - /// cast. We had already proved that the casted Phi is equal to the uncasted - /// Phi in the vectorized loop (under a runtime guard), and therefore - /// there is no need to vectorize the cast - the same value can be used in the - /// vector loop for both the Phi and the cast. - /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, - /// Otherwise, \p VectorLoopValue is a widened/vectorized value. - /// - /// \p EntryVal is the value from the original loop that maps to the vector - /// phi node and is used to distinguish what is the IV currently being - /// processed - original one (if \p EntryVal is a phi corresponding to the - /// original IV) or the "newly-created" one based on the proof mentioned above - /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the - /// latter case \p EntryVal is a TruncInst and we must not record anything for - /// that IV, but it's error-prone to expect callers of this routine to care - /// about that, hence this explicit parameter. - void recordVectorLoopValueForInductionCast( - const InductionDescriptor &ID, const Instruction *EntryVal, - Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, - unsigned Part, unsigned Lane = UINT_MAX); - /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -698,7 +676,8 @@ protected: /// flags, which can be found from the original scalar operations. Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, - const InductionDescriptor &ID) const; + const InductionDescriptor &ID, + BasicBlock *VectorHeader) const; /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, /// vector loop preheader, middle block and scalar preheader. Also @@ -1728,7 +1707,8 @@ private: /// disabled or unsupported, then the scalable part will be equal to /// ElementCount::getScalable(0). FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, - ElementCount UserVF); + ElementCount UserVF, + bool FoldTailByMasking); /// \return the maximized element count based on the targets vector /// registers and the loop trip-count, but limited to a maximum safe VF. @@ -1741,7 +1721,8 @@ private: ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, - const ElementCount &MaxSafeVF); + const ElementCount &MaxSafeVF, + bool FoldTailByMasking); /// \return the maximum legal scalable VF, based on the safe max number /// of elements. @@ -2356,8 +2337,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( const InductionDescriptor &II, Value *Step, Value *Start, - Instruction *EntryVal, VPValue *Def, VPValue *CastDef, - VPTransformState &State) { + Instruction *EntryVal, VPValue *Def, VPTransformState &State) { + IRBuilder<> &Builder = State.Builder; assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"); @@ -2373,7 +2354,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( } Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); - Value *SplatStart = Builder.CreateVectorSplat(VF, Start); + Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); Value *SteppedStart = getStepVector(SplatStart, Zero, Step, II.getInductionOpcode()); @@ -2394,9 +2375,9 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( Type *StepType = Step->getType(); Value *RuntimeVF; if (Step->getType()->isFloatingPointTy()) - RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF); + RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); else - RuntimeVF = getRuntimeVF(Builder, StepType, VF); + RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); // Create a vector splat to use in the induction update. @@ -2405,8 +2386,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. Value *SplatVF = isa<Constant>(Mul) - ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) - : Builder.CreateVectorSplat(VF, Mul); + ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) + : Builder.CreateVectorSplat(State.VF, Mul); Builder.restoreIP(CurrIP); // We may need to add the step a number of times, depending on the unroll @@ -2420,8 +2401,6 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( if (isa<TruncInst>(EntryVal)) addMetadata(LastInduction, EntryVal); - recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, - State, Part); LastInduction = cast<Instruction>( Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); @@ -2455,56 +2434,21 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { return llvm::any_of(IV->users(), isScalarInst); } -void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( - const InductionDescriptor &ID, const Instruction *EntryVal, - Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, - unsigned Part, unsigned Lane) { - assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && - "Expected either an induction phi-node or a truncate of it!"); - - // This induction variable is not the phi from the original loop but the - // newly-created IV based on the proof that casted Phi is equal to the - // uncasted Phi in the vectorized loop (under a runtime guard possibly). It - // re-uses the same InductionDescriptor that original IV uses but we don't - // have to do any recording in this case - that is done when original IV is - // processed. - if (isa<TruncInst>(EntryVal)) - return; - - if (!CastDef) { - assert(ID.getCastInsts().empty() && - "there are casts for ID, but no CastDef"); - return; - } - assert(!ID.getCastInsts().empty() && - "there is a CastDef, but no casts for ID"); - // Only the first Cast instruction in the Casts vector is of interest. - // The rest of the Casts (if exist) have no uses outside the - // induction update chain itself. - if (Lane < UINT_MAX) - State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); - else - State.set(CastDef, VectorLoopVal, Part); -} - -void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, - TruncInst *Trunc, VPValue *Def, - VPValue *CastDef, +void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, + const InductionDescriptor &ID, + Value *Start, TruncInst *Trunc, + VPValue *Def, VPTransformState &State) { + IRBuilder<> &Builder = State.Builder; assert((IV->getType()->isIntegerTy() || IV != OldInduction) && "Primary induction variable must have an integer type"); - - auto II = Legal->getInductionVars().find(IV); - assert(II != Legal->getInductionVars().end() && "IV is not an induction"); - - auto ID = II->second; assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); // The value from the original loop to which we are mapping the new induction // variable. Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; - auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + auto &DL = EntryVal->getModule()->getDataLayout(); // Generate code for the induction step. Note that induction steps are // required to be loop-invariant @@ -2514,7 +2458,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, if (PSE.getSE()->isSCEVable(IV->getType())) { SCEVExpander Exp(*PSE.getSE(), DL, "induction"); return Exp.expandCodeFor(Step, Step->getType(), - LoopVectorPreHeader->getTerminator()); + State.CFG.VectorPreHeader->getTerminator()); } return cast<SCEVUnknown>(Step)->getValue(); }; @@ -2530,7 +2474,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) : Builder.CreateCast(Instruction::SIToFP, Induction, IV->getType()); - ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); + ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID, + State.CFG.PrevBB); ScalarIV->setName("offset.idx"); } if (Trunc) { @@ -2548,20 +2493,19 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); + assert(!State.VF.isScalable() && "scalable vectors not yet supported."); Value *StartIdx; if (Step->getType()->isFloatingPointTy()) - StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part); + StartIdx = + getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part); else - StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part); + StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part); Value *EntryPart = getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode()); State.set(Def, EntryPart, Part); if (Trunc) addMetadata(EntryPart, Trunc); - recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, - State, Part); } }; @@ -2572,7 +2516,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, // Now do the actual transformations, and start with creating the step value. Value *Step = CreateStepValue(ID.getStep()); - if (VF.isZero() || VF.isScalar()) { + if (State.VF.isZero() || State.VF.isScalar()) { Value *ScalarIV = CreateScalarIV(Step); CreateSplatIV(ScalarIV, Step); return; @@ -2583,8 +2527,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, // least one user in the loop that is not widened. auto NeedsScalarIV = needsScalarInduction(EntryVal); if (!NeedsScalarIV) { - createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, - State); + createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); return; } @@ -2592,14 +2535,13 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, // create the phi node, we will splat the scalar induction variable in each // loop iteration. if (!shouldScalarizeInstruction(EntryVal)) { - createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, - State); + createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); Value *ScalarIV = CreateScalarIV(Step); // Create scalar steps that can be used by instructions we will later // scalarize. Note that the addition of the scalar steps will not increase // the number of instructions in the loop in the common case prior to // InstCombine. We will be trading one vector extract for each scalar step. - buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); + buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); return; } @@ -2609,7 +2551,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, Value *ScalarIV = CreateScalarIV(Step); if (!Cost->isScalarEpilogueAllowed()) CreateSplatIV(ScalarIV, Step); - buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); + buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); } Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, @@ -2663,10 +2605,11 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID, - VPValue *Def, VPValue *CastDef, + VPValue *Def, VPTransformState &State) { + IRBuilder<> &Builder = State.Builder; // We shouldn't have to build scalar steps if we aren't vectorizing. - assert(VF.isVector() && "VF should be greater than one"); + assert(State.VF.isVector() && "VF should be greater than one"); // Get the value type and ensure it and the step have the same integer type. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); assert(ScalarIVTy == Step->getType() && @@ -2688,33 +2631,32 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. bool IsUniform = - Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); - unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); + Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF); + unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); // Compute the scalar steps and save the results in State. Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), ScalarIVTy->getScalarSizeInBits()); Type *VecIVTy = nullptr; Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; - if (!IsUniform && VF.isScalable()) { - VecIVTy = VectorType::get(ScalarIVTy, VF); - UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); - SplatStep = Builder.CreateVectorSplat(VF, Step); - SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); + if (!IsUniform && State.VF.isScalable()) { + VecIVTy = VectorType::get(ScalarIVTy, State.VF); + UnitStepVec = + Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); + SplatStep = Builder.CreateVectorSplat(State.VF, Step); + SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); } - for (unsigned Part = 0; Part < UF; ++Part) { - Value *StartIdx0 = createStepForVF(Builder, IntStepTy, VF, Part); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); - if (!IsUniform && VF.isScalable()) { - auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); + if (!IsUniform && State.VF.isScalable()) { + auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); if (ScalarIVTy->isFloatingPointTy()) InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); State.set(Def, Add, Part); - recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, - Part); // It's useful to record the lane values too for the known minimum number // of elements so we do those below. This improves the code quality when // trying to extract the first element, for example. @@ -2728,14 +2670,12 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); // The step returned by `createStepForVF` is a runtime-evaluated value // when VF is scalable. Otherwise, it should be folded into a Constant. - assert((VF.isScalable() || isa<Constant>(StartIdx)) && + assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not " "scalable"); auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); State.set(Def, Add, VPIteration(Part, Lane)); - recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, - Part, Lane); } } } @@ -3023,21 +2963,19 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized // instruction could feed a poison value to the base address of the widen // load/store. - if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0) + if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) Cloned->dropPoisonGeneratingFlags(); State.Builder.SetInsertPoint(Builder.GetInsertBlock(), Builder.GetInsertPoint()); // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. - for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) { - auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); + for (auto &I : enumerate(RepRecipe->operands())) { auto InputInstance = Instance; - if (!Operand || !OrigLoop->contains(Operand) || - (Cost->isUniformAfterVectorization(Operand, State.VF))) + VPValue *Operand = I.value(); + if (State.Plan->isUniformAfterVectorization(Operand)) InputInstance.Lane = VPLane::getFirstLane(); - auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance); - Cloned->setOperand(op, NewOp); + Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); } addNewMetadata(Cloned, Instr); @@ -3339,7 +3277,7 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, Value *InnerLoopVectorizer::emitTransformedIndex( IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, - const InductionDescriptor &ID) const { + const InductionDescriptor &ID, BasicBlock *VectorHeader) const { SCEVExpander Exp(*SE, DL, "induction"); auto Step = ID.getStep(); @@ -3382,15 +3320,15 @@ Value *InnerLoopVectorizer::emitTransformedIndex( }; // Get a suitable insert point for SCEV expansion. For blocks in the vector - // loop, choose the end of the vector loop header (=LoopVectorBody), because + // loop, choose the end of the vector loop header (=VectorHeader), because // the DomTree is not kept up-to-date for additional blocks generated in the // vector loop. By using the header as insertion point, we guarantee that the // expanded instructions dominate all their uses. - auto GetInsertPoint = [this, &B]() { + auto GetInsertPoint = [this, &B, VectorHeader]() { BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); if (InsertBB != LoopVectorBody && - LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) - return LoopVectorBody->getTerminator(); + LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB)) + return VectorHeader->getTerminator(); return &*B.GetInsertPoint(); }; @@ -3538,7 +3476,8 @@ void InnerLoopVectorizer::createInductionResumeValues( CastInst::getCastOpcode(VectorTripCount, true, StepType, true); Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); - EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); + EndValue = + emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); EndValue->setName("ind.end"); // Compute the end value for the additional bypass (if applicable). @@ -3549,7 +3488,7 @@ void InnerLoopVectorizer::createInductionResumeValues( CRD = B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); EndValueFromAdditionalBypass = - emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); + emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); EndValueFromAdditionalBypass->setName("ind.end"); } } @@ -3623,7 +3562,7 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, if (MDNode *LID = OrigLoop->getLoopID()) L->setLoopID(LID); - LoopVectorizeHints Hints(L, true, *ORE); + LoopVectorizeHints Hints(L, true, *ORE, TTI); Hints.setAlreadyVectorized(); #ifdef EXPENSIVE_CHECKS @@ -3780,7 +3719,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, II.getStep()->getType()) : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); CMO->setName("cast.cmo"); - Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); + Value *Escape = + emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody); Escape->setName("ind.escape"); MissingVals[UI] = Escape; } @@ -4573,7 +4513,8 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { } } -bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { +bool InnerLoopVectorizer::useOrderedReductions( + const RecurrenceDescriptor &RdxDesc) { return Cost->useOrderedReductions(RdxDesc); } @@ -4648,8 +4589,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Value *Idx = Builder.CreateAdd( PartStart, ConstantInt::get(PtrInd->getType(), Lane)); Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); - Value *SclrGep = - emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); + Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), + DL, II, State.CFG.PrevBB); SclrGep->setName("next.gep"); State.set(PhiR, SclrGep, VPIteration(Part, Lane)); } @@ -5368,13 +5309,9 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { // Limit MaxScalableVF by the maximum safe dependence distance. Optional<unsigned> MaxVScale = TTI.getMaxVScale(); - if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) { - unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange) - .getVScaleRangeArgs() - .second; - if (VScaleMax > 0) - MaxVScale = VScaleMax; - } + if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) + MaxVScale = + TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); MaxScalableVF = ElementCount::getScalable( MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); if (!MaxScalableVF) @@ -5386,9 +5323,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { return MaxScalableVF; } -FixedScalableVFPair -LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, - ElementCount UserVF) { +FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( + unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -5475,12 +5411,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, FixedScalableVFPair Result(ElementCount::getFixed(1), ElementCount::getScalable(0)); - if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, - WidestType, MaxSafeFixedVF)) + if (auto MaxVF = + getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, + MaxSafeFixedVF, FoldTailByMasking)) Result.FixedVF = MaxVF; - if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, - WidestType, MaxSafeScalableVF)) + if (auto MaxVF = + getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, + MaxSafeScalableVF, FoldTailByMasking)) if (MaxVF.isScalable()) { Result.ScalableVF = MaxVF; LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF @@ -5513,7 +5451,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: - return computeFeasibleMaxVF(TC, UserVF); + return computeFeasibleMaxVF(TC, UserVF, false); case CM_ScalarEpilogueNotAllowedUsePredicate: LLVM_FALLTHROUGH; case CM_ScalarEpilogueNotNeededUsePredicate: @@ -5551,7 +5489,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return computeFeasibleMaxVF(TC, UserVF); + return computeFeasibleMaxVF(TC, UserVF, false); } return FixedScalableVFPair::getNone(); } @@ -5568,7 +5506,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); + FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); // Avoid tail folding if the trip count is known to be a multiple of any VF // we chose. // FIXME: The condition below pessimises the case for fixed-width vectors, @@ -5641,7 +5579,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, - const ElementCount &MaxSafeVF) { + const ElementCount &MaxSafeVF, bool FoldTailByMasking) { bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); TypeSize WidestRegister = TTI.getRegisterBitWidth( ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector @@ -5673,14 +5611,17 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( const auto TripCountEC = ElementCount::getFixed(ConstTripCount); if (ConstTripCount && ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && - isPowerOf2_32(ConstTripCount)) { - // We need to clamp the VF to be the ConstTripCount. There is no point in - // choosing a higher viable VF as done in the loop below. If - // MaxVectorElementCount is scalable, we only fall back on a fixed VF when - // the TC is less than or equal to the known number of lanes. - LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " - << ConstTripCount << "\n"); - return TripCountEC; + (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { + // If loop trip count (TC) is known at compile time there is no point in + // choosing VF greater than TC (as done in the loop below). Select maximum + // power of two which doesn't exceed TC. + // If MaxVectorElementCount is scalable, we only fall back on a fixed VF + // when the TC is less than or equal to the known number of lanes. + auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); + LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " + "exceeding the constant trip count: " + << ClampedConstTripCount << "\n"); + return ElementCount::getFixed(ClampedConstTripCount); } ElementCount MaxVF = MaxVectorElementCount; @@ -5758,12 +5699,11 @@ bool LoopVectorizationCostModel::isMoreProfitable( EstimatedWidthB *= VScale.getValue(); } - // When set to preferred, for now assume vscale may be larger than 1 (or the - // one being tuned for), so that scalable vectorization is slightly favorable - // over fixed-width vectorization. - if (Hints->isScalableVectorizationPreferred()) - if (A.Width.isScalable() && !B.Width.isScalable()) - return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); + // Assume vscale may be larger than 1 (or the value being tuned for), + // so that scalable vectorization is slightly favorable over fixed-width + // vectorization. + if (A.Width.isScalable() && !B.Width.isScalable()) + return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); // To avoid the need for FP division: // (CostA / A.Width) < (CostB / B.Width) @@ -6068,7 +6008,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() { if (auto *PN = dyn_cast<PHINode>(&I)) { if (!Legal->isReductionVariable(PN)) continue; - const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN]; + const RecurrenceDescriptor &RdxDesc = + Legal->getReductionVars().find(PN)->second; if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || TTI.preferInLoopReduction(RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), @@ -7002,7 +6943,7 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; const RecurrenceDescriptor &RdxDesc = - Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; + Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; InstructionCost BaseCost = TTI.getArithmeticReductionCost( RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); @@ -7079,22 +7020,41 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { if (match(Op0, m_ZExtOrSExt(m_Value())) && Op0->getOpcode() == Op1->getOpcode() && - Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { bool IsUnsigned = isa<ZExtInst>(Op0); - auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); - // Matched reduce(mul(ext, ext)) - InstructionCost ExtCost = - TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, - TTI::CastContextHint::None, CostKind, Op0); + Type *Op0Ty = Op0->getOperand(0)->getType(); + Type *Op1Ty = Op1->getOperand(0)->getType(); + Type *LargestOpTy = + Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty + : Op0Ty; + auto *ExtType = VectorType::get(LargestOpTy, VectorTy); + + // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of + // different sizes. We take the largest type as the ext to reduce, and add + // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). + InstructionCost ExtCost0 = TTI.getCastInstrCost( + Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), + TTI::CastContextHint::None, CostKind, Op0); + InstructionCost ExtCost1 = TTI.getCastInstrCost( + Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), + TTI::CastContextHint::None, CostKind, Op1); InstructionCost MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getExtendedAddReductionCost( /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); + InstructionCost ExtraExtCost = 0; + if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { + Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; + ExtraExtCost = TTI.getCastInstrCost( + ExtraExtOp->getOpcode(), ExtType, + VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), + TTI::CastContextHint::None, CostKind, ExtraExtOp); + } - if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) + if (RedCost.isValid() && + (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) return I == RetI ? RedCost : 0; } else if (!match(I, m_ZExtOrSExt(m_Value()))) { // Matched reduce(mul()) @@ -7570,8 +7530,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, Type *CondTy = SI->getCondition()->getType(); if (!ScalarCond) CondTy = VectorType::get(CondTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, - CmpInst::BAD_ICMP_PREDICATE, CostKind, I); + + CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; + if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) + Pred = Cmp->getPredicate(); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, + CostKind, I); } case Instruction::ICmp: case Instruction::FCmp: { @@ -7581,7 +7545,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); VectorTy = ToVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, - CmpInst::BAD_ICMP_PREDICATE, CostKind, I); + cast<CmpInst>(I)->getPredicate(), CostKind, + I); } case Instruction::Store: case Instruction::Load: { @@ -7762,14 +7727,14 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { // Ignore type-promoting instructions we identified during reduction // detection. for (auto &Reduction : Legal->getReductionVars()) { - RecurrenceDescriptor &RedDes = Reduction.second; + const RecurrenceDescriptor &RedDes = Reduction.second; const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } // Ignore type-casting instructions we identified during induction // detection. for (auto &Induction : Legal->getInductionVars()) { - InductionDescriptor &IndDes = Induction.second; + const InductionDescriptor &IndDes = Induction.second; const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } @@ -7778,7 +7743,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { void LoopVectorizationCostModel::collectInLoopReductions() { for (auto &Reduction : Legal->getReductionVars()) { PHINode *Phi = Reduction.first; - RecurrenceDescriptor &RdxDesc = Reduction.second; + const RecurrenceDescriptor &RdxDesc = Reduction.second; // We don't collect reductions that are type promoted (yet). if (RdxDesc.getRecurrenceType() != Phi->getType()) @@ -8064,18 +8029,6 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions( return U == Ind || DeadInstructions.count(cast<Instruction>(U)); })) DeadInstructions.insert(IndUpdate); - - // We record as "Dead" also the type-casting instructions we had identified - // during induction analysis. We don't need any handling for them in the - // vectorized loop because we have proven that, under a proper runtime - // test guarding the vectorized loop, the value of the phi, and the casted - // value of the phi, are the same. The last instruction in this casting chain - // will get its scalar/vector/widened def from the scalar/vector/widened def - // of the respective phi node. Any other casts in the induction def-use chain - // have no other uses outside the phi update chain, and will be ignored. - InductionDescriptor &IndDes = Induction.second; - const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); - DeadInstructions.insert(Casts.begin(), Casts.end()); } } @@ -8461,7 +8414,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, assert(EdgeMask && "No Edge Mask found for condition"); if (BI->getSuccessor(0) != Dst) - EdgeMask = Builder.createNot(EdgeMask); + EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. // The condition is 'SrcMask && EdgeMask', which is equivalent to @@ -8470,7 +8423,8 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, // EdgeMask is poison. Using 'and' here introduces undefined behavior. VPValue *False = Plan->getOrAddVPValue( ConstantInt::getFalse(BI->getCondition()->getType())); - EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); + EdgeMask = + Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); } return EdgeMaskCache[Edge] = EdgeMask; @@ -8492,22 +8446,24 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { if (!CM.blockNeedsPredicationForAnyReason(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. - // Create the block in mask as the first non-phi instruction in the block. - VPBuilder::InsertPointGuard Guard(Builder); - auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); - Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); - // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. - // Start by constructing the desired canonical IV. + // Start by constructing the desired canonical IV in the header block. VPValue *IV = nullptr; if (Legal->getPrimaryInduction()) IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); else { + VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); auto *IVRecipe = new VPWidenCanonicalIVRecipe(); - Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); + HeaderVPBB->insert(IVRecipe, HeaderVPBB->getFirstNonPhi()); IV = IVRecipe; } + + // Create the block in mask as the first non-phi instruction in the block. + VPBuilder::InsertPointGuard Guard(Builder); + auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); + Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); + VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); bool TailFolded = !CM.isScalarEpilogueAllowed(); @@ -8534,7 +8490,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { continue; } - BlockMask = Builder.createOr(BlockMask, EdgeMask); + BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); } return BlockMaskCache[BB] = BlockMask; @@ -8591,14 +8547,10 @@ VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, ArrayRef<VPValue *> Operands) const { // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. - InductionDescriptor II = Legal->getInductionVars().lookup(Phi); - if (II.getKind() == InductionDescriptor::IK_IntInduction || - II.getKind() == InductionDescriptor::IK_FpInduction) { - assert(II.getStartValue() == + if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) { + assert(II->getStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); - const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); - return new VPWidenIntOrFpInductionRecipe( - Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); + return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II); } return nullptr; @@ -8624,11 +8576,10 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( if (LoopVectorizationPlanner::getDecisionAndClampRange( isOptimizableIVTruncate(I), Range)) { - InductionDescriptor II = - Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); + auto *Phi = cast<PHINode>(I->getOperand(0)); + const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); - return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), - Start, nullptr, I); + return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I); } return nullptr; } @@ -8844,13 +8795,17 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( return VPBB; } LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); - assert(VPBB->getSuccessors().empty() && - "VPBB has successors when handling predicated replication."); + + VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); + assert(SingleSucc && "VPBB must have a single successor when handling " + "predicated replication."); + VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); // Record predicated instructions for above packing optimizations. VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); VPBlockUtils::insertBlockAfter(Region, VPBB); auto *RegSucc = new VPBasicBlock(); VPBlockUtils::insertBlockAfter(RegSucc, Region); + VPBlockUtils::connectBlocks(RegSucc, SingleSucc); return RegSucc; } @@ -8910,7 +8865,8 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { VPValue *StartV = Operands[0]; if (Legal->isReductionVariable(Phi)) { - RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; + const RecurrenceDescriptor &RdxDesc = + Legal->getReductionVars().find(Phi)->second; assert(RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, @@ -9031,7 +8987,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } for (auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; - RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); + RecurKind Kind = + Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; RecipeBuilder.recordRecipeOf(Phi); @@ -9069,30 +9026,25 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // visit each basic block after having visited its predecessor basic blocks. // --------------------------------------------------------------------------- - auto Plan = std::make_unique<VPlan>(); + // Create initial VPlan skeleton, with separate header and latch blocks. + VPBasicBlock *HeaderVPBB = new VPBasicBlock(); + VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); + VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); + auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); + auto Plan = std::make_unique<VPlan>(TopRegion); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); DFS.perform(LI); - VPBasicBlock *VPBB = nullptr; - VPBasicBlock *HeaderVPBB = nullptr; + VPBasicBlock *VPBB = HeaderVPBB; SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. unsigned VPBBsForBB = 0; - auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); - if (VPBB) - VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); - else { - auto *TopRegion = new VPRegionBlock("vector loop"); - TopRegion->setEntry(FirstVPBBForBB); - Plan->setEntry(TopRegion); - HeaderVPBB = FirstVPBBForBB; - } - VPBB = FirstVPBBForBB; + VPBB->setName(BB->getName()); Builder.setInsertPoint(VPBB); // Introduce each ingredient into VPlan. @@ -9159,13 +9111,21 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( : ""); } } + + VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); + VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); } + // Fold the last, empty block into its predecessor. + VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); + assert(VPBB && "expected to fold last (empty) block"); + // After here, VPBB should not be used. + VPBB = nullptr; + assert(isa<VPRegionBlock>(Plan->getEntry()) && !Plan->getEntry()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry " "VPBasicBlock"); - cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB); RecipeBuilder.fixHeaderPhis(); // --------------------------------------------------------------------------- @@ -9231,18 +9191,19 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); VPBlockUtils::connectBlocks(SplitPred, SinkRegion); VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); - if (VPBB == SplitPred) - VPBB = SplitBlock; } } + VPlanTransforms::removeRedundantInductionCasts(*Plan); + // Now that sink-after is done, move induction recipes for optimized truncates // to the phi section of the header block. for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); // Adjust the recipes for any inloop reductions. - adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start); + adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, + RecipeBuilder, Range.Start); // Introduce a recipe to combine the incoming and previous values of a // first-order recurrence. @@ -9322,6 +9283,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( RSO.flush(); Plan->setName(PlanName); + // Fold Exit block into its predecessor if possible. + // TODO: Fold block earlier once all VPlan transforms properly maintain a + // VPBasicBlock as exit. + VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); + assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; } @@ -9355,9 +9321,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { } SmallPtrSet<Instruction *, 1> DeadInstructions; - VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, - Legal->getInductionVars(), - DeadInstructions, *PSE.getSE()); + VPlanTransforms::VPInstructionsToVPRecipes( + OrigLoop, Plan, + [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, + DeadInstructions, *PSE.getSE()); return Plan; } @@ -9371,7 +9338,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( ElementCount MinVF) { for (auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; - RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; + const RecurrenceDescriptor &RdxDesc = + Legal->getReductionVars().find(Phi)->second; const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) @@ -9565,7 +9533,7 @@ void VPWidenRecipe::execute(VPTransformState &State) { // exact, etc.). The control flow has been linearized and the // instruction is no longer guarded by the predicate, which could make // the flag properties to no longer hold. - if (State.MayGeneratePoisonRecipes.count(this) > 0) + if (State.MayGeneratePoisonRecipes.contains(this)) VecOp->dropPoisonGeneratingFlags(); } @@ -9714,9 +9682,9 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); - State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), - getTruncInst(), getVPValue(0), - getCastValue(), State); + State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(), + getStartValue()->getLiveInIRValue(), + getTruncInst(), getVPValue(0), State); } void VPWidenPHIRecipe::execute(VPTransformState &State) { @@ -10293,7 +10261,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"); - LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); + LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); LLVM_DEBUG( dbgs() << "LV: Loop hints:" @@ -10747,8 +10715,17 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, PA.preserve<LoopAnalysis>(); PA.preserve<DominatorTreeAnalysis>(); } - if (!Result.MadeCFGChange) + + if (Result.MadeCFGChange) { + // Making CFG changes likely means a loop got vectorized. Indicate that + // extra simplification passes should be run. + // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only + // be run if runtime checks have been added. + AM.getResult<ShouldRunExtraVectorPasses>(F); + PA.preserve<ShouldRunExtraVectorPasses>(); + } else { PA.preserveSet<CFGAnalyses>(); + } return PA; } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 95061e9053fa..37ae13666f7a 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -631,27 +631,26 @@ static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) { /// after: 6 3 5 4 7 2 1 0 static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) { const unsigned Sz = Order.size(); - SmallBitVector UsedIndices(Sz); - SmallVector<int> MaskedIndices; + SmallBitVector UnusedIndices(Sz, /*t=*/true); + SmallBitVector MaskedIndices(Sz); for (unsigned I = 0; I < Sz; ++I) { if (Order[I] < Sz) - UsedIndices.set(Order[I]); + UnusedIndices.reset(Order[I]); else - MaskedIndices.push_back(I); + MaskedIndices.set(I); } - if (MaskedIndices.empty()) + if (MaskedIndices.none()) return; - SmallVector<int> AvailableIndices(MaskedIndices.size()); - unsigned Cnt = 0; - int Idx = UsedIndices.find_first(); - do { - AvailableIndices[Cnt] = Idx; - Idx = UsedIndices.find_next(Idx); - ++Cnt; - } while (Idx > 0); - assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices."); - for (int I = 0, E = MaskedIndices.size(); I < E; ++I) - Order[MaskedIndices[I]] = AvailableIndices[I]; + assert(UnusedIndices.count() == MaskedIndices.count() && + "Non-synced masked/available indices."); + int Idx = UnusedIndices.find_first(); + int MIdx = MaskedIndices.find_first(); + while (MIdx >= 0) { + assert(Idx >= 0 && "Indices must be synced."); + Order[MIdx] = Idx; + Idx = UnusedIndices.find_next(Idx); + MIdx = MaskedIndices.find_next(MIdx); + } } namespace llvm { @@ -812,6 +811,13 @@ public: /// ExtractElement, ExtractValue), which can be part of the graph. Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE); + /// Gets reordering data for the given tree entry. If the entry is vectorized + /// - just return ReorderIndices, otherwise check if the scalars can be + /// reordered and return the most optimal order. + /// \param TopToBottom If true, include the order of vectorized stores and + /// insertelement nodes, otherwise skip them. + Optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom); + /// Reorders the current graph to the most profitable order starting from the /// root node to the leaf nodes. The best order is chosen only from the nodes /// of the same size (vectorization factor). Smaller nodes are considered @@ -1010,18 +1016,25 @@ public: std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); } - // The hard-coded scores listed here are not very important. When computing - // the scores of matching one sub-tree with another, we are basically - // counting the number of values that are matching. So even if all scores - // are set to 1, we would still get a decent matching result. + // The hard-coded scores listed here are not very important, though it shall + // be higher for better matches to improve the resulting cost. When + // computing the scores of matching one sub-tree with another, we are + // basically counting the number of values that are matching. So even if all + // scores are set to 1, we would still get a decent matching result. // However, sometimes we have to break ties. For example we may have to // choose between matching loads vs matching opcodes. This is what these - // scores are helping us with: they provide the order of preference. + // scores are helping us with: they provide the order of preference. Also, + // this is important if the scalar is externally used or used in another + // tree entry node in the different lane. /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). - static const int ScoreConsecutiveLoads = 3; + static const int ScoreConsecutiveLoads = 4; + /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]). + static const int ScoreReversedLoads = 3; /// ExtractElementInst from same vector and consecutive indexes. - static const int ScoreConsecutiveExtracts = 3; + static const int ScoreConsecutiveExtracts = 4; + /// ExtractElementInst from same vector and reversed indices. + static const int ScoreReversedExtracts = 3; /// Constants. static const int ScoreConstants = 2; /// Instructions with the same opcode. @@ -1041,7 +1054,10 @@ public: /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL, - ScalarEvolution &SE) { + ScalarEvolution &SE, int NumLanes) { + if (V1 == V2) + return VLOperands::ScoreSplat; + auto *LI1 = dyn_cast<LoadInst>(V1); auto *LI2 = dyn_cast<LoadInst>(V2); if (LI1 && LI2) { @@ -1051,8 +1067,17 @@ public: Optional<int> Dist = getPointersDiff( LI1->getType(), LI1->getPointerOperand(), LI2->getType(), LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true); - return (Dist && *Dist == 1) ? VLOperands::ScoreConsecutiveLoads - : VLOperands::ScoreFail; + if (!Dist) + return VLOperands::ScoreFail; + // The distance is too large - still may be profitable to use masked + // loads/gathers. + if (std::abs(*Dist) > NumLanes / 2) + return VLOperands::ScoreAltOpcodes; + // This still will detect consecutive loads, but we might have "holes" + // in some cases. It is ok for non-power-2 vectorization and may produce + // better results. It should not affect current vectorization. + return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads + : VLOperands::ScoreReversedLoads; } auto *C1 = dyn_cast<Constant>(V1); @@ -1062,18 +1087,41 @@ public: // Extracts from consecutive indexes of the same vector better score as // the extracts could be optimized away. - Value *EV; - ConstantInt *Ex1Idx, *Ex2Idx; - if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) && - match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) && - Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue()) - return VLOperands::ScoreConsecutiveExtracts; + Value *EV1; + ConstantInt *Ex1Idx; + if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) { + // Undefs are always profitable for extractelements. + if (isa<UndefValue>(V2)) + return VLOperands::ScoreConsecutiveExtracts; + Value *EV2 = nullptr; + ConstantInt *Ex2Idx = nullptr; + if (match(V2, + m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx), + m_Undef())))) { + // Undefs are always profitable for extractelements. + if (!Ex2Idx) + return VLOperands::ScoreConsecutiveExtracts; + if (isUndefVector(EV2) && EV2->getType() == EV1->getType()) + return VLOperands::ScoreConsecutiveExtracts; + if (EV2 == EV1) { + int Idx1 = Ex1Idx->getZExtValue(); + int Idx2 = Ex2Idx->getZExtValue(); + int Dist = Idx2 - Idx1; + // The distance is too large - still may be profitable to use + // shuffles. + if (std::abs(Dist) > NumLanes / 2) + return VLOperands::ScoreAltOpcodes; + return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts + : VLOperands::ScoreReversedExtracts; + } + } + } auto *I1 = dyn_cast<Instruction>(V1); auto *I2 = dyn_cast<Instruction>(V2); if (I1 && I2) { - if (I1 == I2) - return VLOperands::ScoreSplat; + if (I1->getParent() != I2->getParent()) + return VLOperands::ScoreFail; InstructionsState S = getSameOpcode({I1, I2}); // Note: Only consider instructions with <= 2 operands to avoid // complexity explosion. @@ -1088,11 +1136,13 @@ public: return VLOperands::ScoreFail; } - /// Holds the values and their lane that are taking part in the look-ahead + /// Holds the values and their lanes that are taking part in the look-ahead /// score calculation. This is used in the external uses cost calculation. - SmallDenseMap<Value *, int> InLookAheadValues; + /// Need to hold all the lanes in case of splat/broadcast at least to + /// correctly check for the use in the different lane. + SmallDenseMap<Value *, SmallSet<int, 4>> InLookAheadValues; - /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are + /// \returns the additional cost due to uses of \p LHS and \p RHS that are /// either external to the vectorized code, or require shuffling. int getExternalUsesCost(const std::pair<Value *, int> &LHS, const std::pair<Value *, int> &RHS) { @@ -1116,22 +1166,30 @@ public: for (User *U : V->users()) { if (const TreeEntry *UserTE = R.getTreeEntry(U)) { // The user is in the VectorizableTree. Check if we need to insert. - auto It = llvm::find(UserTE->Scalars, U); - assert(It != UserTE->Scalars.end() && "U is in UserTE"); - int UserLn = std::distance(UserTE->Scalars.begin(), It); + int UserLn = UserTE->findLaneForValue(U); assert(UserLn >= 0 && "Bad lane"); - if (UserLn != Ln) + // If the values are different, check just the line of the current + // value. If the values are the same, need to add UserInDiffLaneCost + // only if UserLn does not match both line numbers. + if ((LHS.first != RHS.first && UserLn != Ln) || + (LHS.first == RHS.first && UserLn != LHS.second && + UserLn != RHS.second)) { Cost += UserInDiffLaneCost; + break; + } } else { // Check if the user is in the look-ahead code. auto It2 = InLookAheadValues.find(U); if (It2 != InLookAheadValues.end()) { // The user is in the look-ahead code. Check the lane. - if (It2->second != Ln) + if (!It2->getSecond().contains(Ln)) { Cost += UserInDiffLaneCost; + break; + } } else { // The user is neither in SLP tree nor in the look-ahead code. Cost += ExternalUseCost; + break; } } // Limit the number of visited uses to cap compilation time. @@ -1170,32 +1228,36 @@ public: Value *V1 = LHS.first; Value *V2 = RHS.first; // Get the shallow score of V1 and V2. - int ShallowScoreAtThisLevel = - std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) - - getExternalUsesCost(LHS, RHS)); + int ShallowScoreAtThisLevel = std::max( + (int)ScoreFail, getShallowScore(V1, V2, DL, SE, getNumLanes()) - + getExternalUsesCost(LHS, RHS)); int Lane1 = LHS.second; int Lane2 = RHS.second; // If reached MaxLevel, // or if V1 and V2 are not instructions, // or if they are SPLAT, - // or if they are not consecutive, early return the current cost. + // or if they are not consecutive, + // or if profitable to vectorize loads or extractelements, early return + // the current cost. auto *I1 = dyn_cast<Instruction>(V1); auto *I2 = dyn_cast<Instruction>(V2); if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || ShallowScoreAtThisLevel == VLOperands::ScoreFail || - (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel)) + (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) || + (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) && + ShallowScoreAtThisLevel)) return ShallowScoreAtThisLevel; assert(I1 && I2 && "Should have early exited."); // Keep track of in-tree values for determining the external-use cost. - InLookAheadValues[V1] = Lane1; - InLookAheadValues[V2] = Lane2; + InLookAheadValues[V1].insert(Lane1); + InLookAheadValues[V2].insert(Lane2); // Contains the I2 operand indexes that got matched with I1 operands. SmallSet<unsigned, 4> Op2Used; - // Recursion towards the operands of I1 and I2. We are trying all possbile + // Recursion towards the operands of I1 and I2. We are trying all possible // operand pairs, and keeping track of the best score. for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); OpIdx1 != NumOperands1; ++OpIdx1) { @@ -1319,27 +1381,79 @@ public: return None; } - /// Helper for reorderOperandVecs. \Returns the lane that we should start - /// reordering from. This is the one which has the least number of operands - /// that can freely move about. + /// Helper for reorderOperandVecs. + /// \returns the lane that we should start reordering from. This is the one + /// which has the least number of operands that can freely move about or + /// less profitable because it already has the most optimal set of operands. unsigned getBestLaneToStartReordering() const { - unsigned BestLane = 0; unsigned Min = UINT_MAX; - for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; - ++Lane) { - unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane); - if (NumFreeOps < Min) { - Min = NumFreeOps; - BestLane = Lane; + unsigned SameOpNumber = 0; + // std::pair<unsigned, unsigned> is used to implement a simple voting + // algorithm and choose the lane with the least number of operands that + // can freely move about or less profitable because it already has the + // most optimal set of operands. The first unsigned is a counter for + // voting, the second unsigned is the counter of lanes with instructions + // with same/alternate opcodes and same parent basic block. + MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap; + // Try to be closer to the original results, if we have multiple lanes + // with same cost. If 2 lanes have the same cost, use the one with the + // lowest index. + for (int I = getNumLanes(); I > 0; --I) { + unsigned Lane = I - 1; + OperandsOrderData NumFreeOpsHash = + getMaxNumOperandsThatCanBeReordered(Lane); + // Compare the number of operands that can move and choose the one with + // the least number. + if (NumFreeOpsHash.NumOfAPOs < Min) { + Min = NumFreeOpsHash.NumOfAPOs; + SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; + HashMap.clear(); + HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); + } else if (NumFreeOpsHash.NumOfAPOs == Min && + NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) { + // Select the most optimal lane in terms of number of operands that + // should be moved around. + SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; + HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); + } else if (NumFreeOpsHash.NumOfAPOs == Min && + NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) { + ++HashMap[NumFreeOpsHash.Hash].first; + } + } + // Select the lane with the minimum counter. + unsigned BestLane = 0; + unsigned CntMin = UINT_MAX; + for (const auto &Data : reverse(HashMap)) { + if (Data.second.first < CntMin) { + CntMin = Data.second.first; + BestLane = Data.second.second; } } return BestLane; } - /// \Returns the maximum number of operands that are allowed to be reordered - /// for \p Lane. This is used as a heuristic for selecting the first lane to - /// start operand reordering. - unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { + /// Data structure that helps to reorder operands. + struct OperandsOrderData { + /// The best number of operands with the same APOs, which can be + /// reordered. + unsigned NumOfAPOs = UINT_MAX; + /// Number of operands with the same/alternate instruction opcode and + /// parent. + unsigned NumOpsWithSameOpcodeParent = 0; + /// Hash for the actual operands ordering. + /// Used to count operands, actually their position id and opcode + /// value. It is used in the voting mechanism to find the lane with the + /// least number of operands that can freely move about or less profitable + /// because it already has the most optimal set of operands. Can be + /// replaced with SmallVector<unsigned> instead but hash code is faster + /// and requires less memory. + unsigned Hash = 0; + }; + /// \returns the maximum number of operands that are allowed to be reordered + /// for \p Lane and the number of compatible instructions(with the same + /// parent/opcode). This is used as a heuristic for selecting the first lane + /// to start operand reordering. + OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { unsigned CntTrue = 0; unsigned NumOperands = getNumOperands(); // Operands with the same APO can be reordered. We therefore need to count @@ -1348,11 +1462,45 @@ public: // a map. Instead we can simply count the number of operands that // correspond to one of them (in this case the 'true' APO), and calculate // the other by subtracting it from the total number of operands. - for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) - if (getData(OpIdx, Lane).APO) + // Operands with the same instruction opcode and parent are more + // profitable since we don't need to move them in many cases, with a high + // probability such lane already can be vectorized effectively. + bool AllUndefs = true; + unsigned NumOpsWithSameOpcodeParent = 0; + Instruction *OpcodeI = nullptr; + BasicBlock *Parent = nullptr; + unsigned Hash = 0; + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { + const OperandData &OpData = getData(OpIdx, Lane); + if (OpData.APO) ++CntTrue; - unsigned CntFalse = NumOperands - CntTrue; - return std::max(CntTrue, CntFalse); + // Use Boyer-Moore majority voting for finding the majority opcode and + // the number of times it occurs. + if (auto *I = dyn_cast<Instruction>(OpData.V)) { + if (!OpcodeI || !getSameOpcode({OpcodeI, I}).getOpcode() || + I->getParent() != Parent) { + if (NumOpsWithSameOpcodeParent == 0) { + NumOpsWithSameOpcodeParent = 1; + OpcodeI = I; + Parent = I->getParent(); + } else { + --NumOpsWithSameOpcodeParent; + } + } else { + ++NumOpsWithSameOpcodeParent; + } + } + Hash = hash_combine( + Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1))); + AllUndefs = AllUndefs && isa<UndefValue>(OpData.V); + } + if (AllUndefs) + return {}; + OperandsOrderData Data; + Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue); + Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent; + Data.Hash = Hash; + return Data; } /// Go through the instructions in VL and append their operands. @@ -1500,11 +1648,37 @@ public: ReorderingModes[OpIdx] = ReorderingMode::Failed; } + // Check that we don't have same operands. No need to reorder if operands + // are just perfect diamond or shuffled diamond match. Do not do it only + // for possible broadcasts or non-power of 2 number of scalars (just for + // now). + auto &&SkipReordering = [this]() { + SmallPtrSet<Value *, 4> UniqueValues; + ArrayRef<OperandData> Op0 = OpsVec.front(); + for (const OperandData &Data : Op0) + UniqueValues.insert(Data.V); + for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) { + if (any_of(Op, [&UniqueValues](const OperandData &Data) { + return !UniqueValues.contains(Data.V); + })) + return false; + } + // TODO: Check if we can remove a check for non-power-2 number of + // scalars after full support of non-power-2 vectorization. + return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size()); + }; + // If the initial strategy fails for any of the operand indexes, then we // perform reordering again in a second pass. This helps avoid assigning // high priority to the failed strategy, and should improve reordering for // the non-failed operand indexes. for (int Pass = 0; Pass != 2; ++Pass) { + // Check if no need to reorder operands since they're are perfect or + // shuffled diamond match. + // Need to to do it to avoid extra external use cost counting for + // shuffled matches, which may cause regressions. + if (SkipReordering()) + break; // Skip the second pass if the first pass did not fail. bool StrategyFailed = false; // Mark all operand data as free to use. @@ -1792,9 +1966,10 @@ private: if (Operands.size() < OpIdx + 1) Operands.resize(OpIdx + 1); assert(Operands[OpIdx].empty() && "Already resized?"); - Operands[OpIdx].resize(Scalars.size()); - for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane) - Operands[OpIdx][Lane] = OpVL[Lane]; + assert(OpVL.size() <= Scalars.size() && + "Number of operands is greater than the number of scalars."); + Operands[OpIdx].resize(OpVL.size()); + copy(OpVL, Operands[OpIdx].begin()); } /// Set the operands of this bundle in their original order. @@ -1944,7 +2119,7 @@ private: if (ReuseShuffleIndices.empty()) dbgs() << "Empty"; else - for (unsigned ReuseIdx : ReuseShuffleIndices) + for (int ReuseIdx : ReuseShuffleIndices) dbgs() << ReuseIdx << ", "; dbgs() << "\n"; dbgs() << "ReorderIndices: "; @@ -2819,6 +2994,50 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { return None; } +Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE, + bool TopToBottom) { + // No need to reorder if need to shuffle reuses, still need to shuffle the + // node. + if (!TE.ReuseShuffleIndices.empty()) + return None; + if (TE.State == TreeEntry::Vectorize && + (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) || + (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) && + !TE.isAltShuffle()) + return TE.ReorderIndices; + if (TE.State == TreeEntry::NeedToGather) { + // TODO: add analysis of other gather nodes with extractelement + // instructions and other values/instructions, not only undefs. + if (((TE.getOpcode() == Instruction::ExtractElement && + !TE.isAltShuffle()) || + (all_of(TE.Scalars, + [](Value *V) { + return isa<UndefValue, ExtractElementInst>(V); + }) && + any_of(TE.Scalars, + [](Value *V) { return isa<ExtractElementInst>(V); }))) && + all_of(TE.Scalars, + [](Value *V) { + auto *EE = dyn_cast<ExtractElementInst>(V); + return !EE || isa<FixedVectorType>(EE->getVectorOperandType()); + }) && + allSameType(TE.Scalars)) { + // Check that gather of extractelements can be represented as + // just a shuffle of a single vector. + OrdersType CurrentOrder; + bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder); + if (Reuse || !CurrentOrder.empty()) { + if (!CurrentOrder.empty()) + fixupOrderingIndices(CurrentOrder); + return CurrentOrder; + } + } + if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE)) + return CurrentOrder; + } + return None; +} + void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries; @@ -2826,42 +3045,15 @@ void BoUpSLP::reorderTopToBottom() { // their ordering. DenseMap<const TreeEntry *, OrdersType> GathersToOrders; // Find all reorderable nodes with the given VF. - // Currently the are vectorized loads,extracts + some gathering of extracts. + // Currently the are vectorized stores,loads,extracts + some gathering of + // extracts. for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders]( const std::unique_ptr<TreeEntry> &TE) { - // No need to reorder if need to shuffle reuses, still need to shuffle the - // node. - if (!TE->ReuseShuffleIndices.empty()) - return; - if (TE->State == TreeEntry::Vectorize && - isa<LoadInst, ExtractElementInst, ExtractValueInst, StoreInst, - InsertElementInst>(TE->getMainOp()) && - !TE->isAltShuffle()) { + if (Optional<OrdersType> CurrentOrder = + getReorderingData(*TE.get(), /*TopToBottom=*/true)) { VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - return; - } - if (TE->State == TreeEntry::NeedToGather) { - if (TE->getOpcode() == Instruction::ExtractElement && - !TE->isAltShuffle() && - isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp()) - ->getVectorOperandType()) && - allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { - // Check that gather of extractelements can be represented as - // just a shuffle of a single vector. - OrdersType CurrentOrder; - bool Reuse = - canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); - if (Reuse || !CurrentOrder.empty()) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - GathersToOrders.try_emplace(TE.get(), CurrentOrder); - return; - } - } - if (Optional<OrdersType> CurrentOrder = - findReusedOrderedScalars(*TE.get())) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); - } } }); @@ -2993,44 +3185,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { const std::unique_ptr<TreeEntry> &TE) { if (TE->State != TreeEntry::Vectorize) NonVectorized.push_back(TE.get()); - // No need to reorder if need to shuffle reuses, still need to shuffle the - // node. - if (!TE->ReuseShuffleIndices.empty()) - return; - if (TE->State == TreeEntry::Vectorize && - isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE->getMainOp()) && - !TE->isAltShuffle()) { + if (Optional<OrdersType> CurrentOrder = + getReorderingData(*TE.get(), /*TopToBottom=*/false)) { OrderedEntries.insert(TE.get()); - return; - } - if (TE->State == TreeEntry::NeedToGather) { - if (TE->getOpcode() == Instruction::ExtractElement && - !TE->isAltShuffle() && - isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp()) - ->getVectorOperandType()) && - allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { - // Check that gather of extractelements can be represented as - // just a shuffle of a single vector with a single user only. - OrdersType CurrentOrder; - bool Reuse = - canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); - if ((Reuse || !CurrentOrder.empty()) && - !any_of(VectorizableTree, - [&TE](const std::unique_ptr<TreeEntry> &Entry) { - return Entry->State == TreeEntry::NeedToGather && - Entry.get() != TE.get() && - Entry->isSame(TE->Scalars); - })) { - OrderedEntries.insert(TE.get()); - GathersToOrders.try_emplace(TE.get(), CurrentOrder); - return; - } - } - if (Optional<OrdersType> CurrentOrder = - findReusedOrderedScalars(*TE.get())) { - OrderedEntries.insert(TE.get()); + if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); - } } }); @@ -3392,9 +3551,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Check that every instruction appears once in this bundle. DenseMap<Value *, unsigned> UniquePositions; for (Value *V : VL) { + if (isConstant(V)) { + ReuseShuffleIndicies.emplace_back( + isa<UndefValue>(V) ? UndefMaskElem : UniqueValues.size()); + UniqueValues.emplace_back(V); + continue; + } auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); - ReuseShuffleIndicies.emplace_back(isa<UndefValue>(V) ? -1 - : Res.first->second); + ReuseShuffleIndicies.emplace_back(Res.first->second); if (Res.second) UniqueValues.emplace_back(V); } @@ -3404,6 +3568,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } else { LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); if (NumUniqueScalarValues <= 1 || + (UniquePositions.size() == 1 && all_of(UniqueValues, + [](Value *V) { + return isa<UndefValue>(V) || + !isConstant(V); + })) || !llvm::isPowerOf2_32(NumUniqueScalarValues)) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); @@ -3508,11 +3677,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } } - // If any of the scalars is marked as a value that needs to stay scalar, then - // we need to gather the scalars. // The reduction nodes (stored in UserIgnoreList) also should stay scalar. for (Value *V : VL) { - if (MustGather.count(V) || is_contained(UserIgnoreList, V)) { + if (is_contained(UserIgnoreList, V)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); if (TryToFindDuplicates(S)) newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, @@ -4219,10 +4386,17 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, SmallVectorImpl<unsigned> &CurrentOrder) const { - Instruction *E0 = cast<Instruction>(OpValue); - assert(E0->getOpcode() == Instruction::ExtractElement || - E0->getOpcode() == Instruction::ExtractValue); - assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode"); + const auto *It = find_if(VL, [](Value *V) { + return isa<ExtractElementInst, ExtractValueInst>(V); + }); + assert(It != VL.end() && "Expected at least one extract instruction."); + auto *E0 = cast<Instruction>(*It); + assert(all_of(VL, + [](Value *V) { + return isa<UndefValue, ExtractElementInst, ExtractValueInst>( + V); + }) && + "Invalid opcode"); // Check if all of the extracts come from the same vector and from the // correct offset. Value *Vec = E0->getOperand(0); @@ -4255,23 +4429,28 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, // Also, later we can check that all the indices are used and we have a // consecutive access in the extract instructions, by checking that no // element of CurrentOrder still has value E + 1. - CurrentOrder.assign(E, E + 1); + CurrentOrder.assign(E, E); unsigned I = 0; for (; I < E; ++I) { - auto *Inst = cast<Instruction>(VL[I]); + auto *Inst = dyn_cast<Instruction>(VL[I]); + if (!Inst) + continue; if (Inst->getOperand(0) != Vec) break; + if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) + if (isa<UndefValue>(EE->getIndexOperand())) + continue; Optional<unsigned> Idx = getExtractIndex(Inst); if (!Idx) break; const unsigned ExtIdx = *Idx; if (ExtIdx != I) { - if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1) + if (ExtIdx >= E || CurrentOrder[ExtIdx] != E) break; ShouldKeepOrder = false; CurrentOrder[ExtIdx] = I; } else { - if (CurrentOrder[I] != E + 1) + if (CurrentOrder[I] != E) break; CurrentOrder[I] = I; } @@ -4287,8 +4466,8 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, bool BoUpSLP::areAllUsersVectorized(Instruction *I, ArrayRef<Value *> VectorizedVals) const { return (I->hasOneUse() && is_contained(VectorizedVals, I)) || - llvm::all_of(I->users(), [this](User *U) { - return ScalarToTreeEntry.count(U) > 0; + all_of(I->users(), [this](User *U) { + return ScalarToTreeEntry.count(U) > 0 || MustGather.contains(U); }); } @@ -4348,6 +4527,10 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy, for (auto *V : VL) { ++Idx; + // Need to exclude undefs from analysis. + if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem) + continue; + // Reached the start of a new vector registers. if (Idx % EltsPerVector == 0) { AllConsecutive = true; @@ -4357,9 +4540,11 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy, // Check all extracts for a vector register on the target directly // extract values in order. unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V)); - unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])); - AllConsecutive &= PrevIdx + 1 == CurrentIdx && - CurrentIdx % EltsPerVector == Idx % EltsPerVector; + if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) { + unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])); + AllConsecutive &= PrevIdx + 1 == CurrentIdx && + CurrentIdx % EltsPerVector == Idx % EltsPerVector; + } if (AllConsecutive) continue; @@ -4442,9 +4627,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // FIXME: it tries to fix a problem with MSVC buildbots. TargetTransformInfo &TTIRef = *TTI; auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy, - VectorizedVals](InstructionCost &Cost, - bool IsGather) { + VectorizedVals, E](InstructionCost &Cost) { DenseMap<Value *, int> ExtractVectorsTys; + SmallPtrSet<Value *, 4> CheckedExtracts; for (auto *V : VL) { if (isa<UndefValue>(V)) continue; @@ -4452,7 +4637,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // instruction itself is not going to be vectorized, consider this // instruction as dead and remove its cost from the final cost of the // vectorized tree. - if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals)) + // Also, avoid adjusting the cost for extractelements with multiple uses + // in different graph entries. + const TreeEntry *VE = getTreeEntry(V); + if (!CheckedExtracts.insert(V).second || + !areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) || + (VE && VE != E)) continue; auto *EE = cast<ExtractElementInst>(V); Optional<unsigned> EEIdx = getExtractIndex(EE); @@ -4549,11 +4739,6 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, } return GatherCost; } - if (isSplat(VL)) { - // Found the broadcasting of the single scalar, calculate the cost as the - // broadcast. - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); - } if ((E->getOpcode() == Instruction::ExtractElement || all_of(E->Scalars, [](Value *V) { @@ -4571,13 +4756,20 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // single input vector or of 2 input vectors. InstructionCost Cost = computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); - AdjustExtractsCost(Cost, /*IsGather=*/true); + AdjustExtractsCost(Cost); if (NeedToShuffleReuses) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); return Cost; } } + if (isSplat(VL)) { + // Found the broadcasting of the single scalar, calculate the cost as the + // broadcast. + assert(VecTy == FinalVecTy && + "No reused scalars expected for broadcast."); + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); + } InstructionCost ReuseShuffleCost = 0; if (NeedToShuffleReuses) ReuseShuffleCost = TTI->getShuffleCost( @@ -4755,7 +4947,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); } } else { - AdjustExtractsCost(CommonCost, /*IsGather=*/false); + AdjustExtractsCost(CommonCost); } return CommonCost; } @@ -5211,15 +5403,15 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, FoundOr = true; } // Check if the input is an extended load of the required or/shift expression. - Value *LoadPtr; + Value *Load; if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root || - !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) + !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load)) return false; // Require that the total load bit width is a legal integer type. // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. - Type *SrcTy = LoadPtr->getType()->getPointerElementType(); + Type *SrcTy = Load->getType(); unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts; if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth))) return false; @@ -9061,8 +9253,7 @@ private: "A call to the llvm.fmuladd intrinsic is not handled yet"); ++NumVectorInstructions; - return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind, - ReductionOps.back()); + return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind); } }; @@ -9473,6 +9664,59 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, return Changed; } +/// Compare two cmp instructions. If IsCompatibility is true, function returns +/// true if 2 cmps have same/swapped predicates and mos compatible corresponding +/// operands. If IsCompatibility is false, function implements strict weak +/// ordering relation between two cmp instructions, returning true if the first +/// instruction is "less" than the second, i.e. its predicate is less than the +/// predicate of the second or the operands IDs are less than the operands IDs +/// of the second cmp instruction. +template <bool IsCompatibility> +static bool compareCmp(Value *V, Value *V2, + function_ref<bool(Instruction *)> IsDeleted) { + auto *CI1 = cast<CmpInst>(V); + auto *CI2 = cast<CmpInst>(V2); + if (IsDeleted(CI2) || !isValidElementType(CI2->getType())) + return false; + if (CI1->getOperand(0)->getType()->getTypeID() < + CI2->getOperand(0)->getType()->getTypeID()) + return !IsCompatibility; + if (CI1->getOperand(0)->getType()->getTypeID() > + CI2->getOperand(0)->getType()->getTypeID()) + return false; + CmpInst::Predicate Pred1 = CI1->getPredicate(); + CmpInst::Predicate Pred2 = CI2->getPredicate(); + CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1); + CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2); + CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1); + CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2); + if (BasePred1 < BasePred2) + return !IsCompatibility; + if (BasePred1 > BasePred2) + return false; + // Compare operands. + bool LEPreds = Pred1 <= Pred2; + bool GEPreds = Pred1 >= Pred2; + for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) { + auto *Op1 = CI1->getOperand(LEPreds ? I : E - I - 1); + auto *Op2 = CI2->getOperand(GEPreds ? I : E - I - 1); + if (Op1->getValueID() < Op2->getValueID()) + return !IsCompatibility; + if (Op1->getValueID() > Op2->getValueID()) + return false; + if (auto *I1 = dyn_cast<Instruction>(Op1)) + if (auto *I2 = dyn_cast<Instruction>(Op2)) { + if (I1->getParent() != I2->getParent()) + return false; + InstructionsState S = getSameOpcode({I1, I2}); + if (S.getOpcode()) + continue; + return false; + } + } + return IsCompatibility; +} + bool SLPVectorizerPass::vectorizeSimpleInstructions( SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R, bool AtTerminator) { @@ -9504,37 +9748,16 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions( } // Try to vectorize list of compares. // Sort by type, compare predicate, etc. - // TODO: Add analysis on the operand opcodes (profitable to vectorize - // instructions with same/alternate opcodes/const values). auto &&CompareSorter = [&R](Value *V, Value *V2) { - auto *CI1 = cast<CmpInst>(V); - auto *CI2 = cast<CmpInst>(V2); - if (R.isDeleted(CI2) || !isValidElementType(CI2->getType())) - return false; - if (CI1->getOperand(0)->getType()->getTypeID() < - CI2->getOperand(0)->getType()->getTypeID()) - return true; - if (CI1->getOperand(0)->getType()->getTypeID() > - CI2->getOperand(0)->getType()->getTypeID()) - return false; - return CI1->getPredicate() < CI2->getPredicate() || - (CI1->getPredicate() > CI2->getPredicate() && - CI1->getPredicate() < - CmpInst::getSwappedPredicate(CI2->getPredicate())); + return compareCmp<false>(V, V2, + [&R](Instruction *I) { return R.isDeleted(I); }); }; auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) { if (V1 == V2) return true; - auto *CI1 = cast<CmpInst>(V1); - auto *CI2 = cast<CmpInst>(V2); - if (R.isDeleted(CI2) || !isValidElementType(CI2->getType())) - return false; - if (CI1->getOperand(0)->getType() != CI2->getOperand(0)->getType()) - return false; - return CI1->getPredicate() == CI2->getPredicate() || - CI1->getPredicate() == - CmpInst::getSwappedPredicate(CI2->getPredicate()); + return compareCmp<true>(V1, V2, + [&R](Instruction *I) { return R.isDeleted(I); }); }; auto Limit = [&R](Value *V) { unsigned EltSize = R.getVectorElementSize(V); @@ -9592,10 +9815,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { return true; if (Opcodes1.size() > Opcodes2.size()) return false; + Optional<bool> ConstOrder; for (int I = 0, E = Opcodes1.size(); I < E; ++I) { // Undefs are compatible with any other value. - if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) + if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) { + if (!ConstOrder) + ConstOrder = + !isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]); continue; + } if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent()); @@ -9614,14 +9842,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { continue; return I1->getOpcode() < I2->getOpcode(); } - if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) + if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) { + if (!ConstOrder) + ConstOrder = Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID(); continue; + } if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID()) return true; if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID()) return false; } - return false; + return ConstOrder && *ConstOrder; }; auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) { if (V1 == V2) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 44b5e1df0839..1d9e71663cd2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -374,8 +374,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) { assert((SplitAt == end() || SplitAt->getParent() == this) && "can only split at a position in the same block"); - SmallVector<VPBlockBase *, 2> Succs(getSuccessors().begin(), - getSuccessors().end()); + SmallVector<VPBlockBase *, 2> Succs(successors()); // First, disconnect the current block from its successors. for (VPBlockBase *Succ : Succs) VPBlockUtils::disconnectBlocks(this, Succ); @@ -642,6 +641,7 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB, void VPInstruction::generateInstruction(VPTransformState &State, unsigned Part) { IRBuilder<> &Builder = State.Builder; + Builder.SetCurrentDebugLocation(DL); if (Instruction::isBinaryOp(getOpcode())) { Value *A = State.get(getOperand(0), Part); @@ -768,6 +768,11 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, O << " "; Operand->printAsOperand(O, SlotTracker); } + + if (DL) { + O << ", !dbg "; + DL.print(O); + } } #endif diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 810dd5030f95..f4a1883e35d5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -39,6 +39,7 @@ #include "llvm/ADT/ilist.h" #include "llvm/ADT/ilist_node.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/IRBuilder.h" #include "llvm/Support/InstructionCost.h" #include <algorithm> @@ -51,6 +52,7 @@ namespace llvm { class BasicBlock; class DominatorTree; +class InductionDescriptor; class InnerLoopVectorizer; class LoopInfo; class raw_ostream; @@ -500,6 +502,8 @@ public: const VPBlocksTy &getSuccessors() const { return Successors; } VPBlocksTy &getSuccessors() { return Successors; } + iterator_range<VPBlockBase **> successors() { return Successors; } + const VPBlocksTy &getPredecessors() const { return Predecessors; } VPBlocksTy &getPredecessors() { return Predecessors; } @@ -795,6 +799,7 @@ private: typedef unsigned char OpcodeTy; OpcodeTy Opcode; FastMathFlags FMF; + DebugLoc DL; /// Utility method serving execute(): generates a single instance of the /// modeled instruction. @@ -804,12 +809,14 @@ protected: void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); } public: - VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands) + VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL) : VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands), - VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {} + VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode), + DL(DL) {} - VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands) - : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {} + VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, + DebugLoc DL = {}) + : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL) {} /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPValue *V) { @@ -818,7 +825,7 @@ public: VPInstruction *clone() const { SmallVector<VPValue *, 2> Operands(operands()); - return new VPInstruction(Opcode, Operands); + return new VPInstruction(Opcode, Operands, DL); } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -1003,21 +1010,22 @@ public: /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their vector and scalar values. -class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { +class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue { PHINode *IV; + const InductionDescriptor &IndDesc; public: - VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, Instruction *Cast, - TruncInst *Trunc = nullptr) - : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), IV(IV) { - if (Trunc) - new VPValue(Trunc, this); - else - new VPValue(IV, this); + VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, + const InductionDescriptor &IndDesc) + : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(IV, this), + IV(IV), IndDesc(IndDesc) {} + + VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, + const InductionDescriptor &IndDesc, + TruncInst *Trunc) + : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(Trunc, this), + IV(IV), IndDesc(IndDesc) {} - if (Cast) - new VPValue(Cast, this); - } ~VPWidenIntOrFpInductionRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -1038,13 +1046,6 @@ public: /// Returns the start value of the induction. VPValue *getStartValue() { return getOperand(0); } - /// Returns the cast VPValue, if one is attached, or nullptr otherwise. - VPValue *getCastValue() { - if (getNumDefinedValues() != 2) - return nullptr; - return getVPValue(1); - } - /// Returns the first defined value as TruncInst, if it is one or nullptr /// otherwise. TruncInst *getTruncInst() { @@ -1053,6 +1054,9 @@ public: const TruncInst *getTruncInst() const { return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue()); } + + /// Returns the induction descriptor for the recipe. + const InductionDescriptor &getInductionDescriptor() const { return IndDesc; } }; /// A recipe for handling first order recurrences and pointer inductions. For @@ -1169,7 +1173,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPWidenPHIRecipe { /// operand. class VPReductionPHIRecipe : public VPWidenPHIRecipe { /// Descriptor for the reduction. - RecurrenceDescriptor &RdxDesc; + const RecurrenceDescriptor &RdxDesc; /// The phi is part of an in-loop reduction. bool IsInLoop; @@ -1180,7 +1184,7 @@ class VPReductionPHIRecipe : public VPWidenPHIRecipe { public: /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p /// RdxDesc. - VPReductionPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc, + VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc, VPValue &Start, bool IsInLoop = false, bool IsOrdered = false) : VPWidenPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start), @@ -1210,7 +1214,9 @@ public: VPSlotTracker &SlotTracker) const override; #endif - RecurrenceDescriptor &getRecurrenceDescriptor() { return RdxDesc; } + const RecurrenceDescriptor &getRecurrenceDescriptor() const { + return RdxDesc; + } /// Returns true, if the phi is part of an ordered reduction. bool isOrdered() const { return IsOrdered; } @@ -1340,13 +1346,13 @@ public: /// The Operands are {ChainOp, VecOp, [Condition]}. class VPReductionRecipe : public VPRecipeBase, public VPValue { /// The recurrence decriptor for the reduction in question. - RecurrenceDescriptor *RdxDesc; + const RecurrenceDescriptor *RdxDesc; /// Pointer to the TTI, needed to create the target reduction const TargetTransformInfo *TTI; public: - VPReductionRecipe(RecurrenceDescriptor *R, Instruction *I, VPValue *ChainOp, - VPValue *VecOp, VPValue *CondOp, + VPReductionRecipe(const RecurrenceDescriptor *R, Instruction *I, + VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, const TargetTransformInfo *TTI) : VPRecipeBase(VPRecipeBase::VPReductionSC, {ChainOp, VecOp}), VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), TTI(TTI) { @@ -2252,6 +2258,12 @@ public: return map_range(Operands, Fn); } + /// Returns true if \p VPV is uniform after vectorization. + bool isUniformAfterVectorization(VPValue *VPV) const { + auto RepR = dyn_cast_or_null<VPReplicateRecipe>(VPV->getDef()); + return !VPV->getDef() || (RepR && RepR->isUniform()); + } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. @@ -2340,18 +2352,23 @@ public: /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p - /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. If \p BlockPtr - /// has more than one successor, its conditional bit is propagated to \p - /// NewBlock. \p NewBlock must have neither successors nor predecessors. + /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's + /// successors are moved from \p BlockPtr to \p NewBlock and \p BlockPtr's + /// conditional bit is propagated to \p NewBlock. \p NewBlock must have + /// neither successors nor predecessors. static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) { assert(NewBlock->getSuccessors().empty() && - "Can't insert new block with successors."); - // TODO: move successors from BlockPtr to NewBlock when this functionality - // is necessary. For now, setBlockSingleSuccessor will assert if BlockPtr - // already has successors. - BlockPtr->setOneSuccessor(NewBlock); - NewBlock->setPredecessors({BlockPtr}); + NewBlock->getPredecessors().empty() && + "Can't insert new block with predecessors or successors."); NewBlock->setParent(BlockPtr->getParent()); + SmallVector<VPBlockBase *> Succs(BlockPtr->successors()); + for (VPBlockBase *Succ : Succs) { + disconnectBlocks(BlockPtr, Succ); + connectBlocks(NewBlock, Succ); + } + NewBlock->setCondBit(BlockPtr->getCondBit()); + BlockPtr->setCondBit(nullptr); + connectBlocks(BlockPtr, NewBlock); } /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p @@ -2394,6 +2411,31 @@ public: To->removePredecessor(From); } + /// Try to merge \p Block into its single predecessor, if \p Block is a + /// VPBasicBlock and its predecessor has a single successor. Returns a pointer + /// to the predecessor \p Block was merged into or nullptr otherwise. + static VPBasicBlock *tryToMergeBlockIntoPredecessor(VPBlockBase *Block) { + auto *VPBB = dyn_cast<VPBasicBlock>(Block); + auto *PredVPBB = + dyn_cast_or_null<VPBasicBlock>(Block->getSinglePredecessor()); + if (!VPBB || !PredVPBB || PredVPBB->getNumSuccessors() != 1) + return nullptr; + + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) + R.moveBefore(*PredVPBB, PredVPBB->end()); + VPBlockUtils::disconnectBlocks(PredVPBB, VPBB); + auto *ParentRegion = cast<VPRegionBlock>(Block->getParent()); + if (ParentRegion->getExit() == Block) + ParentRegion->setExit(PredVPBB); + SmallVector<VPBlockBase *> Successors(Block->successors()); + for (auto *Succ : Successors) { + VPBlockUtils::disconnectBlocks(Block, Succ); + VPBlockUtils::connectBlocks(PredVPBB, Succ); + } + delete Block; + return PredVPBB; + } + /// Returns true if the edge \p FromBlock -> \p ToBlock is a back-edge. static bool isBackEdge(const VPBlockBase *FromBlock, const VPBlockBase *ToBlock, const VPLoopInfo *VPLI) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index ac3b3505dc34..86ecd6817873 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -50,14 +50,14 @@ VPValue *VPlanPredicator::getOrCreateNotPredicate(VPBasicBlock *PredBB, case EdgeType::FALSE_EDGE: // CurrBB is the False successor of PredBB - compute not of CBV. - IntermediateVal = Builder.createNot(CBV); + IntermediateVal = Builder.createNot(CBV, {}); break; } // Now AND intermediate value with PredBB's block predicate if it has one. VPValue *BP = PredBB->getPredicate(); if (BP) - return Builder.createAnd(BP, IntermediateVal); + return Builder.createAnd(BP, IntermediateVal, {}); else return IntermediateVal; } @@ -96,7 +96,7 @@ VPValue *VPlanPredicator::genPredicateTree(std::list<VPValue *> &Worklist) { Worklist.pop_front(); // Create an OR of these values. - VPValue *Or = Builder.createOr(LHS, RHS); + VPValue *Or = Builder.createOr(LHS, RHS, {}); // Push OR to the back of the worklist. Worklist.push_back(Or); diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp index c52c8a2229e8..9e19e172dea5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -467,8 +467,9 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { return markFailed(); assert(CombinedOperands.size() > 0 && "Need more some operands"); - auto *VPI = new VPInstruction(Opcode, CombinedOperands); - VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr()); + auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr(); + auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc()); + VPI->setUnderlyingInstr(Inst); LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " << *cast<VPInstruction>(Values[0]) << "\n"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index ded5bc04beb5..d2daf558c2c5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -18,7 +18,8 @@ using namespace llvm; void VPlanTransforms::VPInstructionsToVPRecipes( Loop *OrigLoop, VPlanPtr &Plan, - LoopVectorizationLegality::InductionList &Inductions, + function_ref<const InductionDescriptor *(PHINode *)> + GetIntOrFpInductionDescriptor, SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE) { auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry()); @@ -44,11 +45,9 @@ void VPlanTransforms::VPInstructionsToVPRecipes( VPRecipeBase *NewRecipe = nullptr; if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&Ingredient)) { auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue()); - InductionDescriptor II = Inductions.lookup(Phi); - if (II.getKind() == InductionDescriptor::IK_IntInduction || - II.getKind() == InductionDescriptor::IK_FpInduction) { - VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); - NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, nullptr); + if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) { + VPValue *Start = Plan->getOrAddVPValue(II->getStartValue()); + NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, *II); } else { Plan->addVPValue(Phi, VPPhi); continue; @@ -158,8 +157,7 @@ bool VPlanTransforms::sinkScalarOperands(VPlan &Plan) { // TODO: add ".cloned" suffix to name of Clone's VPValue. Clone->insertBefore(SinkCandidate); - SmallVector<VPUser *, 4> Users(SinkCandidate->user_begin(), - SinkCandidate->user_end()); + SmallVector<VPUser *, 4> Users(SinkCandidate->users()); for (auto *U : Users) { auto *UI = cast<VPRecipeBase>(U); if (UI->getParent() == SinkTo) @@ -266,8 +264,7 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) { VPValue *PredInst1 = cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0); VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue(); - SmallVector<VPUser *> Users(Phi1ToMoveV->user_begin(), - Phi1ToMoveV->user_end()); + SmallVector<VPUser *> Users(Phi1ToMoveV->users()); for (VPUser *U : Users) { auto *UI = dyn_cast<VPRecipeBase>(U); if (!UI || UI->getParent() != Then2) @@ -295,3 +292,35 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) { delete ToDelete; return Changed; } + +void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) { + SmallVector<std::pair<VPRecipeBase *, VPValue *>> CastsToRemove; + for (auto &Phi : Plan.getEntry()->getEntryBasicBlock()->phis()) { + auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); + if (!IV || IV->getTruncInst()) + continue; + + // Visit all casts connected to IV and in Casts. Collect them. + // remember them for removal. + auto &Casts = IV->getInductionDescriptor().getCastInsts(); + VPValue *FindMyCast = IV; + for (Instruction *IRCast : reverse(Casts)) { + VPRecipeBase *FoundUserCast = nullptr; + for (auto *U : FindMyCast->users()) { + auto *UserCast = cast<VPRecipeBase>(U); + if (UserCast->getNumDefinedValues() == 1 && + UserCast->getVPSingleValue()->getUnderlyingValue() == IRCast) { + FoundUserCast = UserCast; + break; + } + } + assert(FoundUserCast && "Missing a cast to remove"); + CastsToRemove.emplace_back(FoundUserCast, IV); + FindMyCast = FoundUserCast->getVPSingleValue(); + } + } + for (auto &E : CastsToRemove) { + E.first->getVPSingleValue()->replaceAllUsesWith(E.second); + E.first->eraseFromParent(); + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index c740f2c022da..a82a562d5e35 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -14,24 +14,37 @@ #define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H #include "VPlan.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" namespace llvm { +class InductionDescriptor; class Instruction; +class PHINode; class ScalarEvolution; struct VPlanTransforms { /// Replaces the VPInstructions in \p Plan with corresponding /// widen recipes. - static void VPInstructionsToVPRecipes( - Loop *OrigLoop, VPlanPtr &Plan, - LoopVectorizationLegality::InductionList &Inductions, - SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE); + static void + VPInstructionsToVPRecipes(Loop *OrigLoop, VPlanPtr &Plan, + function_ref<const InductionDescriptor *(PHINode *)> + GetIntOrFpInductionDescriptor, + SmallPtrSetImpl<Instruction *> &DeadInstructions, + ScalarEvolution &SE); static bool sinkScalarOperands(VPlan &Plan); static bool mergeReplicateRegions(VPlan &Plan); + + /// Remove redundant casts of inductions. + /// + /// Such redundant casts are casts of induction variables that can be ignored, + /// because we already proved that the casted phi is equal to the uncasted phi + /// in the vectorized loop. There is no need to vectorize the cast - the same + /// value can be used for both the phi and casts in the vector loop. + static void removeRedundantInductionCasts(VPlan &Plan); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 6d6ea4eb30f1..7732d9367985 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -156,5 +156,31 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) { RecipeI++; } } + + const VPRegionBlock *TopRegion = cast<VPRegionBlock>(Plan.getEntry()); + const VPBasicBlock *Entry = dyn_cast<VPBasicBlock>(TopRegion->getEntry()); + if (!Entry) { + errs() << "VPlan entry block is not a VPBasicBlock\n"; + return false; + } + const VPBasicBlock *Exit = dyn_cast<VPBasicBlock>(TopRegion->getExit()); + if (!Exit) { + errs() << "VPlan exit block is not a VPBasicBlock\n"; + return false; + } + + for (const VPRegionBlock *Region : + VPBlockUtils::blocksOnly<const VPRegionBlock>( + depth_first(VPBlockRecursiveTraversalWrapper<const VPBlockBase *>( + Plan.getEntry())))) { + if (Region->getEntry()->getNumPredecessors() != 0) { + errs() << "region entry block has predecessors\n"; + return false; + } + if (Region->getExit()->getNumSuccessors() != 0) { + errs() << "region exit block has successors\n"; + return false; + } + } return true; } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 57b11e9414ba..c0aedab2fed0 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -989,9 +989,9 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { if (!FixedVT) return false; - InstructionCost OriginalCost = TTI.getMemoryOpCost( - Instruction::Load, LI->getType(), Align(LI->getAlignment()), - LI->getPointerAddressSpace()); + InstructionCost OriginalCost = + TTI.getMemoryOpCost(Instruction::Load, LI->getType(), LI->getAlign(), + LI->getPointerAddressSpace()); InstructionCost ScalarizedCost = 0; Instruction *LastCheckedInst = LI; |
