aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp6
-rw-r--r--llvm/lib/Transforms/CFGuard/CFGuard.cpp8
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroFrame.cpp33
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroSplit.cpp41
-rw-r--r--llvm/lib/Transforms/Coroutines/Coroutines.cpp1
-rw-r--r--llvm/lib/Transforms/IPO/ArgumentPromotion.cpp10
-rw-r--r--llvm/lib/Transforms/IPO/Attributor.cpp6
-rw-r--r--llvm/lib/Transforms/IPO/AttributorAttributes.cpp38
-rw-r--r--llvm/lib/Transforms/IPO/FunctionAttrs.cpp208
-rw-r--r--llvm/lib/Transforms/IPO/FunctionSpecialization.cpp317
-rw-r--r--llvm/lib/Transforms/IPO/GlobalOpt.cpp11
-rw-r--r--llvm/lib/Transforms/IPO/HotColdSplitting.cpp2
-rw-r--r--llvm/lib/Transforms/IPO/Inliner.cpp2
-rw-r--r--llvm/lib/Transforms/IPO/LowerTypeTests.cpp7
-rw-r--r--llvm/lib/Transforms/IPO/OpenMPOpt.cpp6
-rw-r--r--llvm/lib/Transforms/IPO/SampleContextTracker.cpp21
-rw-r--r--llvm/lib/Transforms/IPO/SampleProfile.cpp174
-rw-r--r--llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp3
-rw-r--r--llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp104
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp58
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp6
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp44
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h1
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp20
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp18
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp7
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp7
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp6
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp165
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp39
-rw-r--r--llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp25
-rw-r--r--llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp61
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp13
-rw-r--r--llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp25
-rw-r--r--llvm/lib/Transforms/Scalar/ConstantHoisting.cpp3
-rw-r--r--llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp58
-rw-r--r--llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp187
-rw-r--r--llvm/lib/Transforms/Scalar/EarlyCSE.cpp12
-rw-r--r--llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp49
-rw-r--r--llvm/lib/Transforms/Scalar/LICM.cpp44
-rw-r--r--llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp28
-rw-r--r--llvm/lib/Transforms/Scalar/LoopRerollPass.cpp12
-rw-r--r--llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp25
-rw-r--r--llvm/lib/Transforms/Scalar/NewGVN.cpp35
-rw-r--r--llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp51
-rw-r--r--llvm/lib/Transforms/Scalar/SCCP.cpp3
-rw-r--r--llvm/lib/Transforms/Scalar/Scalar.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp6
-rw-r--r--llvm/lib/Transforms/Utils/CodeLayout.cpp942
-rw-r--r--llvm/lib/Transforms/Utils/Debugify.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/FunctionComparator.cpp16
-rw-r--r--llvm/lib/Transforms/Utils/Local.cpp12
-rw-r--r--llvm/lib/Transforms/Utils/LoopPeel.cpp31
-rw-r--r--llvm/lib/Transforms/Utils/LoopUtils.cpp101
-rw-r--r--llvm/lib/Transforms/Utils/MetaRenamer.cpp67
-rw-r--r--llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp4
-rw-r--r--llvm/lib/Transforms/Utils/SampleProfileInference.cpp385
-rw-r--r--llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp6
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp293
-rw-r--r--llvm/lib/Transforms/Utils/ValueMapper.cpp6
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp89
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h38
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp463
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp633
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp9
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h118
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp6
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanSLP.cpp5
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp49
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h21
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp26
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp6
75 files changed, 3820 insertions, 1524 deletions
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
index abac3f801a22..4624b735bef8 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -475,12 +475,12 @@ void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
// any of its operands, this way, when we get to the operand, we already
// removed the instructions (from the expression dag) that uses it.
CurrentTruncInst->eraseFromParent();
- for (auto I = InstInfoMap.rbegin(), E = InstInfoMap.rend(); I != E; ++I) {
+ for (auto &I : llvm::reverse(InstInfoMap)) {
// We still need to check that the instruction has no users before we erase
// it, because {SExt, ZExt}Inst Instruction might have other users that was
// not reduced, in such case, we need to keep that instruction.
- if (I->first->use_empty())
- I->first->eraseFromParent();
+ if (I.first->use_empty())
+ I.first->eraseFromParent();
}
}
diff --git a/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
index 96c083a144b2..5fc5295969d0 100644
--- a/llvm/lib/Transforms/CFGuard/CFGuard.cpp
+++ b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
@@ -165,6 +165,12 @@ void CFGuard::insertCFGuardCheck(CallBase *CB) {
IRBuilder<> B(CB);
Value *CalledOperand = CB->getCalledOperand();
+ // If the indirect call is called within catchpad or cleanuppad,
+ // we need to copy "funclet" bundle of the call.
+ SmallVector<llvm::OperandBundleDef, 1> Bundles;
+ if (auto Bundle = CB->getOperandBundle(LLVMContext::OB_funclet))
+ Bundles.push_back(OperandBundleDef(*Bundle));
+
// Load the global symbol as a pointer to the check function.
LoadInst *GuardCheckLoad = B.CreateLoad(GuardFnPtrType, GuardFnGlobal);
@@ -172,7 +178,7 @@ void CFGuard::insertCFGuardCheck(CallBase *CB) {
// even if the original CallBase is an Invoke or CallBr instruction.
CallInst *GuardCheck =
B.CreateCall(GuardFnType, GuardCheckLoad,
- {B.CreateBitCast(CalledOperand, B.getInt8PtrTy())});
+ {B.CreateBitCast(CalledOperand, B.getInt8PtrTy())}, Bundles);
// Ensure that the first argument is passed in the correct register
// (e.g. ECX on 32-bit X86 targets).
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index ac3d078714ce..a0d12865bd3a 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -1237,8 +1237,10 @@ namespace {
struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
using Base = PtrUseVisitor<AllocaUseVisitor>;
AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT,
- const CoroBeginInst &CB, const SuspendCrossingInfo &Checker)
- : PtrUseVisitor(DL), DT(DT), CoroBegin(CB), Checker(Checker) {}
+ const CoroBeginInst &CB, const SuspendCrossingInfo &Checker,
+ bool ShouldUseLifetimeStartInfo)
+ : PtrUseVisitor(DL), DT(DT), CoroBegin(CB), Checker(Checker),
+ ShouldUseLifetimeStartInfo(ShouldUseLifetimeStartInfo) {}
void visit(Instruction &I) {
Users.insert(&I);
@@ -1390,6 +1392,7 @@ private:
SmallPtrSet<Instruction *, 4> Users{};
SmallPtrSet<IntrinsicInst *, 2> LifetimeStarts{};
bool MayWriteBeforeCoroBegin{false};
+ bool ShouldUseLifetimeStartInfo{true};
mutable llvm::Optional<bool> ShouldLiveOnFrame{};
@@ -1398,7 +1401,7 @@ private:
// more precise. We look at every pair of lifetime.start intrinsic and
// every basic block that uses the pointer to see if they cross suspension
// points. The uses cover both direct uses as well as indirect uses.
- if (!LifetimeStarts.empty()) {
+ if (ShouldUseLifetimeStartInfo && !LifetimeStarts.empty()) {
for (auto *I : Users)
for (auto *S : LifetimeStarts)
if (Checker.isDefinitionAcrossSuspend(*S, I))
@@ -2484,8 +2487,15 @@ static void collectFrameAllocas(Function &F, coro::Shape &Shape,
continue;
}
DominatorTree DT(F);
+ // The code that uses lifetime.start intrinsic does not work for functions
+ // with loops without exit. Disable it on ABIs we know to generate such
+ // code.
+ bool ShouldUseLifetimeStartInfo =
+ (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon &&
+ Shape.ABI != coro::ABI::RetconOnce);
AllocaUseVisitor Visitor{F.getParent()->getDataLayout(), DT,
- *Shape.CoroBegin, Checker};
+ *Shape.CoroBegin, Checker,
+ ShouldUseLifetimeStartInfo};
Visitor.visitPtr(*AI);
if (!Visitor.getShouldLiveOnFrame())
continue;
@@ -2572,9 +2582,15 @@ void coro::salvageDebugInfo(
DVI->setExpression(Expr);
/// It makes no sense to move the dbg.value intrinsic.
if (!isa<DbgValueInst>(DVI)) {
- if (auto *InsertPt = dyn_cast<Instruction>(Storage))
+ if (auto *II = dyn_cast<InvokeInst>(Storage))
+ DVI->moveBefore(II->getNormalDest()->getFirstNonPHI());
+ else if (auto *CBI = dyn_cast<CallBrInst>(Storage))
+ DVI->moveBefore(CBI->getDefaultDest()->getFirstNonPHI());
+ else if (auto *InsertPt = dyn_cast<Instruction>(Storage)) {
+ assert(!InsertPt->isTerminator() &&
+ "Unimaged terminator that could return a storage.");
DVI->moveAfter(InsertPt);
- else if (isa<Argument>(Storage))
+ } else if (isa<Argument>(Storage))
DVI->moveAfter(F->getEntryBlock().getFirstNonPHI());
}
}
@@ -2664,7 +2680,10 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
}
}
- sinkLifetimeStartMarkers(F, Shape, Checker);
+ if (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon &&
+ Shape.ABI != coro::ABI::RetconOnce)
+ sinkLifetimeStartMarkers(F, Shape, Checker);
+
if (Shape.ABI != coro::ABI::Async || !Shape.CoroSuspends.empty())
collectFrameAllocas(F, Shape, Checker, FrameData.Allocas);
LLVM_DEBUG(dumpAllocas(FrameData.Allocas));
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index fa1d92f439b8..12c1829524ef 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -280,6 +280,27 @@ static void replaceFallthroughCoroEnd(AnyCoroEndInst *End,
BB->getTerminator()->eraseFromParent();
}
+// Mark a coroutine as done, which implies that the coroutine is finished and
+// never get resumed.
+//
+// In resume-switched ABI, the done state is represented by storing zero in
+// ResumeFnAddr.
+//
+// NOTE: We couldn't omit the argument `FramePtr`. It is necessary because the
+// pointer to the frame in splitted function is not stored in `Shape`.
+static void markCoroutineAsDone(IRBuilder<> &Builder, const coro::Shape &Shape,
+ Value *FramePtr) {
+ assert(
+ Shape.ABI == coro::ABI::Switch &&
+ "markCoroutineAsDone is only supported for Switch-Resumed ABI for now.");
+ auto *GepIndex = Builder.CreateStructGEP(
+ Shape.FrameTy, FramePtr, coro::Shape::SwitchFieldIndex::Resume,
+ "ResumeFn.addr");
+ auto *NullPtr = ConstantPointerNull::get(cast<PointerType>(
+ Shape.FrameTy->getTypeAtIndex(coro::Shape::SwitchFieldIndex::Resume)));
+ Builder.CreateStore(NullPtr, GepIndex);
+}
+
/// Replace an unwind call to llvm.coro.end.
static void replaceUnwindCoroEnd(AnyCoroEndInst *End, const coro::Shape &Shape,
Value *FramePtr, bool InResume,
@@ -288,10 +309,18 @@ static void replaceUnwindCoroEnd(AnyCoroEndInst *End, const coro::Shape &Shape,
switch (Shape.ABI) {
// In switch-lowering, this does nothing in the main function.
- case coro::ABI::Switch:
+ case coro::ABI::Switch: {
+ // In C++'s specification, the coroutine should be marked as done
+ // if promise.unhandled_exception() throws. The frontend will
+ // call coro.end(true) along this path.
+ //
+ // FIXME: We should refactor this once there is other language
+ // which uses Switch-Resumed style other than C++.
+ markCoroutineAsDone(Builder, Shape, FramePtr);
if (!InResume)
return;
break;
+ }
// In async lowering this does nothing.
case coro::ABI::Async:
break;
@@ -364,13 +393,9 @@ static void createResumeEntryBlock(Function &F, coro::Shape &Shape) {
auto *Save = S->getCoroSave();
Builder.SetInsertPoint(Save);
if (S->isFinal()) {
- // Final suspend point is represented by storing zero in ResumeFnAddr.
- auto *GepIndex = Builder.CreateStructGEP(FrameTy, FramePtr,
- coro::Shape::SwitchFieldIndex::Resume,
- "ResumeFn.addr");
- auto *NullPtr = ConstantPointerNull::get(cast<PointerType>(
- FrameTy->getTypeAtIndex(coro::Shape::SwitchFieldIndex::Resume)));
- Builder.CreateStore(NullPtr, GepIndex);
+ // The coroutine should be marked done if it reaches the final suspend
+ // point.
+ markCoroutineAsDone(Builder, Shape, FramePtr);
} else {
auto *GepIndex = Builder.CreateStructGEP(
FrameTy, FramePtr, Shape.getSwitchIndexField(), "index.addr");
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index e4883ef89db7..fba8b03e44ba 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -141,7 +141,6 @@ static bool isCoroutineIntrinsicName(StringRef Name) {
"llvm.coro.id.retcon",
"llvm.coro.id.retcon.once",
"llvm.coro.noop",
- "llvm.coro.param",
"llvm.coro.prepare.async",
"llvm.coro.prepare.retcon",
"llvm.coro.promise",
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 93bb11433775..3a42a2cac928 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -835,14 +835,20 @@ bool ArgumentPromotionPass::areFunctionArgsABICompatible(
const Function &F, const TargetTransformInfo &TTI,
SmallPtrSetImpl<Argument *> &ArgsToPromote,
SmallPtrSetImpl<Argument *> &ByValArgsToTransform) {
+ // TODO: Check individual arguments so we can promote a subset?
+ SmallVector<Type *, 32> Types;
+ for (Argument *Arg : ArgsToPromote)
+ Types.push_back(Arg->getType()->getPointerElementType());
+ for (Argument *Arg : ByValArgsToTransform)
+ Types.push_back(Arg->getParamByValType());
+
for (const Use &U : F.uses()) {
CallBase *CB = dyn_cast<CallBase>(U.getUser());
if (!CB)
return false;
const Function *Caller = CB->getCaller();
const Function *Callee = CB->getCalledFunction();
- if (!TTI.areFunctionArgsABICompatible(Caller, Callee, ArgsToPromote) ||
- !TTI.areFunctionArgsABICompatible(Caller, Callee, ByValArgsToTransform))
+ if (!TTI.areTypesABICompatible(Caller, Callee, Types))
return false;
}
return true;
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index edadc79e3a9f..7e729e57153c 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -2139,12 +2139,10 @@ bool Attributor::shouldSeedAttribute(AbstractAttribute &AA) {
bool Result = true;
#ifndef NDEBUG
if (SeedAllowList.size() != 0)
- Result =
- std::count(SeedAllowList.begin(), SeedAllowList.end(), AA.getName());
+ Result = llvm::is_contained(SeedAllowList, AA.getName());
Function *Fn = AA.getAnchorScope();
if (FunctionSeedAllowList.size() != 0 && Fn)
- Result &= std::count(FunctionSeedAllowList.begin(),
- FunctionSeedAllowList.end(), Fn->getName());
+ Result &= llvm::is_contained(FunctionSeedAllowList, Fn->getName());
#endif
return Result;
}
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index ec08287393de..b977821bcaa6 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -417,7 +417,7 @@ const Value *stripAndAccumulateMinimalOffsets(
AttributorAnalysis);
}
-static const Value *getMinimalBaseOfAccsesPointerOperand(
+static const Value *getMinimalBaseOfAccessPointerOperand(
Attributor &A, const AbstractAttribute &QueryingAA, const Instruction *I,
int64_t &BytesOffset, const DataLayout &DL, bool AllowNonInbounds = false) {
const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false);
@@ -2129,7 +2129,7 @@ static int64_t getKnownNonNullAndDerefBytesForUse(
int64_t Offset;
const Value *Base =
- getMinimalBaseOfAccsesPointerOperand(A, QueryingAA, I, Offset, DL);
+ getMinimalBaseOfAccessPointerOperand(A, QueryingAA, I, Offset, DL);
if (Base) {
if (Base == &AssociatedValue &&
getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
@@ -6414,31 +6414,36 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
return indicatePessimisticFixpoint();
}
+ // Collect the types that will replace the privatizable type in the function
+ // signature.
+ SmallVector<Type *, 16> ReplacementTypes;
+ identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes);
+
// Verify callee and caller agree on how the promoted argument would be
// passed.
- // TODO: The use of the ArgumentPromotion interface here is ugly, we need a
- // specialized form of TargetTransformInfo::areFunctionArgsABICompatible
- // which doesn't require the arguments ArgumentPromotion wanted to pass.
Function &Fn = *getIRPosition().getAnchorScope();
- SmallPtrSet<Argument *, 1> ArgsToPromote, Dummy;
- ArgsToPromote.insert(getAssociatedArgument());
const auto *TTI =
A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(Fn);
- if (!TTI ||
- !ArgumentPromotionPass::areFunctionArgsABICompatible(
- Fn, *TTI, ArgsToPromote, Dummy) ||
- ArgsToPromote.empty()) {
+ if (!TTI) {
+ LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Missing TTI for function "
+ << Fn.getName() << "\n");
+ return indicatePessimisticFixpoint();
+ }
+
+ auto CallSiteCheck = [&](AbstractCallSite ACS) {
+ CallBase *CB = ACS.getInstruction();
+ return TTI->areTypesABICompatible(
+ CB->getCaller(), CB->getCalledFunction(), ReplacementTypes);
+ };
+ bool AllCallSitesKnown;
+ if (!A.checkForAllCallSites(CallSiteCheck, *this, true,
+ AllCallSitesKnown)) {
LLVM_DEBUG(
dbgs() << "[AAPrivatizablePtr] ABI incompatibility detected for "
<< Fn.getName() << "\n");
return indicatePessimisticFixpoint();
}
- // Collect the types that will replace the privatizable type in the function
- // signature.
- SmallVector<Type *, 16> ReplacementTypes;
- identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes);
-
// Register a rewrite of the argument.
Argument *Arg = getAssociatedArgument();
if (!A.isValidFunctionSignatureRewrite(*Arg, ReplacementTypes)) {
@@ -6558,7 +6563,6 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
return false;
};
- bool AllCallSitesKnown;
if (!A.checkForAllCallSites(IsCompatiblePrivArgOfOtherCallSite, *this, true,
AllCallSitesKnown))
return indicatePessimisticFixpoint();
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index cde78713b554..321d4a19a585 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -76,6 +76,7 @@ STATISTIC(NumNoCapture, "Number of arguments marked nocapture");
STATISTIC(NumReturned, "Number of arguments marked returned");
STATISTIC(NumReadNoneArg, "Number of arguments marked readnone");
STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly");
+STATISTIC(NumWriteOnlyArg, "Number of arguments marked writeonly");
STATISTIC(NumNoAlias, "Number of function returns marked noalias");
STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull");
STATISTIC(NumNoRecurse, "Number of functions marked as norecurse");
@@ -580,16 +581,8 @@ struct ArgumentUsesTracker : public CaptureTracker {
return true;
}
- // Note: the callee and the two successor blocks *follow* the argument
- // operands. This means there is no need to adjust UseIndex to account for
- // these.
-
- unsigned UseIndex =
- std::distance(const_cast<const Use *>(CB->arg_begin()), U);
-
- assert(UseIndex < CB->data_operands_size() &&
- "Indirect function calls should have been filtered above!");
-
+ assert(!CB->isCallee(U) && "callee operand reported captured?");
+ const unsigned UseIndex = CB->getDataOperandNo(U);
if (UseIndex >= CB->arg_size()) {
// Data operand, but not a argument operand -- must be a bundle operand
assert(CB->hasOperandBundles() && "Must be!");
@@ -649,8 +642,8 @@ struct GraphTraits<ArgumentGraph *> : public GraphTraits<ArgumentGraphNode *> {
/// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone.
static Attribute::AttrKind
-determinePointerReadAttrs(Argument *A,
- const SmallPtrSet<Argument *, 8> &SCCNodes) {
+determinePointerAccessAttrs(Argument *A,
+ const SmallPtrSet<Argument *, 8> &SCCNodes) {
SmallVector<Use *, 32> Worklist;
SmallPtrSet<Use *, 32> Visited;
@@ -659,7 +652,7 @@ determinePointerReadAttrs(Argument *A,
return Attribute::None;
bool IsRead = false;
- // We don't need to track IsWritten. If A is written to, return immediately.
+ bool IsWrite = false;
for (Use &U : A->uses()) {
Visited.insert(&U);
@@ -667,6 +660,10 @@ determinePointerReadAttrs(Argument *A,
}
while (!Worklist.empty()) {
+ if (IsWrite && IsRead)
+ // No point in searching further..
+ return Attribute::None;
+
Use *U = Worklist.pop_back_val();
Instruction *I = cast<Instruction>(U->getUser());
@@ -684,73 +681,49 @@ determinePointerReadAttrs(Argument *A,
case Instruction::Call:
case Instruction::Invoke: {
- bool Captures = true;
+ CallBase &CB = cast<CallBase>(*I);
+ if (CB.isCallee(U)) {
+ IsRead = true;
+ // Note that indirect calls do not capture, see comment in
+ // CaptureTracking for context
+ continue;
+ }
- if (I->getType()->isVoidTy())
- Captures = false;
+ // Given we've explictily handled the callee operand above, what's left
+ // must be a data operand (e.g. argument or operand bundle)
+ const unsigned UseIndex = CB.getDataOperandNo(U);
- auto AddUsersToWorklistIfCapturing = [&] {
- if (Captures)
+ if (!CB.doesNotCapture(UseIndex)) {
+ if (!CB.onlyReadsMemory())
+ // If the callee can save a copy into other memory, then simply
+ // scanning uses of the call is insufficient. We have no way
+ // of tracking copies of the pointer through memory to see
+ // if a reloaded copy is written to, thus we must give up.
+ return Attribute::None;
+ // Push users for processing once we finish this one
+ if (!I->getType()->isVoidTy())
for (Use &UU : I->uses())
if (Visited.insert(&UU).second)
Worklist.push_back(&UU);
- };
-
- CallBase &CB = cast<CallBase>(*I);
- if (CB.doesNotAccessMemory()) {
- AddUsersToWorklistIfCapturing();
- continue;
}
+
+ if (CB.doesNotAccessMemory())
+ continue;
- Function *F = CB.getCalledFunction();
- if (!F) {
- if (CB.onlyReadsMemory()) {
- IsRead = true;
- AddUsersToWorklistIfCapturing();
- continue;
- }
- return Attribute::None;
- }
-
- // Note: the callee and the two successor blocks *follow* the argument
- // operands. This means there is no need to adjust UseIndex to account
- // for these.
-
- unsigned UseIndex = std::distance(CB.arg_begin(), U);
-
- // U cannot be the callee operand use: since we're exploring the
- // transitive uses of an Argument, having such a use be a callee would
- // imply the call site is an indirect call or invoke; and we'd take the
- // early exit above.
- assert(UseIndex < CB.data_operands_size() &&
- "Data operand use expected!");
-
- bool IsOperandBundleUse = UseIndex >= CB.arg_size();
+ if (Function *F = CB.getCalledFunction())
+ if (CB.isArgOperand(U) && UseIndex < F->arg_size() &&
+ SCCNodes.count(F->getArg(UseIndex)))
+ // This is an argument which is part of the speculative SCC. Note
+ // that only operands corresponding to formal arguments of the callee
+ // can participate in the speculation.
+ break;
- if (UseIndex >= F->arg_size() && !IsOperandBundleUse) {
- assert(F->isVarArg() && "More params than args in non-varargs call");
+ // The accessors used on call site here do the right thing for calls and
+ // invokes with operand bundles.
+ if (!CB.onlyReadsMemory() && !CB.onlyReadsMemory(UseIndex))
return Attribute::None;
- }
-
- Captures &= !CB.doesNotCapture(UseIndex);
-
- // Since the optimizer (by design) cannot see the data flow corresponding
- // to a operand bundle use, these cannot participate in the optimistic SCC
- // analysis. Instead, we model the operand bundle uses as arguments in
- // call to a function external to the SCC.
- if (IsOperandBundleUse ||
- !SCCNodes.count(&*std::next(F->arg_begin(), UseIndex))) {
-
- // The accessors used on call site here do the right thing for calls and
- // invokes with operand bundles.
-
- if (!CB.onlyReadsMemory() && !CB.onlyReadsMemory(UseIndex))
- return Attribute::None;
- if (!CB.doesNotAccessMemory(UseIndex))
- IsRead = true;
- }
-
- AddUsersToWorklistIfCapturing();
+ if (!CB.doesNotAccessMemory(UseIndex))
+ IsRead = true;
break;
}
@@ -763,6 +736,19 @@ determinePointerReadAttrs(Argument *A,
IsRead = true;
break;
+ case Instruction::Store:
+ if (cast<StoreInst>(I)->getValueOperand() == *U)
+ // untrackable capture
+ return Attribute::None;
+
+ // A volatile store has side effects beyond what writeonly can be relied
+ // upon.
+ if (cast<StoreInst>(I)->isVolatile())
+ return Attribute::None;
+
+ IsWrite = true;
+ break;
+
case Instruction::ICmp:
case Instruction::Ret:
break;
@@ -772,7 +758,14 @@ determinePointerReadAttrs(Argument *A,
}
}
- return IsRead ? Attribute::ReadOnly : Attribute::ReadNone;
+ if (IsWrite && IsRead)
+ return Attribute::None;
+ else if (IsRead)
+ return Attribute::ReadOnly;
+ else if (IsWrite)
+ return Attribute::WriteOnly;
+ else
+ return Attribute::ReadNone;
}
/// Deduce returned attributes for the SCC.
@@ -865,9 +858,10 @@ static bool addArgumentAttrsFromCallsites(Function &F) {
return Changed;
}
-static bool addReadAttr(Argument *A, Attribute::AttrKind R) {
- assert((R == Attribute::ReadOnly || R == Attribute::ReadNone)
- && "Must be a Read attribute.");
+static bool addAccessAttr(Argument *A, Attribute::AttrKind R) {
+ assert((R == Attribute::ReadOnly || R == Attribute::ReadNone ||
+ R == Attribute::WriteOnly)
+ && "Must be an access attribute.");
assert(A && "Argument must not be null.");
// If the argument already has the attribute, nothing needs to be done.
@@ -880,7 +874,12 @@ static bool addReadAttr(Argument *A, Attribute::AttrKind R) {
A->removeAttr(Attribute::ReadOnly);
A->removeAttr(Attribute::ReadNone);
A->addAttr(R);
- R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
+ if (R == Attribute::ReadOnly)
+ ++NumReadOnlyArg;
+ else if (R == Attribute::WriteOnly)
+ ++NumWriteOnlyArg;
+ else
+ ++NumReadNoneArg;
return true;
}
@@ -945,15 +944,15 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes,
// Otherwise, it's captured. Don't bother doing SCC analysis on it.
}
if (!HasNonLocalUses && !A->onlyReadsMemory()) {
- // Can we determine that it's readonly/readnone without doing an SCC?
- // Note that we don't allow any calls at all here, or else our result
- // will be dependent on the iteration order through the functions in the
- // SCC.
+ // Can we determine that it's readonly/readnone/writeonly without doing
+ // an SCC? Note that we don't allow any calls at all here, or else our
+ // result will be dependent on the iteration order through the
+ // functions in the SCC.
SmallPtrSet<Argument *, 8> Self;
Self.insert(&*A);
- Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self);
+ Attribute::AttrKind R = determinePointerAccessAttrs(&*A, Self);
if (R != Attribute::None)
- if (addReadAttr(A, R))
+ if (addAccessAttr(A, R))
Changed.insert(F);
}
}
@@ -979,6 +978,13 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes,
A->addAttr(Attribute::NoCapture);
++NumNoCapture;
Changed.insert(A->getParent());
+
+ // Infer the access attributes given the new nocapture one
+ SmallPtrSet<Argument *, 8> Self;
+ Self.insert(&*A);
+ Attribute::AttrKind R = determinePointerAccessAttrs(&*A, Self);
+ if (R != Attribute::None)
+ addAccessAttr(A, R);
}
continue;
}
@@ -1023,10 +1029,10 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes,
Changed.insert(A->getParent());
}
- // We also want to compute readonly/readnone. With a small number of false
- // negatives, we can assume that any pointer which is captured isn't going
- // to be provably readonly or readnone, since by definition we can't
- // analyze all uses of a captured pointer.
+ // We also want to compute readonly/readnone/writeonly. With a small number
+ // of false negatives, we can assume that any pointer which is captured
+ // isn't going to be provably readonly or readnone, since by definition
+ // we can't analyze all uses of a captured pointer.
//
// The false negatives happen when the pointer is captured by a function
// that promises readonly/readnone behaviour on the pointer, then the
@@ -1034,24 +1040,28 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes,
// Also, a readonly/readnone pointer may be returned, but returning a
// pointer is capturing it.
- Attribute::AttrKind ReadAttr = Attribute::ReadNone;
- for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
+ auto meetAccessAttr = [](Attribute::AttrKind A, Attribute::AttrKind B) {
+ if (A == B)
+ return A;
+ if (A == Attribute::ReadNone)
+ return B;
+ if (B == Attribute::ReadNone)
+ return A;
+ return Attribute::None;
+ };
+
+ Attribute::AttrKind AccessAttr = Attribute::ReadNone;
+ for (unsigned i = 0, e = ArgumentSCC.size();
+ i != e && AccessAttr != Attribute::None; ++i) {
Argument *A = ArgumentSCC[i]->Definition;
- Attribute::AttrKind K = determinePointerReadAttrs(A, ArgumentSCCNodes);
- if (K == Attribute::ReadNone)
- continue;
- if (K == Attribute::ReadOnly) {
- ReadAttr = Attribute::ReadOnly;
- continue;
- }
- ReadAttr = K;
- break;
+ Attribute::AttrKind K = determinePointerAccessAttrs(A, ArgumentSCCNodes);
+ AccessAttr = meetAccessAttr(AccessAttr, K);
}
- if (ReadAttr != Attribute::None) {
+ if (AccessAttr != Attribute::None) {
for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
Argument *A = ArgumentSCC[i]->Definition;
- if (addReadAttr(A, ReadAttr))
+ if (addAccessAttr(A, AccessAttr))
Changed.insert(A->getParent());
}
}
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index fbd083bb9bbf..2425646455bd 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -64,8 +64,8 @@ static cl::opt<unsigned> FuncSpecializationMaxIters(
cl::desc("The maximum number of iterations function specialization is run"),
cl::init(1));
-static cl::opt<unsigned> MaxConstantsThreshold(
- "func-specialization-max-constants", cl::Hidden,
+static cl::opt<unsigned> MaxClonesThreshold(
+ "func-specialization-max-clones", cl::Hidden,
cl::desc("The maximum number of clones allowed for a single function "
"specialization"),
cl::init(3));
@@ -92,6 +92,28 @@ static cl::opt<bool> EnableSpecializationForLiteralConstant(
cl::desc("Enable specialization of functions that take a literal constant "
"as an argument."));
+namespace {
+// Bookkeeping struct to pass data from the analysis and profitability phase
+// to the actual transform helper functions.
+struct ArgInfo {
+ Function *Fn; // The function to perform specialisation on.
+ Argument *Arg; // The Formal argument being analysed.
+ Constant *Const; // A corresponding actual constant argument.
+ InstructionCost Gain; // Profitability: Gain = Bonus - Cost.
+
+ // Flag if this will be a partial specialization, in which case we will need
+ // to keep the original function around in addition to the added
+ // specializations.
+ bool Partial = false;
+
+ ArgInfo(Function *F, Argument *A, Constant *C, InstructionCost G)
+ : Fn(F), Arg(A), Const(C), Gain(G){};
+};
+} // Anonymous namespace
+
+using FuncList = SmallVectorImpl<Function *>;
+using ConstList = SmallVectorImpl<Constant *>;
+
// Helper to check if \p LV is either a constant or a constant
// range with a single element. This should cover exactly the same cases as the
// old ValueLatticeElement::isConstant() and is intended to be used in the
@@ -169,7 +191,7 @@ static Constant *getConstantStackValue(CallInst *Call, Value *Val,
// ret void
// }
//
-static void constantArgPropagation(SmallVectorImpl<Function *> &WorkList,
+static void constantArgPropagation(FuncList &WorkList,
Module &M, SCCPSolver &Solver) {
// Iterate over the argument tracked functions see if there
// are any new constant values for the call instruction via
@@ -254,40 +276,33 @@ public:
///
/// \returns true if at least one function is specialized.
bool
- specializeFunctions(SmallVectorImpl<Function *> &FuncDecls,
- SmallVectorImpl<Function *> &CurrentSpecializations) {
-
- // Attempt to specialize the argument-tracked functions.
+ specializeFunctions(FuncList &FuncDecls,
+ FuncList &CurrentSpecializations) {
bool Changed = false;
for (auto *F : FuncDecls) {
- if (specializeFunction(F, CurrentSpecializations)) {
- Changed = true;
- LLVM_DEBUG(dbgs() << "FnSpecialization: Can specialize this func.\n");
- } else {
+ if (!isCandidateFunction(F, CurrentSpecializations))
+ continue;
+
+ auto Cost = getSpecializationCost(F);
+ if (!Cost.isValid()) {
LLVM_DEBUG(
- dbgs() << "FnSpecialization: Cannot specialize this func.\n");
+ dbgs() << "FnSpecialization: Invalid specialisation cost.\n");
+ continue;
}
- }
- for (auto *SpecializedFunc : CurrentSpecializations) {
- SpecializedFuncs.insert(SpecializedFunc);
-
- // Initialize the state of the newly created functions, marking them
- // argument-tracked and executable.
- if (SpecializedFunc->hasExactDefinition() &&
- !SpecializedFunc->hasFnAttribute(Attribute::Naked))
- Solver.addTrackedFunction(SpecializedFunc);
- Solver.addArgumentTrackedFunction(SpecializedFunc);
- FuncDecls.push_back(SpecializedFunc);
- Solver.markBlockExecutable(&SpecializedFunc->front());
+ auto ConstArgs = calculateGains(F, Cost);
+ if (ConstArgs.empty()) {
+ LLVM_DEBUG(dbgs() << "FnSpecialization: no possible constants found\n");
+ continue;
+ }
- // Replace the function arguments for the specialized functions.
- for (Argument &Arg : SpecializedFunc->args())
- if (!Arg.use_empty() && tryToReplaceWithConstant(&Arg))
- LLVM_DEBUG(dbgs() << "FnSpecialization: Replaced constant argument: "
- << Arg.getName() << "\n");
+ for (auto &CA : ConstArgs) {
+ specializeFunction(CA, CurrentSpecializations);
+ Changed = true;
+ }
}
+ updateSpecializedFuncs(FuncDecls, CurrentSpecializations);
NumFuncSpecialized += NbFunctionsSpecialized;
return Changed;
}
@@ -333,15 +348,83 @@ private:
return Clone;
}
- /// This function decides whether to specialize function \p F based on the
- /// known constant values its arguments can take on. Specialization is
- /// performed on the first interesting argument. Specializations based on
- /// additional arguments will be evaluated on following iterations of the
- /// main IPSCCP solve loop. \returns true if the function is specialized and
- /// false otherwise.
- bool specializeFunction(Function *F,
- SmallVectorImpl<Function *> &Specializations) {
+ /// This function decides whether it's worthwhile to specialize function \p F
+ /// based on the known constant values its arguments can take on, i.e. it
+ /// calculates a gain and returns a list of actual arguments that are deemed
+ /// profitable to specialize. Specialization is performed on the first
+ /// interesting argument. Specializations based on additional arguments will
+ /// be evaluated on following iterations of the main IPSCCP solve loop.
+ SmallVector<ArgInfo> calculateGains(Function *F, InstructionCost Cost) {
+ SmallVector<ArgInfo> Worklist;
+ // Determine if we should specialize the function based on the values the
+ // argument can take on. If specialization is not profitable, we continue
+ // on to the next argument.
+ for (Argument &FormalArg : F->args()) {
+ LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing arg: "
+ << FormalArg.getName() << "\n");
+ // Determine if this argument is interesting. If we know the argument can
+ // take on any constant values, they are collected in Constants. If the
+ // argument can only ever equal a constant value in Constants, the
+ // function will be completely specialized, and the IsPartial flag will
+ // be set to false by isArgumentInteresting (that function only adds
+ // values to the Constants list that are deemed profitable).
+ bool IsPartial = true;
+ SmallVector<Constant *> ActualConstArg;
+ if (!isArgumentInteresting(&FormalArg, ActualConstArg, IsPartial)) {
+ LLVM_DEBUG(dbgs() << "FnSpecialization: Argument is not interesting\n");
+ continue;
+ }
+
+ for (auto *ActualArg : ActualConstArg) {
+ InstructionCost Gain =
+ ForceFunctionSpecialization
+ ? 1
+ : getSpecializationBonus(&FormalArg, ActualArg) - Cost;
+ if (Gain <= 0)
+ continue;
+ Worklist.push_back({F, &FormalArg, ActualArg, Gain});
+ }
+
+ if (Worklist.empty())
+ continue;
+
+ // Sort the candidates in descending order.
+ llvm::stable_sort(Worklist, [](const ArgInfo &L, const ArgInfo &R) {
+ return L.Gain > R.Gain;
+ });
+
+ // Truncate the worklist to 'MaxClonesThreshold' candidates if
+ // necessary.
+ if (Worklist.size() > MaxClonesThreshold) {
+ LLVM_DEBUG(dbgs() << "FnSpecialization: number of candidates exceed "
+ << "the maximum number of clones threshold.\n"
+ << "Truncating worklist to " << MaxClonesThreshold
+ << " candidates.\n");
+ Worklist.erase(Worklist.begin() + MaxClonesThreshold,
+ Worklist.end());
+ }
+
+ if (IsPartial || Worklist.size() < ActualConstArg.size())
+ for (auto &ActualArg : Worklist)
+ ActualArg.Partial = true;
+
+ LLVM_DEBUG(dbgs() << "Sorted list of candidates by gain:\n";
+ for (auto &C
+ : Worklist) {
+ dbgs() << "- Function = " << C.Fn->getName() << ", ";
+ dbgs() << "FormalArg = " << C.Arg->getName() << ", ";
+ dbgs() << "ActualArg = " << C.Const->getName() << ", ";
+ dbgs() << "Gain = " << C.Gain << "\n";
+ });
+
+ // FIXME: Only one argument per function.
+ break;
+ }
+ return Worklist;
+ }
+
+ bool isCandidateFunction(Function *F, FuncList &Specializations) {
// Do not specialize the cloned function again.
if (SpecializedFuncs.contains(F))
return false;
@@ -362,84 +445,32 @@ private:
LLVM_DEBUG(dbgs() << "FnSpecialization: Try function: " << F->getName()
<< "\n");
+ return true;
+ }
- // Determine if it would be profitable to create a specialization of the
- // function where the argument takes on the given constant value. If so,
- // add the constant to Constants.
- auto FnSpecCost = getSpecializationCost(F);
- if (!FnSpecCost.isValid()) {
- LLVM_DEBUG(dbgs() << "FnSpecialization: Invalid specialisation cost.\n");
- return false;
- }
-
- LLVM_DEBUG(dbgs() << "FnSpecialization: func specialisation cost: ";
- FnSpecCost.print(dbgs()); dbgs() << "\n");
+ void specializeFunction(ArgInfo &AI, FuncList &Specializations) {
+ Function *Clone = cloneCandidateFunction(AI.Fn);
+ Argument *ClonedArg = Clone->getArg(AI.Arg->getArgNo());
- // Determine if we should specialize the function based on the values the
- // argument can take on. If specialization is not profitable, we continue
- // on to the next argument.
- for (Argument &A : F->args()) {
- LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing arg: " << A.getName()
- << "\n");
- // True if this will be a partial specialization. We will need to keep
- // the original function around in addition to the added specializations.
- bool IsPartial = true;
+ // Rewrite calls to the function so that they call the clone instead.
+ rewriteCallSites(AI.Fn, Clone, *ClonedArg, AI.Const);
- // Determine if this argument is interesting. If we know the argument can
- // take on any constant values, they are collected in Constants. If the
- // argument can only ever equal a constant value in Constants, the
- // function will be completely specialized, and the IsPartial flag will
- // be set to false by isArgumentInteresting (that function only adds
- // values to the Constants list that are deemed profitable).
- SmallVector<Constant *, 4> Constants;
- if (!isArgumentInteresting(&A, Constants, FnSpecCost, IsPartial)) {
- LLVM_DEBUG(dbgs() << "FnSpecialization: Argument is not interesting\n");
- continue;
- }
-
- assert(!Constants.empty() && "No constants on which to specialize");
- LLVM_DEBUG(dbgs() << "FnSpecialization: Argument is interesting!\n"
- << "FnSpecialization: Specializing '" << F->getName()
- << "' on argument: " << A << "\n"
- << "FnSpecialization: Constants are:\n\n";
- for (unsigned I = 0; I < Constants.size(); ++I) dbgs()
- << *Constants[I] << "\n";
- dbgs() << "FnSpecialization: End of constants\n\n");
-
- // Create a version of the function in which the argument is marked
- // constant with the given value.
- for (auto *C : Constants) {
- // Clone the function. We leave the ValueToValueMap empty to allow
- // IPSCCP to propagate the constant arguments.
- Function *Clone = cloneCandidateFunction(F);
- Argument *ClonedArg = Clone->arg_begin() + A.getArgNo();
-
- // Rewrite calls to the function so that they call the clone instead.
- rewriteCallSites(F, Clone, *ClonedArg, C);
-
- // Initialize the lattice state of the arguments of the function clone,
- // marking the argument on which we specialized the function constant
- // with the given value.
- Solver.markArgInFuncSpecialization(F, ClonedArg, C);
-
- // Mark all the specialized functions
- Specializations.push_back(Clone);
- NbFunctionsSpecialized++;
- }
+ // Initialize the lattice state of the arguments of the function clone,
+ // marking the argument on which we specialized the function constant
+ // with the given value.
+ Solver.markArgInFuncSpecialization(AI.Fn, ClonedArg, AI.Const);
- // If the function has been completely specialized, the original function
- // is no longer needed. Mark it unreachable.
- if (!IsPartial)
- Solver.markFunctionUnreachable(F);
-
- // FIXME: Only one argument per function.
- return true;
- }
+ // Mark all the specialized functions
+ Specializations.push_back(Clone);
+ NbFunctionsSpecialized++;
- return false;
+ // If the function has been completely specialized, the original function
+ // is no longer needed. Mark it unreachable.
+ if (!AI.Partial)
+ Solver.markFunctionUnreachable(AI.Fn);
}
- /// Compute the cost of specializing function \p F.
+ /// Compute and return the cost of specializing function \p F.
InstructionCost getSpecializationCost(Function *F) {
// Compute the code metrics for the function.
SmallPtrSet<const Value *, 32> EphValues;
@@ -578,9 +609,7 @@ private:
///
/// \returns true if the function should be specialized on the given
/// argument.
- bool isArgumentInteresting(Argument *A,
- SmallVectorImpl<Constant *> &Constants,
- const InstructionCost &FnSpecCost,
+ bool isArgumentInteresting(Argument *A, ConstList &Constants,
bool &IsPartial) {
// For now, don't attempt to specialize functions based on the values of
// composite types.
@@ -608,42 +637,8 @@ private:
//
// TODO 2: this currently does not support constants, i.e. integer ranges.
//
- SmallVector<Constant *, 4> PossibleConstants;
- bool AllConstant = getPossibleConstants(A, PossibleConstants);
- if (PossibleConstants.empty()) {
- LLVM_DEBUG(dbgs() << "FnSpecialization: no possible constants found\n");
- return false;
- }
- if (PossibleConstants.size() > MaxConstantsThreshold) {
- LLVM_DEBUG(dbgs() << "FnSpecialization: number of constants found exceed "
- << "the maximum number of constants threshold.\n");
- return false;
- }
-
- for (auto *C : PossibleConstants) {
- LLVM_DEBUG(dbgs() << "FnSpecialization: Constant: " << *C << "\n");
- if (ForceFunctionSpecialization) {
- LLVM_DEBUG(dbgs() << "FnSpecialization: Forced!\n");
- Constants.push_back(C);
- continue;
- }
- if (getSpecializationBonus(A, C) > FnSpecCost) {
- LLVM_DEBUG(dbgs() << "FnSpecialization: profitable!\n");
- Constants.push_back(C);
- } else {
- LLVM_DEBUG(dbgs() << "FnSpecialization: not profitable\n");
- }
- }
-
- // None of the constant values the argument can take on were deemed good
- // candidates on which to specialize the function.
- if (Constants.empty())
- return false;
-
- // This will be a partial specialization if some of the constants were
- // rejected due to their profitability.
- IsPartial = !AllConstant || PossibleConstants.size() != Constants.size();
-
+ IsPartial = !getPossibleConstants(A, Constants);
+ LLVM_DEBUG(dbgs() << "FnSpecialization: interesting arg: " << *A << "\n");
return true;
}
@@ -653,8 +648,7 @@ private:
/// \returns true if all of the values the argument can take on are constant
/// (e.g., the argument's parent function cannot be called with an
/// overdefined value).
- bool getPossibleConstants(Argument *A,
- SmallVectorImpl<Constant *> &Constants) {
+ bool getPossibleConstants(Argument *A, ConstList &Constants) {
Function *F = A->getParent();
bool AllConstant = true;
@@ -681,7 +675,7 @@ private:
// For now, constant expressions are fine but only if they are function
// calls.
- if (auto *CE = dyn_cast<ConstantExpr>(V))
+ if (auto *CE = dyn_cast<ConstantExpr>(V))
if (!isa<Function>(CE->getOperand(0)))
return false;
@@ -737,6 +731,29 @@ private:
}
}
}
+
+ void updateSpecializedFuncs(FuncList &FuncDecls,
+ FuncList &CurrentSpecializations) {
+ for (auto *SpecializedFunc : CurrentSpecializations) {
+ SpecializedFuncs.insert(SpecializedFunc);
+
+ // Initialize the state of the newly created functions, marking them
+ // argument-tracked and executable.
+ if (SpecializedFunc->hasExactDefinition() &&
+ !SpecializedFunc->hasFnAttribute(Attribute::Naked))
+ Solver.addTrackedFunction(SpecializedFunc);
+
+ Solver.addArgumentTrackedFunction(SpecializedFunc);
+ FuncDecls.push_back(SpecializedFunc);
+ Solver.markBlockExecutable(&SpecializedFunc->front());
+
+ // Replace the function arguments for the specialized functions.
+ for (Argument &Arg : SpecializedFunc->args())
+ if (!Arg.use_empty() && tryToReplaceWithConstant(&Arg))
+ LLVM_DEBUG(dbgs() << "FnSpecialization: Replaced constant argument: "
+ << Arg.getName() << "\n");
+ }
+ }
};
} // namespace
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index ba7589c2bf60..b1f3ff15c97b 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -305,8 +305,9 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
else if (auto *LI = dyn_cast<LoadInst>(U)) {
// A load from zeroinitializer is always zeroinitializer, regardless of
// any applied offset.
- if (Init->isNullValue()) {
- LI->replaceAllUsesWith(Constant::getNullValue(LI->getType()));
+ Type *Ty = LI->getType();
+ if (Init->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy()) {
+ LI->replaceAllUsesWith(Constant::getNullValue(Ty));
EraseFromParent(LI);
continue;
}
@@ -316,8 +317,7 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
PtrOp = PtrOp->stripAndAccumulateConstantOffsets(
DL, Offset, /* AllowNonInbounds */ true);
if (PtrOp == GV) {
- if (auto *Value = ConstantFoldLoadFromConst(Init, LI->getType(),
- Offset, DL)) {
+ if (auto *Value = ConstantFoldLoadFromConst(Init, Ty, Offset, DL)) {
LI->replaceAllUsesWith(Value);
EraseFromParent(LI);
}
@@ -368,8 +368,7 @@ static bool isSafeSROAGEP(User *U) {
return false;
}
- return llvm::all_of(U->users(),
- [](User *UU) { return isSafeSROAElementUse(UU); });
+ return llvm::all_of(U->users(), isSafeSROAElementUse);
}
/// Return true if the specified instruction is a safe user of a derived
diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
index 833049d6896f..a964fcde0396 100644
--- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -294,7 +294,7 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
// Find all incoming values from the outlining region.
int NumIncomingVals = 0;
for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
- if (find(Region, PN.getIncomingBlock(i)) != Region.end()) {
+ if (llvm::is_contained(Region, PN.getIncomingBlock(i))) {
++NumIncomingVals;
if (NumIncomingVals > 1) {
++NumSplitExitPhis;
diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
index 992c2b292e1e..4e3689f09536 100644
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -856,6 +856,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
if (InlineHistoryID != -1 &&
inlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory)) {
+ LLVM_DEBUG(dbgs() << "Skipping inlining due to history: "
+ << F.getName() << " -> " << Callee.getName() << "\n");
setInlineRemark(*CB, "recursive");
continue;
}
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index f78971f0e586..c0bb19e184d6 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1774,8 +1774,9 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New,
bool IsJumpTableCanonical) {
SmallSetVector<Constant *, 4> Constants;
for (Use &U : llvm::make_early_inc_range(Old->uses())) {
- // Skip block addresses
- if (isa<BlockAddress>(U.getUser()))
+ // Skip block addresses and no_cfi values, which refer to the function
+ // body instead of the jump table.
+ if (isa<BlockAddress, NoCFIValue>(U.getUser()))
continue;
// Skip direct calls to externally defined or non-dso_local functions
@@ -1802,7 +1803,7 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New,
}
void LowerTypeTestsModule::replaceDirectCalls(Value *Old, Value *New) {
- Old->replaceUsesWithIf(New, [](Use &U) { return isDirectCall(U); });
+ Old->replaceUsesWithIf(New, isDirectCall);
}
bool LowerTypeTestsModule::lower() {
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 055ee6b50296..f289e3ecc979 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -3964,6 +3964,9 @@ struct AAKernelInfoCallSite : AAKernelInfo {
case OMPRTL___kmpc_master:
case OMPRTL___kmpc_end_master:
case OMPRTL___kmpc_barrier:
+ case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
+ case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
+ case OMPRTL___kmpc_nvptx_end_reduce_nowait:
break;
case OMPRTL___kmpc_distribute_static_init_4:
case OMPRTL___kmpc_distribute_static_init_4u:
@@ -4010,6 +4013,7 @@ struct AAKernelInfoCallSite : AAKernelInfo {
break;
case OMPRTL___kmpc_omp_task:
// We do not look into tasks right now, just give up.
+ SPMDCompatibilityTracker.indicatePessimisticFixpoint();
SPMDCompatibilityTracker.insert(&CB);
ReachedUnknownParallelRegions.insert(&CB);
break;
@@ -4020,6 +4024,7 @@ struct AAKernelInfoCallSite : AAKernelInfo {
default:
// Unknown OpenMP runtime calls cannot be executed in SPMD-mode,
// generally. However, they do not hide parallel regions.
+ SPMDCompatibilityTracker.indicatePessimisticFixpoint();
SPMDCompatibilityTracker.insert(&CB);
break;
}
@@ -4079,6 +4084,7 @@ struct AAKernelInfoCallSite : AAKernelInfo {
SPMDCompatibilityTracker.insert(&CB);
break;
default:
+ SPMDCompatibilityTracker.indicatePessimisticFixpoint();
SPMDCompatibilityTracker.insert(&CB);
}
diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
index bae9a1e27e75..7334bf695b67 100644
--- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
+++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
@@ -32,7 +32,7 @@ ContextTrieNode *ContextTrieNode::getChildContext(const LineLocation &CallSite,
if (CalleeName.empty())
return getHottestChildContext(CallSite);
- uint64_t Hash = nodeHash(CalleeName, CallSite);
+ uint64_t Hash = FunctionSamples::getCallSiteHash(CalleeName, CallSite);
auto It = AllChildContext.find(Hash);
if (It != AllChildContext.end())
return &It->second;
@@ -65,7 +65,8 @@ ContextTrieNode::getHottestChildContext(const LineLocation &CallSite) {
ContextTrieNode &ContextTrieNode::moveToChildContext(
const LineLocation &CallSite, ContextTrieNode &&NodeToMove,
uint32_t ContextFramesToRemove, bool DeleteNode) {
- uint64_t Hash = nodeHash(NodeToMove.getFuncName(), CallSite);
+ uint64_t Hash =
+ FunctionSamples::getCallSiteHash(NodeToMove.getFuncName(), CallSite);
assert(!AllChildContext.count(Hash) && "Node to remove must exist");
LineLocation OldCallSite = NodeToMove.CallSiteLoc;
ContextTrieNode &OldParentContext = *NodeToMove.getParentContext();
@@ -108,7 +109,7 @@ ContextTrieNode &ContextTrieNode::moveToChildContext(
void ContextTrieNode::removeChildContext(const LineLocation &CallSite,
StringRef CalleeName) {
- uint64_t Hash = nodeHash(CalleeName, CallSite);
+ uint64_t Hash = FunctionSamples::getCallSiteHash(CalleeName, CallSite);
// Note this essentially calls dtor and destroys that child context
AllChildContext.erase(Hash);
}
@@ -174,21 +175,9 @@ void ContextTrieNode::dumpTree() {
}
}
-uint64_t ContextTrieNode::nodeHash(StringRef ChildName,
- const LineLocation &Callsite) {
- // We still use child's name for child hash, this is
- // because for children of root node, we don't have
- // different line/discriminator, and we'll rely on name
- // to differentiate children.
- uint64_t NameHash = std::hash<std::string>{}(ChildName.str());
- uint64_t LocId =
- (((uint64_t)Callsite.LineOffset) << 32) | Callsite.Discriminator;
- return NameHash + (LocId << 5) + LocId;
-}
-
ContextTrieNode *ContextTrieNode::getOrCreateChildContext(
const LineLocation &CallSite, StringRef CalleeName, bool AllowCreate) {
- uint64_t Hash = nodeHash(CalleeName, CallSite);
+ uint64_t Hash = FunctionSamples::getCallSiteHash(CalleeName, CallSite);
auto It = AllChildContext.find(Hash);
if (It != AllChildContext.end()) {
assert(It->second.getFuncName() == CalleeName &&
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index b8fac9d47763..bc6051de90c4 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -467,6 +467,9 @@ protected:
void emitOptimizationRemarksForInlineCandidates(
const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
bool Hot);
+ void promoteMergeNotInlinedContextSamples(
+ DenseMap<CallBase *, const FunctionSamples *> NonInlinedCallSites,
+ const Function &F);
std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG);
std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(CallGraph &CG);
void generateMDProfMetadata(Function &F);
@@ -485,7 +488,7 @@ protected:
std::unique_ptr<SampleContextTracker> ContextTracker;
/// Flag indicating whether input profile is context-sensitive
- bool ProfileIsCS = false;
+ bool ProfileIsCSFlat = false;
/// Flag indicating which LTO/ThinLTO phase the pass is invoked in.
///
@@ -602,7 +605,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
// call instruction should have 0 count.
// For CS profile, the callsite count of previously inlined callees is
// populated with the entry count of the callees.
- if (!ProfileIsCS)
+ if (!ProfileIsCSFlat)
if (const auto *CB = dyn_cast<CallBase>(&Inst))
if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
return 0;
@@ -641,7 +644,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) {
// call instruction should have 0 count.
// For CS profile, the callsite count of previously inlined callees is
// populated with the entry count of the callees.
- if (!ProfileIsCS)
+ if (!ProfileIsCSFlat)
if (const auto *CB = dyn_cast<CallBase>(&Inst))
if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
return 0;
@@ -695,7 +698,7 @@ SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
if (Function *Callee = Inst.getCalledFunction())
CalleeName = Callee->getName();
- if (ProfileIsCS)
+ if (ProfileIsCSFlat)
return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);
const FunctionSamples *FS = findFunctionSamples(Inst);
@@ -727,7 +730,7 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
FunctionSamples::getGUID(R->getName());
};
- if (ProfileIsCS) {
+ if (ProfileIsCSFlat) {
auto CalleeSamples =
ContextTracker->getIndirectCalleeContextSamplesFor(DIL);
if (CalleeSamples.empty())
@@ -780,7 +783,7 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
if (it.second) {
- if (ProfileIsCS)
+ if (ProfileIsCSFlat)
it.first->second = ContextTracker->getContextSamplesFor(DIL);
else
it.first->second =
@@ -1039,7 +1042,7 @@ void SampleProfileLoader::findExternalInlineCandidate(
// For AutoFDO profile, retrieve candidate profiles by walking over
// the nested inlinee profiles.
- if (!ProfileIsCS) {
+ if (!ProfileIsCSFlat) {
Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold);
return;
}
@@ -1134,7 +1137,7 @@ bool SampleProfileLoader::inlineHotFunctions(
assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
"GUIDToFuncNameMap has to be populated");
AllCandidates.push_back(CB);
- if (FS->getEntrySamples() > 0 || ProfileIsCS)
+ if (FS->getEntrySamples() > 0 || ProfileIsCSFlat)
LocalNotInlinedCallSites.try_emplace(CB, FS);
if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
Hot = true;
@@ -1156,11 +1159,9 @@ bool SampleProfileLoader::inlineHotFunctions(
}
for (CallBase *I : CIS) {
Function *CalledFunction = I->getCalledFunction();
- InlineCandidate Candidate = {
- I,
- LocalNotInlinedCallSites.count(I) ? LocalNotInlinedCallSites[I]
- : nullptr,
- 0 /* dummy count */, 1.0 /* dummy distribution factor */};
+ InlineCandidate Candidate = {I, LocalNotInlinedCallSites.lookup(I),
+ 0 /* dummy count */,
+ 1.0 /* dummy distribution factor */};
// Do not inline recursive calls.
if (CalledFunction == &F)
continue;
@@ -1198,53 +1199,9 @@ bool SampleProfileLoader::inlineHotFunctions(
}
// For CS profile, profile for not inlined context will be merged when
- // base profile is being trieved
- if (ProfileIsCS)
- return Changed;
-
- // Accumulate not inlined callsite information into notInlinedSamples
- for (const auto &Pair : LocalNotInlinedCallSites) {
- CallBase *I = Pair.getFirst();
- Function *Callee = I->getCalledFunction();
- if (!Callee || Callee->isDeclaration())
- continue;
-
- ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline",
- I->getDebugLoc(), I->getParent())
- << "previous inlining not repeated: '"
- << ore::NV("Callee", Callee) << "' into '"
- << ore::NV("Caller", &F) << "'");
-
- ++NumCSNotInlined;
- const FunctionSamples *FS = Pair.getSecond();
- if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) {
- continue;
- }
-
- if (ProfileMergeInlinee) {
- // A function call can be replicated by optimizations like callsite
- // splitting or jump threading and the replicates end up sharing the
- // sample nested callee profile instead of slicing the original inlinee's
- // profile. We want to do merge exactly once by filtering out callee
- // profiles with a non-zero head sample count.
- if (FS->getHeadSamples() == 0) {
- // Use entry samples as head samples during the merge, as inlinees
- // don't have head samples.
- const_cast<FunctionSamples *>(FS)->addHeadSamples(
- FS->getEntrySamples());
-
- // Note that we have to do the merge right after processing function.
- // This allows OutlineFS's profile to be used for annotation during
- // top-down processing of functions' annotation.
- FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee);
- OutlineFS->merge(*FS);
- }
- } else {
- auto pair =
- notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
- pair.first->second.entryCount += FS->getEntrySamples();
- }
- }
+ // base profile is being retrieved.
+ if (!FunctionSamples::ProfileIsCSFlat)
+ promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
return Changed;
}
@@ -1285,7 +1242,7 @@ bool SampleProfileLoader::tryInlineCandidate(
InlinedCallSites->push_back(I);
}
- if (ProfileIsCS)
+ if (ProfileIsCSFlat)
ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
++NumCSInlined;
@@ -1430,7 +1387,6 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
bool SampleProfileLoader::inlineHotFunctionsWithPriority(
Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
- assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now");
// ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
// Profile symbol list is ignored when profile-sample-accurate is on.
@@ -1467,6 +1423,8 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
if (ExternalInlineAdvisor)
SizeLimit = std::numeric_limits<unsigned>::max();
+ DenseMap<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
+
// Perform iterative BFS call site prioritized inlining
bool Changed = false;
while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) {
@@ -1521,6 +1479,8 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
}
ICPCount++;
Changed = true;
+ } else if (!ContextTracker) {
+ LocalNotInlinedCallSites.try_emplace(I, FS);
}
}
} else if (CalledFunction && CalledFunction->getSubprogram() &&
@@ -1532,6 +1492,8 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
CQueue.emplace(NewCandidate);
}
Changed = true;
+ } else if (!ContextTracker) {
+ LocalNotInlinedCallSites.try_emplace(I, Candidate.CalleeSamples);
}
} else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
@@ -1549,9 +1511,63 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
++NumCSInlinedHitGrowthLimit;
}
+ // For CS profile, profile for not inlined context will be merged when
+ // base profile is being retrieved.
+ if (!FunctionSamples::ProfileIsCSFlat)
+ promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
return Changed;
}
+void SampleProfileLoader::promoteMergeNotInlinedContextSamples(
+ DenseMap<CallBase *, const FunctionSamples *> NonInlinedCallSites,
+ const Function &F) {
+ // Accumulate not inlined callsite information into notInlinedSamples
+ for (const auto &Pair : NonInlinedCallSites) {
+ CallBase *I = Pair.getFirst();
+ Function *Callee = I->getCalledFunction();
+ if (!Callee || Callee->isDeclaration())
+ continue;
+
+ ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline",
+ I->getDebugLoc(), I->getParent())
+ << "previous inlining not repeated: '"
+ << ore::NV("Callee", Callee) << "' into '"
+ << ore::NV("Caller", &F) << "'");
+
+ ++NumCSNotInlined;
+ const FunctionSamples *FS = Pair.getSecond();
+ if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) {
+ continue;
+ }
+
+ if (ProfileMergeInlinee) {
+ // A function call can be replicated by optimizations like callsite
+ // splitting or jump threading and the replicates end up sharing the
+ // sample nested callee profile instead of slicing the original
+ // inlinee's profile. We want to do merge exactly once by filtering out
+ // callee profiles with a non-zero head sample count.
+ if (FS->getHeadSamples() == 0) {
+ // Use entry samples as head samples during the merge, as inlinees
+ // don't have head samples.
+ const_cast<FunctionSamples *>(FS)->addHeadSamples(
+ FS->getEntrySamples());
+
+ // Note that we have to do the merge right after processing function.
+ // This allows OutlineFS's profile to be used for annotation during
+ // top-down processing of functions' annotation.
+ FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee);
+ OutlineFS->merge(*FS, 1);
+ // Set outlined profile to be synthetic to not bias the inliner.
+ OutlineFS->SetContextSynthetic();
+ }
+ } else {
+ auto pair =
+ notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
+ pair.first->second.entryCount += FS->getEntrySamples();
+ }
+ }
+}
+
/// Returns the sorted CallTargetMap \p M by count in descending order.
static SmallVector<InstrProfValueData, 2>
GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M) {
@@ -1607,7 +1623,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
// With CSSPGO all indirect call targets are counted torwards the
// original indirect call site in the profile, including both
// inlined and non-inlined targets.
- if (!FunctionSamples::ProfileIsCS) {
+ if (!FunctionSamples::ProfileIsCSFlat) {
if (const FunctionSamplesMap *M =
FS->findFunctionSamplesMapAt(CallSite)) {
for (const auto &NameFS : *M)
@@ -1754,7 +1770,7 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
}
DenseSet<GlobalValue::GUID> InlinedGUIDs;
- if (ProfileIsCS && CallsitePrioritizedInline)
+ if (CallsitePrioritizedInline)
Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
else
Changed |= inlineHotFunctions(F, InlinedGUIDs);
@@ -1782,7 +1798,7 @@ INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
std::unique_ptr<ProfiledCallGraph>
SampleProfileLoader::buildProfiledCallGraph(CallGraph &CG) {
std::unique_ptr<ProfiledCallGraph> ProfiledCG;
- if (ProfileIsCS)
+ if (ProfileIsCSFlat)
ProfiledCG = std::make_unique<ProfiledCallGraph>(*ContextTracker);
else
ProfiledCG = std::make_unique<ProfiledCallGraph>(Reader->getProfiles());
@@ -1828,7 +1844,7 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
assert(&CG->getModule() == &M);
if (UseProfiledCallGraph ||
- (ProfileIsCS && !UseProfiledCallGraph.getNumOccurrences())) {
+ (ProfileIsCSFlat && !UseProfiledCallGraph.getNumOccurrences())) {
// Use profiled call edges to augment the top-down order. There are cases
// that the top-down order computed based on the static call graph doesn't
// reflect real execution order. For example
@@ -1961,10 +1977,8 @@ bool SampleProfileLoader::doInitialization(Module &M,
}
// Apply tweaks if context-sensitive profile is available.
- if (Reader->profileIsCS()) {
- ProfileIsCS = true;
- FunctionSamples::ProfileIsCS = true;
-
+ if (Reader->profileIsCSFlat() || Reader->profileIsCSNested()) {
+ ProfileIsCSFlat = Reader->profileIsCSFlat();
// Enable priority-base inliner and size inline by default for CSSPGO.
if (!ProfileSizeInline.getNumOccurrences())
ProfileSizeInline = true;
@@ -1982,10 +1996,15 @@ bool SampleProfileLoader::doInitialization(Module &M,
// Enable iterative-BFI by default for CSSPGO.
if (!UseIterativeBFIInference.getNumOccurrences())
UseIterativeBFIInference = true;
-
- // Tracker for profiles under different context
- ContextTracker = std::make_unique<SampleContextTracker>(
- Reader->getProfiles(), &GUIDToFuncNameMap);
+ // Enable Profi by default for CSSPGO.
+ if (!SampleProfileUseProfi.getNumOccurrences())
+ SampleProfileUseProfi = true;
+
+ if (FunctionSamples::ProfileIsCSFlat) {
+ // Tracker for profiles under different context
+ ContextTracker = std::make_unique<SampleContextTracker>(
+ Reader->getProfiles(), &GUIDToFuncNameMap);
+ }
}
// Load pseudo probe descriptors for probe-based function samples.
@@ -1994,7 +2013,8 @@ bool SampleProfileLoader::doInitialization(Module &M,
if (!ProbeManager->moduleIsProbed(M)) {
const char *Msg =
"Pseudo-probe-based profile requires SampleProfileProbePass";
- Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
+ Ctx.diagnose(DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg,
+ DS_Warning));
return false;
}
}
@@ -2062,7 +2082,7 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
}
// Account for cold calls not inlined....
- if (!ProfileIsCS)
+ if (!ProfileIsCSFlat)
for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
notInlinedCallInfo)
updateProfileCallee(pair.first, pair.second.entryCount);
@@ -2138,7 +2158,7 @@ bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM)
ORE = OwnedORE.get();
}
- if (ProfileIsCS)
+ if (ProfileIsCSFlat)
Samples = ContextTracker->getBaseSamplesFor(F);
else
Samples = Reader->getSamplesFor(F);
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 0cc1b37844f6..daaf6cbeb3fd 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -87,7 +87,8 @@ void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId,
if (isa<Function>(&ExportGV) && allowPromotionAlias(OldName)) {
// Create a local alias with the original name to avoid breaking
// references from inline assembly.
- std::string Alias = ".set " + OldName + "," + NewName + "\n";
+ std::string Alias =
+ ".lto_set_conditional " + OldName + "," + NewName + "\n";
ExportM.appendModuleInlineAsm(Alias);
}
}
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 61054e7ae46f..6acace1d9fd4 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -359,6 +359,36 @@ template <> struct DenseMapInfo<VTableSlotSummary> {
namespace {
+// Returns true if the function must be unreachable based on ValueInfo.
+//
+// In particular, identifies a function as unreachable in the following
+// conditions
+// 1) All summaries are live.
+// 2) All function summaries indicate it's unreachable
+bool mustBeUnreachableFunction(ValueInfo TheFnVI) {
+ if ((!TheFnVI) || TheFnVI.getSummaryList().empty()) {
+ // Returns false if ValueInfo is absent, or the summary list is empty
+ // (e.g., function declarations).
+ return false;
+ }
+
+ for (auto &Summary : TheFnVI.getSummaryList()) {
+ // Conservatively returns false if any non-live functions are seen.
+ // In general either all summaries should be live or all should be dead.
+ if (!Summary->isLive())
+ return false;
+ if (auto *FS = dyn_cast<FunctionSummary>(Summary.get())) {
+ if (!FS->fflags().MustBeUnreachable)
+ return false;
+ }
+ // Do nothing if a non-function has the same GUID (which is rare).
+ // This is correct since non-function summaries are not relevant.
+ }
+ // All function summaries are live and all of them agree that the function is
+ // unreachble.
+ return true;
+}
+
// A virtual call site. VTable is the loaded virtual table pointer, and CS is
// the indirect virtual call.
struct VirtualCallSite {
@@ -562,10 +592,12 @@ struct DevirtModule {
void buildTypeIdentifierMap(
std::vector<VTableBits> &Bits,
DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap);
+
bool
tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot,
const std::set<TypeMemberInfo> &TypeMemberInfos,
- uint64_t ByteOffset);
+ uint64_t ByteOffset,
+ ModuleSummaryIndex *ExportSummary);
void applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn,
bool &IsExported);
@@ -640,6 +672,23 @@ struct DevirtModule {
bool run();
+ // Look up the corresponding ValueInfo entry of `TheFn` in `ExportSummary`.
+ //
+ // Caller guarantees that `ExportSummary` is not nullptr.
+ static ValueInfo lookUpFunctionValueInfo(Function *TheFn,
+ ModuleSummaryIndex *ExportSummary);
+
+ // Returns true if the function definition must be unreachable.
+ //
+ // Note if this helper function returns true, `F` is guaranteed
+ // to be unreachable; if it returns false, `F` might still
+ // be unreachable but not covered by this helper function.
+ //
+ // Implementation-wise, if function definition is present, IR is analyzed; if
+ // not, look up function flags from ExportSummary as a fallback.
+ static bool mustBeUnreachableFunction(Function *const F,
+ ModuleSummaryIndex *ExportSummary);
+
// Lower the module using the action and summary passed as command line
// arguments. For testing purposes only.
static bool
@@ -969,7 +1018,8 @@ void DevirtModule::buildTypeIdentifierMap(
bool DevirtModule::tryFindVirtualCallTargets(
std::vector<VirtualCallTarget> &TargetsForSlot,
- const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset) {
+ const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset,
+ ModuleSummaryIndex *ExportSummary) {
for (const TypeMemberInfo &TM : TypeMemberInfos) {
if (!TM.Bits->GV->isConstant())
return false;
@@ -997,6 +1047,11 @@ bool DevirtModule::tryFindVirtualCallTargets(
if (Fn->getName() == "__cxa_pure_virtual")
continue;
+ // We can disregard unreachable functions as possible call targets, as
+ // unreachable functions shouldn't be called.
+ if (mustBeUnreachableFunction(Fn, ExportSummary))
+ continue;
+
TargetsForSlot.push_back({Fn, &TM});
}
@@ -1053,6 +1108,9 @@ bool DevirtIndex::tryFindVirtualCallTargets(
if (VTP.VTableOffset != P.AddressPointOffset + ByteOffset)
continue;
+ if (mustBeUnreachableFunction(VTP.FuncVI))
+ continue;
+
TargetsForSlot.push_back(VTP.FuncVI);
}
}
@@ -1744,7 +1802,7 @@ void DevirtModule::rebuildGlobal(VTableBits &B) {
GlobalVariable::PrivateLinkage, NewInit, "", B.GV);
NewGV->setSection(B.GV->getSection());
NewGV->setComdat(B.GV->getComdat());
- NewGV->setAlignment(MaybeAlign(B.GV->getAlignment()));
+ NewGV->setAlignment(B.GV->getAlign());
// Copy the original vtable's metadata to the anonymous global, adjusting
// offsets as required.
@@ -2014,6 +2072,44 @@ void DevirtModule::removeRedundantTypeTests() {
}
}
+ValueInfo
+DevirtModule::lookUpFunctionValueInfo(Function *TheFn,
+ ModuleSummaryIndex *ExportSummary) {
+ assert((ExportSummary != nullptr) &&
+ "Caller guarantees ExportSummary is not nullptr");
+
+ const auto TheFnGUID = TheFn->getGUID();
+ const auto TheFnGUIDWithExportedName = GlobalValue::getGUID(TheFn->getName());
+ // Look up ValueInfo with the GUID in the current linkage.
+ ValueInfo TheFnVI = ExportSummary->getValueInfo(TheFnGUID);
+ // If no entry is found and GUID is different from GUID computed using
+ // exported name, look up ValueInfo with the exported name unconditionally.
+ // This is a fallback.
+ //
+ // The reason to have a fallback:
+ // 1. LTO could enable global value internalization via
+ // `enable-lto-internalization`.
+ // 2. The GUID in ExportedSummary is computed using exported name.
+ if ((!TheFnVI) && (TheFnGUID != TheFnGUIDWithExportedName)) {
+ TheFnVI = ExportSummary->getValueInfo(TheFnGUIDWithExportedName);
+ }
+ return TheFnVI;
+}
+
+bool DevirtModule::mustBeUnreachableFunction(
+ Function *const F, ModuleSummaryIndex *ExportSummary) {
+ // First, learn unreachability by analyzing function IR.
+ if (!F->isDeclaration()) {
+ // A function must be unreachable if its entry block ends with an
+ // 'unreachable'.
+ return isa<UnreachableInst>(F->getEntryBlock().getTerminator());
+ }
+ // Learn unreachability from ExportSummary if ExportSummary is present.
+ return ExportSummary &&
+ ::mustBeUnreachableFunction(
+ DevirtModule::lookUpFunctionValueInfo(F, ExportSummary));
+}
+
bool DevirtModule::run() {
// If only some of the modules were split, we cannot correctly perform
// this transformation. We already checked for the presense of type tests
@@ -2137,7 +2233,7 @@ bool DevirtModule::run() {
cast<MDString>(S.first.TypeID)->getString())
.WPDRes[S.first.ByteOffset];
if (tryFindVirtualCallTargets(TargetsForSlot, TypeMemberInfos,
- S.first.ByteOffset)) {
+ S.first.ByteOffset, ExportSummary)) {
if (!trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res)) {
DidVirtualConstProp |=
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index dc55b5a31596..de1034c910d5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1795,6 +1795,55 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I,
}
}
+ // (~A & B & C) | ... --> ...
+ // (~A | B | C) | ... --> ...
+ // TODO: One use checks are conservative. We just need to check that a total
+ // number of multiple used values does not exceed reduction
+ // in operations.
+ if (match(Op0,
+ m_OneUse(m_c_BinOp(FlippedOpcode,
+ m_BinOp(FlippedOpcode, m_Value(B), m_Value(C)),
+ m_CombineAnd(m_Value(X), m_Not(m_Value(A)))))) ||
+ match(Op0, m_OneUse(m_c_BinOp(
+ FlippedOpcode,
+ m_c_BinOp(FlippedOpcode, m_Value(C),
+ m_CombineAnd(m_Value(X), m_Not(m_Value(A)))),
+ m_Value(B))))) {
+ // X = ~A
+ // (~A & B & C) | ~(A | B | C) --> ~(A | (B ^ C))
+ // (~A | B | C) & ~(A & B & C) --> (~A | (B ^ C))
+ if (match(Op1, m_OneUse(m_Not(m_c_BinOp(
+ Opcode, m_c_BinOp(Opcode, m_Specific(A), m_Specific(B)),
+ m_Specific(C))))) ||
+ match(Op1, m_OneUse(m_Not(m_c_BinOp(
+ Opcode, m_c_BinOp(Opcode, m_Specific(B), m_Specific(C)),
+ m_Specific(A))))) ||
+ match(Op1, m_OneUse(m_Not(m_c_BinOp(
+ Opcode, m_c_BinOp(Opcode, m_Specific(A), m_Specific(C)),
+ m_Specific(B)))))) {
+ Value *Xor = Builder.CreateXor(B, C);
+ return (Opcode == Instruction::Or)
+ ? BinaryOperator::CreateNot(Builder.CreateOr(Xor, A))
+ : BinaryOperator::CreateOr(Xor, X);
+ }
+
+ // (~A & B & C) | ~(A | B) --> (C | ~B) & ~A
+ // (~A | B | C) & ~(A & B) --> (C & ~B) | ~A
+ if (match(Op1, m_OneUse(m_Not(m_OneUse(
+ m_c_BinOp(Opcode, m_Specific(A), m_Specific(B)))))))
+ return BinaryOperator::Create(
+ FlippedOpcode, Builder.CreateBinOp(Opcode, C, Builder.CreateNot(B)),
+ X);
+
+ // (~A & B & C) | ~(A | C) --> (B | ~C) & ~A
+ // (~A | B | C) & ~(A & C) --> (B & ~C) | ~A
+ if (match(Op1, m_OneUse(m_Not(m_OneUse(
+ m_c_BinOp(Opcode, m_Specific(A), m_Specific(C)))))))
+ return BinaryOperator::Create(
+ FlippedOpcode, Builder.CreateBinOp(Opcode, B, Builder.CreateNot(C)),
+ X);
+ }
+
return nullptr;
}
@@ -2102,6 +2151,15 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
Value *Cmp = Builder.CreateICmpSLT(X, Zero, "isneg");
return SelectInst::Create(Cmp, Y, Zero);
}
+ // If there's a 'not' of the shifted value, swap the select operands:
+ // ~(iN X s>> (N-1)) & Y --> (X s< 0) ? 0 : Y
+ if (match(&I, m_c_And(m_OneUse(m_Not(
+ m_AShr(m_Value(X), m_SpecificInt(FullShift)))),
+ m_Value(Y)))) {
+ Constant *Zero = ConstantInt::getNullValue(Ty);
+ Value *Cmp = Builder.CreateICmpSLT(X, Zero, "isneg");
+ return SelectInst::Create(Cmp, Zero, Y);
+ }
// (~x) & y --> ~(x | (~y)) iff that gets rid of inversions
if (sinkNotIntoOtherHandOfAndOrOr(I))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 7da2669e1d13..14427bd1f2f4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2472,6 +2472,12 @@ static bool isSafeToEliminateVarargsCast(const CallBase &Call,
Instruction *InstCombinerImpl::tryOptimizeCall(CallInst *CI) {
if (!CI->getCalledFunction()) return nullptr;
+ // Skip optimizing notail and musttail calls so
+ // LibCallSimplifier::optimizeCall doesn't have to preserve those invariants.
+ // LibCallSimplifier::optimizeCall should try to preseve tail calls though.
+ if (CI->isMustTailCall() || CI->isNoTailCall())
+ return nullptr;
+
auto InstCombineRAUW = [this](Instruction *From, Value *With) {
replaceInstUsesWith(*From, With);
};
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 33f217659c01..8df4a4529f47 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -157,7 +157,7 @@ Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI,
Amt = Builder.CreateAdd(Amt, Off);
}
- AllocaInst *New = Builder.CreateAlloca(CastElTy, Amt);
+ AllocaInst *New = Builder.CreateAlloca(CastElTy, AI.getAddressSpace(), Amt);
New->setAlignment(AI.getAlign());
New->takeName(&AI);
New->setUsedWithInAlloca(AI.isUsedWithInAlloca());
@@ -965,13 +965,13 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
if (match(Src, m_VScale(DL))) {
if (Trunc.getFunction() &&
Trunc.getFunction()->hasFnAttribute(Attribute::VScaleRange)) {
- unsigned MaxVScale = Trunc.getFunction()
- ->getFnAttribute(Attribute::VScaleRange)
- .getVScaleRangeArgs()
- .second;
- if (MaxVScale > 0 && Log2_32(MaxVScale) < DestWidth) {
- Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1));
- return replaceInstUsesWith(Trunc, VScale);
+ Attribute Attr =
+ Trunc.getFunction()->getFnAttribute(Attribute::VScaleRange);
+ if (Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax()) {
+ if (Log2_32(MaxVScale.getValue()) < DestWidth) {
+ Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1));
+ return replaceInstUsesWith(Trunc, VScale);
+ }
}
}
}
@@ -1337,14 +1337,13 @@ Instruction *InstCombinerImpl::visitZExt(ZExtInst &CI) {
if (match(Src, m_VScale(DL))) {
if (CI.getFunction() &&
CI.getFunction()->hasFnAttribute(Attribute::VScaleRange)) {
- unsigned MaxVScale = CI.getFunction()
- ->getFnAttribute(Attribute::VScaleRange)
- .getVScaleRangeArgs()
- .second;
- unsigned TypeWidth = Src->getType()->getScalarSizeInBits();
- if (MaxVScale > 0 && Log2_32(MaxVScale) < TypeWidth) {
- Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1));
- return replaceInstUsesWith(CI, VScale);
+ Attribute Attr = CI.getFunction()->getFnAttribute(Attribute::VScaleRange);
+ if (Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax()) {
+ unsigned TypeWidth = Src->getType()->getScalarSizeInBits();
+ if (Log2_32(MaxVScale.getValue()) < TypeWidth) {
+ Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1));
+ return replaceInstUsesWith(CI, VScale);
+ }
}
}
}
@@ -1608,13 +1607,12 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &CI) {
if (match(Src, m_VScale(DL))) {
if (CI.getFunction() &&
CI.getFunction()->hasFnAttribute(Attribute::VScaleRange)) {
- unsigned MaxVScale = CI.getFunction()
- ->getFnAttribute(Attribute::VScaleRange)
- .getVScaleRangeArgs()
- .second;
- if (MaxVScale > 0 && Log2_32(MaxVScale) < (SrcBitSize - 1)) {
- Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1));
- return replaceInstUsesWith(CI, VScale);
+ Attribute Attr = CI.getFunction()->getFnAttribute(Attribute::VScaleRange);
+ if (Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax()) {
+ if (Log2_32(MaxVScale.getValue()) < (SrcBitSize - 1)) {
+ Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1));
+ return replaceInstUsesWith(CI, VScale);
+ }
}
}
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 20c75188ec9f..39b55b028110 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -600,6 +600,7 @@ public:
/// Canonicalize the position of binops relative to shufflevector.
Instruction *foldVectorBinop(BinaryOperator &Inst);
Instruction *foldVectorSelect(SelectInst &Sel);
+ Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf);
/// Given a binary operator, cast instruction, or select which has a PHI node
/// as operand #0, see if we can fold the instruction into the PHI (which is
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 79a8a065d02a..0dbfdba353c4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -163,7 +163,7 @@ static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI,
uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType());
if (!AllocaSize)
return false;
- return isDereferenceableAndAlignedPointer(V, Align(AI->getAlignment()),
+ return isDereferenceableAndAlignedPointer(V, AI->getAlign(),
APInt(64, AllocaSize), DL);
}
@@ -183,7 +183,8 @@ static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC,
if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
if (C->getValue().getActiveBits() <= 64) {
Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
- AllocaInst *New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName());
+ AllocaInst *New = IC.Builder.CreateAlloca(NewTy, AI.getAddressSpace(),
+ nullptr, AI.getName());
New->setAlignment(AI.getAlign());
// Scan to the end of the allocation instructions, to skip over a block of
@@ -199,21 +200,13 @@ static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC,
Type *IdxTy = IC.getDataLayout().getIntPtrType(AI.getType());
Value *NullIdx = Constant::getNullValue(IdxTy);
Value *Idx[2] = {NullIdx, NullIdx};
- Instruction *NewI = GetElementPtrInst::CreateInBounds(
+ Instruction *GEP = GetElementPtrInst::CreateInBounds(
NewTy, New, Idx, New->getName() + ".sub");
- IC.InsertNewInstBefore(NewI, *It);
-
- // Gracefully handle allocas in other address spaces.
- if (AI.getType()->getPointerAddressSpace() !=
- NewI->getType()->getPointerAddressSpace()) {
- NewI =
- CastInst::CreatePointerBitCastOrAddrSpaceCast(NewI, AI.getType());
- IC.InsertNewInstBefore(NewI, *It);
- }
+ IC.InsertNewInstBefore(GEP, *It);
// Now make everything use the getelementptr instead of the original
// allocation.
- return IC.replaceInstUsesWith(AI, NewI);
+ return IC.replaceInstUsesWith(AI, GEP);
}
}
@@ -640,7 +633,6 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
return nullptr;
StringRef Name = LI.getName();
- assert(LI.getAlignment() && "Alignment must be set at this point");
if (auto *ST = dyn_cast<StructType>(T)) {
// If the struct only have one element, we unpack.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 779d298da7a4..aca7ec8d7325 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -755,6 +755,15 @@ Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) {
if (simplifyDivRemOfSelectWithZeroOp(I))
return &I;
+ // If the divisor is a select-of-constants, try to constant fold all div ops:
+ // C / (select Cond, TrueC, FalseC) --> select Cond, (C / TrueC), (C / FalseC)
+ // TODO: Adapt simplifyDivRemOfSelectWithZeroOp to allow this and other folds.
+ if (match(Op0, m_ImmConstant()) &&
+ match(Op1, m_Select(m_Value(), m_ImmConstant(), m_ImmConstant()))) {
+ if (Instruction *R = FoldOpIntoSelect(I, cast<SelectInst>(Op1)))
+ return R;
+ }
+
const APInt *C2;
if (match(Op1, m_APInt(C2))) {
Value *X;
@@ -1461,6 +1470,15 @@ Instruction *InstCombinerImpl::commonIRemTransforms(BinaryOperator &I) {
if (simplifyDivRemOfSelectWithZeroOp(I))
return &I;
+ // If the divisor is a select-of-constants, try to constant fold all rem ops:
+ // C % (select Cond, TrueC, FalseC) --> select Cond, (C % TrueC), (C % FalseC)
+ // TODO: Adapt simplifyDivRemOfSelectWithZeroOp to allow this and other folds.
+ if (match(Op0, m_ImmConstant()) &&
+ match(Op1, m_Select(m_Value(), m_ImmConstant(), m_ImmConstant()))) {
+ if (Instruction *R = FoldOpIntoSelect(I, cast<SelectInst>(Op1)))
+ return R;
+ }
+
if (isa<Constant>(Op1)) {
if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) {
if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 35739c3b9a21..30f6aab2114b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -664,10 +664,7 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) {
return nullptr;
// When processing loads, we need to propagate two bits of information to the
- // sunk load: whether it is volatile, and what its alignment is. We currently
- // don't sink loads when some have their alignment specified and some don't.
- // visitLoadInst will propagate an alignment onto the load when TD is around,
- // and if TD isn't around, we can't handle the mixed case.
+ // sunk load: whether it is volatile, and what its alignment is.
bool isVolatile = FirstLI->isVolatile();
Align LoadAlignment = FirstLI->getAlign();
unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace();
@@ -699,7 +696,7 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) {
!isSafeAndProfitableToSinkLoad(LI))
return nullptr;
- LoadAlignment = std::min(LoadAlignment, Align(LI->getAlign()));
+ LoadAlignment = std::min(LoadAlignment, LI->getAlign());
// If the PHI is of volatile loads and the load block has multiple
// successors, sinking it would remove a load of the volatile value from
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 518d3952dce5..a6d6b5199105 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1482,7 +1482,12 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
if (C0->getType() != Sel.getType())
return nullptr;
- // FIXME: are there any magic icmp predicate+constant pairs we must not touch?
+ // ULT with 'add' of a constant is canonical. See foldICmpAddConstant().
+ // FIXME: Are there more magic icmp predicate+constant pairs we must avoid?
+ // Or should we just abandon this transform entirely?
+ if (Pred == CmpInst::ICMP_ULT && match(X, m_Add(m_Value(), m_Constant())))
+ return nullptr;
+
Value *SelVal0, *SelVal1; // We do not care which one is from where.
match(&Sel, m_Select(m_Value(), m_Value(SelVal0), m_Value(SelVal1)));
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index e357a9da8b12..4dc712f32536 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1595,12 +1595,6 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
simplifyAndSetOp(I, 0, DemandedElts, UndefElts);
simplifyAndSetOp(I, 1, DemandedElts, UndefElts2);
- // Any change to an instruction with potential poison must clear those flags
- // because we can not guarantee those constraints now. Other analysis may
- // determine that it is safe to re-apply the flags.
- if (MadeChange)
- BO->dropPoisonGeneratingFlags();
-
// Output elements are undefined if both are undefined. Consider things
// like undef & 0. The result is known zero, not undef.
UndefElts &= UndefElts2;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 32e537897140..c6a4602e59e3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -363,6 +363,18 @@ static APInt findDemandedEltsByAllUsers(Value *V) {
return UnionUsedElts;
}
+/// Given a constant index for a extractelement or insertelement instruction,
+/// return it with the canonical type if it isn't already canonical. We
+/// arbitrarily pick 64 bit as our canonical type. The actual bitwidth doesn't
+/// matter, we just want a consistent type to simplify CSE.
+ConstantInt *getPreferredVectorIndex(ConstantInt *IndexC) {
+ const unsigned IndexBW = IndexC->getType()->getBitWidth();
+ if (IndexBW == 64 || IndexC->getValue().getActiveBits() > 64)
+ return nullptr;
+ return ConstantInt::get(IndexC->getContext(),
+ IndexC->getValue().zextOrTrunc(64));
+}
+
Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
Value *SrcVec = EI.getVectorOperand();
Value *Index = EI.getIndexOperand();
@@ -374,6 +386,10 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
// find a previously computed scalar that was inserted into the vector.
auto *IndexC = dyn_cast<ConstantInt>(Index);
if (IndexC) {
+ // Canonicalize type of constant indices to i64 to simplify CSE
+ if (auto *NewIdx = getPreferredVectorIndex(IndexC))
+ return replaceOperand(EI, 1, NewIdx);
+
ElementCount EC = EI.getVectorOperandType()->getElementCount();
unsigned NumElts = EC.getKnownMinValue();
@@ -401,37 +417,6 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
if (!EC.isScalable() && IndexC->getValue().uge(NumElts))
return nullptr;
- // This instruction only demands the single element from the input vector.
- // Skip for scalable type, the number of elements is unknown at
- // compile-time.
- if (!EC.isScalable() && NumElts != 1) {
- // If the input vector has a single use, simplify it based on this use
- // property.
- if (SrcVec->hasOneUse()) {
- APInt UndefElts(NumElts, 0);
- APInt DemandedElts(NumElts, 0);
- DemandedElts.setBit(IndexC->getZExtValue());
- if (Value *V =
- SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts))
- return replaceOperand(EI, 0, V);
- } else {
- // If the input vector has multiple uses, simplify it based on a union
- // of all elements used.
- APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec);
- if (!DemandedElts.isAllOnes()) {
- APInt UndefElts(NumElts, 0);
- if (Value *V = SimplifyDemandedVectorElts(
- SrcVec, DemandedElts, UndefElts, 0 /* Depth */,
- true /* AllowMultipleUsers */)) {
- if (V != SrcVec) {
- SrcVec->replaceAllUsesWith(V);
- return &EI;
- }
- }
- }
- }
- }
-
if (Instruction *I = foldBitcastExtElt(EI))
return I;
@@ -473,11 +458,9 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
if (auto *I = dyn_cast<Instruction>(SrcVec)) {
if (auto *IE = dyn_cast<InsertElementInst>(I)) {
- // Extracting the inserted element?
- if (IE->getOperand(2) == Index)
- return replaceInstUsesWith(EI, IE->getOperand(1));
- // If the inserted and extracted elements are constants, they must not
- // be the same value, extract from the pre-inserted value instead.
+ // instsimplify already handled the case where the indices are constants
+ // and equal by value, if both are constants, they must not be the same
+ // value, extract from the pre-inserted value instead.
if (isa<Constant>(IE->getOperand(2)) && IndexC)
return replaceOperand(EI, 0, IE->getOperand(0));
} else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
@@ -497,30 +480,27 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
llvm::count_if(GEP->operands(), [](const Value *V) {
return isa<VectorType>(V->getType());
});
- if (VectorOps > 1)
- return nullptr;
- assert(VectorOps == 1 && "Expected exactly one vector GEP operand!");
-
- Value *NewPtr = GEP->getPointerOperand();
- if (isa<VectorType>(NewPtr->getType()))
- NewPtr = Builder.CreateExtractElement(NewPtr, IndexC);
-
- SmallVector<Value *> NewOps;
- for (unsigned I = 1; I != GEP->getNumOperands(); ++I) {
- Value *Op = GEP->getOperand(I);
- if (isa<VectorType>(Op->getType()))
- NewOps.push_back(Builder.CreateExtractElement(Op, IndexC));
- else
- NewOps.push_back(Op);
- }
+ if (VectorOps == 1) {
+ Value *NewPtr = GEP->getPointerOperand();
+ if (isa<VectorType>(NewPtr->getType()))
+ NewPtr = Builder.CreateExtractElement(NewPtr, IndexC);
+
+ SmallVector<Value *> NewOps;
+ for (unsigned I = 1; I != GEP->getNumOperands(); ++I) {
+ Value *Op = GEP->getOperand(I);
+ if (isa<VectorType>(Op->getType()))
+ NewOps.push_back(Builder.CreateExtractElement(Op, IndexC));
+ else
+ NewOps.push_back(Op);
+ }
- GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
- cast<PointerType>(NewPtr->getType())->getElementType(), NewPtr,
- NewOps);
- NewGEP->setIsInBounds(GEP->isInBounds());
- return NewGEP;
+ GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+ cast<PointerType>(NewPtr->getType())->getElementType(), NewPtr,
+ NewOps);
+ NewGEP->setIsInBounds(GEP->isInBounds());
+ return NewGEP;
+ }
}
- return nullptr;
} else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) {
// If this is extracting an element from a shufflevector, figure out where
// it came from and extract from the appropriate input element instead.
@@ -554,6 +534,44 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
}
}
}
+
+ // Run demanded elements after other transforms as this can drop flags on
+ // binops. If there's two paths to the same final result, we prefer the
+ // one which doesn't force us to drop flags.
+ if (IndexC) {
+ ElementCount EC = EI.getVectorOperandType()->getElementCount();
+ unsigned NumElts = EC.getKnownMinValue();
+ // This instruction only demands the single element from the input vector.
+ // Skip for scalable type, the number of elements is unknown at
+ // compile-time.
+ if (!EC.isScalable() && NumElts != 1) {
+ // If the input vector has a single use, simplify it based on this use
+ // property.
+ if (SrcVec->hasOneUse()) {
+ APInt UndefElts(NumElts, 0);
+ APInt DemandedElts(NumElts, 0);
+ DemandedElts.setBit(IndexC->getZExtValue());
+ if (Value *V =
+ SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts))
+ return replaceOperand(EI, 0, V);
+ } else {
+ // If the input vector has multiple uses, simplify it based on a union
+ // of all elements used.
+ APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec);
+ if (!DemandedElts.isAllOnes()) {
+ APInt UndefElts(NumElts, 0);
+ if (Value *V = SimplifyDemandedVectorElts(
+ SrcVec, DemandedElts, UndefElts, 0 /* Depth */,
+ true /* AllowMultipleUsers */)) {
+ if (V != SrcVec) {
+ SrcVec->replaceAllUsesWith(V);
+ return &EI;
+ }
+ }
+ }
+ }
+ }
+ }
return nullptr;
}
@@ -1476,6 +1494,11 @@ Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
VecOp, ScalarOp, IdxOp, SQ.getWithInstruction(&IE)))
return replaceInstUsesWith(IE, V);
+ // Canonicalize type of constant indices to i64 to simplify CSE
+ if (auto *IndexC = dyn_cast<ConstantInt>(IdxOp))
+ if (auto *NewIdx = getPreferredVectorIndex(IndexC))
+ return replaceOperand(IE, 2, NewIdx);
+
// If the scalar is bitcast and inserted into undef, do the insert in the
// source type followed by bitcast.
// TODO: Generalize for insert into any constant, not just undef?
@@ -2008,9 +2031,7 @@ static Instruction *canonicalizeInsertSplat(ShuffleVectorInst &Shuf,
}
/// Try to fold shuffles that are the equivalent of a vector select.
-static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
- InstCombiner::BuilderTy &Builder,
- const DataLayout &DL) {
+Instruction *InstCombinerImpl::foldSelectShuffle(ShuffleVectorInst &Shuf) {
if (!Shuf.isSelect())
return nullptr;
@@ -2118,21 +2139,23 @@ static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
V = Builder.CreateShuffleVector(X, Y, Mask);
}
- Instruction *NewBO = ConstantsAreOp1 ? BinaryOperator::Create(BOpc, V, NewC) :
- BinaryOperator::Create(BOpc, NewC, V);
+ Value *NewBO = ConstantsAreOp1 ? Builder.CreateBinOp(BOpc, V, NewC) :
+ Builder.CreateBinOp(BOpc, NewC, V);
// Flags are intersected from the 2 source binops. But there are 2 exceptions:
// 1. If we changed an opcode, poison conditions might have changed.
// 2. If the shuffle had undef mask elements, the new binop might have undefs
// where the original code did not. But if we already made a safe constant,
// then there's no danger.
- NewBO->copyIRFlags(B0);
- NewBO->andIRFlags(B1);
- if (DropNSW)
- NewBO->setHasNoSignedWrap(false);
- if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB)
- NewBO->dropPoisonGeneratingFlags();
- return NewBO;
+ if (auto *NewI = dyn_cast<Instruction>(NewBO)) {
+ NewI->copyIRFlags(B0);
+ NewI->andIRFlags(B1);
+ if (DropNSW)
+ NewI->setHasNoSignedWrap(false);
+ if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB)
+ NewI->dropPoisonGeneratingFlags();
+ }
+ return replaceInstUsesWith(Shuf, NewBO);
}
/// Convert a narrowing shuffle of a bitcasted vector into a vector truncate.
@@ -2497,7 +2520,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
if (Instruction *I = canonicalizeInsertSplat(SVI, Builder))
return I;
- if (Instruction *I = foldSelectShuffle(SVI, Builder, DL))
+ if (Instruction *I = foldSelectShuffle(SVI))
return I;
if (Instruction *I = foldTruncShuffle(SVI, DL.isBigEndian()))
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 1f81624f79e7..eb5eadba194d 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2546,7 +2546,7 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
return nullptr;
}
-static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo *TLI,
+static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo &TLI,
Instruction *AI) {
if (isa<ConstantPointerNull>(V))
return true;
@@ -2557,12 +2557,34 @@ static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo *TLI,
// through bitcasts of V can cause
// the result statement below to be true, even when AI and V (ex:
// i8* ->i32* ->i8* of AI) are the same allocations.
- return isAllocLikeFn(V, TLI) && V != AI;
+ return isAllocLikeFn(V, &TLI) && V != AI;
+}
+
+/// Given a call CB which uses an address UsedV, return true if we can prove the
+/// call's only possible effect is storing to V.
+static bool isRemovableWrite(CallBase &CB, Value *UsedV,
+ const TargetLibraryInfo &TLI) {
+ if (!CB.use_empty())
+ // TODO: add recursion if returned attribute is present
+ return false;
+
+ if (CB.isTerminator())
+ // TODO: remove implementation restriction
+ return false;
+
+ if (!CB.willReturn() || !CB.doesNotThrow())
+ return false;
+
+ // If the only possible side effect of the call is writing to the alloca,
+ // and the result isn't used, we can safely remove any reads implied by the
+ // call including those which might read the alloca itself.
+ Optional<MemoryLocation> Dest = MemoryLocation::getForDest(&CB, TLI);
+ return Dest && Dest->Ptr == UsedV;
}
static bool isAllocSiteRemovable(Instruction *AI,
SmallVectorImpl<WeakTrackingVH> &Users,
- const TargetLibraryInfo *TLI) {
+ const TargetLibraryInfo &TLI) {
SmallVector<Instruction*, 4> Worklist;
Worklist.push_back(AI);
@@ -2627,12 +2649,17 @@ static bool isAllocSiteRemovable(Instruction *AI,
}
}
- if (isFreeCall(I, TLI)) {
+ if (isRemovableWrite(*cast<CallBase>(I), PI, TLI)) {
+ Users.emplace_back(I);
+ continue;
+ }
+
+ if (isFreeCall(I, &TLI)) {
Users.emplace_back(I);
continue;
}
- if (isReallocLikeFn(I, TLI, true)) {
+ if (isReallocLikeFn(I, &TLI, true)) {
Users.emplace_back(I);
Worklist.push_back(I);
continue;
@@ -2676,7 +2703,7 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false));
}
- if (isAllocSiteRemovable(&MI, Users, &TLI)) {
+ if (isAllocSiteRemovable(&MI, Users, TLI)) {
for (unsigned i = 0, e = Users.size(); i != e; ++i) {
// Lowering all @llvm.objectsize calls first because they may
// use a bitcast/GEP of the alloca we are removing.
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 38c219ce3465..9f26b37bbc79 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -232,6 +232,12 @@ static cl::opt<int> ClTrackOrigins("dfsan-track-origins",
cl::desc("Track origins of labels"),
cl::Hidden, cl::init(0));
+static cl::opt<bool> ClIgnorePersonalityRoutine(
+ "dfsan-ignore-personality-routine",
+ cl::desc("If a personality routine is marked uninstrumented from the ABI "
+ "list, do not create a wrapper for it."),
+ cl::Hidden, cl::init(false));
+
static StringRef getGlobalTypeString(const GlobalValue &G) {
// Types of GlobalVariables are always pointer types.
Type *GType = G.getValueType();
@@ -1115,7 +1121,7 @@ DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF);
if (F->isVarArg()) {
- NewF->removeFnAttrs(AttrBuilder().addAttribute("split-stack"));
+ NewF->removeFnAttr("split-stack");
CallInst::Create(DFSanVarargWrapperFn,
IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "",
BB);
@@ -1357,9 +1363,24 @@ bool DataFlowSanitizer::runImpl(Module &M) {
std::vector<Function *> FnsToInstrument;
SmallPtrSet<Function *, 2> FnsWithNativeABI;
SmallPtrSet<Function *, 2> FnsWithForceZeroLabel;
+ SmallPtrSet<Constant *, 1> PersonalityFns;
for (Function &F : M)
- if (!F.isIntrinsic() && !DFSanRuntimeFunctions.contains(&F))
+ if (!F.isIntrinsic() && !DFSanRuntimeFunctions.contains(&F)) {
FnsToInstrument.push_back(&F);
+ if (F.hasPersonalityFn())
+ PersonalityFns.insert(F.getPersonalityFn()->stripPointerCasts());
+ }
+
+ if (ClIgnorePersonalityRoutine) {
+ for (auto *C : PersonalityFns) {
+ assert(isa<Function>(C) && "Personality routine is not a function!");
+ Function *F = cast<Function>(C);
+ if (!isInstrumented(F))
+ FnsToInstrument.erase(
+ std::remove(FnsToInstrument.begin(), FnsToInstrument.end(), F),
+ FnsToInstrument.end());
+ }
+ }
// Give function aliases prefixes when necessary, and build wrappers where the
// instrumentedness is inconsistent.
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index d1d3b8ffdf7a..de34348606ef 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -26,7 +26,9 @@
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
@@ -40,6 +42,7 @@
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/InstrProfCorrelator.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Error.h"
@@ -57,6 +60,13 @@ using namespace llvm;
#define DEBUG_TYPE "instrprof"
+namespace llvm {
+cl::opt<bool>
+ DebugInfoCorrelate("debug-info-correlate", cl::ZeroOrMore,
+ cl::desc("Use debug info to correlate profiles."),
+ cl::init(false));
+} // namespace llvm
+
namespace {
cl::opt<bool> DoHashBasedCounterSplit(
@@ -641,6 +651,12 @@ void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) {
}
void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
+ // TODO: Value profiling heavily depends on the data section which is omitted
+ // in lightweight mode. We need to move the value profile pointer to the
+ // Counter struct to get this working.
+ assert(
+ !DebugInfoCorrelate &&
+ "Value profiling is not yet supported with lightweight instrumentation");
GlobalVariable *Name = Ind->getName();
auto It = ProfileDataMap.find(Name);
assert(It != ProfileDataMap.end() && It->second.DataVar &&
@@ -855,6 +871,12 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
GlobalValue::LinkageTypes Linkage = NamePtr->getLinkage();
GlobalValue::VisibilityTypes Visibility = NamePtr->getVisibility();
+ // Use internal rather than private linkage so the counter variable shows up
+ // in the symbol table when using debug info for correlation.
+ if (DebugInfoCorrelate && TT.isOSBinFormatMachO() &&
+ Linkage == GlobalValue::PrivateLinkage)
+ Linkage = GlobalValue::InternalLinkage;
+
// Due to the limitation of binder as of 2021/09/28, the duplicate weak
// symbols in the same csect won't be discarded. When there are duplicate weak
// symbols, we can NOT guarantee that the relocations get resolved to the
@@ -916,6 +938,42 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
MaybeSetComdat(CounterPtr);
CounterPtr->setLinkage(Linkage);
PD.RegionCounters = CounterPtr;
+ if (DebugInfoCorrelate) {
+ if (auto *SP = Fn->getSubprogram()) {
+ DIBuilder DB(*M, true, SP->getUnit());
+ Metadata *FunctionNameAnnotation[] = {
+ MDString::get(Ctx, InstrProfCorrelator::FunctionNameAttributeName),
+ MDString::get(Ctx, getPGOFuncNameVarInitializer(NamePtr)),
+ };
+ Metadata *CFGHashAnnotation[] = {
+ MDString::get(Ctx, InstrProfCorrelator::CFGHashAttributeName),
+ ConstantAsMetadata::get(Inc->getHash()),
+ };
+ Metadata *NumCountersAnnotation[] = {
+ MDString::get(Ctx, InstrProfCorrelator::NumCountersAttributeName),
+ ConstantAsMetadata::get(Inc->getNumCounters()),
+ };
+ auto Annotations = DB.getOrCreateArray({
+ MDNode::get(Ctx, FunctionNameAnnotation),
+ MDNode::get(Ctx, CFGHashAnnotation),
+ MDNode::get(Ctx, NumCountersAnnotation),
+ });
+ auto *DICounter = DB.createGlobalVariableExpression(
+ SP, CounterPtr->getName(), /*LinkageName=*/StringRef(), SP->getFile(),
+ /*LineNo=*/0, DB.createUnspecifiedType("Profile Data Type"),
+ CounterPtr->hasLocalLinkage(), /*IsDefined=*/true, /*Expr=*/nullptr,
+ /*Decl=*/nullptr, /*TemplateParams=*/nullptr, /*AlignInBits=*/0,
+ Annotations);
+ CounterPtr->addDebugInfo(DICounter);
+ DB.finalize();
+ } else {
+ std::string Msg = ("Missing debug info for function " + Fn->getName() +
+ "; required for profile correlation.")
+ .str();
+ Ctx.diagnose(
+ DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning));
+ }
+ }
auto *Int8PtrTy = Type::getInt8PtrTy(Ctx);
// Allocate statically the array of pointers to value profile nodes for
@@ -939,6 +997,9 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx));
}
+ if (DebugInfoCorrelate)
+ return PD.RegionCounters;
+
// Create data variable.
auto *IntPtrTy = M->getDataLayout().getIntPtrType(M->getContext());
auto *Int16Ty = Type::getInt16Ty(Ctx);
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 4d15b784f486..446e601cd4d7 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -307,6 +307,11 @@ static cl::opt<bool>
cl::desc("Enable KernelMemorySanitizer instrumentation"),
cl::Hidden, cl::init(false));
+static cl::opt<bool>
+ ClDisableChecks("msan-disable-checks",
+ cl::desc("Apply no_sanitize to the whole file"), cl::Hidden,
+ cl::init(false));
+
// This is an experiment to enable handling of cases where shadow is a non-zero
// compile-time constant. For some unexplainable reason they were silently
// ignored in the instrumentation.
@@ -1095,7 +1100,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
MemorySanitizerVisitor(Function &F, MemorySanitizer &MS,
const TargetLibraryInfo &TLI)
: F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)), TLI(&TLI) {
- bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeMemory);
+ bool SanitizeFunction =
+ F.hasFnAttribute(Attribute::SanitizeMemory) && !ClDisableChecks;
InsertChecks = SanitizeFunction;
PropagateShadow = SanitizeFunction;
PoisonStack = SanitizeFunction && ClPoisonStack;
@@ -1214,7 +1220,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Value *Shadow = SI->isAtomic() ? getCleanShadow(Val) : getShadow(Val);
Value *ShadowPtr, *OriginPtr;
Type *ShadowTy = Shadow->getType();
- const Align Alignment = assumeAligned(SI->getAlignment());
+ const Align Alignment = SI->getAlign();
const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment);
std::tie(ShadowPtr, OriginPtr) =
getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ true);
@@ -3887,8 +3893,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
&I, IRB, IRB.getInt8Ty(), Align(1), /*isStore*/ true);
Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0);
- IRB.CreateMemSet(ShadowBase, PoisonValue, Len,
- MaybeAlign(I.getAlignment()));
+ IRB.CreateMemSet(ShadowBase, PoisonValue, Len, I.getAlign());
}
if (PoisonStack && MS.TrackOrigins) {
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index af5946325bbb..b6ba1fc2132c 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -273,14 +273,14 @@ static cl::opt<bool> PGOVerifyBFI(
"internal option -pass-remakrs-analysis=pgo."));
static cl::opt<unsigned> PGOVerifyBFIRatio(
- "pgo-verify-bfi-ratio", cl::init(5), cl::Hidden,
- cl::desc("Set the threshold for pgo-verify-big -- only print out "
+ "pgo-verify-bfi-ratio", cl::init(2), cl::Hidden,
+ cl::desc("Set the threshold for pgo-verify-bfi: only print out "
"mismatched BFI if the difference percentage is greater than "
"this value (in percentage)."));
static cl::opt<unsigned> PGOVerifyBFICutoff(
- "pgo-verify-bfi-cutoff", cl::init(1), cl::Hidden,
- cl::desc("Set the threshold for pgo-verify-bfi -- skip the counts whose "
+ "pgo-verify-bfi-cutoff", cl::init(5), cl::Hidden,
+ cl::desc("Set the threshold for pgo-verify-bfi: skip the counts whose "
"profile count value is below."));
namespace llvm {
@@ -291,6 +291,8 @@ extern cl::opt<PGOViewCountsType> PGOViewCounts;
// Command line option to specify the name of the function for CFG dump
// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
extern cl::opt<std::string> ViewBlockFreqFuncName;
+
+extern cl::opt<bool> DebugInfoCorrelate;
} // namespace llvm
static cl::opt<bool>
@@ -467,8 +469,9 @@ private:
createProfileFileNameVar(M, InstrProfileOutput);
// The variable in a comdat may be discarded by LTO. Ensure the
// declaration will be retained.
- appendToCompilerUsed(
- M, createIRLevelProfileFlagVar(M, /*IsCS=*/true, PGOInstrumentEntry));
+ appendToCompilerUsed(M, createIRLevelProfileFlagVar(M, /*IsCS=*/true,
+ PGOInstrumentEntry,
+ DebugInfoCorrelate));
return false;
}
std::string InstrProfileOutput;
@@ -1616,7 +1619,8 @@ static bool InstrumentAllFunctions(
// For the context-sensitve instrumentation, we should have a separated pass
// (before LTO/ThinLTO linking) to create these variables.
if (!IsCS)
- createIRLevelProfileFlagVar(M, /*IsCS=*/false, PGOInstrumentEntry);
+ createIRLevelProfileFlagVar(M, /*IsCS=*/false, PGOInstrumentEntry,
+ DebugInfoCorrelate);
std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
collectComdatMembers(M, ComdatMembers);
@@ -1638,8 +1642,9 @@ PGOInstrumentationGenCreateVar::run(Module &M, ModuleAnalysisManager &AM) {
createProfileFileNameVar(M, CSInstrName);
// The variable in a comdat may be discarded by LTO. Ensure the declaration
// will be retained.
- appendToCompilerUsed(
- M, createIRLevelProfileFlagVar(M, /*IsCS=*/true, PGOInstrumentEntry));
+ appendToCompilerUsed(M, createIRLevelProfileFlagVar(M, /*IsCS=*/true,
+ PGOInstrumentEntry,
+ DebugInfoCorrelate));
return PreservedAnalyses::all();
}
@@ -1774,7 +1779,7 @@ static void verifyFuncBFI(PGOUseFunc &Func, LoopInfo &LI,
uint64_t Diff = (BFICountValue >= CountValue)
? BFICountValue - CountValue
: CountValue - BFICountValue;
- if (Diff < CountValue / 100 * PGOVerifyBFIRatio)
+ if (Diff <= CountValue / 100 * PGOVerifyBFIRatio)
continue;
}
BBMisMatchNum++;
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 27f54f8026e1..37a7053d778e 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -271,8 +271,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
// subtree of BB (subtree not including the BB itself).
DenseMap<BasicBlock *, InsertPtsCostPair> InsertPtsMap;
InsertPtsMap.reserve(Orders.size() + 1);
- for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) {
- BasicBlock *Node = *RIt;
+ for (BasicBlock *Node : llvm::reverse(Orders)) {
bool NodeInBBs = BBs.count(Node);
auto &InsertPts = InsertPtsMap[Node].first;
BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second;
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 8c4523206070..dda1a2f08076 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -588,7 +588,7 @@ struct AllSwitchPaths {
PrevBB = BB;
}
- if (TPath.isExitValueSet())
+ if (TPath.isExitValueSet() && isSupported(TPath))
TPaths.push_back(TPath);
}
}
@@ -683,6 +683,62 @@ private:
return Res;
}
+ /// The determinator BB should precede the switch-defining BB.
+ ///
+ /// Otherwise, it is possible that the state defined in the determinator block
+ /// defines the state for the next iteration of the loop, rather than for the
+ /// current one.
+ ///
+ /// Currently supported paths:
+ /// \code
+ /// < switch bb1 determ def > [ 42, determ ]
+ /// < switch_and_def bb1 determ > [ 42, determ ]
+ /// < switch_and_def_and_determ bb1 > [ 42, switch_and_def_and_determ ]
+ /// \endcode
+ ///
+ /// Unsupported paths:
+ /// \code
+ /// < switch bb1 def determ > [ 43, determ ]
+ /// < switch_and_determ bb1 def > [ 43, switch_and_determ ]
+ /// \endcode
+ bool isSupported(const ThreadingPath &TPath) {
+ Instruction *SwitchCondI = dyn_cast<Instruction>(Switch->getCondition());
+ assert(SwitchCondI);
+ if (!SwitchCondI)
+ return false;
+
+ const BasicBlock *SwitchCondDefBB = SwitchCondI->getParent();
+ const BasicBlock *SwitchCondUseBB = Switch->getParent();
+ const BasicBlock *DeterminatorBB = TPath.getDeterminatorBB();
+
+ assert(
+ SwitchCondUseBB == TPath.getPath().front() &&
+ "The first BB in a threading path should have the switch instruction");
+ if (SwitchCondUseBB != TPath.getPath().front())
+ return false;
+
+ // Make DeterminatorBB the first element in Path.
+ PathType Path = TPath.getPath();
+ auto ItDet = std::find(Path.begin(), Path.end(), DeterminatorBB);
+ std::rotate(Path.begin(), ItDet, Path.end());
+
+ bool IsDetBBSeen = false;
+ bool IsDefBBSeen = false;
+ bool IsUseBBSeen = false;
+ for (BasicBlock *BB : Path) {
+ if (BB == DeterminatorBB)
+ IsDetBBSeen = true;
+ if (BB == SwitchCondDefBB)
+ IsDefBBSeen = true;
+ if (BB == SwitchCondUseBB)
+ IsUseBBSeen = true;
+ if (IsDetBBSeen && IsUseBBSeen && !IsDefBBSeen)
+ return false;
+ }
+
+ return true;
+ }
+
SwitchInst *Switch;
BasicBlock *SwitchBlock;
OptimizationRemarkEmitter *ORE;
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index e0d3a6accadd..eadbb4293539 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -175,44 +175,6 @@ static cl::opt<bool>
using OverlapIntervalsTy = std::map<int64_t, int64_t>;
using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>;
-/// If the value of this instruction and the memory it writes to is unused, may
-/// we delete this instruction?
-static bool isRemovable(Instruction *I) {
- // Don't remove volatile/atomic stores.
- if (StoreInst *SI = dyn_cast<StoreInst>(I))
- return SI->isUnordered();
-
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
- switch (II->getIntrinsicID()) {
- default: llvm_unreachable("Does not have LocForWrite");
- case Intrinsic::lifetime_end:
- // Never remove dead lifetime_end's, e.g. because it is followed by a
- // free.
- return false;
- case Intrinsic::init_trampoline:
- // Always safe to remove init_trampoline.
- return true;
- case Intrinsic::memset:
- case Intrinsic::memmove:
- case Intrinsic::memcpy:
- case Intrinsic::memcpy_inline:
- // Don't remove volatile memory intrinsics.
- return !cast<MemIntrinsic>(II)->isVolatile();
- case Intrinsic::memcpy_element_unordered_atomic:
- case Intrinsic::memmove_element_unordered_atomic:
- case Intrinsic::memset_element_unordered_atomic:
- case Intrinsic::masked_store:
- return true;
- }
- }
-
- // note: only get here for calls with analyzable writes - i.e. libcalls
- if (auto *CB = dyn_cast<CallBase>(I))
- return CB->use_empty();
-
- return false;
-}
-
/// Returns true if the end of this instruction can be safely shortened in
/// length.
static bool isShortenableAtTheEnd(Instruction *I) {
@@ -835,7 +797,7 @@ struct DSEState {
auto *MD = dyn_cast_or_null<MemoryDef>(MA);
if (MD && MemDefs.size() < MemorySSADefsPerBlockLimit &&
- (getLocForWriteEx(&I) || isMemTerminatorInst(&I)))
+ (getLocForWrite(&I) || isMemTerminatorInst(&I)))
MemDefs.push_back(MD);
}
}
@@ -1022,48 +984,39 @@ struct DSEState {
return I.first->second;
}
- Optional<MemoryLocation> getLocForWriteEx(Instruction *I) const {
+ Optional<MemoryLocation> getLocForWrite(Instruction *I) const {
if (!I->mayWriteToMemory())
return None;
- if (auto *MTI = dyn_cast<AnyMemIntrinsic>(I))
- return {MemoryLocation::getForDest(MTI)};
+ if (auto *CB = dyn_cast<CallBase>(I))
+ return MemoryLocation::getForDest(CB, TLI);
+
+ return MemoryLocation::getOrNone(I);
+ }
+
+ /// Assuming this instruction has a dead analyzable write, can we delete
+ /// this instruction?
+ bool isRemovable(Instruction *I) {
+ assert(getLocForWrite(I) && "Must have analyzable write");
+
+ // Don't remove volatile/atomic stores.
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->isUnordered();
if (auto *CB = dyn_cast<CallBase>(I)) {
- // If the functions may write to memory we do not know about, bail out.
- if (!CB->onlyAccessesArgMemory() &&
- !CB->onlyAccessesInaccessibleMemOrArgMem())
- return None;
+ // Don't remove volatile memory intrinsics.
+ if (auto *MI = dyn_cast<MemIntrinsic>(CB))
+ return !MI->isVolatile();
- LibFunc LF;
- if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) {
- switch (LF) {
- case LibFunc_strncpy:
- if (const auto *Len = dyn_cast<ConstantInt>(CB->getArgOperand(2)))
- return MemoryLocation(CB->getArgOperand(0),
- LocationSize::precise(Len->getZExtValue()),
- CB->getAAMetadata());
- LLVM_FALLTHROUGH;
- case LibFunc_strcpy:
- case LibFunc_strcat:
- case LibFunc_strncat:
- return {MemoryLocation::getAfter(CB->getArgOperand(0))};
- default:
- break;
- }
- }
- switch (CB->getIntrinsicID()) {
- case Intrinsic::init_trampoline:
- return {MemoryLocation::getAfter(CB->getArgOperand(0))};
- case Intrinsic::masked_store:
- return {MemoryLocation::getForArgument(CB, 1, TLI)};
- default:
- break;
- }
- return None;
+ // Never remove dead lifetime intrinsics, e.g. because they are followed
+ // by a free.
+ if (CB->isLifetimeStartOrEnd())
+ return false;
+
+ return CB->use_empty() && CB->willReturn() && CB->doesNotThrow();
}
- return MemoryLocation::getOrNone(I);
+ return false;
}
/// Returns true if \p UseInst completely overwrites \p DefLoc
@@ -1081,7 +1034,7 @@ struct DSEState {
return false;
int64_t InstWriteOffset, DepWriteOffset;
- if (auto CC = getLocForWriteEx(UseInst))
+ if (auto CC = getLocForWrite(UseInst))
return isOverwrite(UseInst, DefInst, *CC, DefLoc, InstWriteOffset,
DepWriteOffset) == OW_Complete;
return false;
@@ -1093,7 +1046,7 @@ struct DSEState {
<< *Def->getMemoryInst()
<< ") is at the end the function \n");
- auto MaybeLoc = getLocForWriteEx(Def->getMemoryInst());
+ auto MaybeLoc = getLocForWrite(Def->getMemoryInst());
if (!MaybeLoc) {
LLVM_DEBUG(dbgs() << " ... could not get location for write.\n");
return false;
@@ -1237,30 +1190,14 @@ struct DSEState {
/// loop. In particular, this guarantees that it only references a single
/// MemoryLocation during execution of the containing function.
bool isGuaranteedLoopInvariant(const Value *Ptr) {
- auto IsGuaranteedLoopInvariantBase = [this](const Value *Ptr) {
- Ptr = Ptr->stripPointerCasts();
- if (auto *I = dyn_cast<Instruction>(Ptr)) {
- if (isa<AllocaInst>(Ptr))
- return true;
-
- if (isAllocLikeFn(I, &TLI))
- return true;
-
- return false;
- }
- return true;
- };
-
Ptr = Ptr->stripPointerCasts();
- if (auto *I = dyn_cast<Instruction>(Ptr)) {
- if (I->getParent()->isEntryBlock())
- return true;
- }
- if (auto *GEP = dyn_cast<GEPOperator>(Ptr)) {
- return IsGuaranteedLoopInvariantBase(GEP->getPointerOperand()) &&
- GEP->hasAllConstantIndices();
- }
- return IsGuaranteedLoopInvariantBase(Ptr);
+ if (auto *GEP = dyn_cast<GEPOperator>(Ptr))
+ if (GEP->hasAllConstantIndices())
+ Ptr = GEP->getPointerOperand()->stripPointerCasts();
+
+ if (auto *I = dyn_cast<Instruction>(Ptr))
+ return I->getParent()->isEntryBlock();
+ return true;
}
// Find a MemoryDef writing to \p KillingLoc and dominating \p StartAccess,
@@ -1372,7 +1309,7 @@ struct DSEState {
// If Current does not have an analyzable write location or is not
// removable, skip it.
- CurrentLoc = getLocForWriteEx(CurrentI);
+ CurrentLoc = getLocForWrite(CurrentI);
if (!CurrentLoc || !isRemovable(CurrentI)) {
CanOptimize = false;
continue;
@@ -1729,14 +1666,13 @@ struct DSEState {
LLVM_DEBUG(
dbgs()
<< "Trying to eliminate MemoryDefs at the end of the function\n");
- for (int I = MemDefs.size() - 1; I >= 0; I--) {
- MemoryDef *Def = MemDefs[I];
- if (SkipStores.contains(Def) || !isRemovable(Def->getMemoryInst()))
+ for (MemoryDef *Def : llvm::reverse(MemDefs)) {
+ if (SkipStores.contains(Def))
continue;
Instruction *DefI = Def->getMemoryInst();
- auto DefLoc = getLocForWriteEx(DefI);
- if (!DefLoc)
+ auto DefLoc = getLocForWrite(DefI);
+ if (!DefLoc || !isRemovable(DefI))
continue;
// NOTE: Currently eliminating writes at the end of a function is limited
@@ -1763,13 +1699,19 @@ struct DSEState {
/// \returns true if \p Def is a no-op store, either because it
/// directly stores back a loaded value or stores zero to a calloced object.
bool storeIsNoop(MemoryDef *Def, const Value *DefUO) {
- StoreInst *Store = dyn_cast<StoreInst>(Def->getMemoryInst());
- MemSetInst *MemSet = dyn_cast<MemSetInst>(Def->getMemoryInst());
+ Instruction *DefI = Def->getMemoryInst();
+ StoreInst *Store = dyn_cast<StoreInst>(DefI);
+ MemSetInst *MemSet = dyn_cast<MemSetInst>(DefI);
Constant *StoredConstant = nullptr;
if (Store)
StoredConstant = dyn_cast<Constant>(Store->getOperand(0));
- if (MemSet)
+ else if (MemSet)
StoredConstant = dyn_cast<Constant>(MemSet->getValue());
+ else
+ return false;
+
+ if (!isRemovable(DefI))
+ return false;
if (StoredConstant && StoredConstant->isNullValue()) {
auto *DefUOInst = dyn_cast<Instruction>(DefUO);
@@ -1902,7 +1844,7 @@ struct DSEState {
bool Changed = false;
for (auto OI : IOL) {
Instruction *DeadI = OI.first;
- MemoryLocation Loc = *getLocForWriteEx(DeadI);
+ MemoryLocation Loc = *getLocForWrite(DeadI);
assert(isRemovable(DeadI) && "Expect only removable instruction");
const Value *Ptr = Loc.Ptr->stripPointerCasts();
@@ -1925,9 +1867,14 @@ struct DSEState {
LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs that write the "
"already existing value\n");
for (auto *Def : MemDefs) {
- if (SkipStores.contains(Def) || MSSA.isLiveOnEntryDef(Def) ||
- !isRemovable(Def->getMemoryInst()))
+ if (SkipStores.contains(Def) || MSSA.isLiveOnEntryDef(Def))
continue;
+
+ Instruction *DefInst = Def->getMemoryInst();
+ auto MaybeDefLoc = getLocForWrite(DefInst);
+ if (!MaybeDefLoc || !isRemovable(DefInst))
+ continue;
+
MemoryDef *UpperDef;
// To conserve compile-time, we avoid walking to the next clobbering def.
// Instead, we just try to get the optimized access, if it exists. DSE
@@ -1939,17 +1886,14 @@ struct DSEState {
if (!UpperDef || MSSA.isLiveOnEntryDef(UpperDef))
continue;
- Instruction *DefInst = Def->getMemoryInst();
Instruction *UpperInst = UpperDef->getMemoryInst();
- auto IsRedundantStore = [this, DefInst,
- UpperInst](MemoryLocation UpperLoc) {
+ auto IsRedundantStore = [&]() {
if (DefInst->isIdenticalTo(UpperInst))
return true;
if (auto *MemSetI = dyn_cast<MemSetInst>(UpperInst)) {
if (auto *SI = dyn_cast<StoreInst>(DefInst)) {
- auto MaybeDefLoc = getLocForWriteEx(DefInst);
- if (!MaybeDefLoc)
- return false;
+ // MemSetInst must have a write location.
+ MemoryLocation UpperLoc = *getLocForWrite(UpperInst);
int64_t InstWriteOffset = 0;
int64_t DepWriteOffset = 0;
auto OR = isOverwrite(UpperInst, DefInst, UpperLoc, *MaybeDefLoc,
@@ -1962,9 +1906,7 @@ struct DSEState {
return false;
};
- auto MaybeUpperLoc = getLocForWriteEx(UpperInst);
- if (!MaybeUpperLoc || !IsRedundantStore(*MaybeUpperLoc) ||
- isReadClobber(*MaybeUpperLoc, DefInst))
+ if (!IsRedundantStore() || isReadClobber(*MaybeDefLoc, DefInst))
continue;
LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n DEAD: " << *DefInst
<< '\n');
@@ -1995,7 +1937,7 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
MaybeKillingLoc = State.getLocForTerminator(KillingI).map(
[](const std::pair<MemoryLocation, bool> &P) { return P.first; });
else
- MaybeKillingLoc = State.getLocForWriteEx(KillingI);
+ MaybeKillingLoc = State.getLocForWrite(KillingI);
if (!MaybeKillingLoc) {
LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for "
@@ -2059,7 +2001,7 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
if (!DebugCounter::shouldExecute(MemorySSACounter))
continue;
- MemoryLocation DeadLoc = *State.getLocForWriteEx(DeadI);
+ MemoryLocation DeadLoc = *State.getLocForWrite(DeadI);
if (IsMemTerm) {
const Value *DeadUndObj = getUnderlyingObject(DeadLoc.Ptr);
@@ -2124,8 +2066,7 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
}
// Check if the store is a no-op.
- if (!Shortend && isRemovable(KillingI) &&
- State.storeIsNoop(KillingDef, KillingUndObj)) {
+ if (!Shortend && State.storeIsNoop(KillingDef, KillingUndObj)) {
LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n DEAD: " << *KillingI
<< '\n');
State.deleteDeadInstruction(KillingI);
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 90f71f7729a7..a24997dd3fd4 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -1366,8 +1366,16 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
continue;
}
- if (auto *I = dyn_cast<Instruction>(V))
- I->andIRFlags(&Inst);
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ // If I being poison triggers UB, there is no need to drop those
+ // flags. Otherwise, only retain flags present on both I and Inst.
+ // TODO: Currently some fast-math flags are not treated as
+ // poison-generating even though they should. Until this is fixed,
+ // always retain flags present on both I and Inst for floating point
+ // instructions.
+ if (isa<FPMathOperator>(I) || (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I)))
+ I->andIRFlags(&Inst);
+ }
Inst.replaceAllUsesWith(V);
salvageKnowledge(&Inst, &AC);
removeMSSA(Inst);
diff --git a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
index e54a270fb276..44017b555769 100644
--- a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -13,10 +13,12 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/FlattenCFG.h"
#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
@@ -24,11 +26,11 @@ using namespace llvm;
#define DEBUG_TYPE "flattencfg"
namespace {
-struct FlattenCFGPass : public FunctionPass {
+struct FlattenCFGLegacyPass : public FunctionPass {
static char ID; // Pass identification, replacement for typeid
public:
- FlattenCFGPass() : FunctionPass(ID) {
- initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry());
+ FlattenCFGLegacyPass() : FunctionPass(ID) {
+ initializeFlattenCFGLegacyPassPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
@@ -39,21 +41,10 @@ public:
private:
AliasAnalysis *AA;
};
-}
-
-char FlattenCFGPass::ID = 0;
-INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
- false)
-
-// Public interface to the FlattenCFG pass
-FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); }
/// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function,
/// iterating until no more changes are made.
-static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
+bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
bool Changed = false;
bool LocalChange = true;
@@ -78,8 +69,22 @@ static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
}
return Changed;
}
+} // namespace
-bool FlattenCFGPass::runOnFunction(Function &F) {
+char FlattenCFGLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(FlattenCFGLegacyPass, "flattencfg", "Flatten the CFG",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(FlattenCFGLegacyPass, "flattencfg", "Flatten the CFG",
+ false, false)
+
+// Public interface to the FlattenCFG pass
+FunctionPass *llvm::createFlattenCFGPass() {
+ return new FlattenCFGLegacyPass();
+}
+
+bool FlattenCFGLegacyPass::runOnFunction(Function &F) {
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
bool EverChanged = false;
// iterativelyFlattenCFG can make some blocks dead.
@@ -89,3 +94,15 @@ bool FlattenCFGPass::runOnFunction(Function &F) {
}
return EverChanged;
}
+
+PreservedAnalyses FlattenCFGPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ bool EverChanged = false;
+ AliasAnalysis *AA = &AM.getResult<AAManager>(F);
+ // iterativelyFlattenCFG can make some blocks dead.
+ while (iterativelyFlattenCFG(F, AA)) {
+ removeUnreachableBlocks(F);
+ EverChanged = true;
+ }
+ return EverChanged ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 6f97f3e93123..bc792ca3d8da 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -107,11 +107,6 @@ static cl::opt<bool> ControlFlowHoisting(
"licm-control-flow-hoisting", cl::Hidden, cl::init(false),
cl::desc("Enable control flow (and PHI) hoisting in LICM"));
-static cl::opt<unsigned> HoistSinkColdnessThreshold(
- "licm-coldness-threshold", cl::Hidden, cl::init(4),
- cl::desc("Relative coldness Threshold of hoisting/sinking destination "
- "block for LICM to be considered beneficial"));
-
static cl::opt<uint32_t> MaxNumUsesTraversed(
"licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
cl::desc("Max num uses visited for identifying load "
@@ -819,35 +814,6 @@ public:
};
} // namespace
-// Hoisting/sinking instruction out of a loop isn't always beneficial. It's only
-// only worthwhile if the destination block is actually colder than current
-// block.
-static bool worthSinkOrHoistInst(Instruction &I, BasicBlock *DstBlock,
- OptimizationRemarkEmitter *ORE,
- BlockFrequencyInfo *BFI) {
- // Check block frequency only when runtime profile is available
- // to avoid pathological cases. With static profile, lean towards
- // hosting because it helps canonicalize the loop for vectorizer.
- if (!DstBlock->getParent()->hasProfileData())
- return true;
-
- if (!HoistSinkColdnessThreshold || !BFI)
- return true;
-
- BasicBlock *SrcBlock = I.getParent();
- if (BFI->getBlockFreq(DstBlock).getFrequency() / HoistSinkColdnessThreshold >
- BFI->getBlockFreq(SrcBlock).getFrequency()) {
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "SinkHoistInst", &I)
- << "failed to sink or hoist instruction because containing block "
- "has lower frequency than destination block";
- });
- return false;
- }
-
- return true;
-}
-
/// Walk the specified region of the CFG (defined by all blocks dominated by
/// the specified block, and that are in the current loop) in depth first
/// order w.r.t the DominatorTree. This allows us to visit definitions before
@@ -909,7 +875,6 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
if (CurLoop->hasLoopInvariantOperands(&I) &&
canSinkOrHoistInst(I, AA, DT, CurLoop, /*CurAST*/ nullptr, MSSAU,
true, &Flags, ORE) &&
- worthSinkOrHoistInst(I, CurLoop->getLoopPreheader(), ORE, BFI) &&
isSafeToExecuteUnconditionally(
I, DT, TLI, CurLoop, SafetyInfo, ORE,
CurLoop->getLoopPreheader()->getTerminator())) {
@@ -1741,7 +1706,6 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
// First check if I is worth sinking for all uses. Sink only when it is worth
// across all uses.
SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end());
- SmallVector<PHINode *, 8> ExitPNs;
for (auto *UI : Users) {
auto *User = cast<Instruction>(UI);
@@ -1751,14 +1715,6 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
PHINode *PN = cast<PHINode>(User);
assert(ExitBlockSet.count(PN->getParent()) &&
"The LCSSA PHI is not in an exit block!");
- if (!worthSinkOrHoistInst(I, PN->getParent(), ORE, BFI)) {
- return Changed;
- }
-
- ExitPNs.push_back(PN);
- }
-
- for (auto *PN : ExitPNs) {
// The PHI must be trivially replaceable.
Instruction *New = sinkThroughTriviallyReplaceablePHI(
diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index 77d76609c926..57e36e5b9b90 100644
--- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -224,8 +224,8 @@ bool LoopDataPrefetch::run() {
bool MadeChange = false;
for (Loop *I : *LI)
- for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
- MadeChange |= runOnLoop(*L);
+ for (Loop *L : depth_first(I))
+ MadeChange |= runOnLoop(L);
return MadeChange;
}
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 42da86a9ecf5..5d00fa56e888 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -786,9 +786,9 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
- MaybeAlign(HeadStore->getAlignment()),
- StoredVal, HeadStore, AdjacentStores, StoreEv,
- BECount, IsNegStride)) {
+ MaybeAlign(HeadStore->getAlign()), StoredVal,
+ HeadStore, AdjacentStores, StoreEv, BECount,
+ IsNegStride)) {
TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
Changed = true;
}
@@ -967,12 +967,22 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
<< "\n");
if (PositiveStrideSCEV != MemsetSizeSCEV) {
- // TODO: folding can be done to the SCEVs
- // The folding is to fold expressions that is covered by the loop guard
- // at loop entry. After the folding, compare again and proceed
- // optimization if equal.
- LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
- return false;
+ // If an expression is covered by the loop guard, compare again and
+ // proceed with optimization if equal.
+ const SCEV *FoldedPositiveStride =
+ SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
+ const SCEV *FoldedMemsetSize =
+ SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
+
+ LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
+ << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
+ << " FoldedPositiveStride: " << *FoldedPositiveStride
+ << "\n");
+
+ if (FoldedPositiveStride != FoldedMemsetSize) {
+ LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
+ return false;
+ }
}
}
diff --git a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
index 56d66b93dd69..9d22eceb987f 100644
--- a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -1456,16 +1456,12 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *BackedgeTakenCount) {
}
// Remove instructions associated with non-base iterations.
- for (BasicBlock::reverse_iterator J = Header->rbegin(), JE = Header->rend();
- J != JE;) {
- unsigned I = Uses[&*J].find_first();
+ for (Instruction &Inst : llvm::make_early_inc_range(llvm::reverse(*Header))) {
+ unsigned I = Uses[&Inst].find_first();
if (I > 0 && I < IL_All) {
- LLVM_DEBUG(dbgs() << "LRR: removing: " << *J << "\n");
- J++->eraseFromParent();
- continue;
+ LLVM_DEBUG(dbgs() << "LRR: removing: " << Inst << "\n");
+ Inst.eraseFromParent();
}
-
- ++J;
}
// Rewrite each BaseInst using SCEV.
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index a9a2266e1196..798af48c2337 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -6011,7 +6011,7 @@ struct SCEVDbgValueBuilder {
// See setFinalExpression: prepend our opcodes on the start of any old
// expression opcodes.
assert(!DI.hasArgList());
- llvm::SmallVector<uint64_t, 6> FinalExpr(Expr.begin() + 2, Expr.end());
+ llvm::SmallVector<uint64_t, 6> FinalExpr(llvm::drop_begin(Expr, 2));
auto *NewExpr =
DIExpression::prependOpcodes(OldExpr, FinalExpr, /*StackValue*/ true);
DI.setExpression(NewExpr);
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 39c8b65968aa..893928fb0560 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1136,6 +1136,31 @@ static LoopUnrollResult tryToUnrollLoop(
TransformationMode TM = hasUnrollTransformation(L);
if (TM & TM_Disable)
return LoopUnrollResult::Unmodified;
+
+ // If this loop isn't forced to be unrolled, avoid unrolling it when the
+ // parent loop has an explicit unroll-and-jam pragma. This is to prevent
+ // automatic unrolling from interfering with the user requested
+ // transformation.
+ Loop *ParentL = L->getParentLoop();
+ if (ParentL != NULL &&
+ hasUnrollAndJamTransformation(ParentL) == TM_ForcedByUser &&
+ hasUnrollTransformation(L) != TM_ForcedByUser) {
+ LLVM_DEBUG(dbgs() << "Not unrolling loop since parent loop has"
+ << " llvm.loop.unroll_and_jam.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ // If this loop isn't forced to be unrolled, avoid unrolling it when the
+ // loop has an explicit unroll-and-jam pragma. This is to prevent automatic
+ // unrolling from interfering with the user requested transformation.
+ if (hasUnrollAndJamTransformation(L) == TM_ForcedByUser &&
+ hasUnrollTransformation(L) != TM_ForcedByUser) {
+ LLVM_DEBUG(
+ dbgs()
+ << " Not unrolling loop since it has llvm.loop.unroll_and_jam.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
if (!L->isLoopSimplifyForm()) {
LLVM_DEBUG(
dbgs() << " Not unrolling loop which is not in loop-simplify form.\n");
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 91215cd19e2b..10a8742940b1 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -638,6 +638,7 @@ class NewGVN {
BitVector TouchedInstructions;
DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
+ mutable DenseMap<const IntrinsicInst *, const Value *> IntrinsicInstPred;
#ifndef NDEBUG
// Debugging for how many times each block and instruction got processed.
@@ -794,7 +795,7 @@ private:
BasicBlock *PHIBlock) const;
const Expression *performSymbolicAggrValueEvaluation(Instruction *) const;
ExprResult performSymbolicCmpEvaluation(Instruction *) const;
- ExprResult performSymbolicPredicateInfoEvaluation(Instruction *) const;
+ ExprResult performSymbolicPredicateInfoEvaluation(IntrinsicInst *) const;
// Congruence finding.
bool someEquivalentDominates(const Instruction *, const Instruction *) const;
@@ -815,6 +816,8 @@ private:
// Ranking
unsigned int getRank(const Value *) const;
bool shouldSwapOperands(const Value *, const Value *) const;
+ bool shouldSwapOperandsForIntrinsic(const Value *, const Value *,
+ const IntrinsicInst *I) const;
// Reachability handling.
void updateReachableEdge(BasicBlock *, BasicBlock *);
@@ -1552,7 +1555,7 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
}
NewGVN::ExprResult
-NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
+NewGVN::performSymbolicPredicateInfoEvaluation(IntrinsicInst *I) const {
auto *PI = PredInfo->getPredicateInfoFor(I);
if (!PI)
return ExprResult::none();
@@ -1572,7 +1575,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
Value *AdditionallyUsedValue = CmpOp0;
// Sort the ops.
- if (shouldSwapOperands(FirstOp, SecondOp)) {
+ if (shouldSwapOperandsForIntrinsic(FirstOp, SecondOp, I)) {
std::swap(FirstOp, SecondOp);
Predicate = CmpInst::getSwappedPredicate(Predicate);
AdditionallyUsedValue = CmpOp1;
@@ -1598,7 +1601,7 @@ NewGVN::ExprResult NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
// Intrinsics with the returned attribute are copies of arguments.
if (auto *ReturnedValue = II->getReturnedArgOperand()) {
if (II->getIntrinsicID() == Intrinsic::ssa_copy)
- if (auto Res = performSymbolicPredicateInfoEvaluation(I))
+ if (auto Res = performSymbolicPredicateInfoEvaluation(II))
return Res;
return ExprResult::some(createVariableOrConstant(ReturnedValue));
}
@@ -2951,6 +2954,7 @@ void NewGVN::cleanupTables() {
PredicateToUsers.clear();
MemoryToUsers.clear();
RevisitOnReachabilityChange.clear();
+ IntrinsicInstPred.clear();
}
// Assign local DFS number mapping to instructions, and leave space for Value
@@ -4152,6 +4156,29 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B);
}
+bool NewGVN::shouldSwapOperandsForIntrinsic(const Value *A, const Value *B,
+ const IntrinsicInst *I) const {
+ auto LookupResult = IntrinsicInstPred.find(I);
+ if (shouldSwapOperands(A, B)) {
+ if (LookupResult == IntrinsicInstPred.end())
+ IntrinsicInstPred.insert({I, B});
+ else
+ LookupResult->second = B;
+ return true;
+ }
+
+ if (LookupResult != IntrinsicInstPred.end()) {
+ auto *SeenPredicate = LookupResult->second;
+ if (SeenPredicate) {
+ if (SeenPredicate == B)
+ return true;
+ else
+ LookupResult->second = nullptr;
+ }
+ }
+ return false;
+}
+
namespace {
class NewGVNLegacyPass : public FunctionPass {
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 2d3490b2d29e..e12eca0ed287 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1359,16 +1359,6 @@ static constexpr Attribute::AttrKind FnAttrsToStrip[] =
Attribute::InaccessibleMemOrArgMemOnly,
Attribute::NoSync, Attribute::NoFree};
-// List of all parameter and return attributes which must be stripped when
-// lowering from the abstract machine model. Note that we list attributes
-// here which aren't valid as return attributes, that is okay. There are
-// also some additional attributes with arguments which are handled
-// explicitly and are not in this list.
-static constexpr Attribute::AttrKind ParamAttrsToStrip[] =
- {Attribute::ReadNone, Attribute::ReadOnly, Attribute::WriteOnly,
- Attribute::NoAlias, Attribute::NoFree};
-
-
// Create new attribute set containing only attributes which can be transferred
// from original call to the safepoint.
static AttributeList legalizeCallAttributes(LLVMContext &Ctx,
@@ -2650,24 +2640,19 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
return !Records.empty();
}
-// Handles both return values and arguments for Functions and calls.
-template <typename AttrHolder>
-static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
- unsigned Index) {
+// List of all parameter and return attributes which must be stripped when
+// lowering from the abstract machine model. Note that we list attributes
+// here which aren't valid as return attributes, that is okay.
+static AttrBuilder getParamAndReturnAttributesToRemove() {
AttrBuilder R;
- AttributeSet AS = AH.getAttributes().getAttributes(Index);
- if (AS.getDereferenceableBytes())
- R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable,
- AS.getDereferenceableBytes()));
- if (AS.getDereferenceableOrNullBytes())
- R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull,
- AS.getDereferenceableOrNullBytes()));
- for (auto Attr : ParamAttrsToStrip)
- if (AS.hasAttribute(Attr))
- R.addAttribute(Attr);
-
- if (!R.empty())
- AH.setAttributes(AH.getAttributes().removeAttributesAtIndex(Ctx, Index, R));
+ R.addDereferenceableAttr(1);
+ R.addDereferenceableOrNullAttr(1);
+ R.addAttribute(Attribute::ReadNone);
+ R.addAttribute(Attribute::ReadOnly);
+ R.addAttribute(Attribute::WriteOnly);
+ R.addAttribute(Attribute::NoAlias);
+ R.addAttribute(Attribute::NoFree);
+ return R;
}
static void stripNonValidAttributesFromPrototype(Function &F) {
@@ -2683,13 +2668,13 @@ static void stripNonValidAttributesFromPrototype(Function &F) {
return;
}
+ AttrBuilder R = getParamAndReturnAttributesToRemove();
for (Argument &A : F.args())
if (isa<PointerType>(A.getType()))
- RemoveNonValidAttrAtIndex(Ctx, F,
- A.getArgNo() + AttributeList::FirstArgIndex);
+ F.removeParamAttrs(A.getArgNo(), R);
if (isa<PointerType>(F.getReturnType()))
- RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex);
+ F.removeRetAttrs(R);
for (auto Attr : FnAttrsToStrip)
F.removeFnAttr(Attr);
@@ -2757,13 +2742,13 @@ static void stripNonValidDataFromBody(Function &F) {
stripInvalidMetadataFromInstruction(I);
+ AttrBuilder R = getParamAndReturnAttributesToRemove();
if (auto *Call = dyn_cast<CallBase>(&I)) {
for (int i = 0, e = Call->arg_size(); i != e; i++)
if (isa<PointerType>(Call->getArgOperand(i)->getType()))
- RemoveNonValidAttrAtIndex(Ctx, *Call,
- i + AttributeList::FirstArgIndex);
+ Call->removeParamAttrs(i, R);
if (isa<PointerType>(Call->getType()))
- RemoveNonValidAttrAtIndex(Ctx, *Call, AttributeList::ReturnIndex);
+ Call->removeRetAttrs(R);
}
}
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index 28e00c873361..ff2f8a25f379 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -101,8 +101,7 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
Constant *Const = nullptr;
if (V->getType()->isStructTy()) {
std::vector<ValueLatticeElement> IVs = Solver.getStructLatticeValueFor(V);
- if (any_of(IVs,
- [](const ValueLatticeElement &LV) { return isOverdefined(LV); }))
+ if (llvm::any_of(IVs, isOverdefined))
return false;
std::vector<Constant *> ConstVals;
auto *ST = cast<StructType>(V->getType());
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index a041af0d70d0..f9650efc051f 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -54,7 +54,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeMakeGuardsExplicitLegacyPassPass(Registry);
initializeGVNHoistLegacyPassPass(Registry);
initializeGVNSinkLegacyPassPass(Registry);
- initializeFlattenCFGPassPass(Registry);
+ initializeFlattenCFGLegacyPassPass(Registry);
initializeIRCELegacyPassPass(Registry);
initializeIndVarSimplifyLegacyPassPass(Registry);
initializeInferAddressSpacesPass(Registry);
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index ffa2f9adb978..d23925042b0a 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -648,13 +648,13 @@ Value *ConstantOffsetExtractor::applyExts(Value *V) {
Value *Current = V;
// ExtInsts is built in the use-def order. Therefore, we apply them to V
// in the reversed order.
- for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) {
+ for (CastInst *I : llvm::reverse(ExtInsts)) {
if (Constant *C = dyn_cast<Constant>(Current)) {
// If Current is a constant, apply s/zext using ConstantExpr::getCast.
// ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt.
- Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType());
+ Current = ConstantExpr::getCast(I->getOpcode(), C, I->getType());
} else {
- Instruction *Ext = (*I)->clone();
+ Instruction *Ext = I->clone();
Ext->setOperand(0, Current);
Ext->insertBefore(IP);
Current = Ext;
diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp
new file mode 100644
index 000000000000..dfb9f608eab2
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp
@@ -0,0 +1,942 @@
+//===- CodeLayout.cpp - Implementation of code layout algorithms ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// ExtTSP - layout of basic blocks with i-cache optimization.
+//
+// The algorithm tries to find a layout of nodes (basic blocks) of a given CFG
+// optimizing jump locality and thus processor I-cache utilization. This is
+// achieved via increasing the number of fall-through jumps and co-locating
+// frequently executed nodes together. The name follows the underlying
+// optimization problem, Extended-TSP, which is a generalization of classical
+// (maximum) Traveling Salesmen Problem.
+//
+// The algorithm is a greedy heuristic that works with chains (ordered lists)
+// of basic blocks. Initially all chains are isolated basic blocks. On every
+// iteration, we pick a pair of chains whose merging yields the biggest increase
+// in the ExtTSP score, which models how i-cache "friendly" a specific chain is.
+// A pair of chains giving the maximum gain is merged into a new chain. The
+// procedure stops when there is only one chain left, or when merging does not
+// increase ExtTSP. In the latter case, the remaining chains are sorted by
+// density in the decreasing order.
+//
+// An important aspect is the way two chains are merged. Unlike earlier
+// algorithms (e.g., based on the approach of Pettis-Hansen), two
+// chains, X and Y, are first split into three, X1, X2, and Y. Then we
+// consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y,
+// X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
+// This improves the quality of the final result (the search space is larger)
+// while keeping the implementation sufficiently fast.
+//
+// Reference:
+// * A. Newell and S. Pupyrev, Improved Basic Block Reordering,
+// IEEE Transactions on Computers, 2020
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CodeLayout.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+#define DEBUG_TYPE "code-layout"
+
+// Algorithm-specific constants. The values are tuned for the best performance
+// of large-scale front-end bound binaries.
+static cl::opt<double>
+ ForwardWeight("ext-tsp-forward-weight", cl::Hidden, cl::init(0.1),
+ cl::desc("The weight of forward jumps for ExtTSP value"));
+
+static cl::opt<double>
+ BackwardWeight("ext-tsp-backward-weight", cl::Hidden, cl::init(0.1),
+ cl::desc("The weight of backward jumps for ExtTSP value"));
+
+static cl::opt<unsigned> ForwardDistance(
+ "ext-tsp-forward-distance", cl::Hidden, cl::init(1024),
+ cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP"));
+
+static cl::opt<unsigned> BackwardDistance(
+ "ext-tsp-backward-distance", cl::Hidden, cl::init(640),
+ cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP"));
+
+// The maximum size of a chain for splitting. Larger values of the threshold
+// may yield better quality at the cost of worsen run-time.
+static cl::opt<unsigned> ChainSplitThreshold(
+ "ext-tsp-chain-split-threshold", cl::Hidden, cl::init(128),
+ cl::desc("The maximum size of a chain to apply splitting"));
+
+// The option enables splitting (large) chains along in-coming and out-going
+// jumps. This typically results in a better quality.
+static cl::opt<bool> EnableChainSplitAlongJumps(
+ "ext-tsp-enable-chain-split-along-jumps", cl::Hidden, cl::init(true),
+ cl::desc("The maximum size of a chain to apply splitting"));
+
+namespace {
+
+// Epsilon for comparison of doubles.
+constexpr double EPS = 1e-8;
+
+// Compute the Ext-TSP score for a jump between a given pair of blocks,
+// using their sizes, (estimated) addresses and the jump execution count.
+double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr,
+ uint64_t Count) {
+ // Fallthrough
+ if (SrcAddr + SrcSize == DstAddr) {
+ // Assume that FallthroughWeight = 1.0 after normalization
+ return static_cast<double>(Count);
+ }
+ // Forward
+ if (SrcAddr + SrcSize < DstAddr) {
+ const auto Dist = DstAddr - (SrcAddr + SrcSize);
+ if (Dist <= ForwardDistance) {
+ double Prob = 1.0 - static_cast<double>(Dist) / ForwardDistance;
+ return ForwardWeight * Prob * Count;
+ }
+ return 0;
+ }
+ // Backward
+ const auto Dist = SrcAddr + SrcSize - DstAddr;
+ if (Dist <= BackwardDistance) {
+ double Prob = 1.0 - static_cast<double>(Dist) / BackwardDistance;
+ return BackwardWeight * Prob * Count;
+ }
+ return 0;
+}
+
+/// A type of merging two chains, X and Y. The former chain is split into
+/// X1 and X2 and then concatenated with Y in the order specified by the type.
+enum class MergeTypeTy : int { X_Y, X1_Y_X2, Y_X2_X1, X2_X1_Y };
+
+/// The gain of merging two chains, that is, the Ext-TSP score of the merge
+/// together with the corresponfiding merge 'type' and 'offset'.
+class MergeGainTy {
+public:
+ explicit MergeGainTy() {}
+ explicit MergeGainTy(double Score, size_t MergeOffset, MergeTypeTy MergeType)
+ : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}
+
+ double score() const { return Score; }
+
+ size_t mergeOffset() const { return MergeOffset; }
+
+ MergeTypeTy mergeType() const { return MergeType; }
+
+ // Returns 'true' iff Other is preferred over this.
+ bool operator<(const MergeGainTy &Other) const {
+ return (Other.Score > EPS && Other.Score > Score + EPS);
+ }
+
+ // Update the current gain if Other is preferred over this.
+ void updateIfLessThan(const MergeGainTy &Other) {
+ if (*this < Other)
+ *this = Other;
+ }
+
+private:
+ double Score{-1.0};
+ size_t MergeOffset{0};
+ MergeTypeTy MergeType{MergeTypeTy::X_Y};
+};
+
+class Block;
+class Jump;
+class Chain;
+class ChainEdge;
+
+/// A node in the graph, typically corresponding to a basic block in CFG.
+class Block {
+public:
+ Block(const Block &) = delete;
+ Block(Block &&) = default;
+ Block &operator=(const Block &) = delete;
+ Block &operator=(Block &&) = default;
+
+ // The original index of the block in CFG.
+ size_t Index{0};
+ // The index of the block in the current chain.
+ size_t CurIndex{0};
+ // Size of the block in the binary.
+ uint64_t Size{0};
+ // Execution count of the block in the profile data.
+ uint64_t ExecutionCount{0};
+ // Current chain of the node.
+ Chain *CurChain{nullptr};
+ // An offset of the block in the current chain.
+ mutable uint64_t EstimatedAddr{0};
+ // Forced successor of the block in CFG.
+ Block *ForcedSucc{nullptr};
+ // Forced predecessor of the block in CFG.
+ Block *ForcedPred{nullptr};
+ // Outgoing jumps from the block.
+ std::vector<Jump *> OutJumps;
+ // Incoming jumps to the block.
+ std::vector<Jump *> InJumps;
+
+public:
+ explicit Block(size_t Index, uint64_t Size_, uint64_t EC)
+ : Index(Index), Size(Size_), ExecutionCount(EC) {}
+ bool isEntry() const { return Index == 0; }
+};
+
+/// An arc in the graph, typically corresponding to a jump between two blocks.
+class Jump {
+public:
+ Jump(const Jump &) = delete;
+ Jump(Jump &&) = default;
+ Jump &operator=(const Jump &) = delete;
+ Jump &operator=(Jump &&) = default;
+
+ // Source block of the jump.
+ Block *Source;
+ // Target block of the jump.
+ Block *Target;
+ // Execution count of the arc in the profile data.
+ uint64_t ExecutionCount{0};
+
+public:
+ explicit Jump(Block *Source, Block *Target, uint64_t ExecutionCount)
+ : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}
+};
+
+/// A chain (ordered sequence) of blocks.
+class Chain {
+public:
+ Chain(const Chain &) = delete;
+ Chain(Chain &&) = default;
+ Chain &operator=(const Chain &) = delete;
+ Chain &operator=(Chain &&) = default;
+
+ explicit Chain(uint64_t Id, Block *Block)
+ : Id(Id), Score(0), Blocks(1, Block) {}
+
+ uint64_t id() const { return Id; }
+
+ bool isEntry() const { return Blocks[0]->Index == 0; }
+
+ double score() const { return Score; }
+
+ void setScore(double NewScore) { Score = NewScore; }
+
+ const std::vector<Block *> &blocks() const { return Blocks; }
+
+ const std::vector<std::pair<Chain *, ChainEdge *>> &edges() const {
+ return Edges;
+ }
+
+ ChainEdge *getEdge(Chain *Other) const {
+ for (auto It : Edges) {
+ if (It.first == Other)
+ return It.second;
+ }
+ return nullptr;
+ }
+
+ void removeEdge(Chain *Other) {
+ auto It = Edges.begin();
+ while (It != Edges.end()) {
+ if (It->first == Other) {
+ Edges.erase(It);
+ return;
+ }
+ It++;
+ }
+ }
+
+ void addEdge(Chain *Other, ChainEdge *Edge) {
+ Edges.push_back(std::make_pair(Other, Edge));
+ }
+
+ void merge(Chain *Other, const std::vector<Block *> &MergedBlocks) {
+ Blocks = MergedBlocks;
+ // Update the block's chains
+ for (size_t Idx = 0; Idx < Blocks.size(); Idx++) {
+ Blocks[Idx]->CurChain = this;
+ Blocks[Idx]->CurIndex = Idx;
+ }
+ }
+
+ void mergeEdges(Chain *Other);
+
+ void clear() {
+ Blocks.clear();
+ Blocks.shrink_to_fit();
+ Edges.clear();
+ Edges.shrink_to_fit();
+ }
+
+private:
+ // Unique chain identifier.
+ uint64_t Id;
+ // Cached ext-tsp score for the chain.
+ double Score;
+ // Blocks of the chain.
+ std::vector<Block *> Blocks;
+ // Adjacent chains and corresponding edges (lists of jumps).
+ std::vector<std::pair<Chain *, ChainEdge *>> Edges;
+};
+
+/// An edge in CFG representing jumps between two chains.
+/// When blocks are merged into chains, the edges are combined too so that
+/// there is always at most one edge between a pair of chains
+class ChainEdge {
+public:
+ ChainEdge(const ChainEdge &) = delete;
+ ChainEdge(ChainEdge &&) = default;
+ ChainEdge &operator=(const ChainEdge &) = delete;
+ ChainEdge &operator=(ChainEdge &&) = default;
+
+ explicit ChainEdge(Jump *Jump)
+ : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain),
+ Jumps(1, Jump) {}
+
+ const std::vector<Jump *> &jumps() const { return Jumps; }
+
+ void changeEndpoint(Chain *From, Chain *To) {
+ if (From == SrcChain)
+ SrcChain = To;
+ if (From == DstChain)
+ DstChain = To;
+ }
+
+ void appendJump(Jump *Jump) { Jumps.push_back(Jump); }
+
+ void moveJumps(ChainEdge *Other) {
+ Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end());
+ Other->Jumps.clear();
+ Other->Jumps.shrink_to_fit();
+ }
+
+ bool hasCachedMergeGain(Chain *Src, Chain *Dst) const {
+ return Src == SrcChain ? CacheValidForward : CacheValidBackward;
+ }
+
+ MergeGainTy getCachedMergeGain(Chain *Src, Chain *Dst) const {
+ return Src == SrcChain ? CachedGainForward : CachedGainBackward;
+ }
+
+ void setCachedMergeGain(Chain *Src, Chain *Dst, MergeGainTy MergeGain) {
+ if (Src == SrcChain) {
+ CachedGainForward = MergeGain;
+ CacheValidForward = true;
+ } else {
+ CachedGainBackward = MergeGain;
+ CacheValidBackward = true;
+ }
+ }
+
+ void invalidateCache() {
+ CacheValidForward = false;
+ CacheValidBackward = false;
+ }
+
+private:
+ // Source chain.
+ Chain *SrcChain{nullptr};
+ // Destination chain.
+ Chain *DstChain{nullptr};
+ // Original jumps in the binary with correspinding execution counts.
+ std::vector<Jump *> Jumps;
+ // Cached ext-tsp value for merging the pair of chains.
+ // Since the gain of merging (Src, Dst) and (Dst, Src) might be different,
+ // we store both values here.
+ MergeGainTy CachedGainForward;
+ MergeGainTy CachedGainBackward;
+ // Whether the cached value must be recomputed.
+ bool CacheValidForward{false};
+ bool CacheValidBackward{false};
+};
+
+void Chain::mergeEdges(Chain *Other) {
+ assert(this != Other && "cannot merge a chain with itself");
+
+ // Update edges adjacent to chain Other
+ for (auto EdgeIt : Other->Edges) {
+ const auto DstChain = EdgeIt.first;
+ const auto DstEdge = EdgeIt.second;
+ const auto TargetChain = DstChain == Other ? this : DstChain;
+ auto CurEdge = getEdge(TargetChain);
+ if (CurEdge == nullptr) {
+ DstEdge->changeEndpoint(Other, this);
+ this->addEdge(TargetChain, DstEdge);
+ if (DstChain != this && DstChain != Other) {
+ DstChain->addEdge(this, DstEdge);
+ }
+ } else {
+ CurEdge->moveJumps(DstEdge);
+ }
+ // Cleanup leftover edge
+ if (DstChain != Other) {
+ DstChain->removeEdge(Other);
+ }
+ }
+}
+
+using BlockIter = std::vector<Block *>::const_iterator;
+
+/// A wrapper around three chains of blocks; it is used to avoid extra
+/// instantiation of the vectors.
+class MergedChain {
+public:
+ MergedChain(BlockIter Begin1, BlockIter End1, BlockIter Begin2 = BlockIter(),
+ BlockIter End2 = BlockIter(), BlockIter Begin3 = BlockIter(),
+ BlockIter End3 = BlockIter())
+ : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3),
+ End3(End3) {}
+
+ template <typename F> void forEach(const F &Func) const {
+ for (auto It = Begin1; It != End1; It++)
+ Func(*It);
+ for (auto It = Begin2; It != End2; It++)
+ Func(*It);
+ for (auto It = Begin3; It != End3; It++)
+ Func(*It);
+ }
+
+ std::vector<Block *> getBlocks() const {
+ std::vector<Block *> Result;
+ Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) +
+ std::distance(Begin3, End3));
+ Result.insert(Result.end(), Begin1, End1);
+ Result.insert(Result.end(), Begin2, End2);
+ Result.insert(Result.end(), Begin3, End3);
+ return Result;
+ }
+
+ const Block *getFirstBlock() const { return *Begin1; }
+
+private:
+ BlockIter Begin1;
+ BlockIter End1;
+ BlockIter Begin2;
+ BlockIter End2;
+ BlockIter Begin3;
+ BlockIter End3;
+};
+
+/// The implementation of the ExtTSP algorithm.
+class ExtTSPImpl {
+ using EdgeT = std::pair<uint64_t, uint64_t>;
+ using EdgeCountMap = DenseMap<EdgeT, uint64_t>;
+
+public:
+ ExtTSPImpl(size_t NumNodes, const std::vector<uint64_t> &NodeSizes,
+ const std::vector<uint64_t> &NodeCounts,
+ const EdgeCountMap &EdgeCounts)
+ : NumNodes(NumNodes) {
+ initialize(NodeSizes, NodeCounts, EdgeCounts);
+ }
+
+ /// Run the algorithm and return an optimized ordering of blocks.
+ void run(std::vector<uint64_t> &Result) {
+ // Pass 1: Merge blocks with their mutually forced successors
+ mergeForcedPairs();
+
+ // Pass 2: Merge pairs of chains while improving the ExtTSP objective
+ mergeChainPairs();
+
+ // Pass 3: Merge cold blocks to reduce code size
+ mergeColdChains();
+
+ // Collect blocks from all chains
+ concatChains(Result);
+ }
+
+private:
+ /// Initialize the algorithm's data structures.
+ void initialize(const std::vector<uint64_t> &NodeSizes,
+ const std::vector<uint64_t> &NodeCounts,
+ const EdgeCountMap &EdgeCounts) {
+ // Initialize blocks
+ AllBlocks.reserve(NumNodes);
+ for (uint64_t Node = 0; Node < NumNodes; Node++) {
+ uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
+ uint64_t ExecutionCount = NodeCounts[Node];
+ // The execution count of the entry block is set to at least 1
+ if (Node == 0 && ExecutionCount == 0)
+ ExecutionCount = 1;
+ AllBlocks.emplace_back(Node, Size, ExecutionCount);
+ }
+
+ // Initialize jumps between blocks
+ SuccNodes = std::vector<std::vector<uint64_t>>(NumNodes);
+ PredNodes = std::vector<std::vector<uint64_t>>(NumNodes);
+ AllJumps.reserve(EdgeCounts.size());
+ for (auto It : EdgeCounts) {
+ auto Pred = It.first.first;
+ auto Succ = It.first.second;
+ // Ignore self-edges
+ if (Pred == Succ)
+ continue;
+
+ SuccNodes[Pred].push_back(Succ);
+ PredNodes[Succ].push_back(Pred);
+ auto ExecutionCount = It.second;
+ if (ExecutionCount > 0) {
+ auto &Block = AllBlocks[Pred];
+ auto &SuccBlock = AllBlocks[Succ];
+ AllJumps.emplace_back(&Block, &SuccBlock, ExecutionCount);
+ SuccBlock.InJumps.push_back(&AllJumps.back());
+ Block.OutJumps.push_back(&AllJumps.back());
+ }
+ }
+
+ // Initialize chains
+ AllChains.reserve(NumNodes);
+ HotChains.reserve(NumNodes);
+ for (auto &Block : AllBlocks) {
+ AllChains.emplace_back(Block.Index, &Block);
+ Block.CurChain = &AllChains.back();
+ if (Block.ExecutionCount > 0) {
+ HotChains.push_back(&AllChains.back());
+ }
+ }
+
+ // Initialize chain edges
+ AllEdges.reserve(AllJumps.size());
+ for (auto &Block : AllBlocks) {
+ for (auto &Jump : Block.OutJumps) {
+ const auto SuccBlock = Jump->Target;
+ auto CurEdge = Block.CurChain->getEdge(SuccBlock->CurChain);
+ // this edge is already present in the graph
+ if (CurEdge != nullptr) {
+ assert(SuccBlock->CurChain->getEdge(Block.CurChain) != nullptr);
+ CurEdge->appendJump(Jump);
+ continue;
+ }
+ // this is a new edge
+ AllEdges.emplace_back(Jump);
+ Block.CurChain->addEdge(SuccBlock->CurChain, &AllEdges.back());
+ SuccBlock->CurChain->addEdge(Block.CurChain, &AllEdges.back());
+ }
+ }
+ }
+
+ /// For a pair of blocks, A and B, block B is the forced successor of A,
+ /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
+ /// to B are from A. Such blocks should be adjacent in the optimal ordering;
+ /// the method finds and merges such pairs of blocks.
+ void mergeForcedPairs() {
+ // Find fallthroughs based on edge weights
+ for (auto &Block : AllBlocks) {
+ if (SuccNodes[Block.Index].size() == 1 &&
+ PredNodes[SuccNodes[Block.Index][0]].size() == 1 &&
+ SuccNodes[Block.Index][0] != 0) {
+ size_t SuccIndex = SuccNodes[Block.Index][0];
+ Block.ForcedSucc = &AllBlocks[SuccIndex];
+ AllBlocks[SuccIndex].ForcedPred = &Block;
+ }
+ }
+
+ // There might be 'cycles' in the forced dependencies, since profile
+ // data isn't 100% accurate. Typically this is observed in loops, when the
+ // loop edges are the hottest successors for the basic blocks of the loop.
+ // Break the cycles by choosing the block with the smallest index as the
+ // head. This helps to keep the original order of the loops, which likely
+ // have already been rotated in the optimized manner.
+ for (auto &Block : AllBlocks) {
+ if (Block.ForcedSucc == nullptr || Block.ForcedPred == nullptr)
+ continue;
+
+ auto SuccBlock = Block.ForcedSucc;
+ while (SuccBlock != nullptr && SuccBlock != &Block) {
+ SuccBlock = SuccBlock->ForcedSucc;
+ }
+ if (SuccBlock == nullptr)
+ continue;
+ // Break the cycle
+ AllBlocks[Block.ForcedPred->Index].ForcedSucc = nullptr;
+ Block.ForcedPred = nullptr;
+ }
+
+ // Merge blocks with their fallthrough successors
+ for (auto &Block : AllBlocks) {
+ if (Block.ForcedPred == nullptr && Block.ForcedSucc != nullptr) {
+ auto CurBlock = &Block;
+ while (CurBlock->ForcedSucc != nullptr) {
+ const auto NextBlock = CurBlock->ForcedSucc;
+ mergeChains(Block.CurChain, NextBlock->CurChain, 0, MergeTypeTy::X_Y);
+ CurBlock = NextBlock;
+ }
+ }
+ }
+ }
+
+ /// Merge pairs of chains while improving the ExtTSP objective.
+ void mergeChainPairs() {
+ /// Deterministically compare pairs of chains
+ auto compareChainPairs = [](const Chain *A1, const Chain *B1,
+ const Chain *A2, const Chain *B2) {
+ if (A1 != A2)
+ return A1->id() < A2->id();
+ return B1->id() < B2->id();
+ };
+
+ while (HotChains.size() > 1) {
+ Chain *BestChainPred = nullptr;
+ Chain *BestChainSucc = nullptr;
+ auto BestGain = MergeGainTy();
+ // Iterate over all pairs of chains
+ for (auto ChainPred : HotChains) {
+ // Get candidates for merging with the current chain
+ for (auto EdgeIter : ChainPred->edges()) {
+ auto ChainSucc = EdgeIter.first;
+ auto ChainEdge = EdgeIter.second;
+ // Ignore loop edges
+ if (ChainPred == ChainSucc)
+ continue;
+
+ // Compute the gain of merging the two chains
+ auto CurGain = getBestMergeGain(ChainPred, ChainSucc, ChainEdge);
+ if (CurGain.score() <= EPS)
+ continue;
+
+ if (BestGain < CurGain ||
+ (std::abs(CurGain.score() - BestGain.score()) < EPS &&
+ compareChainPairs(ChainPred, ChainSucc, BestChainPred,
+ BestChainSucc))) {
+ BestGain = CurGain;
+ BestChainPred = ChainPred;
+ BestChainSucc = ChainSucc;
+ }
+ }
+ }
+
+ // Stop merging when there is no improvement
+ if (BestGain.score() <= EPS)
+ break;
+
+ // Merge the best pair of chains
+ mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
+ BestGain.mergeType());
+ }
+ }
+
+ /// Merge cold blocks to reduce code size.
+ void mergeColdChains() {
+ for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
+ // Iterating over neighbors in the reverse order to make sure original
+ // fallthrough jumps are merged first
+ size_t NumSuccs = SuccNodes[SrcBB].size();
+ for (size_t Idx = 0; Idx < NumSuccs; Idx++) {
+ auto DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
+ auto SrcChain = AllBlocks[SrcBB].CurChain;
+ auto DstChain = AllBlocks[DstBB].CurChain;
+ if (SrcChain != DstChain && !DstChain->isEntry() &&
+ SrcChain->blocks().back()->Index == SrcBB &&
+ DstChain->blocks().front()->Index == DstBB) {
+ mergeChains(SrcChain, DstChain, 0, MergeTypeTy::X_Y);
+ }
+ }
+ }
+ }
+
+ /// Compute the Ext-TSP score for a given block order and a list of jumps.
+ double extTSPScore(const MergedChain &MergedBlocks,
+ const std::vector<Jump *> &Jumps) const {
+ if (Jumps.empty())
+ return 0.0;
+ uint64_t CurAddr = 0;
+ MergedBlocks.forEach([&](const Block *BB) {
+ BB->EstimatedAddr = CurAddr;
+ CurAddr += BB->Size;
+ });
+
+ double Score = 0;
+ for (auto &Jump : Jumps) {
+ const auto SrcBlock = Jump->Source;
+ const auto DstBlock = Jump->Target;
+ Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size,
+ DstBlock->EstimatedAddr, Jump->ExecutionCount);
+ }
+ return Score;
+ }
+
+ /// Compute the gain of merging two chains.
+ ///
+ /// The function considers all possible ways of merging two chains and
+ /// computes the one having the largest increase in ExtTSP objective. The
+ /// result is a pair with the first element being the gain and the second
+ /// element being the corresponding merging type.
+ MergeGainTy getBestMergeGain(Chain *ChainPred, Chain *ChainSucc,
+ ChainEdge *Edge) const {
+ if (Edge->hasCachedMergeGain(ChainPred, ChainSucc)) {
+ return Edge->getCachedMergeGain(ChainPred, ChainSucc);
+ }
+
+ // Precompute jumps between ChainPred and ChainSucc
+ auto Jumps = Edge->jumps();
+ auto EdgePP = ChainPred->getEdge(ChainPred);
+ if (EdgePP != nullptr) {
+ Jumps.insert(Jumps.end(), EdgePP->jumps().begin(), EdgePP->jumps().end());
+ }
+ assert(!Jumps.empty() && "trying to merge chains w/o jumps");
+
+ // The object holds the best currently chosen gain of merging the two chains
+ MergeGainTy Gain = MergeGainTy();
+
+ /// Given a merge offset and a list of merge types, try to merge two chains
+ /// and update Gain with a better alternative
+ auto tryChainMerging = [&](size_t Offset,
+ const std::vector<MergeTypeTy> &MergeTypes) {
+ // Skip merging corresponding to concatenation w/o splitting
+ if (Offset == 0 || Offset == ChainPred->blocks().size())
+ return;
+ // Skip merging if it breaks Forced successors
+ auto BB = ChainPred->blocks()[Offset - 1];
+ if (BB->ForcedSucc != nullptr)
+ return;
+ // Apply the merge, compute the corresponding gain, and update the best
+ // value, if the merge is beneficial
+ for (auto &MergeType : MergeTypes) {
+ Gain.updateIfLessThan(
+ computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));
+ }
+ };
+
+ // Try to concatenate two chains w/o splitting
+ Gain.updateIfLessThan(
+ computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeTy::X_Y));
+
+ if (EnableChainSplitAlongJumps) {
+ // Attach (a part of) ChainPred before the first block of ChainSucc
+ for (auto &Jump : ChainSucc->blocks().front()->InJumps) {
+ const auto SrcBlock = Jump->Source;
+ if (SrcBlock->CurChain != ChainPred)
+ continue;
+ size_t Offset = SrcBlock->CurIndex + 1;
+ tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::X2_X1_Y});
+ }
+
+ // Attach (a part of) ChainPred after the last block of ChainSucc
+ for (auto &Jump : ChainSucc->blocks().back()->OutJumps) {
+ const auto DstBlock = Jump->Source;
+ if (DstBlock->CurChain != ChainPred)
+ continue;
+ size_t Offset = DstBlock->CurIndex;
+ tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1});
+ }
+ }
+
+ // Try to break ChainPred in various ways and concatenate with ChainSucc
+ if (ChainPred->blocks().size() <= ChainSplitThreshold) {
+ for (size_t Offset = 1; Offset < ChainPred->blocks().size(); Offset++) {
+ // Try to split the chain in different ways. In practice, applying
+ // X2_Y_X1 merging is almost never provides benefits; thus, we exclude
+ // it from consideration to reduce the search space
+ tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1,
+ MergeTypeTy::X2_X1_Y});
+ }
+ }
+ Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
+ return Gain;
+ }
+
+ /// Compute the score gain of merging two chains, respecting a given
+ /// merge 'type' and 'offset'.
+ ///
+ /// The two chains are not modified in the method.
+ MergeGainTy computeMergeGain(const Chain *ChainPred, const Chain *ChainSucc,
+ const std::vector<Jump *> &Jumps,
+ size_t MergeOffset,
+ MergeTypeTy MergeType) const {
+ auto MergedBlocks = mergeBlocks(ChainPred->blocks(), ChainSucc->blocks(),
+ MergeOffset, MergeType);
+
+ // Do not allow a merge that does not preserve the original entry block
+ if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
+ !MergedBlocks.getFirstBlock()->isEntry())
+ return MergeGainTy();
+
+ // The gain for the new chain
+ auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->score();
+ return MergeGainTy(NewGainScore, MergeOffset, MergeType);
+ }
+
+ /// Merge two chains of blocks respecting a given merge 'type' and 'offset'.
+ ///
+ /// If MergeType == 0, then the result is a concatentation of two chains.
+ /// Otherwise, the first chain is cut into two sub-chains at the offset,
+ /// and merged using all possible ways of concatenating three chains.
+ MergedChain mergeBlocks(const std::vector<Block *> &X,
+ const std::vector<Block *> &Y, size_t MergeOffset,
+ MergeTypeTy MergeType) const {
+ // Split the first chain, X, into X1 and X2
+ BlockIter BeginX1 = X.begin();
+ BlockIter EndX1 = X.begin() + MergeOffset;
+ BlockIter BeginX2 = X.begin() + MergeOffset;
+ BlockIter EndX2 = X.end();
+ BlockIter BeginY = Y.begin();
+ BlockIter EndY = Y.end();
+
+ // Construct a new chain from the three existing ones
+ switch (MergeType) {
+ case MergeTypeTy::X_Y:
+ return MergedChain(BeginX1, EndX2, BeginY, EndY);
+ case MergeTypeTy::X1_Y_X2:
+ return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
+ case MergeTypeTy::Y_X2_X1:
+ return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
+ case MergeTypeTy::X2_X1_Y:
+ return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
+ }
+ llvm_unreachable("unexpected chain merge type");
+ }
+
+ /// Merge chain From into chain Into, update the list of active chains,
+ /// adjacency information, and the corresponding cached values.
+ void mergeChains(Chain *Into, Chain *From, size_t MergeOffset,
+ MergeTypeTy MergeType) {
+ assert(Into != From && "a chain cannot be merged with itself");
+
+ // Merge the blocks
+ auto MergedBlocks =
+ mergeBlocks(Into->blocks(), From->blocks(), MergeOffset, MergeType);
+ Into->merge(From, MergedBlocks.getBlocks());
+ Into->mergeEdges(From);
+ From->clear();
+
+ // Update cached ext-tsp score for the new chain
+ auto SelfEdge = Into->getEdge(Into);
+ if (SelfEdge != nullptr) {
+ MergedBlocks = MergedChain(Into->blocks().begin(), Into->blocks().end());
+ Into->setScore(extTSPScore(MergedBlocks, SelfEdge->jumps()));
+ }
+
+ // Remove chain From from the list of active chains
+ auto Iter = std::remove(HotChains.begin(), HotChains.end(), From);
+ HotChains.erase(Iter, HotChains.end());
+
+ // Invalidate caches
+ for (auto EdgeIter : Into->edges()) {
+ EdgeIter.second->invalidateCache();
+ }
+ }
+
+ /// Concatenate all chains into a final order of blocks.
+ void concatChains(std::vector<uint64_t> &Order) {
+ // Collect chains and calculate some stats for their sorting
+ std::vector<Chain *> SortedChains;
+ DenseMap<const Chain *, double> ChainDensity;
+ for (auto &Chain : AllChains) {
+ if (!Chain.blocks().empty()) {
+ SortedChains.push_back(&Chain);
+ // Using doubles to avoid overflow of ExecutionCount
+ double Size = 0;
+ double ExecutionCount = 0;
+ for (auto Block : Chain.blocks()) {
+ Size += static_cast<double>(Block->Size);
+ ExecutionCount += static_cast<double>(Block->ExecutionCount);
+ }
+ assert(Size > 0 && "a chain of zero size");
+ ChainDensity[&Chain] = ExecutionCount / Size;
+ }
+ }
+
+ // Sorting chains by density in the decreasing order
+ std::stable_sort(SortedChains.begin(), SortedChains.end(),
+ [&](const Chain *C1, const Chain *C2) {
+ // Makre sure the original entry block is at the
+ // beginning of the order
+ if (C1->isEntry() != C2->isEntry()) {
+ return C1->isEntry();
+ }
+
+ const double D1 = ChainDensity[C1];
+ const double D2 = ChainDensity[C2];
+ // Compare by density and break ties by chain identifiers
+ return (D1 != D2) ? (D1 > D2) : (C1->id() < C2->id());
+ });
+
+ // Collect the blocks in the order specified by their chains
+ Order.reserve(NumNodes);
+ for (auto Chain : SortedChains) {
+ for (auto Block : Chain->blocks()) {
+ Order.push_back(Block->Index);
+ }
+ }
+ }
+
+private:
+ /// The number of nodes in the graph.
+ const size_t NumNodes;
+
+ /// Successors of each node.
+ std::vector<std::vector<uint64_t>> SuccNodes;
+
+ /// Predecessors of each node.
+ std::vector<std::vector<uint64_t>> PredNodes;
+
+ /// All basic blocks.
+ std::vector<Block> AllBlocks;
+
+ /// All jumps between blocks.
+ std::vector<Jump> AllJumps;
+
+ /// All chains of basic blocks.
+ std::vector<Chain> AllChains;
+
+ /// All edges between chains.
+ std::vector<ChainEdge> AllEdges;
+
+ /// Active chains. The vector gets updated at runtime when chains are merged.
+ std::vector<Chain *> HotChains;
+};
+
+} // end of anonymous namespace
+
+std::vector<uint64_t> llvm::applyExtTspLayout(
+ const std::vector<uint64_t> &NodeSizes,
+ const std::vector<uint64_t> &NodeCounts,
+ const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) {
+ size_t NumNodes = NodeSizes.size();
+
+ // Verify correctness of the input data.
+ assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
+ assert(NumNodes > 2 && "Incorrect input");
+
+ // Apply the reordering algorithm.
+ auto Alg = ExtTSPImpl(NumNodes, NodeSizes, NodeCounts, EdgeCounts);
+ std::vector<uint64_t> Result;
+ Alg.run(Result);
+
+ // Verify correctness of the output.
+ assert(Result.front() == 0 && "Original entry point is not preserved");
+ assert(Result.size() == NumNodes && "Incorrect size of reordered layout");
+ return Result;
+}
+
+double llvm::calcExtTspScore(
+ const std::vector<uint64_t> &Order, const std::vector<uint64_t> &NodeSizes,
+ const std::vector<uint64_t> &NodeCounts,
+ const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) {
+ // Estimate addresses of the blocks in memory
+ auto Addr = std::vector<uint64_t>(NodeSizes.size(), 0);
+ for (size_t Idx = 1; Idx < Order.size(); Idx++) {
+ Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
+ }
+
+ // Increase the score for each jump
+ double Score = 0;
+ for (auto It : EdgeCounts) {
+ auto Pred = It.first.first;
+ auto Succ = It.first.second;
+ uint64_t Count = It.second;
+ Score += extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count);
+ }
+ return Score;
+}
+
+double llvm::calcExtTspScore(
+ const std::vector<uint64_t> &NodeSizes,
+ const std::vector<uint64_t> &NodeCounts,
+ const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) {
+ auto Order = std::vector<uint64_t>(NodeSizes.size());
+ for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
+ Order[Idx] = Idx;
+ }
+ return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
+}
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index fc7083b0c30d..589622d69578 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -596,7 +596,7 @@ bool llvm::checkDebugInfoMetadata(Module &M,
auto DILocsBefore = DIPreservationMap[NameOfWrappedPass].DILocations;
auto DILocsAfter = DIPreservationAfter[NameOfWrappedPass].DILocations;
- auto InstToDelete = DIPreservationAfter[NameOfWrappedPass].InstToDelete;
+ auto InstToDelete = DIPreservationMap[NameOfWrappedPass].InstToDelete;
auto DIVarsBefore = DIPreservationMap[NameOfWrappedPass].DIVariables;
auto DIVarsAfter = DIPreservationAfter[NameOfWrappedPass].DIVariables;
diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
index 326864803d7c..06596f7b04e1 100644
--- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -58,6 +58,14 @@ int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const {
return 0;
}
+int FunctionComparator::cmpAligns(Align L, Align R) const {
+ if (L.value() < R.value())
+ return -1;
+ if (L.value() > R.value())
+ return 1;
+ return 0;
+}
+
int FunctionComparator::cmpOrderings(AtomicOrdering L, AtomicOrdering R) const {
if ((int)L < (int)R)
return -1;
@@ -556,13 +564,12 @@ int FunctionComparator::cmpOperations(const Instruction *L,
if (int Res = cmpTypes(AI->getAllocatedType(),
cast<AllocaInst>(R)->getAllocatedType()))
return Res;
- return cmpNumbers(AI->getAlignment(), cast<AllocaInst>(R)->getAlignment());
+ return cmpAligns(AI->getAlign(), cast<AllocaInst>(R)->getAlign());
}
if (const LoadInst *LI = dyn_cast<LoadInst>(L)) {
if (int Res = cmpNumbers(LI->isVolatile(), cast<LoadInst>(R)->isVolatile()))
return Res;
- if (int Res =
- cmpNumbers(LI->getAlignment(), cast<LoadInst>(R)->getAlignment()))
+ if (int Res = cmpAligns(LI->getAlign(), cast<LoadInst>(R)->getAlign()))
return Res;
if (int Res =
cmpOrderings(LI->getOrdering(), cast<LoadInst>(R)->getOrdering()))
@@ -578,8 +585,7 @@ int FunctionComparator::cmpOperations(const Instruction *L,
if (int Res =
cmpNumbers(SI->isVolatile(), cast<StoreInst>(R)->isVolatile()))
return Res;
- if (int Res =
- cmpNumbers(SI->getAlignment(), cast<StoreInst>(R)->getAlignment()))
+ if (int Res = cmpAligns(SI->getAlign(), cast<StoreInst>(R)->getAlign()))
return Res;
if (int Res =
cmpOrderings(SI->getOrdering(), cast<StoreInst>(R)->getOrdering()))
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index ec926b1f5a94..ecad79b68185 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -402,6 +402,18 @@ bool llvm::isInstructionTriviallyDead(Instruction *I,
return wouldInstructionBeTriviallyDead(I, TLI);
}
+bool llvm::wouldInstructionBeTriviallyDeadOnUnusedPaths(
+ Instruction *I, const TargetLibraryInfo *TLI) {
+ // Instructions that are "markers" and have implied meaning on code around
+ // them (without explicit uses), are not dead on unused paths.
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+ if (II->getIntrinsicID() == Intrinsic::stacksave ||
+ II->getIntrinsicID() == Intrinsic::launder_invariant_group ||
+ II->isLifetimeStartOrEnd())
+ return false;
+ return wouldInstructionBeTriviallyDead(I, TLI);
+}
+
bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
const TargetLibraryInfo *TLI) {
if (I->isTerminator())
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index f3cf42be8ba1..69fd110dc3c2 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -104,9 +104,7 @@ bool llvm::canPeel(Loop *L) {
// note that LoopPeeling currently can only update the branch weights of latch
// blocks and branch weights to blocks with deopt or unreachable do not need
// updating.
- return all_of(Exits, [](const BasicBlock *BB) {
- return IsBlockFollowedByDeoptOrUnreachable(BB);
- });
+ return llvm::all_of(Exits, IsBlockFollowedByDeoptOrUnreachable);
}
// This function calculates the number of iterations after which the given Phi
@@ -333,6 +331,31 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
return DesiredPeelCount;
}
+/// This "heuristic" exactly matches implicit behavior which used to exist
+/// inside getLoopEstimatedTripCount. It was added here to keep an
+/// improvement inside that API from causing peeling to become more agressive.
+/// This should probably be removed.
+static bool violatesLegacyMultiExitLoopCheck(Loop *L) {
+ BasicBlock *Latch = L->getLoopLatch();
+ if (!Latch)
+ return true;
+
+ BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator());
+ if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch))
+ return true;
+
+ assert((LatchBR->getSuccessor(0) == L->getHeader() ||
+ LatchBR->getSuccessor(1) == L->getHeader()) &&
+ "At least one edge out of the latch must go to the header");
+
+ SmallVector<BasicBlock *, 4> ExitBlocks;
+ L->getUniqueNonLatchExitBlocks(ExitBlocks);
+ return any_of(ExitBlocks, [](const BasicBlock *EB) {
+ return !EB->getTerminatingDeoptimizeCall();
+ });
+}
+
+
// Return the number of iterations we want to peel off.
void llvm::computePeelCount(Loop *L, unsigned LoopSize,
TargetTransformInfo::PeelingPreferences &PP,
@@ -436,6 +459,8 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
// We only do this in the presence of profile information, since otherwise
// our estimates of the trip count are not reliable enough.
if (L->getHeader()->getParent()->hasProfileData()) {
+ if (violatesLegacyMultiExitLoopCheck(L))
+ return;
Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L);
if (!PeelCount)
return;
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index c8e42acdffb3..93157bd87c34 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -773,8 +773,8 @@ void llvm::breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
}
-/// Checks if \p L has single exit through latch block except possibly
-/// "deoptimizing" exits. Returns branch instruction terminating the loop
+/// Checks if \p L has an exiting latch branch. There may also be other
+/// exiting blocks. Returns branch instruction terminating the loop
/// latch if above check is successful, nullptr otherwise.
static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
BasicBlock *Latch = L->getLoopLatch();
@@ -789,53 +789,61 @@ static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
LatchBR->getSuccessor(1) == L->getHeader()) &&
"At least one edge out of the latch must go to the header");
- SmallVector<BasicBlock *, 4> ExitBlocks;
- L->getUniqueNonLatchExitBlocks(ExitBlocks);
- if (any_of(ExitBlocks, [](const BasicBlock *EB) {
- return !EB->getTerminatingDeoptimizeCall();
- }))
- return nullptr;
-
return LatchBR;
}
-Optional<unsigned>
-llvm::getLoopEstimatedTripCount(Loop *L,
- unsigned *EstimatedLoopInvocationWeight) {
- // Support loops with an exiting latch and other existing exists only
- // deoptimize.
- BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
- if (!LatchBranch)
- return None;
-
+/// Return the estimated trip count for any exiting branch which dominates
+/// the loop latch.
+static Optional<uint64_t>
+getEstimatedTripCount(BranchInst *ExitingBranch, Loop *L,
+ uint64_t &OrigExitWeight) {
// To estimate the number of times the loop body was executed, we want to
// know the number of times the backedge was taken, vs. the number of times
// we exited the loop.
- uint64_t BackedgeTakenWeight, LatchExitWeight;
- if (!LatchBranch->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight))
+ uint64_t LoopWeight, ExitWeight;
+ if (!ExitingBranch->extractProfMetadata(LoopWeight, ExitWeight))
return None;
- if (LatchBranch->getSuccessor(0) != L->getHeader())
- std::swap(BackedgeTakenWeight, LatchExitWeight);
+ if (L->contains(ExitingBranch->getSuccessor(1)))
+ std::swap(LoopWeight, ExitWeight);
- if (!LatchExitWeight)
+ if (!ExitWeight)
+ // Don't have a way to return predicated infinite
return None;
- if (EstimatedLoopInvocationWeight)
- *EstimatedLoopInvocationWeight = LatchExitWeight;
+ OrigExitWeight = ExitWeight;
- // Estimated backedge taken count is a ratio of the backedge taken weight by
- // the weight of the edge exiting the loop, rounded to nearest.
- uint64_t BackedgeTakenCount =
- llvm::divideNearest(BackedgeTakenWeight, LatchExitWeight);
- // Estimated trip count is one plus estimated backedge taken count.
- return BackedgeTakenCount + 1;
+ // Estimated exit count is a ratio of the loop weight by the weight of the
+ // edge exiting the loop, rounded to nearest.
+ uint64_t ExitCount = llvm::divideNearest(LoopWeight, ExitWeight);
+ // Estimated trip count is one plus estimated exit count.
+ return ExitCount + 1;
+}
+
+Optional<unsigned>
+llvm::getLoopEstimatedTripCount(Loop *L,
+ unsigned *EstimatedLoopInvocationWeight) {
+ // Currently we take the estimate exit count only from the loop latch,
+ // ignoring other exiting blocks. This can overestimate the trip count
+ // if we exit through another exit, but can never underestimate it.
+ // TODO: incorporate information from other exits
+ if (BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L)) {
+ uint64_t ExitWeight;
+ if (Optional<uint64_t> EstTripCount =
+ getEstimatedTripCount(LatchBranch, L, ExitWeight)) {
+ if (EstimatedLoopInvocationWeight)
+ *EstimatedLoopInvocationWeight = ExitWeight;
+ return *EstTripCount;
+ }
+ }
+ return None;
}
bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
unsigned EstimatedloopInvocationWeight) {
- // Support loops with an exiting latch and other existing exists only
- // deoptimize.
+ // At the moment, we currently support changing the estimate trip count of
+ // the latch branch only. We could extend this API to manipulate estimated
+ // trip counts for any exit.
BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
if (!LatchBranch)
return false;
@@ -923,8 +931,7 @@ Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
// Helper to generate an ordered reduction.
Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
- unsigned Op, RecurKind RdxKind,
- ArrayRef<Value *> RedOps) {
+ unsigned Op, RecurKind RdxKind) {
unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements();
// Extract and apply reduction ops in ascending order:
@@ -942,9 +949,6 @@ Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
"Invalid min/max");
Result = createMinMaxOp(Builder, RdxKind, Result, Ext);
}
-
- if (!RedOps.empty())
- propagateIRFlags(Result, RedOps);
}
return Result;
@@ -952,14 +956,20 @@ Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
// Helper to generate a log2 shuffle reduction.
Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
- unsigned Op, RecurKind RdxKind,
- ArrayRef<Value *> RedOps) {
+ unsigned Op, RecurKind RdxKind) {
unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements();
// VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
// and vector ops, reducing the set of values being computed by half each
// round.
assert(isPowerOf2_32(VF) &&
"Reduction emission only supported for pow2 vectors!");
+ // Note: fast-math-flags flags are controlled by the builder configuration
+ // and are assumed to apply to all generated arithmetic instructions. Other
+ // poison generating flags (nsw/nuw/inbounds/inrange/exact) are not part
+ // of the builder configuration, and since they're not passed explicitly,
+ // will never be relevant here. Note that it would be generally unsound to
+ // propagate these from an intrinsic call to the expansion anyways as we/
+ // change the order of operations.
Value *TmpVec = Src;
SmallVector<int, 32> ShuffleMask(VF);
for (unsigned i = VF; i != 1; i >>= 1) {
@@ -973,7 +983,6 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
Value *Shuf = Builder.CreateShuffleVector(TmpVec, ShuffleMask, "rdx.shuf");
if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
- // The builder propagates its fast-math-flags setting.
TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
"bin.rdx");
} else {
@@ -981,13 +990,6 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
"Invalid min/max");
TmpVec = createMinMaxOp(Builder, RdxKind, TmpVec, Shuf);
}
- if (!RedOps.empty())
- propagateIRFlags(TmpVec, RedOps);
-
- // We may compute the reassociated scalar ops in a way that does not
- // preserve nsw/nuw etc. Conservatively, drop those flags.
- if (auto *ReductionInst = dyn_cast<Instruction>(TmpVec))
- ReductionInst->dropPoisonGeneratingFlags();
}
// The result is in the first element of the vector.
return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
@@ -1035,8 +1037,7 @@ Value *llvm::createSelectCmpTargetReduction(IRBuilderBase &Builder,
Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
const TargetTransformInfo *TTI,
- Value *Src, RecurKind RdxKind,
- ArrayRef<Value *> RedOps) {
+ Value *Src, RecurKind RdxKind) {
auto *SrcVecEltTy = cast<VectorType>(Src->getType())->getElementType();
switch (RdxKind) {
case RecurKind::Add:
diff --git a/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/llvm/lib/Transforms/Utils/MetaRenamer.cpp
index 3ce10535d45f..9fba2f3f86b5 100644
--- a/llvm/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/llvm/lib/Transforms/Utils/MetaRenamer.cpp
@@ -15,6 +15,7 @@
#include "llvm/Transforms/Utils/MetaRenamer.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
@@ -31,10 +32,36 @@
#include "llvm/IR/TypeFinder.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils.h"
using namespace llvm;
+static cl::opt<std::string> RenameExcludeFunctionPrefixes(
+ "rename-exclude-function-prefixes",
+ cl::desc("Prefixes for functions that don't need to be renamed, separated "
+ "by a comma"),
+ cl::Hidden);
+
+static cl::opt<std::string> RenameExcludeAliasPrefixes(
+ "rename-exclude-alias-prefixes",
+ cl::desc("Prefixes for aliases that don't need to be renamed, separated "
+ "by a comma"),
+ cl::Hidden);
+
+static cl::opt<std::string> RenameExcludeGlobalPrefixes(
+ "rename-exclude-global-prefixes",
+ cl::desc(
+ "Prefixes for global values that don't need to be renamed, separated "
+ "by a comma"),
+ cl::Hidden);
+
+static cl::opt<std::string> RenameExcludeStructPrefixes(
+ "rename-exclude-struct-prefixes",
+ cl::desc("Prefixes for structs that don't need to be renamed, separated "
+ "by a comma"),
+ cl::Hidden);
+
static const char *const metaNames[] = {
// See http://en.wikipedia.org/wiki/Metasyntactic_variable
"foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge",
@@ -66,6 +93,18 @@ struct Renamer {
PRNG prng;
};
+static void
+parseExcludedPrefixes(StringRef PrefixesStr,
+ SmallVectorImpl<StringRef> &ExcludedPrefixes) {
+ for (;;) {
+ auto PrefixesSplit = PrefixesStr.split(',');
+ if (PrefixesSplit.first.empty())
+ break;
+ ExcludedPrefixes.push_back(PrefixesSplit.first);
+ PrefixesStr = PrefixesSplit.second;
+ }
+}
+
void MetaRename(Function &F) {
for (Argument &Arg : F.args())
if (!Arg.getType()->isVoidTy())
@@ -91,10 +130,26 @@ void MetaRename(Module &M,
Renamer renamer(randSeed);
+ SmallVector<StringRef, 8> ExcludedAliasesPrefixes;
+ SmallVector<StringRef, 8> ExcludedGlobalsPrefixes;
+ SmallVector<StringRef, 8> ExcludedStructsPrefixes;
+ SmallVector<StringRef, 8> ExcludedFuncPrefixes;
+ parseExcludedPrefixes(RenameExcludeAliasPrefixes, ExcludedAliasesPrefixes);
+ parseExcludedPrefixes(RenameExcludeGlobalPrefixes, ExcludedGlobalsPrefixes);
+ parseExcludedPrefixes(RenameExcludeStructPrefixes, ExcludedStructsPrefixes);
+ parseExcludedPrefixes(RenameExcludeFunctionPrefixes, ExcludedFuncPrefixes);
+
+ auto IsNameExcluded = [](StringRef &Name,
+ SmallVectorImpl<StringRef> &ExcludedPrefixes) {
+ return any_of(ExcludedPrefixes,
+ [&Name](auto &Prefix) { return Name.startswith(Prefix); });
+ };
+
// Rename all aliases
for (GlobalAlias &GA : M.aliases()) {
StringRef Name = GA.getName();
- if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+ if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) ||
+ IsNameExcluded(Name, ExcludedAliasesPrefixes))
continue;
GA.setName("alias");
@@ -103,7 +158,8 @@ void MetaRename(Module &M,
// Rename all global variables
for (GlobalVariable &GV : M.globals()) {
StringRef Name = GV.getName();
- if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+ if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) ||
+ IsNameExcluded(Name, ExcludedGlobalsPrefixes))
continue;
GV.setName("global");
@@ -113,7 +169,9 @@ void MetaRename(Module &M,
TypeFinder StructTypes;
StructTypes.run(M, true);
for (StructType *STy : StructTypes) {
- if (STy->isLiteral() || STy->getName().empty())
+ StringRef Name = STy->getName();
+ if (STy->isLiteral() || Name.empty() ||
+ IsNameExcluded(Name, ExcludedStructsPrefixes))
continue;
SmallString<128> NameStorage;
@@ -128,7 +186,8 @@ void MetaRename(Module &M,
// Leave library functions alone because their presence or absence could
// affect the behavior of other passes.
if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) ||
- GetTLI(F).getLibFunc(F, Tmp))
+ GetTLI(F).getLibFunc(F, Tmp) ||
+ IsNameExcluded(Name, ExcludedFuncPrefixes))
continue;
// Leave @main alone. The output of -metarenamer might be passed to
diff --git a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
index 3ebc89158173..65207056a3f4 100644
--- a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
+++ b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
@@ -144,6 +144,10 @@ static void convertToRelLookupTable(GlobalVariable &LookupTable) {
Value *Offset =
Builder.CreateShl(Index, ConstantInt::get(IntTy, 2), "reltable.shift");
+ // Insert the call to load.relative instrinsic before LOAD.
+ // GEP might not be immediately followed by a LOAD, like it can be hoisted
+ // outside the loop or another instruction might be inserted them in between.
+ Builder.SetInsertPoint(Load);
Function *LoadRelIntrinsic = llvm::Intrinsic::getDeclaration(
&M, Intrinsic::load_relative, {Index->getType()});
Value *Base = Builder.CreateBitCast(RelLookupTable, Builder.getInt8PtrTy());
diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
index 9495e442e0bf..2f2dff6b5f0b 100644
--- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
+++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
@@ -220,7 +220,7 @@ private:
Now = Pred;
}
- assert(PathCapacity > 0 && "found incorrect augmenting path");
+ assert(PathCapacity > 0 && "found an incorrect augmenting path");
// Update the flow along the path
Now = Target;
@@ -271,6 +271,352 @@ private:
uint64_t Target;
};
+/// A post-processing adjustment of control flow. It applies two steps by
+/// rerouting some flow and making it more realistic:
+///
+/// - First, it removes all isolated components ("islands") with a positive flow
+/// that are unreachable from the entry block. For every such component, we
+/// find the shortest from the entry to an exit passing through the component,
+/// and increase the flow by one unit along the path.
+///
+/// - Second, it identifies all "unknown subgraphs" consisting of basic blocks
+/// with no sampled counts. Then it rebalnces the flow that goes through such
+/// a subgraph so that each branch is taken with probability 50%.
+/// An unknown subgraph is such that for every two nodes u and v:
+/// - u dominates v and u is not unknown;
+/// - v post-dominates u; and
+/// - all inner-nodes of all (u,v)-paths are unknown.
+///
+class FlowAdjuster {
+public:
+ FlowAdjuster(FlowFunction &Func) : Func(Func) {
+ assert(Func.Blocks[Func.Entry].isEntry() &&
+ "incorrect index of the entry block");
+ }
+
+ // Run the post-processing
+ void run() {
+ /// Adjust the flow to get rid of isolated components.
+ joinIsolatedComponents();
+
+ /// Rebalance the flow inside unknown subgraphs.
+ rebalanceUnknownSubgraphs();
+ }
+
+ /// The probability for the first successor of a unknown subgraph
+ static constexpr double UnknownFirstSuccProbability = 0.5;
+
+private:
+ void joinIsolatedComponents() {
+ // Find blocks that are reachable from the source
+ auto Visited = std::vector<bool>(NumBlocks(), false);
+ findReachable(Func.Entry, Visited);
+
+ // Iterate over all non-reachable blocks and adjust their weights
+ for (uint64_t I = 0; I < NumBlocks(); I++) {
+ auto &Block = Func.Blocks[I];
+ if (Block.Flow > 0 && !Visited[I]) {
+ // Find a path from the entry to an exit passing through the block I
+ auto Path = findShortestPath(I);
+ // Increase the flow along the path
+ assert(Path.size() > 0 && Path[0]->Source == Func.Entry &&
+ "incorrectly computed path adjusting control flow");
+ Func.Blocks[Func.Entry].Flow += 1;
+ for (auto &Jump : Path) {
+ Jump->Flow += 1;
+ Func.Blocks[Jump->Target].Flow += 1;
+ // Update reachability
+ findReachable(Jump->Target, Visited);
+ }
+ }
+ }
+ }
+
+ /// Run BFS from a given block along the jumps with a positive flow and mark
+ /// all reachable blocks.
+ void findReachable(uint64_t Src, std::vector<bool> &Visited) {
+ if (Visited[Src])
+ return;
+ std::queue<uint64_t> Queue;
+ Queue.push(Src);
+ Visited[Src] = true;
+ while (!Queue.empty()) {
+ Src = Queue.front();
+ Queue.pop();
+ for (auto Jump : Func.Blocks[Src].SuccJumps) {
+ uint64_t Dst = Jump->Target;
+ if (Jump->Flow > 0 && !Visited[Dst]) {
+ Queue.push(Dst);
+ Visited[Dst] = true;
+ }
+ }
+ }
+ }
+
+ /// Find the shortest path from the entry block to an exit block passing
+ /// through a given block.
+ std::vector<FlowJump *> findShortestPath(uint64_t BlockIdx) {
+ // A path from the entry block to BlockIdx
+ auto ForwardPath = findShortestPath(Func.Entry, BlockIdx);
+ // A path from BlockIdx to an exit block
+ auto BackwardPath = findShortestPath(BlockIdx, AnyExitBlock);
+
+ // Concatenate the two paths
+ std::vector<FlowJump *> Result;
+ Result.insert(Result.end(), ForwardPath.begin(), ForwardPath.end());
+ Result.insert(Result.end(), BackwardPath.begin(), BackwardPath.end());
+ return Result;
+ }
+
+ /// Apply the Dijkstra algorithm to find the shortest path from a given
+ /// Source to a given Target block.
+ /// If Target == -1, then the path ends at an exit block.
+ std::vector<FlowJump *> findShortestPath(uint64_t Source, uint64_t Target) {
+ // Quit early, if possible
+ if (Source == Target)
+ return std::vector<FlowJump *>();
+ if (Func.Blocks[Source].isExit() && Target == AnyExitBlock)
+ return std::vector<FlowJump *>();
+
+ // Initialize data structures
+ auto Distance = std::vector<int64_t>(NumBlocks(), INF);
+ auto Parent = std::vector<FlowJump *>(NumBlocks(), nullptr);
+ Distance[Source] = 0;
+ std::set<std::pair<uint64_t, uint64_t>> Queue;
+ Queue.insert(std::make_pair(Distance[Source], Source));
+
+ // Run the Dijkstra algorithm
+ while (!Queue.empty()) {
+ uint64_t Src = Queue.begin()->second;
+ Queue.erase(Queue.begin());
+ // If we found a solution, quit early
+ if (Src == Target ||
+ (Func.Blocks[Src].isExit() && Target == AnyExitBlock))
+ break;
+
+ for (auto Jump : Func.Blocks[Src].SuccJumps) {
+ uint64_t Dst = Jump->Target;
+ int64_t JumpDist = jumpDistance(Jump);
+ if (Distance[Dst] > Distance[Src] + JumpDist) {
+ Queue.erase(std::make_pair(Distance[Dst], Dst));
+
+ Distance[Dst] = Distance[Src] + JumpDist;
+ Parent[Dst] = Jump;
+
+ Queue.insert(std::make_pair(Distance[Dst], Dst));
+ }
+ }
+ }
+ // If Target is not provided, find the closest exit block
+ if (Target == AnyExitBlock) {
+ for (uint64_t I = 0; I < NumBlocks(); I++) {
+ if (Func.Blocks[I].isExit() && Parent[I] != nullptr) {
+ if (Target == AnyExitBlock || Distance[Target] > Distance[I]) {
+ Target = I;
+ }
+ }
+ }
+ }
+ assert(Parent[Target] != nullptr && "a path does not exist");
+
+ // Extract the constructed path
+ std::vector<FlowJump *> Result;
+ uint64_t Now = Target;
+ while (Now != Source) {
+ assert(Now == Parent[Now]->Target && "incorrect parent jump");
+ Result.push_back(Parent[Now]);
+ Now = Parent[Now]->Source;
+ }
+ // Reverse the path, since it is extracted from Target to Source
+ std::reverse(Result.begin(), Result.end());
+ return Result;
+ }
+
+ /// A distance of a path for a given jump.
+ /// In order to incite the path to use blocks/jumps with large positive flow,
+ /// and avoid changing branch probability of outgoing edges drastically,
+ /// set the distance as follows:
+ /// if Jump.Flow > 0, then distance = max(100 - Jump->Flow, 0)
+ /// if Block.Weight > 0, then distance = 1
+ /// otherwise distance >> 1
+ int64_t jumpDistance(FlowJump *Jump) const {
+ int64_t BaseDistance = 100;
+ if (Jump->IsUnlikely)
+ return MinCostMaxFlow::AuxCostUnlikely;
+ if (Jump->Flow > 0)
+ return std::max(BaseDistance - (int64_t)Jump->Flow, (int64_t)0);
+ if (Func.Blocks[Jump->Target].Weight > 0)
+ return BaseDistance;
+ return BaseDistance * (NumBlocks() + 1);
+ };
+
+ uint64_t NumBlocks() const { return Func.Blocks.size(); }
+
+ /// Rebalance unknown subgraphs so as each branch splits with probabilities
+ /// UnknownFirstSuccProbability and 1 - UnknownFirstSuccProbability
+ void rebalanceUnknownSubgraphs() {
+ assert(UnknownFirstSuccProbability >= 0.0 &&
+ UnknownFirstSuccProbability <= 1.0 &&
+ "the share of the unknown successor should be between 0 and 1");
+ // Try to find unknown subgraphs from each non-unknown block
+ for (uint64_t I = 0; I < Func.Blocks.size(); I++) {
+ auto SrcBlock = &Func.Blocks[I];
+ // Do not attempt to find unknown successors from a unknown or a
+ // zero-flow block
+ if (SrcBlock->UnknownWeight || SrcBlock->Flow == 0)
+ continue;
+
+ std::vector<FlowBlock *> UnknownSuccs;
+ FlowBlock *DstBlock = nullptr;
+ // Find a unknown subgraphs starting at block SrcBlock
+ if (!findUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs))
+ continue;
+ // At the moment, we do not rebalance subgraphs containing cycles among
+ // unknown blocks
+ if (!isAcyclicSubgraph(SrcBlock, DstBlock, UnknownSuccs))
+ continue;
+
+ // Rebalance the flow
+ rebalanceUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs);
+ }
+ }
+
+ /// Find a unknown subgraph starting at block SrcBlock.
+ /// If the search is successful, the method sets DstBlock and UnknownSuccs.
+ bool findUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *&DstBlock,
+ std::vector<FlowBlock *> &UnknownSuccs) {
+ // Run BFS from SrcBlock and make sure all paths are going through unknown
+ // blocks and end at a non-unknown DstBlock
+ auto Visited = std::vector<bool>(NumBlocks(), false);
+ std::queue<uint64_t> Queue;
+ DstBlock = nullptr;
+
+ Queue.push(SrcBlock->Index);
+ Visited[SrcBlock->Index] = true;
+ while (!Queue.empty()) {
+ auto &Block = Func.Blocks[Queue.front()];
+ Queue.pop();
+ // Process blocks reachable from Block
+ for (auto Jump : Block.SuccJumps) {
+ uint64_t Dst = Jump->Target;
+ if (Visited[Dst])
+ continue;
+ Visited[Dst] = true;
+ if (!Func.Blocks[Dst].UnknownWeight) {
+ // If we see non-unique non-unknown block reachable from SrcBlock,
+ // stop processing and skip rebalancing
+ FlowBlock *CandidateDstBlock = &Func.Blocks[Dst];
+ if (DstBlock != nullptr && DstBlock != CandidateDstBlock)
+ return false;
+ DstBlock = CandidateDstBlock;
+ } else {
+ Queue.push(Dst);
+ UnknownSuccs.push_back(&Func.Blocks[Dst]);
+ }
+ }
+ }
+
+ // If the list of unknown blocks is empty, we don't need rebalancing
+ if (UnknownSuccs.empty())
+ return false;
+ // If all reachable nodes from SrcBlock are unknown, skip rebalancing
+ if (DstBlock == nullptr)
+ return false;
+ // If any of the unknown blocks is an exit block, skip rebalancing
+ for (auto Block : UnknownSuccs) {
+ if (Block->isExit())
+ return false;
+ }
+
+ return true;
+ }
+
+ /// Verify if the given unknown subgraph is acyclic, and if yes, reorder
+ /// UnknownSuccs in the topological order (so that all jumps are "forward").
+ bool isAcyclicSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock,
+ std::vector<FlowBlock *> &UnknownSuccs) {
+ // Extract local in-degrees in the considered subgraph
+ auto LocalInDegree = std::vector<uint64_t>(NumBlocks(), 0);
+ for (auto Jump : SrcBlock->SuccJumps) {
+ LocalInDegree[Jump->Target]++;
+ }
+ for (uint64_t I = 0; I < UnknownSuccs.size(); I++) {
+ for (auto Jump : UnknownSuccs[I]->SuccJumps) {
+ LocalInDegree[Jump->Target]++;
+ }
+ }
+ // A loop containing SrcBlock
+ if (LocalInDegree[SrcBlock->Index] > 0)
+ return false;
+
+ std::vector<FlowBlock *> AcyclicOrder;
+ std::queue<uint64_t> Queue;
+ Queue.push(SrcBlock->Index);
+ while (!Queue.empty()) {
+ auto &Block = Func.Blocks[Queue.front()];
+ Queue.pop();
+ // Stop propagation once we reach DstBlock
+ if (Block.Index == DstBlock->Index)
+ break;
+
+ AcyclicOrder.push_back(&Block);
+ // Add to the queue all successors with zero local in-degree
+ for (auto Jump : Block.SuccJumps) {
+ uint64_t Dst = Jump->Target;
+ LocalInDegree[Dst]--;
+ if (LocalInDegree[Dst] == 0) {
+ Queue.push(Dst);
+ }
+ }
+ }
+
+ // If there is a cycle in the subgraph, AcyclicOrder contains only a subset
+ // of all blocks
+ if (UnknownSuccs.size() + 1 != AcyclicOrder.size())
+ return false;
+ UnknownSuccs = AcyclicOrder;
+ return true;
+ }
+
+ /// Rebalance a given subgraph.
+ void rebalanceUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock,
+ std::vector<FlowBlock *> &UnknownSuccs) {
+ assert(SrcBlock->Flow > 0 && "zero-flow block in unknown subgraph");
+ assert(UnknownSuccs.front() == SrcBlock && "incorrect order of unknowns");
+
+ for (auto Block : UnknownSuccs) {
+ // Block's flow is the sum of incoming flows
+ uint64_t TotalFlow = 0;
+ if (Block == SrcBlock) {
+ TotalFlow = Block->Flow;
+ } else {
+ for (auto Jump : Block->PredJumps) {
+ TotalFlow += Jump->Flow;
+ }
+ Block->Flow = TotalFlow;
+ }
+
+ // Process all successor jumps and update corresponding flow values
+ for (uint64_t I = 0; I < Block->SuccJumps.size(); I++) {
+ auto Jump = Block->SuccJumps[I];
+ if (I + 1 == Block->SuccJumps.size()) {
+ Jump->Flow = TotalFlow;
+ continue;
+ }
+ uint64_t Flow = uint64_t(TotalFlow * UnknownFirstSuccProbability);
+ Jump->Flow = Flow;
+ TotalFlow -= Flow;
+ }
+ }
+ }
+
+ /// A constant indicating an arbitrary exit block of a function.
+ static constexpr uint64_t AnyExitBlock = uint64_t(-1);
+
+ /// The function.
+ FlowFunction &Func;
+};
+
/// Initializing flow network for a given function.
///
/// Every block is split into three nodes that are responsible for (i) an
@@ -440,6 +786,39 @@ void verifyWeights(const FlowFunction &Func) {
}
}
assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow");
+
+ // Verify that there are no isolated flow components
+ // One could modify FlowFunction to hold edges indexed by the sources, which
+ // will avoid a creation of the object
+ auto PositiveFlowEdges = std::vector<std::vector<uint64_t>>(NumBlocks);
+ for (auto &Jump : Func.Jumps) {
+ if (Jump.Flow > 0) {
+ PositiveFlowEdges[Jump.Source].push_back(Jump.Target);
+ }
+ }
+
+ // Run BFS from the source along edges with positive flow
+ std::queue<uint64_t> Queue;
+ auto Visited = std::vector<bool>(NumBlocks, false);
+ Queue.push(Func.Entry);
+ Visited[Func.Entry] = true;
+ while (!Queue.empty()) {
+ uint64_t Src = Queue.front();
+ Queue.pop();
+ for (uint64_t Dst : PositiveFlowEdges[Src]) {
+ if (!Visited[Dst]) {
+ Queue.push(Dst);
+ Visited[Dst] = true;
+ }
+ }
+ }
+
+ // Verify that every block that has a positive flow is reached from the source
+ // along edges with a positive flow
+ for (uint64_t I = 0; I < NumBlocks; I++) {
+ auto &Block = Func.Blocks[I];
+ assert((Visited[I] || Block.Flow == 0) && "an isolated flow component");
+ }
}
#endif
@@ -455,6 +834,10 @@ void llvm::applyFlowInference(FlowFunction &Func) {
// Extract flow values for every block and every edge
extractWeights(InferenceNetwork, Func);
+ // Post-processing adjustments to the flow
+ auto Adjuster = FlowAdjuster(Func);
+ Adjuster.run();
+
#ifndef NDEBUG
// Verify the result
verifyWeights(Func);
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 71c15d5c51fc..c840ee85795f 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1047,9 +1047,9 @@ bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) {
if (SE.DT.dominates(IncV, InsertPos))
break;
}
- for (auto I = IVIncs.rbegin(), E = IVIncs.rend(); I != E; ++I) {
- fixupInsertPoints(*I);
- (*I)->moveBefore(InsertPos);
+ for (Instruction *I : llvm::reverse(IVIncs)) {
+ fixupInsertPoints(I);
+ I->moveBefore(InsertPos);
}
return true;
}
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index afa3ecde77f9..1046998c26de 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3629,7 +3629,7 @@ static bool tryWidenCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
return false; // TODO
// Use lambda to lazily compute expensive condition after cheap ones.
auto NoSideEffects = [](BasicBlock &BB) {
- return !llvm::any_of(BB, [](const Instruction &I) {
+ return llvm::none_of(BB, [](const Instruction &I) {
return I.mayWriteToMemory() || I.mayHaveSideEffects();
});
};
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index e190a1294eb3..02727a3dbf9c 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -193,6 +193,19 @@ static void annotateNonNullAndDereferenceable(CallInst *CI, ArrayRef<unsigned> A
}
}
+// Copy CallInst "flags" like musttail, notail, and tail. Return New param for
+// easier chaining. Calls to emit* and B.createCall should probably be wrapped
+// in this function when New is created to replace Old. Callers should take
+// care to check Old.isMustTailCall() if they aren't replacing Old directly
+// with New.
+static Value *copyFlags(const CallInst &Old, Value *New) {
+ assert(!Old.isMustTailCall() && "do not copy musttail call flags");
+ assert(!Old.isNoTailCall() && "do not copy notail call flags");
+ if (auto *NewCI = dyn_cast_or_null<CallInst>(New))
+ NewCI->setTailCallKind(Old.getTailCallKind());
+ return New;
+}
+
//===----------------------------------------------------------------------===//
// String and Memory Library Call Optimizations
//===----------------------------------------------------------------------===//
@@ -215,7 +228,7 @@ Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilderBase &B) {
if (Len == 0)
return Dst;
- return emitStrLenMemCpy(Src, Dst, Len, B);
+ return copyFlags(*CI, emitStrLenMemCpy(Src, Dst, Len, B));
}
Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
@@ -279,7 +292,7 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilderBase &B) {
// strncat(x, s, c) -> strcat(x, s)
// s is constant so the strcat can be optimized further.
- return emitStrLenMemCpy(Src, Dst, SrcLen, B);
+ return copyFlags(*CI, emitStrLenMemCpy(Src, Dst, SrcLen, B));
}
Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilderBase &B) {
@@ -300,9 +313,11 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilderBase &B) {
if (!FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32.
return nullptr;
- return emitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len),
- B, DL, TLI);
+ return copyFlags(
+ *CI,
+ emitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len), B,
+ DL, TLI));
}
// Otherwise, the character is a constant, see if the first argument is
@@ -340,7 +355,7 @@ Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilderBase &B) {
if (!getConstantStringInfo(SrcStr, Str)) {
// strrchr(s, 0) -> strchr(s, 0)
if (CharC->isZero())
- return emitStrChr(SrcStr, '\0', B, TLI);
+ return copyFlags(*CI, emitStrChr(SrcStr, '\0', B, TLI));
return nullptr;
}
@@ -385,25 +400,28 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilderBase &B) {
annotateDereferenceableBytes(CI, 1, Len2);
if (Len1 && Len2) {
- return emitMemCmp(Str1P, Str2P,
- ConstantInt::get(DL.getIntPtrType(CI->getContext()),
- std::min(Len1, Len2)),
- B, DL, TLI);
+ return copyFlags(
+ *CI, emitMemCmp(Str1P, Str2P,
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+ std::min(Len1, Len2)),
+ B, DL, TLI));
}
// strcmp to memcmp
if (!HasStr1 && HasStr2) {
if (canTransformToMemCmp(CI, Str1P, Len2, DL))
- return emitMemCmp(
- Str1P, Str2P,
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), B, DL,
- TLI);
+ return copyFlags(
+ *CI,
+ emitMemCmp(Str1P, Str2P,
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2),
+ B, DL, TLI));
} else if (HasStr1 && !HasStr2) {
if (canTransformToMemCmp(CI, Str2P, Len1, DL))
- return emitMemCmp(
- Str1P, Str2P,
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), B, DL,
- TLI);
+ return copyFlags(
+ *CI,
+ emitMemCmp(Str1P, Str2P,
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1),
+ B, DL, TLI));
}
annotateNonNullNoUndefBasedOnAccess(CI, {0, 1});
@@ -430,7 +448,7 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) {
return ConstantInt::get(CI->getType(), 0);
if (Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
- return emitMemCmp(Str1P, Str2P, Size, B, DL, TLI);
+ return copyFlags(*CI, emitMemCmp(Str1P, Str2P, Size, B, DL, TLI));
StringRef Str1, Str2;
bool HasStr1 = getConstantStringInfo(Str1P, Str1);
@@ -462,17 +480,19 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) {
if (!HasStr1 && HasStr2) {
Len2 = std::min(Len2, Length);
if (canTransformToMemCmp(CI, Str1P, Len2, DL))
- return emitMemCmp(
- Str1P, Str2P,
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), B, DL,
- TLI);
+ return copyFlags(
+ *CI,
+ emitMemCmp(Str1P, Str2P,
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2),
+ B, DL, TLI));
} else if (HasStr1 && !HasStr2) {
Len1 = std::min(Len1, Length);
if (canTransformToMemCmp(CI, Str2P, Len1, DL))
- return emitMemCmp(
- Str1P, Str2P,
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), B, DL,
- TLI);
+ return copyFlags(
+ *CI,
+ emitMemCmp(Str1P, Str2P,
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1),
+ B, DL, TLI));
}
return nullptr;
@@ -485,7 +505,7 @@ Value *LibCallSimplifier::optimizeStrNDup(CallInst *CI, IRBuilderBase &B) {
if (SrcLen && Size) {
annotateDereferenceableBytes(CI, 0, SrcLen);
if (SrcLen <= Size->getZExtValue() + 1)
- return emitStrDup(Src, B, TLI);
+ return copyFlags(*CI, emitStrDup(Src, B, TLI));
}
return nullptr;
@@ -495,7 +515,7 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) {
Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
if (Dst == Src) // strcpy(x,x) -> x
return Src;
-
+
annotateNonNullNoUndefBasedOnAccess(CI, {0, 1});
// See if we can get the length of the input string.
uint64_t Len = GetStringLength(Src);
@@ -511,6 +531,7 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) {
ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len));
NewCI->setAttributes(CI->getAttributes());
NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+ copyFlags(*CI, NewCI);
return Dst;
}
@@ -520,7 +541,7 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) {
// stpcpy(d,s) -> strcpy(d,s) if the result is not used.
if (CI->use_empty())
- return emitStrCpy(Dst, Src, B, TLI);
+ return copyFlags(*CI, emitStrCpy(Dst, Src, B, TLI));
if (Dst == Src) { // stpcpy(x,x) -> x+strlen(x)
Value *StrLen = emitStrLen(Src, B, DL, TLI);
@@ -544,6 +565,7 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) {
CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1), LenV);
NewCI->setAttributes(CI->getAttributes());
NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+ copyFlags(*CI, NewCI);
return DstEnd;
}
@@ -583,6 +605,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) {
AttrBuilder ArgAttrs(CI->getAttributes().getParamAttrs(0));
NewCI->setAttributes(NewCI->getAttributes().addParamAttributes(
CI->getContext(), 0, ArgAttrs));
+ copyFlags(*CI, NewCI);
return Dst;
}
@@ -606,6 +629,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) {
ConstantInt::get(DL.getIntPtrType(PT), Len));
NewCI->setAttributes(CI->getAttributes());
NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+ copyFlags(*CI, NewCI);
return Dst;
}
@@ -737,7 +761,7 @@ Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilderBase &B) {
// strpbrk(s, "a") -> strchr(s, 'a')
if (HasS2 && S2.size() == 1)
- return emitStrChr(CI->getArgOperand(0), S2[0], B, TLI);
+ return copyFlags(*CI, emitStrChr(CI->getArgOperand(0), S2[0], B, TLI));
return nullptr;
}
@@ -793,7 +817,7 @@ Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilderBase &B) {
// strcspn(s, "") -> strlen(s)
if (HasS2 && S2.empty())
- return emitStrLen(CI->getArgOperand(0), B, DL, TLI);
+ return copyFlags(*CI, emitStrLen(CI->getArgOperand(0), B, DL, TLI));
return nullptr;
}
@@ -1062,7 +1086,7 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilderBase &B) {
Value *LHS = CI->getArgOperand(0);
Value *RHS = CI->getArgOperand(1);
Value *Size = CI->getArgOperand(2);
- return emitBCmp(LHS, RHS, Size, B, DL, TLI);
+ return copyFlags(*CI, emitBCmp(LHS, RHS, Size, B, DL, TLI));
}
return nullptr;
@@ -1083,6 +1107,7 @@ Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilderBase &B) {
CI->getArgOperand(1), Align(1), Size);
NewCI->setAttributes(CI->getAttributes());
NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+ copyFlags(*CI, NewCI);
return CI->getArgOperand(0);
}
@@ -1110,7 +1135,8 @@ Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilderBase &B) {
size_t Pos = SrcStr.find(StopChar->getSExtValue() & 0xFF);
if (Pos == StringRef::npos) {
if (N->getZExtValue() <= SrcStr.size()) {
- B.CreateMemCpy(Dst, Align(1), Src, Align(1), CI->getArgOperand(3));
+ copyFlags(*CI, B.CreateMemCpy(Dst, Align(1), Src, Align(1),
+ CI->getArgOperand(3)));
return Constant::getNullValue(CI->getType());
}
return nullptr;
@@ -1119,7 +1145,7 @@ Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilderBase &B) {
Value *NewN =
ConstantInt::get(N->getType(), std::min(uint64_t(Pos + 1), N->getZExtValue()));
// memccpy -> llvm.memcpy
- B.CreateMemCpy(Dst, Align(1), Src, Align(1), NewN);
+ copyFlags(*CI, B.CreateMemCpy(Dst, Align(1), Src, Align(1), NewN));
return Pos + 1 <= N->getZExtValue()
? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, NewN)
: Constant::getNullValue(CI->getType());
@@ -1136,6 +1162,7 @@ Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilderBase &B) {
// TODO: Attach return value attributes to the 1st operand to preserve them?
NewCI->setAttributes(CI->getAttributes());
NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+ copyFlags(*CI, NewCI);
return B.CreateInBoundsGEP(B.getInt8Ty(), Dst, N);
}
@@ -1150,6 +1177,7 @@ Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilderBase &B) {
CI->getArgOperand(1), Align(1), Size);
NewCI->setAttributes(CI->getAttributes());
NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+ copyFlags(*CI, NewCI);
return CI->getArgOperand(0);
}
@@ -1164,12 +1192,13 @@ Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilderBase &B) {
CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align(1));
NewCI->setAttributes(CI->getAttributes());
NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+ copyFlags(*CI, NewCI);
return CI->getArgOperand(0);
}
Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilderBase &B) {
if (isa<ConstantPointerNull>(CI->getArgOperand(0)))
- return emitMalloc(CI->getArgOperand(1), B, DL, TLI);
+ return copyFlags(*CI, emitMalloc(CI->getArgOperand(1), B, DL, TLI));
return nullptr;
}
@@ -1190,7 +1219,7 @@ static Value *replaceUnaryCall(CallInst *CI, IRBuilderBase &B,
Function *F = Intrinsic::getDeclaration(M, IID, CI->getType());
CallInst *NewCall = B.CreateCall(F, V);
NewCall->takeName(CI);
- return NewCall;
+ return copyFlags(*CI, NewCall);
}
/// Return a variant of Val with float type.
@@ -1311,7 +1340,8 @@ Value *LibCallSimplifier::optimizeCAbs(CallInst *CI, IRBuilderBase &B) {
Function *FSqrt = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::sqrt,
CI->getType());
- return B.CreateCall(FSqrt, B.CreateFAdd(RealReal, ImagImag), "cabs");
+ return copyFlags(
+ *CI, B.CreateCall(FSqrt, B.CreateFAdd(RealReal, ImagImag), "cabs"));
}
static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func,
@@ -1334,14 +1364,16 @@ static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func,
// sin(-X) --> -sin(X)
// tan(-X) --> -tan(X)
if (match(Call->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X)))))
- return B.CreateFNeg(B.CreateCall(Call->getCalledFunction(), X));
+ return B.CreateFNeg(
+ copyFlags(*Call, B.CreateCall(Call->getCalledFunction(), X)));
break;
case LibFunc_cos:
case LibFunc_cosf:
case LibFunc_cosl:
// cos(-X) --> cos(X)
if (match(Call->getArgOperand(0), m_FNeg(m_Value(X))))
- return B.CreateCall(Call->getCalledFunction(), X, "cos");
+ return copyFlags(*Call,
+ B.CreateCall(Call->getCalledFunction(), X, "cos"));
break;
default:
break;
@@ -1476,9 +1508,10 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
(isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo)) &&
hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
if (Value *ExpoI = getIntToFPVal(Expo, B, TLI->getIntSize()))
- return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI, TLI,
- LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl,
- B, Attrs);
+ return copyFlags(*Pow,
+ emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI,
+ TLI, LibFunc_ldexp, LibFunc_ldexpf,
+ LibFunc_ldexpl, B, Attrs));
}
// pow(2.0 ** n, x) -> exp2(n * x)
@@ -1496,11 +1529,13 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
double N = NI.logBase2() * (IsReciprocal ? -1.0 : 1.0);
Value *FMul = B.CreateFMul(Expo, ConstantFP::get(Ty, N), "mul");
if (Pow->doesNotAccessMemory())
- return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty),
- FMul, "exp2");
+ return copyFlags(*Pow, B.CreateCall(Intrinsic::getDeclaration(
+ Mod, Intrinsic::exp2, Ty),
+ FMul, "exp2"));
else
- return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f,
- LibFunc_exp2l, B, Attrs);
+ return copyFlags(*Pow, emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2,
+ LibFunc_exp2f,
+ LibFunc_exp2l, B, Attrs));
}
}
@@ -1508,8 +1543,9 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
// TODO: There is no exp10() intrinsic yet, but some day there shall be one.
if (match(Base, m_SpecificFP(10.0)) &&
hasFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l))
- return emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10, LibFunc_exp10f,
- LibFunc_exp10l, B, Attrs);
+ return copyFlags(*Pow, emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10,
+ LibFunc_exp10f, LibFunc_exp10l,
+ B, Attrs));
// pow(x, y) -> exp2(log2(x) * y)
if (Pow->hasApproxFunc() && Pow->hasNoNaNs() && BaseF->isFiniteNonZero() &&
@@ -1528,11 +1564,13 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
if (Log) {
Value *FMul = B.CreateFMul(Log, Expo, "mul");
if (Pow->doesNotAccessMemory())
- return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty),
- FMul, "exp2");
+ return copyFlags(*Pow, B.CreateCall(Intrinsic::getDeclaration(
+ Mod, Intrinsic::exp2, Ty),
+ FMul, "exp2"));
else if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l))
- return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f,
- LibFunc_exp2l, B, Attrs);
+ return copyFlags(*Pow, emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2,
+ LibFunc_exp2f,
+ LibFunc_exp2l, B, Attrs));
}
}
@@ -1595,6 +1633,8 @@ Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilderBase &B) {
Sqrt = B.CreateCall(FAbsFn, Sqrt, "abs");
}
+ Sqrt = copyFlags(*Pow, Sqrt);
+
// Handle non finite base by expanding to
// (x == -infinity ? +infinity : sqrt(x)).
if (!Pow->hasNoInfs()) {
@@ -1721,15 +1761,18 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) {
if (ExpoF->isInteger() &&
ExpoF->convertToInteger(IntExpo, APFloat::rmTowardZero, &Ignored) ==
APFloat::opOK) {
- return createPowWithIntegerExponent(
- Base, ConstantInt::get(B.getIntNTy(TLI->getIntSize()), IntExpo), M, B);
+ return copyFlags(
+ *Pow,
+ createPowWithIntegerExponent(
+ Base, ConstantInt::get(B.getIntNTy(TLI->getIntSize()), IntExpo),
+ M, B));
}
}
// powf(x, itofp(y)) -> powi(x, y)
if (AllowApprox && (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo))) {
if (Value *ExpoI = getIntToFPVal(Expo, B, TLI->getIntSize()))
- return createPowWithIntegerExponent(Base, ExpoI, M, B);
+ return copyFlags(*Pow, createPowWithIntegerExponent(Base, ExpoI, M, B));
}
// Shrink pow() to powf() if the arguments are single precision,
@@ -1792,7 +1835,8 @@ Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilderBase &B) {
Intrinsic::ID IID = Callee->getName().startswith("fmin") ? Intrinsic::minnum
: Intrinsic::maxnum;
Function *F = Intrinsic::getDeclaration(CI->getModule(), IID, CI->getType());
- return B.CreateCall(F, { CI->getArgOperand(0), CI->getArgOperand(1) });
+ return copyFlags(
+ *CI, B.CreateCall(F, {CI->getArgOperand(0), CI->getArgOperand(1)}));
}
Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
@@ -2010,9 +2054,9 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) {
// of the square root calculation.
Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, ArgType);
Value *SqrtCall = B.CreateCall(Sqrt, OtherOp, "sqrt");
- return B.CreateFMul(FabsCall, SqrtCall);
+ return copyFlags(*CI, B.CreateFMul(FabsCall, SqrtCall));
}
- return FabsCall;
+ return copyFlags(*CI, FabsCall);
}
// TODO: Generalize to handle any trig function and its inverse.
@@ -2327,7 +2371,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) {
// printf("x") -> putchar('x'), even for "%" and "%%".
if (FormatStr.size() == 1 || FormatStr == "%%")
- return emitPutChar(B.getInt32(FormatStr[0]), B, TLI);
+ return copyFlags(*CI, emitPutChar(B.getInt32(FormatStr[0]), B, TLI));
// Try to remove call or emit putchar/puts.
if (FormatStr == "%s" && CI->arg_size() > 1) {
@@ -2339,12 +2383,12 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) {
return (Value *)CI;
// printf("%s", "a") --> putchar('a')
if (OperandStr.size() == 1)
- return emitPutChar(B.getInt32(OperandStr[0]), B, TLI);
+ return copyFlags(*CI, emitPutChar(B.getInt32(OperandStr[0]), B, TLI));
// printf("%s", str"\n") --> puts(str)
if (OperandStr.back() == '\n') {
OperandStr = OperandStr.drop_back();
Value *GV = B.CreateGlobalString(OperandStr, "str");
- return emitPutS(GV, B, TLI);
+ return copyFlags(*CI, emitPutS(GV, B, TLI));
}
return nullptr;
}
@@ -2356,19 +2400,19 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) {
// pass to be run after this pass, to merge duplicate strings.
FormatStr = FormatStr.drop_back();
Value *GV = B.CreateGlobalString(FormatStr, "str");
- return emitPutS(GV, B, TLI);
+ return copyFlags(*CI, emitPutS(GV, B, TLI));
}
// Optimize specific format strings.
// printf("%c", chr) --> putchar(chr)
if (FormatStr == "%c" && CI->arg_size() > 1 &&
CI->getArgOperand(1)->getType()->isIntegerTy())
- return emitPutChar(CI->getArgOperand(1), B, TLI);
+ return copyFlags(*CI, emitPutChar(CI->getArgOperand(1), B, TLI));
// printf("%s\n", str) --> puts(str)
if (FormatStr == "%s\n" && CI->arg_size() > 1 &&
CI->getArgOperand(1)->getType()->isPointerTy())
- return emitPutS(CI->getArgOperand(1), B, TLI);
+ return copyFlags(*CI, emitPutS(CI->getArgOperand(1), B, TLI));
return nullptr;
}
@@ -2459,7 +2503,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
if (CI->use_empty())
// sprintf(dest, "%s", str) -> strcpy(dest, str)
- return emitStrCpy(Dest, CI->getArgOperand(2), B, TLI);
+ return copyFlags(*CI, emitStrCpy(Dest, CI->getArgOperand(2), B, TLI));
uint64_t SrcLen = GetStringLength(CI->getArgOperand(2));
if (SrcLen) {
@@ -2558,10 +2602,12 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI,
// snprintf(dst, size, fmt) -> llvm.memcpy(align 1 dst, align 1 fmt,
// strlen(fmt)+1)
- B.CreateMemCpy(
- CI->getArgOperand(0), Align(1), CI->getArgOperand(2), Align(1),
- ConstantInt::get(DL.getIntPtrType(CI->getContext()),
- FormatStr.size() + 1)); // Copy the null byte.
+ copyFlags(
+ *CI,
+ B.CreateMemCpy(
+ CI->getArgOperand(0), Align(1), CI->getArgOperand(2), Align(1),
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+ FormatStr.size() + 1))); // Copy the null byte.
return ConstantInt::get(CI->getType(), FormatStr.size());
}
@@ -2599,8 +2645,10 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI,
else if (N < Str.size() + 1)
return nullptr;
- B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(3),
- Align(1), ConstantInt::get(CI->getType(), Str.size() + 1));
+ copyFlags(
+ *CI, B.CreateMemCpy(CI->getArgOperand(0), Align(1),
+ CI->getArgOperand(3), Align(1),
+ ConstantInt::get(CI->getType(), Str.size() + 1)));
// The snprintf result is the unincremented number of bytes in the string.
return ConstantInt::get(CI->getType(), Str.size());
@@ -2640,10 +2688,11 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI,
if (FormatStr.contains('%'))
return nullptr; // We found a format specifier.
- return emitFWrite(
- CI->getArgOperand(1),
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size()),
- CI->getArgOperand(0), B, DL, TLI);
+ return copyFlags(
+ *CI, emitFWrite(CI->getArgOperand(1),
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+ FormatStr.size()),
+ CI->getArgOperand(0), B, DL, TLI));
}
// The remaining optimizations require the format string to be "%s" or "%c"
@@ -2656,14 +2705,16 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI,
// fprintf(F, "%c", chr) --> fputc(chr, F)
if (!CI->getArgOperand(2)->getType()->isIntegerTy())
return nullptr;
- return emitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
+ return copyFlags(
+ *CI, emitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI));
}
if (FormatStr[1] == 's') {
// fprintf(F, "%s", str) --> fputs(str, F)
if (!CI->getArgOperand(2)->getType()->isPointerTy())
return nullptr;
- return emitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
+ return copyFlags(
+ *CI, emitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI));
}
return nullptr;
}
@@ -2750,10 +2801,11 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilderBase &B) {
return nullptr;
// Known to have no uses (see above).
- return emitFWrite(
- CI->getArgOperand(0),
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len - 1),
- CI->getArgOperand(1), B, DL, TLI);
+ return copyFlags(
+ *CI,
+ emitFWrite(CI->getArgOperand(0),
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len - 1),
+ CI->getArgOperand(1), B, DL, TLI));
}
Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilderBase &B) {
@@ -2765,15 +2817,16 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilderBase &B) {
// puts("") -> putchar('\n')
StringRef Str;
if (getConstantStringInfo(CI->getArgOperand(0), Str) && Str.empty())
- return emitPutChar(B.getInt32('\n'), B, TLI);
+ return copyFlags(*CI, emitPutChar(B.getInt32('\n'), B, TLI));
return nullptr;
}
Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilderBase &B) {
// bcopy(src, dst, n) -> llvm.memmove(dst, src, n)
- return B.CreateMemMove(CI->getArgOperand(1), Align(1), CI->getArgOperand(0),
- Align(1), CI->getArgOperand(2));
+ return copyFlags(*CI, B.CreateMemMove(CI->getArgOperand(1), Align(1),
+ CI->getArgOperand(0), Align(1),
+ CI->getArgOperand(2)));
}
bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
@@ -2971,6 +3024,8 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
}
Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
+ assert(!CI->isMustTailCall() && "These transforms aren't musttail safe.");
+
// TODO: Split out the code below that operates on FP calls so that
// we can all non-FP calls with the StrictFP attribute to be
// optimized.
@@ -3212,6 +3267,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
Align(1), CI->getArgOperand(2));
NewCI->setAttributes(CI->getAttributes());
NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+ copyFlags(*CI, NewCI);
return CI->getArgOperand(0);
}
return nullptr;
@@ -3225,6 +3281,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
Align(1), CI->getArgOperand(2));
NewCI->setAttributes(CI->getAttributes());
NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+ copyFlags(*CI, NewCI);
return CI->getArgOperand(0);
}
return nullptr;
@@ -3238,6 +3295,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
CI->getArgOperand(2), Align(1));
NewCI->setAttributes(CI->getAttributes());
NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+ copyFlags(*CI, NewCI);
return CI->getArgOperand(0);
}
return nullptr;
@@ -3252,7 +3310,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemPCpyChk(CallInst *CI,
CallInst *NewCI = cast<CallInst>(Call);
NewCI->setAttributes(CI->getAttributes());
NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
- return NewCI;
+ return copyFlags(*CI, NewCI);
}
return nullptr;
}
@@ -3277,9 +3335,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
// string lengths for varying.
if (isFortifiedCallFoldable(CI, 2, None, 1)) {
if (Func == LibFunc_strcpy_chk)
- return emitStrCpy(Dst, Src, B, TLI);
+ return copyFlags(*CI, emitStrCpy(Dst, Src, B, TLI));
else
- return emitStpCpy(Dst, Src, B, TLI);
+ return copyFlags(*CI, emitStpCpy(Dst, Src, B, TLI));
}
if (OnlyLowerUnknownSize)
@@ -3303,14 +3361,14 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
// a __memcpy_chk, we still need to return the correct end pointer.
if (Ret && Func == LibFunc_stpcpy_chk)
return B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(SizeTTy, Len - 1));
- return Ret;
+ return copyFlags(*CI, cast<CallInst>(Ret));
}
Value *FortifiedLibCallSimplifier::optimizeStrLenChk(CallInst *CI,
IRBuilderBase &B) {
if (isFortifiedCallFoldable(CI, 1, None, 0))
- return emitStrLen(CI->getArgOperand(0), B, CI->getModule()->getDataLayout(),
- TLI);
+ return copyFlags(*CI, emitStrLen(CI->getArgOperand(0), B,
+ CI->getModule()->getDataLayout(), TLI));
return nullptr;
}
@@ -3319,11 +3377,13 @@ Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI,
LibFunc Func) {
if (isFortifiedCallFoldable(CI, 3, 2)) {
if (Func == LibFunc_strncpy_chk)
- return emitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(2), B, TLI);
+ return copyFlags(*CI,
+ emitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2), B, TLI));
else
- return emitStpNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(2), B, TLI);
+ return copyFlags(*CI,
+ emitStpNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2), B, TLI));
}
return nullptr;
@@ -3332,8 +3392,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI,
Value *FortifiedLibCallSimplifier::optimizeMemCCpyChk(CallInst *CI,
IRBuilderBase &B) {
if (isFortifiedCallFoldable(CI, 4, 3))
- return emitMemCCpy(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(2), CI->getArgOperand(3), B, TLI);
+ return copyFlags(
+ *CI, emitMemCCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2), CI->getArgOperand(3), B, TLI));
return nullptr;
}
@@ -3342,8 +3403,9 @@ Value *FortifiedLibCallSimplifier::optimizeSNPrintfChk(CallInst *CI,
IRBuilderBase &B) {
if (isFortifiedCallFoldable(CI, 3, 1, None, 2)) {
SmallVector<Value *, 8> VariadicArgs(drop_begin(CI->args(), 5));
- return emitSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(4), VariadicArgs, B, TLI);
+ return copyFlags(*CI,
+ emitSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(4), VariadicArgs, B, TLI));
}
return nullptr;
@@ -3353,8 +3415,9 @@ Value *FortifiedLibCallSimplifier::optimizeSPrintfChk(CallInst *CI,
IRBuilderBase &B) {
if (isFortifiedCallFoldable(CI, 2, None, None, 1)) {
SmallVector<Value *, 8> VariadicArgs(drop_begin(CI->args(), 4));
- return emitSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), VariadicArgs,
- B, TLI);
+ return copyFlags(*CI,
+ emitSPrintf(CI->getArgOperand(0), CI->getArgOperand(3),
+ VariadicArgs, B, TLI));
}
return nullptr;
@@ -3363,7 +3426,8 @@ Value *FortifiedLibCallSimplifier::optimizeSPrintfChk(CallInst *CI,
Value *FortifiedLibCallSimplifier::optimizeStrCatChk(CallInst *CI,
IRBuilderBase &B) {
if (isFortifiedCallFoldable(CI, 2))
- return emitStrCat(CI->getArgOperand(0), CI->getArgOperand(1), B, TLI);
+ return copyFlags(
+ *CI, emitStrCat(CI->getArgOperand(0), CI->getArgOperand(1), B, TLI));
return nullptr;
}
@@ -3371,8 +3435,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrCatChk(CallInst *CI,
Value *FortifiedLibCallSimplifier::optimizeStrLCat(CallInst *CI,
IRBuilderBase &B) {
if (isFortifiedCallFoldable(CI, 3))
- return emitStrLCat(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(2), B, TLI);
+ return copyFlags(*CI,
+ emitStrLCat(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2), B, TLI));
return nullptr;
}
@@ -3380,8 +3445,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrLCat(CallInst *CI,
Value *FortifiedLibCallSimplifier::optimizeStrNCatChk(CallInst *CI,
IRBuilderBase &B) {
if (isFortifiedCallFoldable(CI, 3))
- return emitStrNCat(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(2), B, TLI);
+ return copyFlags(*CI,
+ emitStrNCat(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2), B, TLI));
return nullptr;
}
@@ -3389,8 +3455,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrNCatChk(CallInst *CI,
Value *FortifiedLibCallSimplifier::optimizeStrLCpyChk(CallInst *CI,
IRBuilderBase &B) {
if (isFortifiedCallFoldable(CI, 3))
- return emitStrLCpy(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(2), B, TLI);
+ return copyFlags(*CI,
+ emitStrLCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2), B, TLI));
return nullptr;
}
@@ -3398,8 +3465,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrLCpyChk(CallInst *CI,
Value *FortifiedLibCallSimplifier::optimizeVSNPrintfChk(CallInst *CI,
IRBuilderBase &B) {
if (isFortifiedCallFoldable(CI, 3, 1, None, 2))
- return emitVSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(4), CI->getArgOperand(5), B, TLI);
+ return copyFlags(
+ *CI, emitVSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(4), CI->getArgOperand(5), B, TLI));
return nullptr;
}
@@ -3407,8 +3475,9 @@ Value *FortifiedLibCallSimplifier::optimizeVSNPrintfChk(CallInst *CI,
Value *FortifiedLibCallSimplifier::optimizeVSPrintfChk(CallInst *CI,
IRBuilderBase &B) {
if (isFortifiedCallFoldable(CI, 2, None, None, 1))
- return emitVSPrintf(CI->getArgOperand(0), CI->getArgOperand(3),
- CI->getArgOperand(4), B, TLI);
+ return copyFlags(*CI,
+ emitVSPrintf(CI->getArgOperand(0), CI->getArgOperand(3),
+ CI->getArgOperand(4), B, TLI));
return nullptr;
}
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index c3eafd6b2492..b822db938af8 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -450,6 +450,12 @@ Value *Mapper::mapValue(const Value *V) {
DSOLocalEquivalent::get(Func), NewTy);
}
+ if (const auto *NC = dyn_cast<NoCFIValue>(C)) {
+ auto *Val = mapValue(NC->getGlobalValue());
+ GlobalValue *GV = cast<GlobalValue>(Val);
+ return getVM()[NC] = NoCFIValue::get(GV);
+ }
+
auto mapValueOrNull = [this](Value *V) {
auto Mapped = mapValue(V);
assert((Mapped || (Flags & RF_NullMapMissingGlobalValues)) &&
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 805011191da0..81e5aa223c07 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -55,22 +55,23 @@ static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
cl::desc("The maximum number of SCEV checks allowed with a "
"vectorize(enable) pragma"));
-// FIXME: When scalable vectorization is stable enough, change the default
-// to SK_PreferFixedWidth.
-static cl::opt<LoopVectorizeHints::ScalableForceKind> ScalableVectorization(
- "scalable-vectorization", cl::init(LoopVectorizeHints::SK_FixedWidthOnly),
- cl::Hidden,
- cl::desc("Control whether the compiler can use scalable vectors to "
- "vectorize a loop"),
- cl::values(
- clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off",
- "Scalable vectorization is disabled."),
- clEnumValN(LoopVectorizeHints::SK_PreferFixedWidth, "on",
- "Scalable vectorization is available, but favor fixed-width "
- "vectorization when the cost is inconclusive."),
- clEnumValN(LoopVectorizeHints::SK_PreferScalable, "preferred",
- "Scalable vectorization is available and favored when the "
- "cost is inconclusive.")));
+static cl::opt<LoopVectorizeHints::ScalableForceKind>
+ ForceScalableVectorization(
+ "scalable-vectorization", cl::init(LoopVectorizeHints::SK_Unspecified),
+ cl::Hidden,
+ cl::desc("Control whether the compiler can use scalable vectors to "
+ "vectorize a loop"),
+ cl::values(
+ clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off",
+ "Scalable vectorization is disabled."),
+ clEnumValN(
+ LoopVectorizeHints::SK_PreferScalable, "preferred",
+ "Scalable vectorization is available and favored when the "
+ "cost is inconclusive."),
+ clEnumValN(
+ LoopVectorizeHints::SK_PreferScalable, "on",
+ "Scalable vectorization is available and favored when the "
+ "cost is inconclusive.")));
/// Maximum vectorization interleave count.
static const unsigned MaxInterleaveFactor = 16;
@@ -95,7 +96,8 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
bool InterleaveOnlyWhenForced,
- OptimizationRemarkEmitter &ORE)
+ OptimizationRemarkEmitter &ORE,
+ const TargetTransformInfo *TTI)
: Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
Interleave("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE),
Force("vectorize.enable", FK_Undefined, HK_FORCE),
@@ -110,14 +112,32 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
if (VectorizerParams::isInterleaveForced())
Interleave.Value = VectorizerParams::VectorizationInterleave;
+ // If the metadata doesn't explicitly specify whether to enable scalable
+ // vectorization, then decide based on the following criteria (increasing
+ // level of priority):
+ // - Target default
+ // - Metadata width
+ // - Force option (always overrides)
+ if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) {
+ if (TTI)
+ Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable
+ : SK_FixedWidthOnly;
+
+ if (Width.Value)
+ // If the width is set, but the metadata says nothing about the scalable
+ // property, then assume it concerns only a fixed-width UserVF.
+ // If width is not set, the flag takes precedence.
+ Scalable.Value = SK_FixedWidthOnly;
+ }
+
+ // If the flag is set to force any use of scalable vectors, override the loop
+ // hints.
+ if (ForceScalableVectorization.getValue() !=
+ LoopVectorizeHints::SK_Unspecified)
+ Scalable.Value = ForceScalableVectorization.getValue();
+
+ // Scalable vectorization is disabled if no preference is specified.
if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified)
- // If the width is set, but the metadata says nothing about the scalable
- // property, then assume it concerns only a fixed-width UserVF.
- // If width is not set, the flag takes precedence.
- Scalable.Value = Width.Value ? SK_FixedWidthOnly : ScalableVectorization;
- else if (ScalableVectorization == SK_FixedWidthOnly)
- // If the flag is set to disable any use of scalable vectors, override the
- // loop hint.
Scalable.Value = SK_FixedWidthOnly;
if (IsVectorized.Value != 1)
@@ -929,7 +949,7 @@ bool LoopVectorizationLegality::canVectorizeFPMath(
}));
}
-bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
+bool LoopVectorizationLegality::isInductionPhi(const Value *V) const {
Value *In0 = const_cast<Value *>(V);
PHINode *PN = dyn_cast_or_null<PHINode>(In0);
if (!PN)
@@ -938,16 +958,29 @@ bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
return Inductions.count(PN);
}
-bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
+const InductionDescriptor *
+LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const {
+ if (!isInductionPhi(Phi))
+ return nullptr;
+ auto &ID = getInductionVars().find(Phi)->second;
+ if (ID.getKind() == InductionDescriptor::IK_IntInduction ||
+ ID.getKind() == InductionDescriptor::IK_FpInduction)
+ return &ID;
+ return nullptr;
+}
+
+bool LoopVectorizationLegality::isCastedInductionVariable(
+ const Value *V) const {
auto *Inst = dyn_cast<Instruction>(V);
return (Inst && InductionCastsToIgnore.count(Inst));
}
-bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+bool LoopVectorizationLegality::isInductionVariable(const Value *V) const {
return isInductionPhi(V) || isCastedInductionVariable(V);
}
-bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
+bool LoopVectorizationLegality::isFirstOrderRecurrence(
+ const PHINode *Phi) const {
return FirstOrderRecurrences.count(Phi);
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index a7d6609f8c56..71eb39a18d2f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -45,16 +45,17 @@ class VPBuilder {
VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
VPInstruction *createInstruction(unsigned Opcode,
- ArrayRef<VPValue *> Operands) {
- VPInstruction *Instr = new VPInstruction(Opcode, Operands);
+ ArrayRef<VPValue *> Operands, DebugLoc DL) {
+ VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL);
if (BB)
BB->insert(Instr, InsertPt);
return Instr;
}
VPInstruction *createInstruction(unsigned Opcode,
- std::initializer_list<VPValue *> Operands) {
- return createInstruction(Opcode, ArrayRef<VPValue *>(Operands));
+ std::initializer_list<VPValue *> Operands,
+ DebugLoc DL) {
+ return createInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL);
}
public:
@@ -123,30 +124,33 @@ public:
/// its underlying Instruction.
VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
Instruction *Inst = nullptr) {
- VPInstruction *NewVPInst = createInstruction(Opcode, Operands);
+ DebugLoc DL;
+ if (Inst)
+ DL = Inst->getDebugLoc();
+ VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL);
NewVPInst->setUnderlyingValue(Inst);
return NewVPInst;
}
- VPValue *createNaryOp(unsigned Opcode,
- std::initializer_list<VPValue *> Operands,
- Instruction *Inst = nullptr) {
- return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst);
+ VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
+ DebugLoc DL) {
+ return createInstruction(Opcode, Operands, DL);
}
- VPValue *createNot(VPValue *Operand) {
- return createInstruction(VPInstruction::Not, {Operand});
+ VPValue *createNot(VPValue *Operand, DebugLoc DL) {
+ return createInstruction(VPInstruction::Not, {Operand}, DL);
}
- VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
- return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
+ VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL) {
+ return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL);
}
- VPValue *createOr(VPValue *LHS, VPValue *RHS) {
- return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
+ VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL) {
+ return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, DL);
}
- VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal) {
- return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal});
+ VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal,
+ DebugLoc DL) {
+ return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL);
}
//===--------------------------------------------------------------------===//
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5ca0adb4242c..4747f34fcc62 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -428,6 +428,8 @@ class GeneratedRTChecks;
namespace llvm {
+AnalysisKey ShouldRunExtraVectorPasses::Key;
+
/// InnerLoopVectorizer vectorizes loops which contain only one basic
/// block to a specified vectorization factor (VF).
/// This class performs the widening of scalars into vectors, or multiple
@@ -506,8 +508,8 @@ public:
/// Widen an integer or floating-point induction variable \p IV. If \p Trunc
/// is provided, the integer induction variable will first be truncated to
/// the corresponding type.
- void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
- VPValue *Def, VPValue *CastDef,
+ void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID,
+ Value *Start, TruncInst *Trunc, VPValue *Def,
VPTransformState &State);
/// Construct the vector value of a scalarized value \p V one lane at a time.
@@ -534,7 +536,7 @@ public:
/// Returns true if the reordering of FP operations is not allowed, but we are
/// able to vectorize with strict in-order reductions for the given RdxDesc.
- bool useOrderedReductions(RecurrenceDescriptor &RdxDesc);
+ bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
/// Create a broadcast instruction. This method generates a broadcast
/// instruction (shuffle) for loop invariant values and for the induction
@@ -619,7 +621,7 @@ protected:
/// can also be a truncate instruction.
void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
const InductionDescriptor &ID, VPValue *Def,
- VPValue *CastDef, VPTransformState &State);
+ VPTransformState &State);
/// Create a vector induction phi node based on an existing scalar one. \p
/// EntryVal is the value from the original loop that maps to the vector phi
@@ -629,7 +631,6 @@ protected:
void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
Value *Step, Value *Start,
Instruction *EntryVal, VPValue *Def,
- VPValue *CastDef,
VPTransformState &State);
/// Returns true if an instruction \p I should be scalarized instead of
@@ -639,29 +640,6 @@ protected:
/// Returns true if we should generate a scalar version of \p IV.
bool needsScalarInduction(Instruction *IV) const;
- /// If there is a cast involved in the induction variable \p ID, which should
- /// be ignored in the vectorized loop body, this function records the
- /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
- /// cast. We had already proved that the casted Phi is equal to the uncasted
- /// Phi in the vectorized loop (under a runtime guard), and therefore
- /// there is no need to vectorize the cast - the same value can be used in the
- /// vector loop for both the Phi and the cast.
- /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
- /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
- ///
- /// \p EntryVal is the value from the original loop that maps to the vector
- /// phi node and is used to distinguish what is the IV currently being
- /// processed - original one (if \p EntryVal is a phi corresponding to the
- /// original IV) or the "newly-created" one based on the proof mentioned above
- /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
- /// latter case \p EntryVal is a TruncInst and we must not record anything for
- /// that IV, but it's error-prone to expect callers of this routine to care
- /// about that, hence this explicit parameter.
- void recordVectorLoopValueForInductionCast(
- const InductionDescriptor &ID, const Instruction *EntryVal,
- Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
- unsigned Part, unsigned Lane = UINT_MAX);
-
/// Generate a shuffle sequence that will reverse the vector Vec.
virtual Value *reverseVector(Value *Vec);
@@ -698,7 +676,8 @@ protected:
/// flags, which can be found from the original scalar operations.
Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
const DataLayout &DL,
- const InductionDescriptor &ID) const;
+ const InductionDescriptor &ID,
+ BasicBlock *VectorHeader) const;
/// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
/// vector loop preheader, middle block and scalar preheader. Also
@@ -1728,7 +1707,8 @@ private:
/// disabled or unsupported, then the scalable part will be equal to
/// ElementCount::getScalable(0).
FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
- ElementCount UserVF);
+ ElementCount UserVF,
+ bool FoldTailByMasking);
/// \return the maximized element count based on the targets vector
/// registers and the loop trip-count, but limited to a maximum safe VF.
@@ -1741,7 +1721,8 @@ private:
ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
unsigned SmallestType,
unsigned WidestType,
- const ElementCount &MaxSafeVF);
+ const ElementCount &MaxSafeVF,
+ bool FoldTailByMasking);
/// \return the maximum legal scalable VF, based on the safe max number
/// of elements.
@@ -2356,8 +2337,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
const InductionDescriptor &II, Value *Step, Value *Start,
- Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
- VPTransformState &State) {
+ Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
+ IRBuilder<> &Builder = State.Builder;
assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
"Expected either an induction phi-node or a truncate of it!");
@@ -2373,7 +2354,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
}
Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
- Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
+ Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
Value *SteppedStart =
getStepVector(SplatStart, Zero, Step, II.getInductionOpcode());
@@ -2394,9 +2375,9 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
Type *StepType = Step->getType();
Value *RuntimeVF;
if (Step->getType()->isFloatingPointTy())
- RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF);
+ RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
else
- RuntimeVF = getRuntimeVF(Builder, StepType, VF);
+ RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
// Create a vector splat to use in the induction update.
@@ -2405,8 +2386,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
// IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
// handle a constant vector splat.
Value *SplatVF = isa<Constant>(Mul)
- ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
- : Builder.CreateVectorSplat(VF, Mul);
+ ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
+ : Builder.CreateVectorSplat(State.VF, Mul);
Builder.restoreIP(CurrIP);
// We may need to add the step a number of times, depending on the unroll
@@ -2420,8 +2401,6 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
if (isa<TruncInst>(EntryVal))
addMetadata(LastInduction, EntryVal);
- recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
- State, Part);
LastInduction = cast<Instruction>(
Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
@@ -2455,56 +2434,21 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
return llvm::any_of(IV->users(), isScalarInst);
}
-void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
- const InductionDescriptor &ID, const Instruction *EntryVal,
- Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
- unsigned Part, unsigned Lane) {
- assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
- "Expected either an induction phi-node or a truncate of it!");
-
- // This induction variable is not the phi from the original loop but the
- // newly-created IV based on the proof that casted Phi is equal to the
- // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
- // re-uses the same InductionDescriptor that original IV uses but we don't
- // have to do any recording in this case - that is done when original IV is
- // processed.
- if (isa<TruncInst>(EntryVal))
- return;
-
- if (!CastDef) {
- assert(ID.getCastInsts().empty() &&
- "there are casts for ID, but no CastDef");
- return;
- }
- assert(!ID.getCastInsts().empty() &&
- "there is a CastDef, but no casts for ID");
- // Only the first Cast instruction in the Casts vector is of interest.
- // The rest of the Casts (if exist) have no uses outside the
- // induction update chain itself.
- if (Lane < UINT_MAX)
- State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
- else
- State.set(CastDef, VectorLoopVal, Part);
-}
-
-void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
- TruncInst *Trunc, VPValue *Def,
- VPValue *CastDef,
+void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
+ const InductionDescriptor &ID,
+ Value *Start, TruncInst *Trunc,
+ VPValue *Def,
VPTransformState &State) {
+ IRBuilder<> &Builder = State.Builder;
assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type");
-
- auto II = Legal->getInductionVars().find(IV);
- assert(II != Legal->getInductionVars().end() && "IV is not an induction");
-
- auto ID = II->second;
assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
// The value from the original loop to which we are mapping the new induction
// variable.
Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
- auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+ auto &DL = EntryVal->getModule()->getDataLayout();
// Generate code for the induction step. Note that induction steps are
// required to be loop-invariant
@@ -2514,7 +2458,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
if (PSE.getSE()->isSCEVable(IV->getType())) {
SCEVExpander Exp(*PSE.getSE(), DL, "induction");
return Exp.expandCodeFor(Step, Step->getType(),
- LoopVectorPreHeader->getTerminator());
+ State.CFG.VectorPreHeader->getTerminator());
}
return cast<SCEVUnknown>(Step)->getValue();
};
@@ -2530,7 +2474,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
? Builder.CreateSExtOrTrunc(Induction, IV->getType())
: Builder.CreateCast(Instruction::SIToFP, Induction,
IV->getType());
- ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
+ ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID,
+ State.CFG.PrevBB);
ScalarIV->setName("offset.idx");
}
if (Trunc) {
@@ -2548,20 +2493,19 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
Value *Broadcasted = getBroadcastInstrs(ScalarIV);
for (unsigned Part = 0; Part < UF; ++Part) {
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
Value *StartIdx;
if (Step->getType()->isFloatingPointTy())
- StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part);
+ StartIdx =
+ getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part);
else
- StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part);
+ StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part);
Value *EntryPart =
getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode());
State.set(Def, EntryPart, Part);
if (Trunc)
addMetadata(EntryPart, Trunc);
- recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
- State, Part);
}
};
@@ -2572,7 +2516,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
// Now do the actual transformations, and start with creating the step value.
Value *Step = CreateStepValue(ID.getStep());
- if (VF.isZero() || VF.isScalar()) {
+ if (State.VF.isZero() || State.VF.isScalar()) {
Value *ScalarIV = CreateScalarIV(Step);
CreateSplatIV(ScalarIV, Step);
return;
@@ -2583,8 +2527,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
// least one user in the loop that is not widened.
auto NeedsScalarIV = needsScalarInduction(EntryVal);
if (!NeedsScalarIV) {
- createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
- State);
+ createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
return;
}
@@ -2592,14 +2535,13 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
// create the phi node, we will splat the scalar induction variable in each
// loop iteration.
if (!shouldScalarizeInstruction(EntryVal)) {
- createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
- State);
+ createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
Value *ScalarIV = CreateScalarIV(Step);
// Create scalar steps that can be used by instructions we will later
// scalarize. Note that the addition of the scalar steps will not increase
// the number of instructions in the loop in the common case prior to
// InstCombine. We will be trading one vector extract for each scalar step.
- buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
+ buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
return;
}
@@ -2609,7 +2551,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
Value *ScalarIV = CreateScalarIV(Step);
if (!Cost->isScalarEpilogueAllowed())
CreateSplatIV(ScalarIV, Step);
- buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
+ buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
}
Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
@@ -2663,10 +2605,11 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
Instruction *EntryVal,
const InductionDescriptor &ID,
- VPValue *Def, VPValue *CastDef,
+ VPValue *Def,
VPTransformState &State) {
+ IRBuilder<> &Builder = State.Builder;
// We shouldn't have to build scalar steps if we aren't vectorizing.
- assert(VF.isVector() && "VF should be greater than one");
+ assert(State.VF.isVector() && "VF should be greater than one");
// Get the value type and ensure it and the step have the same integer type.
Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
assert(ScalarIVTy == Step->getType() &&
@@ -2688,33 +2631,32 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
// iteration. If EntryVal is uniform, we only need to generate the first
// lane. Otherwise, we generate all VF values.
bool IsUniform =
- Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);
- unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();
+ Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF);
+ unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
// Compute the scalar steps and save the results in State.
Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
ScalarIVTy->getScalarSizeInBits());
Type *VecIVTy = nullptr;
Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
- if (!IsUniform && VF.isScalable()) {
- VecIVTy = VectorType::get(ScalarIVTy, VF);
- UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));
- SplatStep = Builder.CreateVectorSplat(VF, Step);
- SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);
+ if (!IsUniform && State.VF.isScalable()) {
+ VecIVTy = VectorType::get(ScalarIVTy, State.VF);
+ UnitStepVec =
+ Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
+ SplatStep = Builder.CreateVectorSplat(State.VF, Step);
+ SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
}
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *StartIdx0 = createStepForVF(Builder, IntStepTy, VF, Part);
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
- if (!IsUniform && VF.isScalable()) {
- auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
+ if (!IsUniform && State.VF.isScalable()) {
+ auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
if (ScalarIVTy->isFloatingPointTy())
InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
State.set(Def, Add, Part);
- recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
- Part);
// It's useful to record the lane values too for the known minimum number
// of elements so we do those below. This improves the code quality when
// trying to extract the first element, for example.
@@ -2728,14 +2670,12 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
// The step returned by `createStepForVF` is a runtime-evaluated value
// when VF is scalable. Otherwise, it should be folded into a Constant.
- assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
+ assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
"Expected StartIdx to be folded to a constant when VF is not "
"scalable");
auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
State.set(Def, Add, VPIteration(Part, Lane));
- recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
- Part, Lane);
}
}
}
@@ -3023,21 +2963,19 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
// poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
// instruction could feed a poison value to the base address of the widen
// load/store.
- if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0)
+ if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
Cloned->dropPoisonGeneratingFlags();
State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
Builder.GetInsertPoint());
// Replace the operands of the cloned instructions with their scalar
// equivalents in the new loop.
- for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) {
- auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
+ for (auto &I : enumerate(RepRecipe->operands())) {
auto InputInstance = Instance;
- if (!Operand || !OrigLoop->contains(Operand) ||
- (Cost->isUniformAfterVectorization(Operand, State.VF)))
+ VPValue *Operand = I.value();
+ if (State.Plan->isUniformAfterVectorization(Operand))
InputInstance.Lane = VPLane::getFirstLane();
- auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance);
- Cloned->setOperand(op, NewOp);
+ Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
}
addNewMetadata(Cloned, Instr);
@@ -3339,7 +3277,7 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
Value *InnerLoopVectorizer::emitTransformedIndex(
IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
- const InductionDescriptor &ID) const {
+ const InductionDescriptor &ID, BasicBlock *VectorHeader) const {
SCEVExpander Exp(*SE, DL, "induction");
auto Step = ID.getStep();
@@ -3382,15 +3320,15 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
};
// Get a suitable insert point for SCEV expansion. For blocks in the vector
- // loop, choose the end of the vector loop header (=LoopVectorBody), because
+ // loop, choose the end of the vector loop header (=VectorHeader), because
// the DomTree is not kept up-to-date for additional blocks generated in the
// vector loop. By using the header as insertion point, we guarantee that the
// expanded instructions dominate all their uses.
- auto GetInsertPoint = [this, &B]() {
+ auto GetInsertPoint = [this, &B, VectorHeader]() {
BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
if (InsertBB != LoopVectorBody &&
- LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
- return LoopVectorBody->getTerminator();
+ LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB))
+ return VectorHeader->getTerminator();
return &*B.GetInsertPoint();
};
@@ -3538,7 +3476,8 @@ void InnerLoopVectorizer::createInductionResumeValues(
CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
- EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
+ EndValue =
+ emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
EndValue->setName("ind.end");
// Compute the end value for the additional bypass (if applicable).
@@ -3549,7 +3488,7 @@ void InnerLoopVectorizer::createInductionResumeValues(
CRD =
B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
EndValueFromAdditionalBypass =
- emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
+ emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
EndValueFromAdditionalBypass->setName("ind.end");
}
}
@@ -3623,7 +3562,7 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
if (MDNode *LID = OrigLoop->getLoopID())
L->setLoopID(LID);
- LoopVectorizeHints Hints(L, true, *ORE);
+ LoopVectorizeHints Hints(L, true, *ORE, TTI);
Hints.setAlreadyVectorized();
#ifdef EXPENSIVE_CHECKS
@@ -3780,7 +3719,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
II.getStep()->getType())
: B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
CMO->setName("cast.cmo");
- Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
+ Value *Escape =
+ emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody);
Escape->setName("ind.escape");
MissingVals[UI] = Escape;
}
@@ -4573,7 +4513,8 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
}
}
-bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
+bool InnerLoopVectorizer::useOrderedReductions(
+ const RecurrenceDescriptor &RdxDesc) {
return Cost->useOrderedReductions(RdxDesc);
}
@@ -4648,8 +4589,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
Value *Idx = Builder.CreateAdd(
PartStart, ConstantInt::get(PtrInd->getType(), Lane));
Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
- Value *SclrGep =
- emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
+ Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(),
+ DL, II, State.CFG.PrevBB);
SclrGep->setName("next.gep");
State.set(PhiR, SclrGep, VPIteration(Part, Lane));
}
@@ -5368,13 +5309,9 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
// Limit MaxScalableVF by the maximum safe dependence distance.
Optional<unsigned> MaxVScale = TTI.getMaxVScale();
- if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
- unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange)
- .getVScaleRangeArgs()
- .second;
- if (VScaleMax > 0)
- MaxVScale = VScaleMax;
- }
+ if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
+ MaxVScale =
+ TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
MaxScalableVF = ElementCount::getScalable(
MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
if (!MaxScalableVF)
@@ -5386,9 +5323,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
return MaxScalableVF;
}
-FixedScalableVFPair
-LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
- ElementCount UserVF) {
+FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
+ unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -5475,12 +5411,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
FixedScalableVFPair Result(ElementCount::getFixed(1),
ElementCount::getScalable(0));
- if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
- WidestType, MaxSafeFixedVF))
+ if (auto MaxVF =
+ getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
+ MaxSafeFixedVF, FoldTailByMasking))
Result.FixedVF = MaxVF;
- if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
- WidestType, MaxSafeScalableVF))
+ if (auto MaxVF =
+ getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
+ MaxSafeScalableVF, FoldTailByMasking))
if (MaxVF.isScalable()) {
Result.ScalableVF = MaxVF;
LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
@@ -5513,7 +5451,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
switch (ScalarEpilogueStatus) {
case CM_ScalarEpilogueAllowed:
- return computeFeasibleMaxVF(TC, UserVF);
+ return computeFeasibleMaxVF(TC, UserVF, false);
case CM_ScalarEpilogueNotAllowedUsePredicate:
LLVM_FALLTHROUGH;
case CM_ScalarEpilogueNotNeededUsePredicate:
@@ -5551,7 +5489,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n");
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
- return computeFeasibleMaxVF(TC, UserVF);
+ return computeFeasibleMaxVF(TC, UserVF, false);
}
return FixedScalableVFPair::getNone();
}
@@ -5568,7 +5506,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
- FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF);
+ FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
// Avoid tail folding if the trip count is known to be a multiple of any VF
// we chose.
// FIXME: The condition below pessimises the case for fixed-width vectors,
@@ -5641,7 +5579,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
- const ElementCount &MaxSafeVF) {
+ const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
TypeSize WidestRegister = TTI.getRegisterBitWidth(
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
@@ -5673,14 +5611,17 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
if (ConstTripCount &&
ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
- isPowerOf2_32(ConstTripCount)) {
- // We need to clamp the VF to be the ConstTripCount. There is no point in
- // choosing a higher viable VF as done in the loop below. If
- // MaxVectorElementCount is scalable, we only fall back on a fixed VF when
- // the TC is less than or equal to the known number of lanes.
- LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
- << ConstTripCount << "\n");
- return TripCountEC;
+ (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
+ // If loop trip count (TC) is known at compile time there is no point in
+ // choosing VF greater than TC (as done in the loop below). Select maximum
+ // power of two which doesn't exceed TC.
+ // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
+ // when the TC is less than or equal to the known number of lanes.
+ auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
+ LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
+ "exceeding the constant trip count: "
+ << ClampedConstTripCount << "\n");
+ return ElementCount::getFixed(ClampedConstTripCount);
}
ElementCount MaxVF = MaxVectorElementCount;
@@ -5758,12 +5699,11 @@ bool LoopVectorizationCostModel::isMoreProfitable(
EstimatedWidthB *= VScale.getValue();
}
- // When set to preferred, for now assume vscale may be larger than 1 (or the
- // one being tuned for), so that scalable vectorization is slightly favorable
- // over fixed-width vectorization.
- if (Hints->isScalableVectorizationPreferred())
- if (A.Width.isScalable() && !B.Width.isScalable())
- return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
+ // Assume vscale may be larger than 1 (or the value being tuned for),
+ // so that scalable vectorization is slightly favorable over fixed-width
+ // vectorization.
+ if (A.Width.isScalable() && !B.Width.isScalable())
+ return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
// To avoid the need for FP division:
// (CostA / A.Width) < (CostB / B.Width)
@@ -6068,7 +6008,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
if (auto *PN = dyn_cast<PHINode>(&I)) {
if (!Legal->isReductionVariable(PN))
continue;
- const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN];
+ const RecurrenceDescriptor &RdxDesc =
+ Legal->getReductionVars().find(PN)->second;
if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
TTI.preferInLoopReduction(RdxDesc.getOpcode(),
RdxDesc.getRecurrenceType(),
@@ -7002,7 +6943,7 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
const RecurrenceDescriptor &RdxDesc =
- Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
+ Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
InstructionCost BaseCost = TTI.getArithmeticReductionCost(
RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
@@ -7079,22 +7020,41 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
if (match(Op0, m_ZExtOrSExt(m_Value())) &&
Op0->getOpcode() == Op1->getOpcode() &&
- Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
!TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
bool IsUnsigned = isa<ZExtInst>(Op0);
- auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
- // Matched reduce(mul(ext, ext))
- InstructionCost ExtCost =
- TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,
- TTI::CastContextHint::None, CostKind, Op0);
+ Type *Op0Ty = Op0->getOperand(0)->getType();
+ Type *Op1Ty = Op1->getOperand(0)->getType();
+ Type *LargestOpTy =
+ Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
+ : Op0Ty;
+ auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
+
+ // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
+ // different sizes. We take the largest type as the ext to reduce, and add
+ // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
+ InstructionCost ExtCost0 = TTI.getCastInstrCost(
+ Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
+ TTI::CastContextHint::None, CostKind, Op0);
+ InstructionCost ExtCost1 = TTI.getCastInstrCost(
+ Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
+ TTI::CastContextHint::None, CostKind, Op1);
InstructionCost MulCost =
TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
InstructionCost RedCost = TTI.getExtendedAddReductionCost(
/*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
CostKind);
+ InstructionCost ExtraExtCost = 0;
+ if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
+ Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
+ ExtraExtCost = TTI.getCastInstrCost(
+ ExtraExtOp->getOpcode(), ExtType,
+ VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
+ TTI::CastContextHint::None, CostKind, ExtraExtOp);
+ }
- if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
+ if (RedCost.isValid() &&
+ (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
return I == RetI ? RedCost : 0;
} else if (!match(I, m_ZExtOrSExt(m_Value()))) {
// Matched reduce(mul())
@@ -7570,8 +7530,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
Type *CondTy = SI->getCondition()->getType();
if (!ScalarCond)
CondTy = VectorType::get(CondTy, VF);
- return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
+
+ CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+ if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
+ Pred = Cmp->getPredicate();
+ return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
+ CostKind, I);
}
case Instruction::ICmp:
case Instruction::FCmp: {
@@ -7581,7 +7545,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
VectorTy = ToVectorTy(ValTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
- CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
+ cast<CmpInst>(I)->getPredicate(), CostKind,
+ I);
}
case Instruction::Store:
case Instruction::Load: {
@@ -7762,14 +7727,14 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
// Ignore type-promoting instructions we identified during reduction
// detection.
for (auto &Reduction : Legal->getReductionVars()) {
- RecurrenceDescriptor &RedDes = Reduction.second;
+ const RecurrenceDescriptor &RedDes = Reduction.second;
const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
// Ignore type-casting instructions we identified during induction
// detection.
for (auto &Induction : Legal->getInductionVars()) {
- InductionDescriptor &IndDes = Induction.second;
+ const InductionDescriptor &IndDes = Induction.second;
const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
@@ -7778,7 +7743,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
void LoopVectorizationCostModel::collectInLoopReductions() {
for (auto &Reduction : Legal->getReductionVars()) {
PHINode *Phi = Reduction.first;
- RecurrenceDescriptor &RdxDesc = Reduction.second;
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
// We don't collect reductions that are type promoted (yet).
if (RdxDesc.getRecurrenceType() != Phi->getType())
@@ -8064,18 +8029,6 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
return U == Ind || DeadInstructions.count(cast<Instruction>(U));
}))
DeadInstructions.insert(IndUpdate);
-
- // We record as "Dead" also the type-casting instructions we had identified
- // during induction analysis. We don't need any handling for them in the
- // vectorized loop because we have proven that, under a proper runtime
- // test guarding the vectorized loop, the value of the phi, and the casted
- // value of the phi, are the same. The last instruction in this casting chain
- // will get its scalar/vector/widened def from the scalar/vector/widened def
- // of the respective phi node. Any other casts in the induction def-use chain
- // have no other uses outside the phi update chain, and will be ignored.
- InductionDescriptor &IndDes = Induction.second;
- const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
- DeadInstructions.insert(Casts.begin(), Casts.end());
}
}
@@ -8461,7 +8414,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
assert(EdgeMask && "No Edge Mask found for condition");
if (BI->getSuccessor(0) != Dst)
- EdgeMask = Builder.createNot(EdgeMask);
+ EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
// The condition is 'SrcMask && EdgeMask', which is equivalent to
@@ -8470,7 +8423,8 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
// EdgeMask is poison. Using 'and' here introduces undefined behavior.
VPValue *False = Plan->getOrAddVPValue(
ConstantInt::getFalse(BI->getCondition()->getType()));
- EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
+ EdgeMask =
+ Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
}
return EdgeMaskCache[Edge] = EdgeMask;
@@ -8492,22 +8446,24 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
if (!CM.blockNeedsPredicationForAnyReason(BB))
return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
- // Create the block in mask as the first non-phi instruction in the block.
- VPBuilder::InsertPointGuard Guard(Builder);
- auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
- Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
-
// Introduce the early-exit compare IV <= BTC to form header block mask.
// This is used instead of IV < TC because TC may wrap, unlike BTC.
- // Start by constructing the desired canonical IV.
+ // Start by constructing the desired canonical IV in the header block.
VPValue *IV = nullptr;
if (Legal->getPrimaryInduction())
IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
else {
+ VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
auto *IVRecipe = new VPWidenCanonicalIVRecipe();
- Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
+ HeaderVPBB->insert(IVRecipe, HeaderVPBB->getFirstNonPhi());
IV = IVRecipe;
}
+
+ // Create the block in mask as the first non-phi instruction in the block.
+ VPBuilder::InsertPointGuard Guard(Builder);
+ auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
+ Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
+
VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
bool TailFolded = !CM.isScalarEpilogueAllowed();
@@ -8534,7 +8490,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
continue;
}
- BlockMask = Builder.createOr(BlockMask, EdgeMask);
+ BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
}
return BlockMaskCache[BB] = BlockMask;
@@ -8591,14 +8547,10 @@ VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
ArrayRef<VPValue *> Operands) const {
// Check if this is an integer or fp induction. If so, build the recipe that
// produces its scalar and vector values.
- InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
- if (II.getKind() == InductionDescriptor::IK_IntInduction ||
- II.getKind() == InductionDescriptor::IK_FpInduction) {
- assert(II.getStartValue() ==
+ if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) {
+ assert(II->getStartValue() ==
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
- const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts();
- return new VPWidenIntOrFpInductionRecipe(
- Phi, Operands[0], Casts.empty() ? nullptr : Casts.front());
+ return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II);
}
return nullptr;
@@ -8624,11 +8576,10 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
if (LoopVectorizationPlanner::getDecisionAndClampRange(
isOptimizableIVTruncate(I), Range)) {
- InductionDescriptor II =
- Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
+ auto *Phi = cast<PHINode>(I->getOperand(0));
+ const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
- return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
- Start, nullptr, I);
+ return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I);
}
return nullptr;
}
@@ -8844,13 +8795,17 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
return VPBB;
}
LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
- assert(VPBB->getSuccessors().empty() &&
- "VPBB has successors when handling predicated replication.");
+
+ VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
+ assert(SingleSucc && "VPBB must have a single successor when handling "
+ "predicated replication.");
+ VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
// Record predicated instructions for above packing optimizations.
VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
VPBlockUtils::insertBlockAfter(Region, VPBB);
auto *RegSucc = new VPBasicBlock();
VPBlockUtils::insertBlockAfter(RegSucc, Region);
+ VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
return RegSucc;
}
@@ -8910,7 +8865,8 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
VPValue *StartV = Operands[0];
if (Legal->isReductionVariable(Phi)) {
- RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
+ const RecurrenceDescriptor &RdxDesc =
+ Legal->getReductionVars().find(Phi)->second;
assert(RdxDesc.getRecurrenceStartValue() ==
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
@@ -9031,7 +8987,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}
for (auto &Reduction : CM.getInLoopReductionChains()) {
PHINode *Phi = Reduction.first;
- RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
+ RecurKind Kind =
+ Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
RecipeBuilder.recordRecipeOf(Phi);
@@ -9069,30 +9026,25 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// visit each basic block after having visited its predecessor basic blocks.
// ---------------------------------------------------------------------------
- auto Plan = std::make_unique<VPlan>();
+ // Create initial VPlan skeleton, with separate header and latch blocks.
+ VPBasicBlock *HeaderVPBB = new VPBasicBlock();
+ VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
+ VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
+ auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
+ auto Plan = std::make_unique<VPlan>(TopRegion);
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
LoopBlocksDFS DFS(OrigLoop);
DFS.perform(LI);
- VPBasicBlock *VPBB = nullptr;
- VPBasicBlock *HeaderVPBB = nullptr;
+ VPBasicBlock *VPBB = HeaderVPBB;
SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
// Relevant instructions from basic block BB will be grouped into VPRecipe
// ingredients and fill a new VPBasicBlock.
unsigned VPBBsForBB = 0;
- auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
- if (VPBB)
- VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
- else {
- auto *TopRegion = new VPRegionBlock("vector loop");
- TopRegion->setEntry(FirstVPBBForBB);
- Plan->setEntry(TopRegion);
- HeaderVPBB = FirstVPBBForBB;
- }
- VPBB = FirstVPBBForBB;
+ VPBB->setName(BB->getName());
Builder.setInsertPoint(VPBB);
// Introduce each ingredient into VPlan.
@@ -9159,13 +9111,21 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
: "");
}
}
+
+ VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
+ VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
}
+ // Fold the last, empty block into its predecessor.
+ VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
+ assert(VPBB && "expected to fold last (empty) block");
+ // After here, VPBB should not be used.
+ VPBB = nullptr;
+
assert(isa<VPRegionBlock>(Plan->getEntry()) &&
!Plan->getEntry()->getEntryBasicBlock()->empty() &&
"entry block must be set to a VPRegionBlock having a non-empty entry "
"VPBasicBlock");
- cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB);
RecipeBuilder.fixHeaderPhis();
// ---------------------------------------------------------------------------
@@ -9231,18 +9191,19 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
- if (VPBB == SplitPred)
- VPBB = SplitBlock;
}
}
+ VPlanTransforms::removeRedundantInductionCasts(*Plan);
+
// Now that sink-after is done, move induction recipes for optimized truncates
// to the phi section of the header block.
for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
// Adjust the recipes for any inloop reductions.
- adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start);
+ adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan,
+ RecipeBuilder, Range.Start);
// Introduce a recipe to combine the incoming and previous values of a
// first-order recurrence.
@@ -9322,6 +9283,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
RSO.flush();
Plan->setName(PlanName);
+ // Fold Exit block into its predecessor if possible.
+ // TODO: Fold block earlier once all VPlan transforms properly maintain a
+ // VPBasicBlock as exit.
+ VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit());
+
assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
}
@@ -9355,9 +9321,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
}
SmallPtrSet<Instruction *, 1> DeadInstructions;
- VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan,
- Legal->getInductionVars(),
- DeadInstructions, *PSE.getSE());
+ VPlanTransforms::VPInstructionsToVPRecipes(
+ OrigLoop, Plan,
+ [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
+ DeadInstructions, *PSE.getSE());
return Plan;
}
@@ -9371,7 +9338,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
ElementCount MinVF) {
for (auto &Reduction : CM.getInLoopReductionChains()) {
PHINode *Phi = Reduction.first;
- RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
+ const RecurrenceDescriptor &RdxDesc =
+ Legal->getReductionVars().find(Phi)->second;
const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
@@ -9565,7 +9533,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
// exact, etc.). The control flow has been linearized and the
// instruction is no longer guarded by the predicate, which could make
// the flag properties to no longer hold.
- if (State.MayGeneratePoisonRecipes.count(this) > 0)
+ if (State.MayGeneratePoisonRecipes.contains(this))
VecOp->dropPoisonGeneratingFlags();
}
@@ -9714,9 +9682,9 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Int or FP induction being replicated.");
- State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
- getTruncInst(), getVPValue(0),
- getCastValue(), State);
+ State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(),
+ getStartValue()->getLiveInIRValue(),
+ getTruncInst(), getVPValue(0), State);
}
void VPWidenPHIRecipe::execute(VPTransformState &State) {
@@ -10293,7 +10261,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
<< L->getHeader()->getParent()->getName() << "\" from "
<< DebugLocStr << "\n");
- LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
+ LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
LLVM_DEBUG(
dbgs() << "LV: Loop hints:"
@@ -10747,8 +10715,17 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
PA.preserve<LoopAnalysis>();
PA.preserve<DominatorTreeAnalysis>();
}
- if (!Result.MadeCFGChange)
+
+ if (Result.MadeCFGChange) {
+ // Making CFG changes likely means a loop got vectorized. Indicate that
+ // extra simplification passes should be run.
+ // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
+ // be run if runtime checks have been added.
+ AM.getResult<ShouldRunExtraVectorPasses>(F);
+ PA.preserve<ShouldRunExtraVectorPasses>();
+ } else {
PA.preserveSet<CFGAnalyses>();
+ }
return PA;
}
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 95061e9053fa..37ae13666f7a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -631,27 +631,26 @@ static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
/// after: 6 3 5 4 7 2 1 0
static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) {
const unsigned Sz = Order.size();
- SmallBitVector UsedIndices(Sz);
- SmallVector<int> MaskedIndices;
+ SmallBitVector UnusedIndices(Sz, /*t=*/true);
+ SmallBitVector MaskedIndices(Sz);
for (unsigned I = 0; I < Sz; ++I) {
if (Order[I] < Sz)
- UsedIndices.set(Order[I]);
+ UnusedIndices.reset(Order[I]);
else
- MaskedIndices.push_back(I);
+ MaskedIndices.set(I);
}
- if (MaskedIndices.empty())
+ if (MaskedIndices.none())
return;
- SmallVector<int> AvailableIndices(MaskedIndices.size());
- unsigned Cnt = 0;
- int Idx = UsedIndices.find_first();
- do {
- AvailableIndices[Cnt] = Idx;
- Idx = UsedIndices.find_next(Idx);
- ++Cnt;
- } while (Idx > 0);
- assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices.");
- for (int I = 0, E = MaskedIndices.size(); I < E; ++I)
- Order[MaskedIndices[I]] = AvailableIndices[I];
+ assert(UnusedIndices.count() == MaskedIndices.count() &&
+ "Non-synced masked/available indices.");
+ int Idx = UnusedIndices.find_first();
+ int MIdx = MaskedIndices.find_first();
+ while (MIdx >= 0) {
+ assert(Idx >= 0 && "Indices must be synced.");
+ Order[MIdx] = Idx;
+ Idx = UnusedIndices.find_next(Idx);
+ MIdx = MaskedIndices.find_next(MIdx);
+ }
}
namespace llvm {
@@ -812,6 +811,13 @@ public:
/// ExtractElement, ExtractValue), which can be part of the graph.
Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
+ /// Gets reordering data for the given tree entry. If the entry is vectorized
+ /// - just return ReorderIndices, otherwise check if the scalars can be
+ /// reordered and return the most optimal order.
+ /// \param TopToBottom If true, include the order of vectorized stores and
+ /// insertelement nodes, otherwise skip them.
+ Optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom);
+
/// Reorders the current graph to the most profitable order starting from the
/// root node to the leaf nodes. The best order is chosen only from the nodes
/// of the same size (vectorization factor). Smaller nodes are considered
@@ -1010,18 +1016,25 @@ public:
std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
}
- // The hard-coded scores listed here are not very important. When computing
- // the scores of matching one sub-tree with another, we are basically
- // counting the number of values that are matching. So even if all scores
- // are set to 1, we would still get a decent matching result.
+ // The hard-coded scores listed here are not very important, though it shall
+ // be higher for better matches to improve the resulting cost. When
+ // computing the scores of matching one sub-tree with another, we are
+ // basically counting the number of values that are matching. So even if all
+ // scores are set to 1, we would still get a decent matching result.
// However, sometimes we have to break ties. For example we may have to
// choose between matching loads vs matching opcodes. This is what these
- // scores are helping us with: they provide the order of preference.
+ // scores are helping us with: they provide the order of preference. Also,
+ // this is important if the scalar is externally used or used in another
+ // tree entry node in the different lane.
/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
- static const int ScoreConsecutiveLoads = 3;
+ static const int ScoreConsecutiveLoads = 4;
+ /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
+ static const int ScoreReversedLoads = 3;
/// ExtractElementInst from same vector and consecutive indexes.
- static const int ScoreConsecutiveExtracts = 3;
+ static const int ScoreConsecutiveExtracts = 4;
+ /// ExtractElementInst from same vector and reversed indices.
+ static const int ScoreReversedExtracts = 3;
/// Constants.
static const int ScoreConstants = 2;
/// Instructions with the same opcode.
@@ -1041,7 +1054,10 @@ public:
/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
- ScalarEvolution &SE) {
+ ScalarEvolution &SE, int NumLanes) {
+ if (V1 == V2)
+ return VLOperands::ScoreSplat;
+
auto *LI1 = dyn_cast<LoadInst>(V1);
auto *LI2 = dyn_cast<LoadInst>(V2);
if (LI1 && LI2) {
@@ -1051,8 +1067,17 @@ public:
Optional<int> Dist = getPointersDiff(
LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
- return (Dist && *Dist == 1) ? VLOperands::ScoreConsecutiveLoads
- : VLOperands::ScoreFail;
+ if (!Dist)
+ return VLOperands::ScoreFail;
+ // The distance is too large - still may be profitable to use masked
+ // loads/gathers.
+ if (std::abs(*Dist) > NumLanes / 2)
+ return VLOperands::ScoreAltOpcodes;
+ // This still will detect consecutive loads, but we might have "holes"
+ // in some cases. It is ok for non-power-2 vectorization and may produce
+ // better results. It should not affect current vectorization.
+ return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads
+ : VLOperands::ScoreReversedLoads;
}
auto *C1 = dyn_cast<Constant>(V1);
@@ -1062,18 +1087,41 @@ public:
// Extracts from consecutive indexes of the same vector better score as
// the extracts could be optimized away.
- Value *EV;
- ConstantInt *Ex1Idx, *Ex2Idx;
- if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) &&
- match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) &&
- Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue())
- return VLOperands::ScoreConsecutiveExtracts;
+ Value *EV1;
+ ConstantInt *Ex1Idx;
+ if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
+ // Undefs are always profitable for extractelements.
+ if (isa<UndefValue>(V2))
+ return VLOperands::ScoreConsecutiveExtracts;
+ Value *EV2 = nullptr;
+ ConstantInt *Ex2Idx = nullptr;
+ if (match(V2,
+ m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx),
+ m_Undef())))) {
+ // Undefs are always profitable for extractelements.
+ if (!Ex2Idx)
+ return VLOperands::ScoreConsecutiveExtracts;
+ if (isUndefVector(EV2) && EV2->getType() == EV1->getType())
+ return VLOperands::ScoreConsecutiveExtracts;
+ if (EV2 == EV1) {
+ int Idx1 = Ex1Idx->getZExtValue();
+ int Idx2 = Ex2Idx->getZExtValue();
+ int Dist = Idx2 - Idx1;
+ // The distance is too large - still may be profitable to use
+ // shuffles.
+ if (std::abs(Dist) > NumLanes / 2)
+ return VLOperands::ScoreAltOpcodes;
+ return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts
+ : VLOperands::ScoreReversedExtracts;
+ }
+ }
+ }
auto *I1 = dyn_cast<Instruction>(V1);
auto *I2 = dyn_cast<Instruction>(V2);
if (I1 && I2) {
- if (I1 == I2)
- return VLOperands::ScoreSplat;
+ if (I1->getParent() != I2->getParent())
+ return VLOperands::ScoreFail;
InstructionsState S = getSameOpcode({I1, I2});
// Note: Only consider instructions with <= 2 operands to avoid
// complexity explosion.
@@ -1088,11 +1136,13 @@ public:
return VLOperands::ScoreFail;
}
- /// Holds the values and their lane that are taking part in the look-ahead
+ /// Holds the values and their lanes that are taking part in the look-ahead
/// score calculation. This is used in the external uses cost calculation.
- SmallDenseMap<Value *, int> InLookAheadValues;
+ /// Need to hold all the lanes in case of splat/broadcast at least to
+ /// correctly check for the use in the different lane.
+ SmallDenseMap<Value *, SmallSet<int, 4>> InLookAheadValues;
- /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
+ /// \returns the additional cost due to uses of \p LHS and \p RHS that are
/// either external to the vectorized code, or require shuffling.
int getExternalUsesCost(const std::pair<Value *, int> &LHS,
const std::pair<Value *, int> &RHS) {
@@ -1116,22 +1166,30 @@ public:
for (User *U : V->users()) {
if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
// The user is in the VectorizableTree. Check if we need to insert.
- auto It = llvm::find(UserTE->Scalars, U);
- assert(It != UserTE->Scalars.end() && "U is in UserTE");
- int UserLn = std::distance(UserTE->Scalars.begin(), It);
+ int UserLn = UserTE->findLaneForValue(U);
assert(UserLn >= 0 && "Bad lane");
- if (UserLn != Ln)
+ // If the values are different, check just the line of the current
+ // value. If the values are the same, need to add UserInDiffLaneCost
+ // only if UserLn does not match both line numbers.
+ if ((LHS.first != RHS.first && UserLn != Ln) ||
+ (LHS.first == RHS.first && UserLn != LHS.second &&
+ UserLn != RHS.second)) {
Cost += UserInDiffLaneCost;
+ break;
+ }
} else {
// Check if the user is in the look-ahead code.
auto It2 = InLookAheadValues.find(U);
if (It2 != InLookAheadValues.end()) {
// The user is in the look-ahead code. Check the lane.
- if (It2->second != Ln)
+ if (!It2->getSecond().contains(Ln)) {
Cost += UserInDiffLaneCost;
+ break;
+ }
} else {
// The user is neither in SLP tree nor in the look-ahead code.
Cost += ExternalUseCost;
+ break;
}
}
// Limit the number of visited uses to cap compilation time.
@@ -1170,32 +1228,36 @@ public:
Value *V1 = LHS.first;
Value *V2 = RHS.first;
// Get the shallow score of V1 and V2.
- int ShallowScoreAtThisLevel =
- std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
- getExternalUsesCost(LHS, RHS));
+ int ShallowScoreAtThisLevel = std::max(
+ (int)ScoreFail, getShallowScore(V1, V2, DL, SE, getNumLanes()) -
+ getExternalUsesCost(LHS, RHS));
int Lane1 = LHS.second;
int Lane2 = RHS.second;
// If reached MaxLevel,
// or if V1 and V2 are not instructions,
// or if they are SPLAT,
- // or if they are not consecutive, early return the current cost.
+ // or if they are not consecutive,
+ // or if profitable to vectorize loads or extractelements, early return
+ // the current cost.
auto *I1 = dyn_cast<Instruction>(V1);
auto *I2 = dyn_cast<Instruction>(V2);
if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
- (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
+ (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
+ (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
+ ShallowScoreAtThisLevel))
return ShallowScoreAtThisLevel;
assert(I1 && I2 && "Should have early exited.");
// Keep track of in-tree values for determining the external-use cost.
- InLookAheadValues[V1] = Lane1;
- InLookAheadValues[V2] = Lane2;
+ InLookAheadValues[V1].insert(Lane1);
+ InLookAheadValues[V2].insert(Lane2);
// Contains the I2 operand indexes that got matched with I1 operands.
SmallSet<unsigned, 4> Op2Used;
- // Recursion towards the operands of I1 and I2. We are trying all possbile
+ // Recursion towards the operands of I1 and I2. We are trying all possible
// operand pairs, and keeping track of the best score.
for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
OpIdx1 != NumOperands1; ++OpIdx1) {
@@ -1319,27 +1381,79 @@ public:
return None;
}
- /// Helper for reorderOperandVecs. \Returns the lane that we should start
- /// reordering from. This is the one which has the least number of operands
- /// that can freely move about.
+ /// Helper for reorderOperandVecs.
+ /// \returns the lane that we should start reordering from. This is the one
+ /// which has the least number of operands that can freely move about or
+ /// less profitable because it already has the most optimal set of operands.
unsigned getBestLaneToStartReordering() const {
- unsigned BestLane = 0;
unsigned Min = UINT_MAX;
- for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
- ++Lane) {
- unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);
- if (NumFreeOps < Min) {
- Min = NumFreeOps;
- BestLane = Lane;
+ unsigned SameOpNumber = 0;
+ // std::pair<unsigned, unsigned> is used to implement a simple voting
+ // algorithm and choose the lane with the least number of operands that
+ // can freely move about or less profitable because it already has the
+ // most optimal set of operands. The first unsigned is a counter for
+ // voting, the second unsigned is the counter of lanes with instructions
+ // with same/alternate opcodes and same parent basic block.
+ MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
+ // Try to be closer to the original results, if we have multiple lanes
+ // with same cost. If 2 lanes have the same cost, use the one with the
+ // lowest index.
+ for (int I = getNumLanes(); I > 0; --I) {
+ unsigned Lane = I - 1;
+ OperandsOrderData NumFreeOpsHash =
+ getMaxNumOperandsThatCanBeReordered(Lane);
+ // Compare the number of operands that can move and choose the one with
+ // the least number.
+ if (NumFreeOpsHash.NumOfAPOs < Min) {
+ Min = NumFreeOpsHash.NumOfAPOs;
+ SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
+ HashMap.clear();
+ HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
+ } else if (NumFreeOpsHash.NumOfAPOs == Min &&
+ NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
+ // Select the most optimal lane in terms of number of operands that
+ // should be moved around.
+ SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
+ HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
+ } else if (NumFreeOpsHash.NumOfAPOs == Min &&
+ NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
+ ++HashMap[NumFreeOpsHash.Hash].first;
+ }
+ }
+ // Select the lane with the minimum counter.
+ unsigned BestLane = 0;
+ unsigned CntMin = UINT_MAX;
+ for (const auto &Data : reverse(HashMap)) {
+ if (Data.second.first < CntMin) {
+ CntMin = Data.second.first;
+ BestLane = Data.second.second;
}
}
return BestLane;
}
- /// \Returns the maximum number of operands that are allowed to be reordered
- /// for \p Lane. This is used as a heuristic for selecting the first lane to
- /// start operand reordering.
- unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
+ /// Data structure that helps to reorder operands.
+ struct OperandsOrderData {
+ /// The best number of operands with the same APOs, which can be
+ /// reordered.
+ unsigned NumOfAPOs = UINT_MAX;
+ /// Number of operands with the same/alternate instruction opcode and
+ /// parent.
+ unsigned NumOpsWithSameOpcodeParent = 0;
+ /// Hash for the actual operands ordering.
+ /// Used to count operands, actually their position id and opcode
+ /// value. It is used in the voting mechanism to find the lane with the
+ /// least number of operands that can freely move about or less profitable
+ /// because it already has the most optimal set of operands. Can be
+ /// replaced with SmallVector<unsigned> instead but hash code is faster
+ /// and requires less memory.
+ unsigned Hash = 0;
+ };
+ /// \returns the maximum number of operands that are allowed to be reordered
+ /// for \p Lane and the number of compatible instructions(with the same
+ /// parent/opcode). This is used as a heuristic for selecting the first lane
+ /// to start operand reordering.
+ OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
unsigned CntTrue = 0;
unsigned NumOperands = getNumOperands();
// Operands with the same APO can be reordered. We therefore need to count
@@ -1348,11 +1462,45 @@ public:
// a map. Instead we can simply count the number of operands that
// correspond to one of them (in this case the 'true' APO), and calculate
// the other by subtracting it from the total number of operands.
- for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
- if (getData(OpIdx, Lane).APO)
+ // Operands with the same instruction opcode and parent are more
+ // profitable since we don't need to move them in many cases, with a high
+ // probability such lane already can be vectorized effectively.
+ bool AllUndefs = true;
+ unsigned NumOpsWithSameOpcodeParent = 0;
+ Instruction *OpcodeI = nullptr;
+ BasicBlock *Parent = nullptr;
+ unsigned Hash = 0;
+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+ const OperandData &OpData = getData(OpIdx, Lane);
+ if (OpData.APO)
++CntTrue;
- unsigned CntFalse = NumOperands - CntTrue;
- return std::max(CntTrue, CntFalse);
+ // Use Boyer-Moore majority voting for finding the majority opcode and
+ // the number of times it occurs.
+ if (auto *I = dyn_cast<Instruction>(OpData.V)) {
+ if (!OpcodeI || !getSameOpcode({OpcodeI, I}).getOpcode() ||
+ I->getParent() != Parent) {
+ if (NumOpsWithSameOpcodeParent == 0) {
+ NumOpsWithSameOpcodeParent = 1;
+ OpcodeI = I;
+ Parent = I->getParent();
+ } else {
+ --NumOpsWithSameOpcodeParent;
+ }
+ } else {
+ ++NumOpsWithSameOpcodeParent;
+ }
+ }
+ Hash = hash_combine(
+ Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
+ AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
+ }
+ if (AllUndefs)
+ return {};
+ OperandsOrderData Data;
+ Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
+ Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
+ Data.Hash = Hash;
+ return Data;
}
/// Go through the instructions in VL and append their operands.
@@ -1500,11 +1648,37 @@ public:
ReorderingModes[OpIdx] = ReorderingMode::Failed;
}
+ // Check that we don't have same operands. No need to reorder if operands
+ // are just perfect diamond or shuffled diamond match. Do not do it only
+ // for possible broadcasts or non-power of 2 number of scalars (just for
+ // now).
+ auto &&SkipReordering = [this]() {
+ SmallPtrSet<Value *, 4> UniqueValues;
+ ArrayRef<OperandData> Op0 = OpsVec.front();
+ for (const OperandData &Data : Op0)
+ UniqueValues.insert(Data.V);
+ for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
+ if (any_of(Op, [&UniqueValues](const OperandData &Data) {
+ return !UniqueValues.contains(Data.V);
+ }))
+ return false;
+ }
+ // TODO: Check if we can remove a check for non-power-2 number of
+ // scalars after full support of non-power-2 vectorization.
+ return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
+ };
+
// If the initial strategy fails for any of the operand indexes, then we
// perform reordering again in a second pass. This helps avoid assigning
// high priority to the failed strategy, and should improve reordering for
// the non-failed operand indexes.
for (int Pass = 0; Pass != 2; ++Pass) {
+ // Check if no need to reorder operands since they're are perfect or
+ // shuffled diamond match.
+ // Need to to do it to avoid extra external use cost counting for
+ // shuffled matches, which may cause regressions.
+ if (SkipReordering())
+ break;
// Skip the second pass if the first pass did not fail.
bool StrategyFailed = false;
// Mark all operand data as free to use.
@@ -1792,9 +1966,10 @@ private:
if (Operands.size() < OpIdx + 1)
Operands.resize(OpIdx + 1);
assert(Operands[OpIdx].empty() && "Already resized?");
- Operands[OpIdx].resize(Scalars.size());
- for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
- Operands[OpIdx][Lane] = OpVL[Lane];
+ assert(OpVL.size() <= Scalars.size() &&
+ "Number of operands is greater than the number of scalars.");
+ Operands[OpIdx].resize(OpVL.size());
+ copy(OpVL, Operands[OpIdx].begin());
}
/// Set the operands of this bundle in their original order.
@@ -1944,7 +2119,7 @@ private:
if (ReuseShuffleIndices.empty())
dbgs() << "Empty";
else
- for (unsigned ReuseIdx : ReuseShuffleIndices)
+ for (int ReuseIdx : ReuseShuffleIndices)
dbgs() << ReuseIdx << ", ";
dbgs() << "\n";
dbgs() << "ReorderIndices: ";
@@ -2819,6 +2994,50 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
return None;
}
+Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
+ bool TopToBottom) {
+ // No need to reorder if need to shuffle reuses, still need to shuffle the
+ // node.
+ if (!TE.ReuseShuffleIndices.empty())
+ return None;
+ if (TE.State == TreeEntry::Vectorize &&
+ (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
+ (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
+ !TE.isAltShuffle())
+ return TE.ReorderIndices;
+ if (TE.State == TreeEntry::NeedToGather) {
+ // TODO: add analysis of other gather nodes with extractelement
+ // instructions and other values/instructions, not only undefs.
+ if (((TE.getOpcode() == Instruction::ExtractElement &&
+ !TE.isAltShuffle()) ||
+ (all_of(TE.Scalars,
+ [](Value *V) {
+ return isa<UndefValue, ExtractElementInst>(V);
+ }) &&
+ any_of(TE.Scalars,
+ [](Value *V) { return isa<ExtractElementInst>(V); }))) &&
+ all_of(TE.Scalars,
+ [](Value *V) {
+ auto *EE = dyn_cast<ExtractElementInst>(V);
+ return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
+ }) &&
+ allSameType(TE.Scalars)) {
+ // Check that gather of extractelements can be represented as
+ // just a shuffle of a single vector.
+ OrdersType CurrentOrder;
+ bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder);
+ if (Reuse || !CurrentOrder.empty()) {
+ if (!CurrentOrder.empty())
+ fixupOrderingIndices(CurrentOrder);
+ return CurrentOrder;
+ }
+ }
+ if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
+ return CurrentOrder;
+ }
+ return None;
+}
+
void BoUpSLP::reorderTopToBottom() {
// Maps VF to the graph nodes.
DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries;
@@ -2826,42 +3045,15 @@ void BoUpSLP::reorderTopToBottom() {
// their ordering.
DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
// Find all reorderable nodes with the given VF.
- // Currently the are vectorized loads,extracts + some gathering of extracts.
+ // Currently the are vectorized stores,loads,extracts + some gathering of
+ // extracts.
for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders](
const std::unique_ptr<TreeEntry> &TE) {
- // No need to reorder if need to shuffle reuses, still need to shuffle the
- // node.
- if (!TE->ReuseShuffleIndices.empty())
- return;
- if (TE->State == TreeEntry::Vectorize &&
- isa<LoadInst, ExtractElementInst, ExtractValueInst, StoreInst,
- InsertElementInst>(TE->getMainOp()) &&
- !TE->isAltShuffle()) {
+ if (Optional<OrdersType> CurrentOrder =
+ getReorderingData(*TE.get(), /*TopToBottom=*/true)) {
VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
- return;
- }
- if (TE->State == TreeEntry::NeedToGather) {
- if (TE->getOpcode() == Instruction::ExtractElement &&
- !TE->isAltShuffle() &&
- isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())
- ->getVectorOperandType()) &&
- allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {
- // Check that gather of extractelements can be represented as
- // just a shuffle of a single vector.
- OrdersType CurrentOrder;
- bool Reuse =
- canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);
- if (Reuse || !CurrentOrder.empty()) {
- VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
- GathersToOrders.try_emplace(TE.get(), CurrentOrder);
- return;
- }
- }
- if (Optional<OrdersType> CurrentOrder =
- findReusedOrderedScalars(*TE.get())) {
- VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
+ if (TE->State != TreeEntry::Vectorize)
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
- }
}
});
@@ -2993,44 +3185,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
const std::unique_ptr<TreeEntry> &TE) {
if (TE->State != TreeEntry::Vectorize)
NonVectorized.push_back(TE.get());
- // No need to reorder if need to shuffle reuses, still need to shuffle the
- // node.
- if (!TE->ReuseShuffleIndices.empty())
- return;
- if (TE->State == TreeEntry::Vectorize &&
- isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE->getMainOp()) &&
- !TE->isAltShuffle()) {
+ if (Optional<OrdersType> CurrentOrder =
+ getReorderingData(*TE.get(), /*TopToBottom=*/false)) {
OrderedEntries.insert(TE.get());
- return;
- }
- if (TE->State == TreeEntry::NeedToGather) {
- if (TE->getOpcode() == Instruction::ExtractElement &&
- !TE->isAltShuffle() &&
- isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())
- ->getVectorOperandType()) &&
- allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {
- // Check that gather of extractelements can be represented as
- // just a shuffle of a single vector with a single user only.
- OrdersType CurrentOrder;
- bool Reuse =
- canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);
- if ((Reuse || !CurrentOrder.empty()) &&
- !any_of(VectorizableTree,
- [&TE](const std::unique_ptr<TreeEntry> &Entry) {
- return Entry->State == TreeEntry::NeedToGather &&
- Entry.get() != TE.get() &&
- Entry->isSame(TE->Scalars);
- })) {
- OrderedEntries.insert(TE.get());
- GathersToOrders.try_emplace(TE.get(), CurrentOrder);
- return;
- }
- }
- if (Optional<OrdersType> CurrentOrder =
- findReusedOrderedScalars(*TE.get())) {
- OrderedEntries.insert(TE.get());
+ if (TE->State != TreeEntry::Vectorize)
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
- }
}
});
@@ -3392,9 +3551,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Check that every instruction appears once in this bundle.
DenseMap<Value *, unsigned> UniquePositions;
for (Value *V : VL) {
+ if (isConstant(V)) {
+ ReuseShuffleIndicies.emplace_back(
+ isa<UndefValue>(V) ? UndefMaskElem : UniqueValues.size());
+ UniqueValues.emplace_back(V);
+ continue;
+ }
auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
- ReuseShuffleIndicies.emplace_back(isa<UndefValue>(V) ? -1
- : Res.first->second);
+ ReuseShuffleIndicies.emplace_back(Res.first->second);
if (Res.second)
UniqueValues.emplace_back(V);
}
@@ -3404,6 +3568,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
} else {
LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
if (NumUniqueScalarValues <= 1 ||
+ (UniquePositions.size() == 1 && all_of(UniqueValues,
+ [](Value *V) {
+ return isa<UndefValue>(V) ||
+ !isConstant(V);
+ })) ||
!llvm::isPowerOf2_32(NumUniqueScalarValues)) {
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
@@ -3508,11 +3677,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
}
- // If any of the scalars is marked as a value that needs to stay scalar, then
- // we need to gather the scalars.
// The reduction nodes (stored in UserIgnoreList) also should stay scalar.
for (Value *V : VL) {
- if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
+ if (is_contained(UserIgnoreList, V)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
if (TryToFindDuplicates(S))
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
@@ -4219,10 +4386,17 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
SmallVectorImpl<unsigned> &CurrentOrder) const {
- Instruction *E0 = cast<Instruction>(OpValue);
- assert(E0->getOpcode() == Instruction::ExtractElement ||
- E0->getOpcode() == Instruction::ExtractValue);
- assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
+ const auto *It = find_if(VL, [](Value *V) {
+ return isa<ExtractElementInst, ExtractValueInst>(V);
+ });
+ assert(It != VL.end() && "Expected at least one extract instruction.");
+ auto *E0 = cast<Instruction>(*It);
+ assert(all_of(VL,
+ [](Value *V) {
+ return isa<UndefValue, ExtractElementInst, ExtractValueInst>(
+ V);
+ }) &&
+ "Invalid opcode");
// Check if all of the extracts come from the same vector and from the
// correct offset.
Value *Vec = E0->getOperand(0);
@@ -4255,23 +4429,28 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
// Also, later we can check that all the indices are used and we have a
// consecutive access in the extract instructions, by checking that no
// element of CurrentOrder still has value E + 1.
- CurrentOrder.assign(E, E + 1);
+ CurrentOrder.assign(E, E);
unsigned I = 0;
for (; I < E; ++I) {
- auto *Inst = cast<Instruction>(VL[I]);
+ auto *Inst = dyn_cast<Instruction>(VL[I]);
+ if (!Inst)
+ continue;
if (Inst->getOperand(0) != Vec)
break;
+ if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
+ if (isa<UndefValue>(EE->getIndexOperand()))
+ continue;
Optional<unsigned> Idx = getExtractIndex(Inst);
if (!Idx)
break;
const unsigned ExtIdx = *Idx;
if (ExtIdx != I) {
- if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
+ if (ExtIdx >= E || CurrentOrder[ExtIdx] != E)
break;
ShouldKeepOrder = false;
CurrentOrder[ExtIdx] = I;
} else {
- if (CurrentOrder[I] != E + 1)
+ if (CurrentOrder[I] != E)
break;
CurrentOrder[I] = I;
}
@@ -4287,8 +4466,8 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
bool BoUpSLP::areAllUsersVectorized(Instruction *I,
ArrayRef<Value *> VectorizedVals) const {
return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||
- llvm::all_of(I->users(), [this](User *U) {
- return ScalarToTreeEntry.count(U) > 0;
+ all_of(I->users(), [this](User *U) {
+ return ScalarToTreeEntry.count(U) > 0 || MustGather.contains(U);
});
}
@@ -4348,6 +4527,10 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
for (auto *V : VL) {
++Idx;
+ // Need to exclude undefs from analysis.
+ if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
+ continue;
+
// Reached the start of a new vector registers.
if (Idx % EltsPerVector == 0) {
AllConsecutive = true;
@@ -4357,9 +4540,11 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
// Check all extracts for a vector register on the target directly
// extract values in order.
unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
- unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
- AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
- CurrentIdx % EltsPerVector == Idx % EltsPerVector;
+ if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) {
+ unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
+ AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
+ CurrentIdx % EltsPerVector == Idx % EltsPerVector;
+ }
if (AllConsecutive)
continue;
@@ -4442,9 +4627,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// FIXME: it tries to fix a problem with MSVC buildbots.
TargetTransformInfo &TTIRef = *TTI;
auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy,
- VectorizedVals](InstructionCost &Cost,
- bool IsGather) {
+ VectorizedVals, E](InstructionCost &Cost) {
DenseMap<Value *, int> ExtractVectorsTys;
+ SmallPtrSet<Value *, 4> CheckedExtracts;
for (auto *V : VL) {
if (isa<UndefValue>(V))
continue;
@@ -4452,7 +4637,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// instruction itself is not going to be vectorized, consider this
// instruction as dead and remove its cost from the final cost of the
// vectorized tree.
- if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals))
+ // Also, avoid adjusting the cost for extractelements with multiple uses
+ // in different graph entries.
+ const TreeEntry *VE = getTreeEntry(V);
+ if (!CheckedExtracts.insert(V).second ||
+ !areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
+ (VE && VE != E))
continue;
auto *EE = cast<ExtractElementInst>(V);
Optional<unsigned> EEIdx = getExtractIndex(EE);
@@ -4549,11 +4739,6 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
}
return GatherCost;
}
- if (isSplat(VL)) {
- // Found the broadcasting of the single scalar, calculate the cost as the
- // broadcast.
- return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
- }
if ((E->getOpcode() == Instruction::ExtractElement ||
all_of(E->Scalars,
[](Value *V) {
@@ -4571,13 +4756,20 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// single input vector or of 2 input vectors.
InstructionCost Cost =
computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
- AdjustExtractsCost(Cost, /*IsGather=*/true);
+ AdjustExtractsCost(Cost);
if (NeedToShuffleReuses)
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
FinalVecTy, E->ReuseShuffleIndices);
return Cost;
}
}
+ if (isSplat(VL)) {
+ // Found the broadcasting of the single scalar, calculate the cost as the
+ // broadcast.
+ assert(VecTy == FinalVecTy &&
+ "No reused scalars expected for broadcast.");
+ return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
+ }
InstructionCost ReuseShuffleCost = 0;
if (NeedToShuffleReuses)
ReuseShuffleCost = TTI->getShuffleCost(
@@ -4755,7 +4947,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
}
} else {
- AdjustExtractsCost(CommonCost, /*IsGather=*/false);
+ AdjustExtractsCost(CommonCost);
}
return CommonCost;
}
@@ -5211,15 +5403,15 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
FoundOr = true;
}
// Check if the input is an extended load of the required or/shift expression.
- Value *LoadPtr;
+ Value *Load;
if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
- !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+ !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
return false;
// Require that the total load bit width is a legal integer type.
// For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
// But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
- Type *SrcTy = LoadPtr->getType()->getPointerElementType();
+ Type *SrcTy = Load->getType();
unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
return false;
@@ -9061,8 +9253,7 @@ private:
"A call to the llvm.fmuladd intrinsic is not handled yet");
++NumVectorInstructions;
- return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,
- ReductionOps.back());
+ return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind);
}
};
@@ -9473,6 +9664,59 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
return Changed;
}
+/// Compare two cmp instructions. If IsCompatibility is true, function returns
+/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
+/// operands. If IsCompatibility is false, function implements strict weak
+/// ordering relation between two cmp instructions, returning true if the first
+/// instruction is "less" than the second, i.e. its predicate is less than the
+/// predicate of the second or the operands IDs are less than the operands IDs
+/// of the second cmp instruction.
+template <bool IsCompatibility>
+static bool compareCmp(Value *V, Value *V2,
+ function_ref<bool(Instruction *)> IsDeleted) {
+ auto *CI1 = cast<CmpInst>(V);
+ auto *CI2 = cast<CmpInst>(V2);
+ if (IsDeleted(CI2) || !isValidElementType(CI2->getType()))
+ return false;
+ if (CI1->getOperand(0)->getType()->getTypeID() <
+ CI2->getOperand(0)->getType()->getTypeID())
+ return !IsCompatibility;
+ if (CI1->getOperand(0)->getType()->getTypeID() >
+ CI2->getOperand(0)->getType()->getTypeID())
+ return false;
+ CmpInst::Predicate Pred1 = CI1->getPredicate();
+ CmpInst::Predicate Pred2 = CI2->getPredicate();
+ CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1);
+ CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2);
+ CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
+ CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
+ if (BasePred1 < BasePred2)
+ return !IsCompatibility;
+ if (BasePred1 > BasePred2)
+ return false;
+ // Compare operands.
+ bool LEPreds = Pred1 <= Pred2;
+ bool GEPreds = Pred1 >= Pred2;
+ for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
+ auto *Op1 = CI1->getOperand(LEPreds ? I : E - I - 1);
+ auto *Op2 = CI2->getOperand(GEPreds ? I : E - I - 1);
+ if (Op1->getValueID() < Op2->getValueID())
+ return !IsCompatibility;
+ if (Op1->getValueID() > Op2->getValueID())
+ return false;
+ if (auto *I1 = dyn_cast<Instruction>(Op1))
+ if (auto *I2 = dyn_cast<Instruction>(Op2)) {
+ if (I1->getParent() != I2->getParent())
+ return false;
+ InstructionsState S = getSameOpcode({I1, I2});
+ if (S.getOpcode())
+ continue;
+ return false;
+ }
+ }
+ return IsCompatibility;
+}
+
bool SLPVectorizerPass::vectorizeSimpleInstructions(
SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R,
bool AtTerminator) {
@@ -9504,37 +9748,16 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
}
// Try to vectorize list of compares.
// Sort by type, compare predicate, etc.
- // TODO: Add analysis on the operand opcodes (profitable to vectorize
- // instructions with same/alternate opcodes/const values).
auto &&CompareSorter = [&R](Value *V, Value *V2) {
- auto *CI1 = cast<CmpInst>(V);
- auto *CI2 = cast<CmpInst>(V2);
- if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))
- return false;
- if (CI1->getOperand(0)->getType()->getTypeID() <
- CI2->getOperand(0)->getType()->getTypeID())
- return true;
- if (CI1->getOperand(0)->getType()->getTypeID() >
- CI2->getOperand(0)->getType()->getTypeID())
- return false;
- return CI1->getPredicate() < CI2->getPredicate() ||
- (CI1->getPredicate() > CI2->getPredicate() &&
- CI1->getPredicate() <
- CmpInst::getSwappedPredicate(CI2->getPredicate()));
+ return compareCmp<false>(V, V2,
+ [&R](Instruction *I) { return R.isDeleted(I); });
};
auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) {
if (V1 == V2)
return true;
- auto *CI1 = cast<CmpInst>(V1);
- auto *CI2 = cast<CmpInst>(V2);
- if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))
- return false;
- if (CI1->getOperand(0)->getType() != CI2->getOperand(0)->getType())
- return false;
- return CI1->getPredicate() == CI2->getPredicate() ||
- CI1->getPredicate() ==
- CmpInst::getSwappedPredicate(CI2->getPredicate());
+ return compareCmp<true>(V1, V2,
+ [&R](Instruction *I) { return R.isDeleted(I); });
};
auto Limit = [&R](Value *V) {
unsigned EltSize = R.getVectorElementSize(V);
@@ -9592,10 +9815,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
return true;
if (Opcodes1.size() > Opcodes2.size())
return false;
+ Optional<bool> ConstOrder;
for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
// Undefs are compatible with any other value.
- if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
+ if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) {
+ if (!ConstOrder)
+ ConstOrder =
+ !isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]);
continue;
+ }
if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
@@ -9614,14 +9842,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
continue;
return I1->getOpcode() < I2->getOpcode();
}
- if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
+ if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) {
+ if (!ConstOrder)
+ ConstOrder = Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID();
continue;
+ }
if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())
return true;
if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())
return false;
}
- return false;
+ return ConstOrder && *ConstOrder;
};
auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) {
if (V1 == V2)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 44b5e1df0839..1d9e71663cd2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -374,8 +374,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
assert((SplitAt == end() || SplitAt->getParent() == this) &&
"can only split at a position in the same block");
- SmallVector<VPBlockBase *, 2> Succs(getSuccessors().begin(),
- getSuccessors().end());
+ SmallVector<VPBlockBase *, 2> Succs(successors());
// First, disconnect the current block from its successors.
for (VPBlockBase *Succ : Succs)
VPBlockUtils::disconnectBlocks(this, Succ);
@@ -642,6 +641,7 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
void VPInstruction::generateInstruction(VPTransformState &State,
unsigned Part) {
IRBuilder<> &Builder = State.Builder;
+ Builder.SetCurrentDebugLocation(DL);
if (Instruction::isBinaryOp(getOpcode())) {
Value *A = State.get(getOperand(0), Part);
@@ -768,6 +768,11 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
O << " ";
Operand->printAsOperand(O, SlotTracker);
}
+
+ if (DL) {
+ O << ", !dbg ";
+ DL.print(O);
+ }
}
#endif
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 810dd5030f95..f4a1883e35d5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -39,6 +39,7 @@
#include "llvm/ADT/ilist.h"
#include "llvm/ADT/ilist_node.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/Support/InstructionCost.h"
#include <algorithm>
@@ -51,6 +52,7 @@ namespace llvm {
class BasicBlock;
class DominatorTree;
+class InductionDescriptor;
class InnerLoopVectorizer;
class LoopInfo;
class raw_ostream;
@@ -500,6 +502,8 @@ public:
const VPBlocksTy &getSuccessors() const { return Successors; }
VPBlocksTy &getSuccessors() { return Successors; }
+ iterator_range<VPBlockBase **> successors() { return Successors; }
+
const VPBlocksTy &getPredecessors() const { return Predecessors; }
VPBlocksTy &getPredecessors() { return Predecessors; }
@@ -795,6 +799,7 @@ private:
typedef unsigned char OpcodeTy;
OpcodeTy Opcode;
FastMathFlags FMF;
+ DebugLoc DL;
/// Utility method serving execute(): generates a single instance of the
/// modeled instruction.
@@ -804,12 +809,14 @@ protected:
void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
public:
- VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands)
+ VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL)
: VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands),
- VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {}
+ VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode),
+ DL(DL) {}
- VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
- : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {}
+ VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
+ DebugLoc DL = {})
+ : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL) {}
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPValue *V) {
@@ -818,7 +825,7 @@ public:
VPInstruction *clone() const {
SmallVector<VPValue *, 2> Operands(operands());
- return new VPInstruction(Opcode, Operands);
+ return new VPInstruction(Opcode, Operands, DL);
}
/// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -1003,21 +1010,22 @@ public:
/// A recipe for handling phi nodes of integer and floating-point inductions,
/// producing their vector and scalar values.
-class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
+class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue {
PHINode *IV;
+ const InductionDescriptor &IndDesc;
public:
- VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, Instruction *Cast,
- TruncInst *Trunc = nullptr)
- : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), IV(IV) {
- if (Trunc)
- new VPValue(Trunc, this);
- else
- new VPValue(IV, this);
+ VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
+ const InductionDescriptor &IndDesc)
+ : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(IV, this),
+ IV(IV), IndDesc(IndDesc) {}
+
+ VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
+ const InductionDescriptor &IndDesc,
+ TruncInst *Trunc)
+ : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(Trunc, this),
+ IV(IV), IndDesc(IndDesc) {}
- if (Cast)
- new VPValue(Cast, this);
- }
~VPWidenIntOrFpInductionRecipe() override = default;
/// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -1038,13 +1046,6 @@ public:
/// Returns the start value of the induction.
VPValue *getStartValue() { return getOperand(0); }
- /// Returns the cast VPValue, if one is attached, or nullptr otherwise.
- VPValue *getCastValue() {
- if (getNumDefinedValues() != 2)
- return nullptr;
- return getVPValue(1);
- }
-
/// Returns the first defined value as TruncInst, if it is one or nullptr
/// otherwise.
TruncInst *getTruncInst() {
@@ -1053,6 +1054,9 @@ public:
const TruncInst *getTruncInst() const {
return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue());
}
+
+ /// Returns the induction descriptor for the recipe.
+ const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
};
/// A recipe for handling first order recurrences and pointer inductions. For
@@ -1169,7 +1173,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPWidenPHIRecipe {
/// operand.
class VPReductionPHIRecipe : public VPWidenPHIRecipe {
/// Descriptor for the reduction.
- RecurrenceDescriptor &RdxDesc;
+ const RecurrenceDescriptor &RdxDesc;
/// The phi is part of an in-loop reduction.
bool IsInLoop;
@@ -1180,7 +1184,7 @@ class VPReductionPHIRecipe : public VPWidenPHIRecipe {
public:
/// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
/// RdxDesc.
- VPReductionPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc,
+ VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
VPValue &Start, bool IsInLoop = false,
bool IsOrdered = false)
: VPWidenPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start),
@@ -1210,7 +1214,9 @@ public:
VPSlotTracker &SlotTracker) const override;
#endif
- RecurrenceDescriptor &getRecurrenceDescriptor() { return RdxDesc; }
+ const RecurrenceDescriptor &getRecurrenceDescriptor() const {
+ return RdxDesc;
+ }
/// Returns true, if the phi is part of an ordered reduction.
bool isOrdered() const { return IsOrdered; }
@@ -1340,13 +1346,13 @@ public:
/// The Operands are {ChainOp, VecOp, [Condition]}.
class VPReductionRecipe : public VPRecipeBase, public VPValue {
/// The recurrence decriptor for the reduction in question.
- RecurrenceDescriptor *RdxDesc;
+ const RecurrenceDescriptor *RdxDesc;
/// Pointer to the TTI, needed to create the target reduction
const TargetTransformInfo *TTI;
public:
- VPReductionRecipe(RecurrenceDescriptor *R, Instruction *I, VPValue *ChainOp,
- VPValue *VecOp, VPValue *CondOp,
+ VPReductionRecipe(const RecurrenceDescriptor *R, Instruction *I,
+ VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
const TargetTransformInfo *TTI)
: VPRecipeBase(VPRecipeBase::VPReductionSC, {ChainOp, VecOp}),
VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), TTI(TTI) {
@@ -2252,6 +2258,12 @@ public:
return map_range(Operands, Fn);
}
+ /// Returns true if \p VPV is uniform after vectorization.
+ bool isUniformAfterVectorization(VPValue *VPV) const {
+ auto RepR = dyn_cast_or_null<VPReplicateRecipe>(VPV->getDef());
+ return !VPV->getDef() || (RepR && RepR->isUniform());
+ }
+
private:
/// Add to the given dominator tree the header block and every new basic block
/// that was created between it and the latch block, inclusive.
@@ -2340,18 +2352,23 @@ public:
/// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p
/// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p
- /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. If \p BlockPtr
- /// has more than one successor, its conditional bit is propagated to \p
- /// NewBlock. \p NewBlock must have neither successors nor predecessors.
+ /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's
+ /// successors are moved from \p BlockPtr to \p NewBlock and \p BlockPtr's
+ /// conditional bit is propagated to \p NewBlock. \p NewBlock must have
+ /// neither successors nor predecessors.
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
assert(NewBlock->getSuccessors().empty() &&
- "Can't insert new block with successors.");
- // TODO: move successors from BlockPtr to NewBlock when this functionality
- // is necessary. For now, setBlockSingleSuccessor will assert if BlockPtr
- // already has successors.
- BlockPtr->setOneSuccessor(NewBlock);
- NewBlock->setPredecessors({BlockPtr});
+ NewBlock->getPredecessors().empty() &&
+ "Can't insert new block with predecessors or successors.");
NewBlock->setParent(BlockPtr->getParent());
+ SmallVector<VPBlockBase *> Succs(BlockPtr->successors());
+ for (VPBlockBase *Succ : Succs) {
+ disconnectBlocks(BlockPtr, Succ);
+ connectBlocks(NewBlock, Succ);
+ }
+ NewBlock->setCondBit(BlockPtr->getCondBit());
+ BlockPtr->setCondBit(nullptr);
+ connectBlocks(BlockPtr, NewBlock);
}
/// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
@@ -2394,6 +2411,31 @@ public:
To->removePredecessor(From);
}
+ /// Try to merge \p Block into its single predecessor, if \p Block is a
+ /// VPBasicBlock and its predecessor has a single successor. Returns a pointer
+ /// to the predecessor \p Block was merged into or nullptr otherwise.
+ static VPBasicBlock *tryToMergeBlockIntoPredecessor(VPBlockBase *Block) {
+ auto *VPBB = dyn_cast<VPBasicBlock>(Block);
+ auto *PredVPBB =
+ dyn_cast_or_null<VPBasicBlock>(Block->getSinglePredecessor());
+ if (!VPBB || !PredVPBB || PredVPBB->getNumSuccessors() != 1)
+ return nullptr;
+
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB))
+ R.moveBefore(*PredVPBB, PredVPBB->end());
+ VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
+ auto *ParentRegion = cast<VPRegionBlock>(Block->getParent());
+ if (ParentRegion->getExit() == Block)
+ ParentRegion->setExit(PredVPBB);
+ SmallVector<VPBlockBase *> Successors(Block->successors());
+ for (auto *Succ : Successors) {
+ VPBlockUtils::disconnectBlocks(Block, Succ);
+ VPBlockUtils::connectBlocks(PredVPBB, Succ);
+ }
+ delete Block;
+ return PredVPBB;
+ }
+
/// Returns true if the edge \p FromBlock -> \p ToBlock is a back-edge.
static bool isBackEdge(const VPBlockBase *FromBlock,
const VPBlockBase *ToBlock, const VPLoopInfo *VPLI) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index ac3b3505dc34..86ecd6817873 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -50,14 +50,14 @@ VPValue *VPlanPredicator::getOrCreateNotPredicate(VPBasicBlock *PredBB,
case EdgeType::FALSE_EDGE:
// CurrBB is the False successor of PredBB - compute not of CBV.
- IntermediateVal = Builder.createNot(CBV);
+ IntermediateVal = Builder.createNot(CBV, {});
break;
}
// Now AND intermediate value with PredBB's block predicate if it has one.
VPValue *BP = PredBB->getPredicate();
if (BP)
- return Builder.createAnd(BP, IntermediateVal);
+ return Builder.createAnd(BP, IntermediateVal, {});
else
return IntermediateVal;
}
@@ -96,7 +96,7 @@ VPValue *VPlanPredicator::genPredicateTree(std::list<VPValue *> &Worklist) {
Worklist.pop_front();
// Create an OR of these values.
- VPValue *Or = Builder.createOr(LHS, RHS);
+ VPValue *Or = Builder.createOr(LHS, RHS, {});
// Push OR to the back of the worklist.
Worklist.push_back(Or);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
index c52c8a2229e8..9e19e172dea5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -467,8 +467,9 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
return markFailed();
assert(CombinedOperands.size() > 0 && "Need more some operands");
- auto *VPI = new VPInstruction(Opcode, CombinedOperands);
- VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
+ auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr();
+ auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc());
+ VPI->setUnderlyingInstr(Inst);
LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
<< *cast<VPInstruction>(Values[0]) << "\n");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ded5bc04beb5..d2daf558c2c5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -18,7 +18,8 @@ using namespace llvm;
void VPlanTransforms::VPInstructionsToVPRecipes(
Loop *OrigLoop, VPlanPtr &Plan,
- LoopVectorizationLegality::InductionList &Inductions,
+ function_ref<const InductionDescriptor *(PHINode *)>
+ GetIntOrFpInductionDescriptor,
SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE) {
auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry());
@@ -44,11 +45,9 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
VPRecipeBase *NewRecipe = nullptr;
if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&Ingredient)) {
auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue());
- InductionDescriptor II = Inductions.lookup(Phi);
- if (II.getKind() == InductionDescriptor::IK_IntInduction ||
- II.getKind() == InductionDescriptor::IK_FpInduction) {
- VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
- NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, nullptr);
+ if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) {
+ VPValue *Start = Plan->getOrAddVPValue(II->getStartValue());
+ NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, *II);
} else {
Plan->addVPValue(Phi, VPPhi);
continue;
@@ -158,8 +157,7 @@ bool VPlanTransforms::sinkScalarOperands(VPlan &Plan) {
// TODO: add ".cloned" suffix to name of Clone's VPValue.
Clone->insertBefore(SinkCandidate);
- SmallVector<VPUser *, 4> Users(SinkCandidate->user_begin(),
- SinkCandidate->user_end());
+ SmallVector<VPUser *, 4> Users(SinkCandidate->users());
for (auto *U : Users) {
auto *UI = cast<VPRecipeBase>(U);
if (UI->getParent() == SinkTo)
@@ -266,8 +264,7 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) {
VPValue *PredInst1 =
cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
- SmallVector<VPUser *> Users(Phi1ToMoveV->user_begin(),
- Phi1ToMoveV->user_end());
+ SmallVector<VPUser *> Users(Phi1ToMoveV->users());
for (VPUser *U : Users) {
auto *UI = dyn_cast<VPRecipeBase>(U);
if (!UI || UI->getParent() != Then2)
@@ -295,3 +292,35 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) {
delete ToDelete;
return Changed;
}
+
+void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) {
+ SmallVector<std::pair<VPRecipeBase *, VPValue *>> CastsToRemove;
+ for (auto &Phi : Plan.getEntry()->getEntryBasicBlock()->phis()) {
+ auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
+ if (!IV || IV->getTruncInst())
+ continue;
+
+ // Visit all casts connected to IV and in Casts. Collect them.
+ // remember them for removal.
+ auto &Casts = IV->getInductionDescriptor().getCastInsts();
+ VPValue *FindMyCast = IV;
+ for (Instruction *IRCast : reverse(Casts)) {
+ VPRecipeBase *FoundUserCast = nullptr;
+ for (auto *U : FindMyCast->users()) {
+ auto *UserCast = cast<VPRecipeBase>(U);
+ if (UserCast->getNumDefinedValues() == 1 &&
+ UserCast->getVPSingleValue()->getUnderlyingValue() == IRCast) {
+ FoundUserCast = UserCast;
+ break;
+ }
+ }
+ assert(FoundUserCast && "Missing a cast to remove");
+ CastsToRemove.emplace_back(FoundUserCast, IV);
+ FindMyCast = FoundUserCast->getVPSingleValue();
+ }
+ }
+ for (auto &E : CastsToRemove) {
+ E.first->getVPSingleValue()->replaceAllUsesWith(E.second);
+ E.first->eraseFromParent();
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index c740f2c022da..a82a562d5e35 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -14,24 +14,37 @@
#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
#include "VPlan.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
namespace llvm {
+class InductionDescriptor;
class Instruction;
+class PHINode;
class ScalarEvolution;
struct VPlanTransforms {
/// Replaces the VPInstructions in \p Plan with corresponding
/// widen recipes.
- static void VPInstructionsToVPRecipes(
- Loop *OrigLoop, VPlanPtr &Plan,
- LoopVectorizationLegality::InductionList &Inductions,
- SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE);
+ static void
+ VPInstructionsToVPRecipes(Loop *OrigLoop, VPlanPtr &Plan,
+ function_ref<const InductionDescriptor *(PHINode *)>
+ GetIntOrFpInductionDescriptor,
+ SmallPtrSetImpl<Instruction *> &DeadInstructions,
+ ScalarEvolution &SE);
static bool sinkScalarOperands(VPlan &Plan);
static bool mergeReplicateRegions(VPlan &Plan);
+
+ /// Remove redundant casts of inductions.
+ ///
+ /// Such redundant casts are casts of induction variables that can be ignored,
+ /// because we already proved that the casted phi is equal to the uncasted phi
+ /// in the vectorized loop. There is no need to vectorize the cast - the same
+ /// value can be used for both the phi and casts in the vector loop.
+ static void removeRedundantInductionCasts(VPlan &Plan);
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 6d6ea4eb30f1..7732d9367985 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -156,5 +156,31 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {
RecipeI++;
}
}
+
+ const VPRegionBlock *TopRegion = cast<VPRegionBlock>(Plan.getEntry());
+ const VPBasicBlock *Entry = dyn_cast<VPBasicBlock>(TopRegion->getEntry());
+ if (!Entry) {
+ errs() << "VPlan entry block is not a VPBasicBlock\n";
+ return false;
+ }
+ const VPBasicBlock *Exit = dyn_cast<VPBasicBlock>(TopRegion->getExit());
+ if (!Exit) {
+ errs() << "VPlan exit block is not a VPBasicBlock\n";
+ return false;
+ }
+
+ for (const VPRegionBlock *Region :
+ VPBlockUtils::blocksOnly<const VPRegionBlock>(
+ depth_first(VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(
+ Plan.getEntry())))) {
+ if (Region->getEntry()->getNumPredecessors() != 0) {
+ errs() << "region entry block has predecessors\n";
+ return false;
+ }
+ if (Region->getExit()->getNumSuccessors() != 0) {
+ errs() << "region exit block has successors\n";
+ return false;
+ }
+ }
return true;
}
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 57b11e9414ba..c0aedab2fed0 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -989,9 +989,9 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
if (!FixedVT)
return false;
- InstructionCost OriginalCost = TTI.getMemoryOpCost(
- Instruction::Load, LI->getType(), Align(LI->getAlignment()),
- LI->getPointerAddressSpace());
+ InstructionCost OriginalCost =
+ TTI.getMemoryOpCost(Instruction::Load, LI->getType(), LI->getAlign(),
+ LI->getPointerAddressSpace());
InstructionCost ScalarizedCost = 0;
Instruction *LastCheckedInst = LI;