aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroEarly.cpp11
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroFrame.cpp12
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroInstr.h12
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroInternal.h14
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroSplit.cpp167
-rw-r--r--llvm/lib/Transforms/Coroutines/Coroutines.cpp22
-rw-r--r--llvm/lib/Transforms/IPO/AlwaysInliner.cpp18
-rw-r--r--llvm/lib/Transforms/IPO/ArgumentPromotion.cpp7
-rw-r--r--llvm/lib/Transforms/IPO/Attributor.cpp30
-rw-r--r--llvm/lib/Transforms/IPO/AttributorAttributes.cpp305
-rw-r--r--llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp9
-rw-r--r--llvm/lib/Transforms/IPO/FunctionAttrs.cpp18
-rw-r--r--llvm/lib/Transforms/IPO/FunctionSpecialization.cpp29
-rw-r--r--llvm/lib/Transforms/IPO/GlobalOpt.cpp727
-rw-r--r--llvm/lib/Transforms/IPO/IROutliner.cpp758
-rw-r--r--llvm/lib/Transforms/IPO/Inliner.cpp56
-rw-r--r--llvm/lib/Transforms/IPO/ModuleInliner.cpp9
-rw-r--r--llvm/lib/Transforms/IPO/OpenMPOpt.cpp141
-rw-r--r--llvm/lib/Transforms/IPO/PartialInlining.cpp10
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp63
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp82
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp177
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp13
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp100
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h14
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp12
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp30
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp11
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp49
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp21
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp3
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp651
-rw-r--r--llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp16
-rw-r--r--llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp68
-rw-r--r--llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp17
-rw-r--r--llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp52
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemProfiler.cpp28
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp102
-rw-r--r--llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp5
-rw-r--r--llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp7
-rw-r--r--llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h10
-rw-r--r--llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp3
-rw-r--r--llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h3
-rw-r--r--llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp2
-rw-r--r--llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp30
-rw-r--r--llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h1
-rw-r--r--llvm/lib/Transforms/Scalar/ADCE.cpp9
-rw-r--r--llvm/lib/Transforms/Scalar/ConstantHoisting.cpp12
-rw-r--r--llvm/lib/Transforms/Scalar/ConstraintElimination.cpp87
-rw-r--r--llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp264
-rw-r--r--llvm/lib/Transforms/Scalar/EarlyCSE.cpp9
-rw-r--r--llvm/lib/Transforms/Scalar/GVN.cpp25
-rw-r--r--llvm/lib/Transforms/Scalar/IndVarSimplify.cpp11
-rw-r--r--llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp6
-rw-r--r--llvm/lib/Transforms/Scalar/JumpThreading.cpp11
-rw-r--r--llvm/lib/Transforms/Scalar/LICM.cpp33
-rw-r--r--llvm/lib/Transforms/Scalar/LoopDeletion.cpp30
-rw-r--r--llvm/lib/Transforms/Scalar/LoopFlatten.cpp510
-rw-r--r--llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/LoopInterchange.cpp263
-rw-r--r--llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp25
-rw-r--r--llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp11
-rw-r--r--llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp149
-rw-r--r--llvm/lib/Transforms/Scalar/NewGVN.cpp89
-rw-r--r--llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp1
-rw-r--r--llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp16
-rw-r--r--llvm/lib/Transforms/Scalar/SCCP.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/SROA.cpp128
-rw-r--r--llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp7
-rw-r--r--llvm/lib/Transforms/Scalar/Scalarizer.cpp5
-rw-r--r--llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp144
-rw-r--r--llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp17
-rw-r--r--llvm/lib/Transforms/Utils/BuildLibCalls.cpp44
-rw-r--r--llvm/lib/Transforms/Utils/CallGraphUpdater.cpp3
-rw-r--r--llvm/lib/Transforms/Utils/CallPromotionUtils.cpp4
-rw-r--r--llvm/lib/Transforms/Utils/CodeExtractor.cpp244
-rw-r--r--llvm/lib/Transforms/Utils/Evaluator.cpp290
-rw-r--r--llvm/lib/Transforms/Utils/GlobalStatus.cpp8
-rw-r--r--llvm/lib/Transforms/Utils/InlineFunction.cpp12
-rw-r--r--llvm/lib/Transforms/Utils/LCSSA.cpp4
-rw-r--r--llvm/lib/Transforms/Utils/Local.cpp13
-rw-r--r--llvm/lib/Transforms/Utils/LoopPeel.cpp8
-rw-r--r--llvm/lib/Transforms/Utils/LoopUnroll.cpp18
-rw-r--r--llvm/lib/Transforms/Utils/LoopUtils.cpp5
-rw-r--r--llvm/lib/Transforms/Utils/LoopVersioning.cpp21
-rw-r--r--llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/ModuleUtils.cpp74
-rw-r--r--llvm/lib/Transforms/Utils/SampleProfileInference.cpp273
-rw-r--r--llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp222
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp198
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp7
-rw-r--r--llvm/lib/Transforms/Utils/ValueMapper.cpp8
-rw-r--r--llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp78
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp831
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp324
-rw-r--r--llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp244
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h247
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPredicator.h2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp27
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h4
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanValue.h13
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp20
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp3
107 files changed, 5344 insertions, 3708 deletions
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index 68a34bdcb1cd..1533e1805f17 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -176,11 +176,14 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
lowerCoroNoop(cast<IntrinsicInst>(&I));
break;
case Intrinsic::coro_id:
- // Mark a function that comes out of the frontend that has a coro.id
- // with a coroutine attribute.
if (auto *CII = cast<CoroIdInst>(&I)) {
if (CII->getInfo().isPreSplit()) {
- F.addFnAttr(CORO_PRESPLIT_ATTR, UNPREPARED_FOR_SPLIT);
+ assert(F.hasFnAttribute(CORO_PRESPLIT_ATTR) &&
+ F.getFnAttribute(CORO_PRESPLIT_ATTR).getValueAsString() ==
+ UNPREPARED_FOR_SPLIT &&
+ "The frontend uses Swtich-Resumed ABI should emit "
+ "\"coroutine.presplit\" attribute with value \"0\" for the "
+ "coroutine.");
setCannotDuplicate(CII);
CII->setCoroutineSelf();
CoroId = cast<CoroIdInst>(&I);
@@ -190,6 +193,8 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
case Intrinsic::coro_id_retcon:
case Intrinsic::coro_id_retcon_once:
case Intrinsic::coro_id_async:
+ // TODO: Remove the line once we support it in the corresponding
+ // frontend.
F.addFnAttr(CORO_PRESPLIT_ATTR, PREPARED_FOR_SPLIT);
break;
case Intrinsic::coro_resume:
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index a0d12865bd3a..92acfb93057a 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -587,7 +587,7 @@ void FrameTypeBuilder::addFieldForAllocas(const Function &F,
}
});
- if (!Shape.ReuseFrameSlot && !EnableReuseStorageInFrame) {
+ if (!Shape.OptimizeFrame && !EnableReuseStorageInFrame) {
for (const auto &A : FrameData.Allocas) {
AllocaInst *Alloca = A.Alloca;
NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca));
@@ -808,7 +808,7 @@ static StringRef solveTypeName(Type *Ty) {
if (Ty->isPointerTy()) {
auto *PtrTy = cast<PointerType>(Ty);
- Type *PointeeTy = PtrTy->getElementType();
+ Type *PointeeTy = PtrTy->getPointerElementType();
auto Name = solveTypeName(PointeeTy);
if (Name == "UnknownType")
return "PointerType";
@@ -1659,7 +1659,7 @@ static Instruction *insertSpills(const FrameDataInfo &FrameData,
&*Builder.GetInsertPoint());
// This dbg.declare is for the main function entry point. It
// will be deleted in all coro-split functions.
- coro::salvageDebugInfo(DbgPtrAllocaCache, DDI, Shape.ReuseFrameSlot);
+ coro::salvageDebugInfo(DbgPtrAllocaCache, DDI, Shape.OptimizeFrame);
}
}
@@ -2278,7 +2278,7 @@ static void eliminateSwiftErrorArgument(Function &F, Argument &Arg,
IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg());
auto ArgTy = cast<PointerType>(Arg.getType());
- auto ValueTy = ArgTy->getElementType();
+ auto ValueTy = ArgTy->getPointerElementType();
// Reduce to the alloca case:
@@ -2506,7 +2506,7 @@ static void collectFrameAllocas(Function &F, coro::Shape &Shape,
void coro::salvageDebugInfo(
SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> &DbgPtrAllocaCache,
- DbgVariableIntrinsic *DVI, bool ReuseFrameSlot) {
+ DbgVariableIntrinsic *DVI, bool OptimizeFrame) {
Function *F = DVI->getFunction();
IRBuilder<> Builder(F->getContext());
auto InsertPt = F->getEntryBlock().getFirstInsertionPt();
@@ -2558,7 +2558,7 @@ void coro::salvageDebugInfo(
//
// Avoid to create the alloca would be eliminated by optimization
// passes and the corresponding dbg.declares would be invalid.
- if (!ReuseFrameSlot && !EnableReuseStorageInFrame)
+ if (!OptimizeFrame && !EnableReuseStorageInFrame)
if (auto *Arg = dyn_cast<llvm::Argument>(Storage)) {
auto &Cached = DbgPtrAllocaCache[Storage];
if (!Cached) {
diff --git a/llvm/lib/Transforms/Coroutines/CoroInstr.h b/llvm/lib/Transforms/Coroutines/CoroInstr.h
index bf3d781ba43e..014938c15a0a 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInstr.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInstr.h
@@ -599,6 +599,18 @@ public:
}
};
+/// This represents the llvm.coro.align instruction.
+class LLVM_LIBRARY_VISIBILITY CoroAlignInst : public IntrinsicInst {
+public:
+ // Methods to support type inquiry through isa, cast, and dyn_cast:
+ static bool classof(const IntrinsicInst *I) {
+ return I->getIntrinsicID() == Intrinsic::coro_align;
+ }
+ static bool classof(const Value *V) {
+ return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+ }
+};
+
class LLVM_LIBRARY_VISIBILITY AnyCoroEndInst : public IntrinsicInst {
enum { FrameArg, UnwindArg };
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 27ba8524f975..9a17068df3a9 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -36,6 +36,11 @@ void initializeCoroCleanupLegacyPass(PassRegistry &);
// adds coroutine subfunctions to the SCC to be processed by IPO pipeline.
// Async lowering similarily triggers a restart of the pipeline after it has
// split the coroutine.
+//
+// FIXME: Refactor these attributes as LLVM attributes instead of string
+// attributes since these attributes are already used outside LLVM's
+// coroutine module.
+// FIXME: Remove these values once we remove the Legacy PM.
#define CORO_PRESPLIT_ATTR "coroutine.presplit"
#define UNPREPARED_FOR_SPLIT "0"
#define PREPARED_FOR_SPLIT "1"
@@ -54,7 +59,7 @@ void updateCallGraph(Function &Caller, ArrayRef<Function *> Funcs,
/// holding a pointer to the coroutine frame.
void salvageDebugInfo(
SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> &DbgPtrAllocaCache,
- DbgVariableIntrinsic *DVI, bool ReuseFrameSlot);
+ DbgVariableIntrinsic *DVI, bool OptimizeFrame);
// Keeps data and helper functions for lowering coroutine intrinsics.
struct LowererBase {
@@ -99,6 +104,7 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
CoroBeginInst *CoroBegin;
SmallVector<AnyCoroEndInst *, 4> CoroEnds;
SmallVector<CoroSizeInst *, 2> CoroSizes;
+ SmallVector<CoroAlignInst *, 2> CoroAligns;
SmallVector<AnyCoroSuspendInst *, 4> CoroSuspends;
SmallVector<CallInst*, 2> SwiftErrorOps;
@@ -126,7 +132,7 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
BasicBlock *AllocaSpillBlock;
/// This would only be true if optimization are enabled.
- bool ReuseFrameSlot;
+ bool OptimizeFrame;
struct SwitchLoweringStorage {
SwitchInst *ResumeSwitch;
@@ -272,8 +278,8 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
void emitDealloc(IRBuilder<> &Builder, Value *Ptr, CallGraph *CG) const;
Shape() = default;
- explicit Shape(Function &F, bool ReuseFrameSlot = false)
- : ReuseFrameSlot(ReuseFrameSlot) {
+ explicit Shape(Function &F, bool OptimizeFrame = false)
+ : OptimizeFrame(OptimizeFrame) {
buildFrom(F);
}
void buildFrom(Function &F);
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 12c1829524ef..b5129809c6a6 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -29,6 +29,7 @@
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/LazyCallGraph.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/Attributes.h"
@@ -617,7 +618,8 @@ static void replaceSwiftErrorOps(Function &F, coro::Shape &Shape,
Value *CachedSlot = nullptr;
auto getSwiftErrorSlot = [&](Type *ValueTy) -> Value * {
if (CachedSlot) {
- assert(CachedSlot->getType()->getPointerElementType() == ValueTy &&
+ assert(cast<PointerType>(CachedSlot->getType())
+ ->isOpaqueOrPointeeTypeMatches(ValueTy) &&
"multiple swifterror slots in function with different types");
return CachedSlot;
}
@@ -626,7 +628,8 @@ static void replaceSwiftErrorOps(Function &F, coro::Shape &Shape,
for (auto &Arg : F.args()) {
if (Arg.isSwiftError()) {
CachedSlot = &Arg;
- assert(Arg.getType()->getPointerElementType() == ValueTy &&
+ assert(cast<PointerType>(Arg.getType())
+ ->isOpaqueOrPointeeTypeMatches(ValueTy) &&
"swifterror argument does not have expected type");
return &Arg;
}
@@ -682,7 +685,7 @@ void CoroCloner::salvageDebugInfo() {
if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
Worklist.push_back(DVI);
for (DbgVariableIntrinsic *DVI : Worklist)
- coro::salvageDebugInfo(DbgPtrAllocaCache, DVI, Shape.ReuseFrameSlot);
+ coro::salvageDebugInfo(DbgPtrAllocaCache, DVI, Shape.OptimizeFrame);
// Remove all salvaged dbg.declare intrinsics that became
// either unreachable or stale due to the CoroSplit transformation.
@@ -835,7 +838,7 @@ Value *CoroCloner::deriveNewFramePointer() {
static void addFramePointerAttrs(AttributeList &Attrs, LLVMContext &Context,
unsigned ParamIndex,
uint64_t Size, Align Alignment) {
- AttrBuilder ParamAttrs;
+ AttrBuilder ParamAttrs(Context);
ParamAttrs.addAttribute(Attribute::NonNull);
ParamAttrs.addAttribute(Attribute::NoAlias);
ParamAttrs.addAlignmentAttr(Alignment);
@@ -845,14 +848,14 @@ static void addFramePointerAttrs(AttributeList &Attrs, LLVMContext &Context,
static void addAsyncContextAttrs(AttributeList &Attrs, LLVMContext &Context,
unsigned ParamIndex) {
- AttrBuilder ParamAttrs;
+ AttrBuilder ParamAttrs(Context);
ParamAttrs.addAttribute(Attribute::SwiftAsync);
Attrs = Attrs.addParamAttributes(Context, ParamIndex, ParamAttrs);
}
static void addSwiftSelfAttrs(AttributeList &Attrs, LLVMContext &Context,
unsigned ParamIndex) {
- AttrBuilder ParamAttrs;
+ AttrBuilder ParamAttrs(Context);
ParamAttrs.addAttribute(Attribute::SwiftSelf);
Attrs = Attrs.addParamAttributes(Context, ParamIndex, ParamAttrs);
}
@@ -929,7 +932,7 @@ void CoroCloner::create() {
case coro::ABI::Switch:
// Bootstrap attributes by copying function attributes from the
// original function. This should include optimization settings and so on.
- NewAttrs = NewAttrs.addFnAttributes(Context, OrigAttrs.getFnAttrs());
+ NewAttrs = NewAttrs.addFnAttributes(Context, AttrBuilder(Context, OrigAttrs.getFnAttrs()));
addFramePointerAttrs(NewAttrs, Context, 0,
Shape.FrameSize, Shape.FrameAlign);
@@ -952,7 +955,7 @@ void CoroCloner::create() {
// Transfer the original function's attributes.
auto FnAttrs = OrigF.getAttributes().getFnAttrs();
- NewAttrs = NewAttrs.addFnAttributes(Context, FnAttrs);
+ NewAttrs = NewAttrs.addFnAttributes(Context, AttrBuilder(Context, FnAttrs));
break;
}
case coro::ABI::Retcon:
@@ -1082,10 +1085,16 @@ static void updateAsyncFuncPointerContextSize(coro::Shape &Shape) {
Shape.AsyncLowering.AsyncFuncPointer->setInitializer(NewFuncPtrStruct);
}
-static void replaceFrameSize(coro::Shape &Shape) {
+static void replaceFrameSizeAndAlignment(coro::Shape &Shape) {
if (Shape.ABI == coro::ABI::Async)
updateAsyncFuncPointerContextSize(Shape);
+ for (CoroAlignInst *CA : Shape.CoroAligns) {
+ CA->replaceAllUsesWith(
+ ConstantInt::get(CA->getType(), Shape.FrameAlign.value()));
+ CA->eraseFromParent();
+ }
+
if (Shape.CoroSizes.empty())
return;
@@ -1197,10 +1206,34 @@ scanPHIsAndUpdateValueMap(Instruction *Prev, BasicBlock *NewBlock,
static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) {
DenseMap<Value *, Value *> ResolvedValues;
BasicBlock *UnconditionalSucc = nullptr;
+ assert(InitialInst->getModule());
+ const DataLayout &DL = InitialInst->getModule()->getDataLayout();
+
+ auto GetFirstValidInstruction = [](Instruction *I) {
+ while (I) {
+ // BitCastInst wouldn't generate actual code so that we could skip it.
+ if (isa<BitCastInst>(I) || I->isDebugOrPseudoInst() ||
+ I->isLifetimeStartOrEnd())
+ I = I->getNextNode();
+ else if (isInstructionTriviallyDead(I))
+ // Duing we are in the middle of the transformation, we need to erase
+ // the dead instruction manually.
+ I = &*I->eraseFromParent();
+ else
+ break;
+ }
+ return I;
+ };
+
+ auto TryResolveConstant = [&ResolvedValues](Value *V) {
+ auto It = ResolvedValues.find(V);
+ if (It != ResolvedValues.end())
+ V = It->second;
+ return dyn_cast<ConstantInt>(V);
+ };
Instruction *I = InitialInst;
- while (I->isTerminator() ||
- (isa<CmpInst>(I) && I->getNextNode()->isTerminator())) {
+ while (I->isTerminator() || isa<CmpInst>(I)) {
if (isa<ReturnInst>(I)) {
if (I != InitialInst) {
// If InitialInst is an unconditional branch,
@@ -1213,48 +1246,68 @@ static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) {
}
if (auto *BR = dyn_cast<BranchInst>(I)) {
if (BR->isUnconditional()) {
- BasicBlock *BB = BR->getSuccessor(0);
+ BasicBlock *Succ = BR->getSuccessor(0);
if (I == InitialInst)
- UnconditionalSucc = BB;
- scanPHIsAndUpdateValueMap(I, BB, ResolvedValues);
- I = BB->getFirstNonPHIOrDbgOrLifetime();
+ UnconditionalSucc = Succ;
+ scanPHIsAndUpdateValueMap(I, Succ, ResolvedValues);
+ I = GetFirstValidInstruction(Succ->getFirstNonPHIOrDbgOrLifetime());
continue;
}
- } else if (auto *CondCmp = dyn_cast<CmpInst>(I)) {
- auto *BR = dyn_cast<BranchInst>(I->getNextNode());
- if (BR && BR->isConditional() && CondCmp == BR->getCondition()) {
- // If the case number of suspended switch instruction is reduced to
- // 1, then it is simplified to CmpInst in llvm::ConstantFoldTerminator.
- // And the comparsion looks like : %cond = icmp eq i8 %V, constant.
- ConstantInt *CondConst = dyn_cast<ConstantInt>(CondCmp->getOperand(1));
- if (CondConst && CondCmp->getPredicate() == CmpInst::ICMP_EQ) {
- Value *V = CondCmp->getOperand(0);
- auto it = ResolvedValues.find(V);
- if (it != ResolvedValues.end())
- V = it->second;
-
- if (ConstantInt *Cond0 = dyn_cast<ConstantInt>(V)) {
- BasicBlock *BB = Cond0->equalsInt(CondConst->getZExtValue())
- ? BR->getSuccessor(0)
- : BR->getSuccessor(1);
- scanPHIsAndUpdateValueMap(I, BB, ResolvedValues);
- I = BB->getFirstNonPHIOrDbgOrLifetime();
- continue;
- }
- }
- }
- } else if (auto *SI = dyn_cast<SwitchInst>(I)) {
- Value *V = SI->getCondition();
- auto it = ResolvedValues.find(V);
- if (it != ResolvedValues.end())
- V = it->second;
- if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) {
- BasicBlock *BB = SI->findCaseValue(Cond)->getCaseSuccessor();
- scanPHIsAndUpdateValueMap(I, BB, ResolvedValues);
- I = BB->getFirstNonPHIOrDbgOrLifetime();
+
+ BasicBlock *BB = BR->getParent();
+ // Handle the case the condition of the conditional branch is constant.
+ // e.g.,
+ //
+ // br i1 false, label %cleanup, label %CoroEnd
+ //
+ // It is possible during the transformation. We could continue the
+ // simplifying in this case.
+ if (ConstantFoldTerminator(BB, /*DeleteDeadConditions=*/true)) {
+ // Handle this branch in next iteration.
+ I = BB->getTerminator();
continue;
}
+ } else if (auto *CondCmp = dyn_cast<CmpInst>(I)) {
+ // If the case number of suspended switch instruction is reduced to
+ // 1, then it is simplified to CmpInst in llvm::ConstantFoldTerminator.
+ auto *BR = dyn_cast<BranchInst>(
+ GetFirstValidInstruction(CondCmp->getNextNode()));
+ if (!BR || !BR->isConditional() || CondCmp != BR->getCondition())
+ return false;
+
+ // And the comparsion looks like : %cond = icmp eq i8 %V, constant.
+ // So we try to resolve constant for the first operand only since the
+ // second operand should be literal constant by design.
+ ConstantInt *Cond0 = TryResolveConstant(CondCmp->getOperand(0));
+ auto *Cond1 = dyn_cast<ConstantInt>(CondCmp->getOperand(1));
+ if (!Cond0 || !Cond1)
+ return false;
+
+ // Both operands of the CmpInst are Constant. So that we could evaluate
+ // it immediately to get the destination.
+ auto *ConstResult =
+ dyn_cast_or_null<ConstantInt>(ConstantFoldCompareInstOperands(
+ CondCmp->getPredicate(), Cond0, Cond1, DL));
+ if (!ConstResult)
+ return false;
+
+ CondCmp->replaceAllUsesWith(ConstResult);
+ CondCmp->eraseFromParent();
+
+ // Handle this branch in next iteration.
+ I = BR;
+ continue;
+ } else if (auto *SI = dyn_cast<SwitchInst>(I)) {
+ ConstantInt *Cond = TryResolveConstant(SI->getCondition());
+ if (!Cond)
+ return false;
+
+ BasicBlock *BB = SI->findCaseValue(Cond)->getCaseSuccessor();
+ scanPHIsAndUpdateValueMap(I, BB, ResolvedValues);
+ I = GetFirstValidInstruction(BB->getFirstNonPHIOrDbgOrLifetime());
+ continue;
}
+
return false;
}
return false;
@@ -1826,20 +1879,20 @@ namespace {
static coro::Shape splitCoroutine(Function &F,
SmallVectorImpl<Function *> &Clones,
- bool ReuseFrameSlot) {
+ bool OptimizeFrame) {
PrettyStackTraceFunction prettyStackTrace(F);
// The suspend-crossing algorithm in buildCoroutineFrame get tripped
// up by uses in unreachable blocks, so remove them as a first pass.
removeUnreachableBlocks(F);
- coro::Shape Shape(F, ReuseFrameSlot);
+ coro::Shape Shape(F, OptimizeFrame);
if (!Shape.CoroBegin)
return Shape;
simplifySuspendPoints(Shape);
buildCoroutineFrame(F, Shape);
- replaceFrameSize(Shape);
+ replaceFrameSizeAndAlignment(Shape);
// If there are no suspend points, no split required, just remove
// the allocation and deallocation blocks, they are not needed.
@@ -2165,7 +2218,7 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C,
F.removeFnAttr(CORO_PRESPLIT_ATTR);
SmallVector<Function *, 4> Clones;
- const coro::Shape Shape = splitCoroutine(F, Clones, ReuseFrameSlot);
+ const coro::Shape Shape = splitCoroutine(F, Clones, OptimizeFrame);
updateCallGraphAfterCoroutineSplit(*N, Shape, Clones, C, CG, AM, UR, FAM);
if (!Shape.CoroSuspends.empty()) {
@@ -2198,13 +2251,13 @@ namespace {
struct CoroSplitLegacy : public CallGraphSCCPass {
static char ID; // Pass identification, replacement for typeid
- CoroSplitLegacy(bool ReuseFrameSlot = false)
- : CallGraphSCCPass(ID), ReuseFrameSlot(ReuseFrameSlot) {
+ CoroSplitLegacy(bool OptimizeFrame = false)
+ : CallGraphSCCPass(ID), OptimizeFrame(OptimizeFrame) {
initializeCoroSplitLegacyPass(*PassRegistry::getPassRegistry());
}
bool Run = false;
- bool ReuseFrameSlot;
+ bool OptimizeFrame;
// A coroutine is identified by the presence of coro.begin intrinsic, if
// we don't have any, this pass has nothing to do.
@@ -2263,7 +2316,7 @@ struct CoroSplitLegacy : public CallGraphSCCPass {
F->removeFnAttr(CORO_PRESPLIT_ATTR);
SmallVector<Function *, 4> Clones;
- const coro::Shape Shape = splitCoroutine(*F, Clones, ReuseFrameSlot);
+ const coro::Shape Shape = splitCoroutine(*F, Clones, OptimizeFrame);
updateCallGraphAfterCoroutineSplit(*F, Shape, Clones, CG, SCC);
if (Shape.ABI == coro::ABI::Async) {
// Restart SCC passes.
@@ -2300,6 +2353,6 @@ INITIALIZE_PASS_END(
"Split coroutine into a set of functions driving its state machine", false,
false)
-Pass *llvm::createCoroSplitLegacyPass(bool ReuseFrameSlot) {
- return new CoroSplitLegacy(ReuseFrameSlot);
+Pass *llvm::createCoroSplitLegacyPass(bool OptimizeFrame) {
+ return new CoroSplitLegacy(OptimizeFrame);
}
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index fba8b03e44ba..965a146c143f 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -123,6 +123,7 @@ Value *coro::LowererBase::makeSubFnCall(Value *Arg, int Index,
static bool isCoroutineIntrinsicName(StringRef Name) {
// NOTE: Must be sorted!
static const char *const CoroIntrinsics[] = {
+ "llvm.coro.align",
"llvm.coro.alloc",
"llvm.coro.async.context.alloc",
"llvm.coro.async.context.dealloc",
@@ -268,6 +269,9 @@ void coro::Shape::buildFrom(Function &F) {
case Intrinsic::coro_size:
CoroSizes.push_back(cast<CoroSizeInst>(II));
break;
+ case Intrinsic::coro_align:
+ CoroAligns.push_back(cast<CoroAlignInst>(II));
+ break;
case Intrinsic::coro_frame:
CoroFrames.push_back(cast<CoroFrameInst>(II));
break;
@@ -672,8 +676,11 @@ static void checkAsyncFuncPointer(const Instruction *I, Value *V) {
if (!AsyncFuncPtrAddr)
fail(I, "llvm.coro.id.async async function pointer not a global", V);
- auto *StructTy =
- cast<StructType>(AsyncFuncPtrAddr->getType()->getPointerElementType());
+ if (AsyncFuncPtrAddr->getType()->isOpaquePointerTy())
+ return;
+
+ auto *StructTy = cast<StructType>(
+ AsyncFuncPtrAddr->getType()->getNonOpaquePointerElementType());
if (StructTy->isOpaque() || !StructTy->isPacked() ||
StructTy->getNumElements() != 2 ||
!StructTy->getElementType(0)->isIntegerTy(32) ||
@@ -697,14 +704,16 @@ void CoroIdAsyncInst::checkWellFormed() const {
static void checkAsyncContextProjectFunction(const Instruction *I,
Function *F) {
auto *FunTy = cast<FunctionType>(F->getValueType());
- if (!FunTy->getReturnType()->isPointerTy() ||
- !FunTy->getReturnType()->getPointerElementType()->isIntegerTy(8))
+ Type *Int8Ty = Type::getInt8Ty(F->getContext());
+ auto *RetPtrTy = dyn_cast<PointerType>(FunTy->getReturnType());
+ if (!RetPtrTy || !RetPtrTy->isOpaqueOrPointeeTypeMatches(Int8Ty))
fail(I,
"llvm.coro.suspend.async resume function projection function must "
"return an i8* type",
F);
if (FunTy->getNumParams() != 1 || !FunTy->getParamType(0)->isPointerTy() ||
- !FunTy->getParamType(0)->getPointerElementType()->isIntegerTy(8))
+ !cast<PointerType>(FunTy->getParamType(0))
+ ->isOpaqueOrPointeeTypeMatches(Int8Ty))
fail(I,
"llvm.coro.suspend.async resume function projection function must "
"take one i8* type as parameter",
@@ -719,8 +728,7 @@ void CoroAsyncEndInst::checkWellFormed() const {
auto *MustTailCallFunc = getMustTailCallFunction();
if (!MustTailCallFunc)
return;
- auto *FnTy =
- cast<FunctionType>(MustTailCallFunc->getType()->getPointerElementType());
+ auto *FnTy = MustTailCallFunc->getFunctionType();
if (FnTy->getNumParams() != (arg_size() - 3))
fail(this,
"llvm.coro.end.async must tail call function argument type must "
diff --git a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
index 01e724e22dcf..a6d9ce1033f3 100644
--- a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
+++ b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
@@ -54,13 +54,13 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
if (F.isPresplitCoroutine())
continue;
- if (!F.isDeclaration() && F.hasFnAttribute(Attribute::AlwaysInline) &&
- isInlineViable(F).isSuccess()) {
+ if (!F.isDeclaration() && isInlineViable(F).isSuccess()) {
Calls.clear();
for (User *U : F.users())
if (auto *CB = dyn_cast<CallBase>(U))
- if (CB->getCalledFunction() == &F)
+ if (CB->getCalledFunction() == &F &&
+ CB->hasFnAttr(Attribute::AlwaysInline))
Calls.insert(CB);
for (CallBase *CB : Calls) {
@@ -92,10 +92,12 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
Changed = true;
}
- // Remember to try and delete this function afterward. This both avoids
- // re-walking the rest of the module and avoids dealing with any iterator
- // invalidation issues while deleting functions.
- InlinedFunctions.push_back(&F);
+ if (F.hasFnAttribute(Attribute::AlwaysInline)) {
+ // Remember to try and delete this function afterward. This both avoids
+ // re-walking the rest of the module and avoids dealing with any
+ // iterator invalidation issues while deleting functions.
+ InlinedFunctions.push_back(&F);
+ }
}
}
@@ -117,7 +119,7 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
if (!InlinedFunctions.empty()) {
// Now we just have the comdat functions. Filter out the ones whose comdats
// are not actually dead.
- filterDeadComdatFunctions(M, InlinedFunctions);
+ filterDeadComdatFunctions(InlinedFunctions);
// The remaining functions are actually dead.
for (Function *F : InlinedFunctions) {
M.getFunctionList().erase(F);
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 3a42a2cac928..ce3c5153bde2 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -196,8 +196,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
for (const auto &ArgIndex : ArgIndices) {
// not allowed to dereference ->begin() if size() is 0
Params.push_back(GetElementPtrInst::getIndexedType(
- cast<PointerType>(I->getType())->getElementType(),
- ArgIndex.second));
+ I->getType()->getPointerElementType(), ArgIndex.second));
ArgAttrVec.push_back(AttributeSet());
assert(Params.back());
}
@@ -298,7 +297,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
Ops.push_back(ConstantInt::get(IdxTy, II));
// Keep track of the type we're currently indexing.
if (auto *ElPTy = dyn_cast<PointerType>(ElTy))
- ElTy = ElPTy->getElementType();
+ ElTy = ElPTy->getPointerElementType();
else
ElTy = GetElementPtrInst::getTypeAtIndex(ElTy, II);
}
@@ -928,7 +927,7 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
SmallPtrSet<Argument *, 8> ArgsToPromote;
SmallPtrSet<Argument *, 8> ByValArgsToTransform;
for (Argument *PtrArg : PointerArgs) {
- Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
+ Type *AgTy = PtrArg->getType()->getPointerElementType();
// Replace sret attribute with noalias. This reduces register pressure by
// avoiding a register copy.
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 7e729e57153c..12b8a0ef9d00 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -22,6 +22,7 @@
#include "llvm/ADT/TinyPtrVector.h"
#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/MustExecute.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -202,9 +203,12 @@ bool AA::isDynamicallyUnique(Attributor &A, const AbstractAttribute &QueryingAA,
return NoRecurseAA.isAssumedNoRecurse();
}
-Constant *AA::getInitialValueForObj(Value &Obj, Type &Ty) {
+Constant *AA::getInitialValueForObj(Value &Obj, Type &Ty,
+ const TargetLibraryInfo *TLI) {
if (isa<AllocaInst>(Obj))
return UndefValue::get(&Ty);
+ if (isAllocationFn(&Obj, TLI))
+ return getInitialValueOfAllocation(&cast<CallBase>(Obj), TLI, &Ty);
auto *GV = dyn_cast<GlobalVariable>(&Obj);
if (!GV || !GV->hasLocalLinkage())
return nullptr;
@@ -316,7 +320,8 @@ bool AA::getPotentialCopiesOfStoredValue(
dbgs() << "Underlying object is a valid nullptr, giving up.\n";);
return false;
}
- if (!isa<AllocaInst>(Obj) && !isa<GlobalVariable>(Obj)) {
+ if (!isa<AllocaInst>(Obj) && !isa<GlobalVariable>(Obj) &&
+ !isNoAliasCall(Obj)) {
LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << *Obj
<< "\n";);
return false;
@@ -741,6 +746,7 @@ void IRPosition::verify() {
assert((CBContext == nullptr) &&
"'call site argument' position must not have CallBaseContext!");
Use *U = getAsUsePtr();
+ (void)U; // Silence unused variable warning.
assert(U && "Expected use for a 'call site argument' position!");
assert(isa<CallBase>(U->getUser()) &&
"Expected call base user for a 'call site argument' position!");
@@ -999,10 +1005,11 @@ bool Attributor::isAssumedDead(const BasicBlock &BB,
return false;
}
-bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred,
- const AbstractAttribute &QueryingAA,
- const Value &V, bool CheckBBLivenessOnly,
- DepClassTy LivenessDepClass) {
+bool Attributor::checkForAllUses(
+ function_ref<bool(const Use &, bool &)> Pred,
+ const AbstractAttribute &QueryingAA, const Value &V,
+ bool CheckBBLivenessOnly, DepClassTy LivenessDepClass,
+ function_ref<bool(const Use &OldU, const Use &NewU)> EquivalentUseCB) {
// Check the trivial case first as it catches void values.
if (V.use_empty())
@@ -1053,8 +1060,15 @@ bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred,
<< PotentialCopies.size()
<< " potential copies instead!\n");
for (Value *PotentialCopy : PotentialCopies)
- for (const Use &U : PotentialCopy->uses())
- Worklist.push_back(&U);
+ for (const Use &CopyUse : PotentialCopy->uses()) {
+ if (EquivalentUseCB && !EquivalentUseCB(*U, CopyUse)) {
+ LLVM_DEBUG(dbgs() << "[Attributor] Potential copy was "
+ "rejected by the equivalence call back: "
+ << *CopyUse << "!\n");
+ return false;
+ }
+ Worklist.push_back(&CopyUse);
+ }
continue;
}
}
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index b977821bcaa6..76420783b2d1 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -417,12 +417,10 @@ const Value *stripAndAccumulateMinimalOffsets(
AttributorAnalysis);
}
-static const Value *getMinimalBaseOfAccessPointerOperand(
- Attributor &A, const AbstractAttribute &QueryingAA, const Instruction *I,
- int64_t &BytesOffset, const DataLayout &DL, bool AllowNonInbounds = false) {
- const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false);
- if (!Ptr)
- return nullptr;
+static const Value *
+getMinimalBaseOfPointer(Attributor &A, const AbstractAttribute &QueryingAA,
+ const Value *Ptr, int64_t &BytesOffset,
+ const DataLayout &DL, bool AllowNonInbounds = false) {
APInt OffsetAPInt(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
const Value *Base = stripAndAccumulateMinimalOffsets(
A, QueryingAA, Ptr, DL, OffsetAPInt, AllowNonInbounds);
@@ -431,18 +429,6 @@ static const Value *getMinimalBaseOfAccessPointerOperand(
return Base;
}
-static const Value *
-getBasePointerOfAccessPointerOperand(const Instruction *I, int64_t &BytesOffset,
- const DataLayout &DL,
- bool AllowNonInbounds = false) {
- const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false);
- if (!Ptr)
- return nullptr;
-
- return GetPointerBaseWithConstantOffset(Ptr, BytesOffset, DL,
- AllowNonInbounds);
-}
-
/// Clamp the information known for all returned values of a function
/// (identified by \p QueryingAA) into \p S.
template <typename AAType, typename StateType = typename AAType::StateType>
@@ -810,14 +796,17 @@ struct AA::PointerInfo::OffsetAndSize : public std::pair<int64_t, int64_t> {
int64_t getSize() const { return second; }
static OffsetAndSize getUnknown() { return OffsetAndSize(Unknown, Unknown); }
+ /// Return true if offset or size are unknown.
+ bool offsetOrSizeAreUnknown() const {
+ return getOffset() == OffsetAndSize::Unknown ||
+ getSize() == OffsetAndSize::Unknown;
+ }
+
/// Return true if this offset and size pair might describe an address that
/// overlaps with \p OAS.
bool mayOverlap(const OffsetAndSize &OAS) const {
// Any unknown value and we are giving up -> overlap.
- if (OAS.getOffset() == OffsetAndSize::Unknown ||
- OAS.getSize() == OffsetAndSize::Unknown ||
- getOffset() == OffsetAndSize::Unknown ||
- getSize() == OffsetAndSize::Unknown)
+ if (offsetOrSizeAreUnknown() || OAS.offsetOrSizeAreUnknown())
return true;
// Check if one offset point is in the other interval [offset, offset+size].
@@ -1024,8 +1013,9 @@ protected:
OffsetAndSize ItOAS = It.getFirst();
if (!OAS.mayOverlap(ItOAS))
continue;
+ bool IsExact = OAS == ItOAS && !OAS.offsetOrSizeAreUnknown();
for (auto &Access : It.getSecond())
- if (!CB(Access, OAS == ItOAS))
+ if (!CB(Access, IsExact))
return false;
}
return true;
@@ -1161,27 +1151,34 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
return true;
};
+ const auto *TLI = getAnchorScope()
+ ? A.getInfoCache().getTargetLibraryInfoForFunction(
+ *getAnchorScope())
+ : nullptr;
auto UsePred = [&](const Use &U, bool &Follow) -> bool {
Value *CurPtr = U.get();
User *Usr = U.getUser();
LLVM_DEBUG(dbgs() << "[AAPointerInfo] Analyze " << *CurPtr << " in "
<< *Usr << "\n");
-
- OffsetInfo &PtrOI = OffsetInfoMap[CurPtr];
+ assert(OffsetInfoMap.count(CurPtr) &&
+ "The current pointer offset should have been seeded!");
if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Usr)) {
if (CE->isCast())
- return HandlePassthroughUser(Usr, PtrOI, Follow);
+ return HandlePassthroughUser(Usr, OffsetInfoMap[CurPtr], Follow);
if (CE->isCompare())
return true;
- if (!CE->isGEPWithNoNotionalOverIndexing()) {
+ if (!isa<GEPOperator>(CE)) {
LLVM_DEBUG(dbgs() << "[AAPointerInfo] Unhandled constant user " << *CE
<< "\n");
return false;
}
}
if (auto *GEP = dyn_cast<GEPOperator>(Usr)) {
+ // Note the order here, the Usr access might change the map, CurPtr is
+ // already in it though.
OffsetInfo &UsrOI = OffsetInfoMap[Usr];
+ OffsetInfo &PtrOI = OffsetInfoMap[CurPtr];
UsrOI = PtrOI;
// TODO: Use range information.
@@ -1205,19 +1202,22 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
}
UsrOI.Offset = PtrOI.Offset +
DL.getIndexedOffsetInType(
- CurPtr->getType()->getPointerElementType(), Indices);
+ GEP->getSourceElementType(), Indices);
Follow = true;
return true;
}
if (isa<CastInst>(Usr) || isa<SelectInst>(Usr))
- return HandlePassthroughUser(Usr, PtrOI, Follow);
+ return HandlePassthroughUser(Usr, OffsetInfoMap[CurPtr], Follow);
// For PHIs we need to take care of the recurrence explicitly as the value
// might change while we iterate through a loop. For now, we give up if
// the PHI is not invariant.
if (isa<PHINode>(Usr)) {
- // Check if the PHI is invariant (so far).
+ // Note the order here, the Usr access might change the map, CurPtr is
+ // already in it though.
OffsetInfo &UsrOI = OffsetInfoMap[Usr];
+ OffsetInfo &PtrOI = OffsetInfoMap[CurPtr];
+ // Check if the PHI is invariant (so far).
if (UsrOI == PtrOI)
return true;
@@ -1257,8 +1257,8 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
if (auto *LoadI = dyn_cast<LoadInst>(Usr))
return handleAccess(A, *LoadI, *CurPtr, /* Content */ nullptr,
- AccessKind::AK_READ, PtrOI.Offset, Changed,
- LoadI->getType());
+ AccessKind::AK_READ, OffsetInfoMap[CurPtr].Offset,
+ Changed, LoadI->getType());
if (auto *StoreI = dyn_cast<StoreInst>(Usr)) {
if (StoreI->getValueOperand() == CurPtr) {
LLVM_DEBUG(dbgs() << "[AAPointerInfo] Escaping use in store "
@@ -1269,18 +1269,21 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
Optional<Value *> Content = A.getAssumedSimplified(
*StoreI->getValueOperand(), *this, UsedAssumedInformation);
return handleAccess(A, *StoreI, *CurPtr, Content, AccessKind::AK_WRITE,
- PtrOI.Offset, Changed,
+ OffsetInfoMap[CurPtr].Offset, Changed,
StoreI->getValueOperand()->getType());
}
if (auto *CB = dyn_cast<CallBase>(Usr)) {
if (CB->isLifetimeStartOrEnd())
return true;
+ if (TLI && isFreeCall(CB, TLI))
+ return true;
if (CB->isArgOperand(&U)) {
unsigned ArgNo = CB->getArgOperandNo(&U);
const auto &CSArgPI = A.getAAFor<AAPointerInfo>(
*this, IRPosition::callsite_argument(*CB, ArgNo),
DepClassTy::REQUIRED);
- Changed = translateAndAddCalleeState(A, CSArgPI, PtrOI.Offset, *CB) |
+ Changed = translateAndAddCalleeState(
+ A, CSArgPI, OffsetInfoMap[CurPtr].Offset, *CB) |
Changed;
return true;
}
@@ -1293,8 +1296,15 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
LLVM_DEBUG(dbgs() << "[AAPointerInfo] User not handled " << *Usr << "\n");
return false;
};
+ auto EquivalentUseCB = [&](const Use &OldU, const Use &NewU) {
+ if (OffsetInfoMap.count(NewU))
+ return OffsetInfoMap[NewU] == OffsetInfoMap[OldU];
+ OffsetInfoMap[NewU] = OffsetInfoMap[OldU];
+ return true;
+ };
if (!A.checkForAllUses(UsePred, *this, AssociatedValue,
- /* CheckBBLivenessOnly */ true))
+ /* CheckBBLivenessOnly */ true, DepClassTy::OPTIONAL,
+ EquivalentUseCB))
return indicatePessimisticFixpoint();
LLVM_DEBUG({
@@ -2127,31 +2137,26 @@ static int64_t getKnownNonNullAndDerefBytesForUse(
return DerefAA.getKnownDereferenceableBytes();
}
+ Optional<MemoryLocation> Loc = MemoryLocation::getOrNone(I);
+ if (!Loc || Loc->Ptr != UseV || !Loc->Size.isPrecise() || I->isVolatile())
+ return 0;
+
int64_t Offset;
const Value *Base =
- getMinimalBaseOfAccessPointerOperand(A, QueryingAA, I, Offset, DL);
- if (Base) {
- if (Base == &AssociatedValue &&
- getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
- int64_t DerefBytes =
- (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()) + Offset;
-
- IsNonNull |= !NullPointerIsDefined;
- return std::max(int64_t(0), DerefBytes);
- }
+ getMinimalBaseOfPointer(A, QueryingAA, Loc->Ptr, Offset, DL);
+ if (Base && Base == &AssociatedValue) {
+ int64_t DerefBytes = Loc->Size.getValue() + Offset;
+ IsNonNull |= !NullPointerIsDefined;
+ return std::max(int64_t(0), DerefBytes);
}
/// Corner case when an offset is 0.
- Base = getBasePointerOfAccessPointerOperand(I, Offset, DL,
- /*AllowNonInbounds*/ true);
- if (Base) {
- if (Offset == 0 && Base == &AssociatedValue &&
- getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
- int64_t DerefBytes =
- (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType());
- IsNonNull |= !NullPointerIsDefined;
- return std::max(int64_t(0), DerefBytes);
- }
+ Base = GetPointerBaseWithConstantOffset(Loc->Ptr, Offset, DL,
+ /*AllowNonInbounds*/ true);
+ if (Base && Base == &AssociatedValue && Offset == 0) {
+ int64_t DerefBytes = Loc->Size.getValue();
+ IsNonNull |= !NullPointerIsDefined;
+ return std::max(int64_t(0), DerefBytes);
}
return 0;
@@ -2325,6 +2330,8 @@ struct AANoRecurseFunction final : AANoRecurseImpl {
/// See AbstractAttribute::initialize(...).
void initialize(Attributor &A) override {
AANoRecurseImpl::initialize(A);
+ // TODO: We should build a call graph ourselves to enable this in the module
+ // pass as well.
if (const Function *F = getAnchorScope())
if (A.getInfoCache().getSccSize(*F) != 1)
indicatePessimisticFixpoint();
@@ -4057,17 +4064,15 @@ struct AADereferenceableImpl : AADereferenceable {
if (!UseV->getType()->isPointerTy())
return;
- Type *PtrTy = UseV->getType();
- const DataLayout &DL = A.getDataLayout();
+ Optional<MemoryLocation> Loc = MemoryLocation::getOrNone(I);
+ if (!Loc || Loc->Ptr != UseV || !Loc->Size.isPrecise() || I->isVolatile())
+ return;
+
int64_t Offset;
- if (const Value *Base = getBasePointerOfAccessPointerOperand(
- I, Offset, DL, /*AllowNonInbounds*/ true)) {
- if (Base == &getAssociatedValue() &&
- getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
- uint64_t Size = DL.getTypeStoreSize(PtrTy->getPointerElementType());
- State.addAccessedBytes(Offset, Size);
- }
- }
+ const Value *Base = GetPointerBaseWithConstantOffset(
+ Loc->Ptr, Offset, A.getDataLayout(), /*AllowNonInbounds*/ true);
+ if (Base && Base == &getAssociatedValue())
+ State.addAccessedBytes(Offset, Loc->Size.getValue());
}
/// See followUsesInMBEC
@@ -5236,6 +5241,8 @@ struct AAValueSimplifyImpl : AAValueSimplify {
if (!AA::getAssumedUnderlyingObjects(A, Ptr, Objects, AA, &L))
return false;
+ const auto *TLI =
+ A.getInfoCache().getTargetLibraryInfoForFunction(*L.getFunction());
for (Value *Obj : Objects) {
LLVM_DEBUG(dbgs() << "Visit underlying object " << *Obj << "\n");
if (isa<UndefValue>(Obj))
@@ -5250,9 +5257,7 @@ struct AAValueSimplifyImpl : AAValueSimplify {
continue;
return false;
}
- if (!isa<AllocaInst>(Obj) && !isa<GlobalVariable>(Obj))
- return false;
- Constant *InitialVal = AA::getInitialValueForObj(*Obj, *L.getType());
+ Constant *InitialVal = AA::getInitialValueForObj(*Obj, *L.getType(), TLI);
if (!InitialVal || !Union(*InitialVal))
return false;
@@ -5745,13 +5750,6 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
/// The call that allocates the memory.
CallBase *const CB;
- /// The kind of allocation.
- const enum class AllocationKind {
- MALLOC,
- CALLOC,
- ALIGNED_ALLOC,
- } Kind;
-
/// The library function id for the allocation.
LibFunc LibraryFunctionId = NotLibFunc;
@@ -5808,20 +5806,17 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
DeallocationInfos[CB] = new (A.Allocator) DeallocationInfo{CB};
return true;
}
- bool IsMalloc = isMallocLikeFn(CB, TLI);
- bool IsAlignedAllocLike = !IsMalloc && isAlignedAllocLikeFn(CB, TLI);
- bool IsCalloc =
- !IsMalloc && !IsAlignedAllocLike && isCallocLikeFn(CB, TLI);
- if (!IsMalloc && !IsAlignedAllocLike && !IsCalloc)
- return true;
- auto Kind =
- IsMalloc ? AllocationInfo::AllocationKind::MALLOC
- : (IsCalloc ? AllocationInfo::AllocationKind::CALLOC
- : AllocationInfo::AllocationKind::ALIGNED_ALLOC);
-
- AllocationInfo *AI = new (A.Allocator) AllocationInfo{CB, Kind};
- AllocationInfos[CB] = AI;
- TLI->getLibFunc(*CB, AI->LibraryFunctionId);
+ // To do heap to stack, we need to know that the allocation itself is
+ // removable once uses are rewritten, and that we can initialize the
+ // alloca to the same pattern as the original allocation result.
+ if (isAllocationFn(CB, TLI) && isAllocRemovable(CB, TLI)) {
+ auto *I8Ty = Type::getInt8Ty(CB->getParent()->getContext());
+ if (nullptr != getInitialValueOfAllocation(CB, TLI, I8Ty)) {
+ AllocationInfo *AI = new (A.Allocator) AllocationInfo{CB};
+ AllocationInfos[CB] = AI;
+ TLI->getLibFunc(*CB, AI->LibraryFunctionId);
+ }
+ }
return true;
};
@@ -5917,21 +5912,22 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
Optional<APInt> SizeAPI = getSize(A, *this, AI);
if (SizeAPI.hasValue()) {
Size = ConstantInt::get(AI.CB->getContext(), *SizeAPI);
- } else if (AI.Kind == AllocationInfo::AllocationKind::CALLOC) {
- auto *Num = AI.CB->getOperand(0);
- auto *SizeT = AI.CB->getOperand(1);
- IRBuilder<> B(AI.CB);
- Size = B.CreateMul(Num, SizeT, "h2s.calloc.size");
- } else if (AI.Kind == AllocationInfo::AllocationKind::ALIGNED_ALLOC) {
- Size = AI.CB->getOperand(1);
} else {
- Size = AI.CB->getOperand(0);
+ LLVMContext &Ctx = AI.CB->getContext();
+ auto &DL = A.getInfoCache().getDL();
+ ObjectSizeOpts Opts;
+ ObjectSizeOffsetEvaluator Eval(DL, TLI, Ctx, Opts);
+ SizeOffsetEvalType SizeOffsetPair = Eval.compute(AI.CB);
+ assert(SizeOffsetPair != ObjectSizeOffsetEvaluator::unknown() &&
+ cast<ConstantInt>(SizeOffsetPair.second)->isZero());
+ Size = SizeOffsetPair.first;
}
Align Alignment(1);
- if (AI.Kind == AllocationInfo::AllocationKind::ALIGNED_ALLOC) {
- Optional<APInt> AlignmentAPI =
- getAPInt(A, *this, *AI.CB->getArgOperand(0));
+ if (MaybeAlign RetAlign = AI.CB->getRetAlign())
+ Alignment = max(Alignment, RetAlign);
+ if (Value *Align = getAllocAlignment(AI.CB, TLI)) {
+ Optional<APInt> AlignmentAPI = getAPInt(A, *this, *Align);
assert(AlignmentAPI.hasValue() &&
"Expected an alignment during manifest!");
Alignment =
@@ -5947,6 +5943,11 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
Alloca = new BitCastInst(Alloca, AI.CB->getType(), "malloc_bc",
Alloca->getNextNode());
+ auto *I8Ty = Type::getInt8Ty(F->getContext());
+ auto *InitVal = getInitialValueOfAllocation(AI.CB, TLI, I8Ty);
+ assert(InitVal &&
+ "Must be able to materialize initial memory state of allocation");
+
A.changeValueAfterManifest(*AI.CB, *Alloca);
if (auto *II = dyn_cast<InvokeInst>(AI.CB)) {
@@ -5957,18 +5958,13 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
A.deleteAfterManifest(*AI.CB);
}
- // Zero out the allocated memory if it was a calloc.
- if (AI.Kind == AllocationInfo::AllocationKind::CALLOC) {
- auto *BI = new BitCastInst(Alloca, AI.CB->getType(), "calloc_bc",
- Alloca->getNextNode());
- Value *Ops[] = {
- BI, ConstantInt::get(F->getContext(), APInt(8, 0, false)), Size,
- ConstantInt::get(Type::getInt1Ty(F->getContext()), false)};
-
- Type *Tys[] = {BI->getType(), AI.CB->getOperand(0)->getType()};
- Module *M = F->getParent();
- Function *Fn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys);
- CallInst::Create(Fn, Ops, "", BI->getNextNode());
+ // Initialize the alloca with the same value as used by the allocation
+ // function. We can skip undef as the initial value of an alloc is
+ // undef, and the memset would simply end up being DSEd.
+ if (!isa<UndefValue>(InitVal)) {
+ IRBuilder<> Builder(Alloca->getNextNode());
+ // TODO: Use alignment above if align!=1
+ Builder.CreateMemSet(Alloca, InitVal, Size, None);
}
HasChanged = ChangeStatus::CHANGED;
}
@@ -5990,25 +5986,18 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
Optional<APInt> getSize(Attributor &A, const AbstractAttribute &AA,
AllocationInfo &AI) {
+ auto Mapper = [&](const Value *V) -> const Value * {
+ bool UsedAssumedInformation = false;
+ if (Optional<Constant *> SimpleV =
+ A.getAssumedConstant(*V, AA, UsedAssumedInformation))
+ if (*SimpleV)
+ return *SimpleV;
+ return V;
+ };
- if (AI.Kind == AllocationInfo::AllocationKind::MALLOC)
- return getAPInt(A, AA, *AI.CB->getArgOperand(0));
-
- if (AI.Kind == AllocationInfo::AllocationKind::ALIGNED_ALLOC)
- // Only if the alignment is also constant we return a size.
- return getAPInt(A, AA, *AI.CB->getArgOperand(0)).hasValue()
- ? getAPInt(A, AA, *AI.CB->getArgOperand(1))
- : llvm::None;
-
- assert(AI.Kind == AllocationInfo::AllocationKind::CALLOC &&
- "Expected only callocs are left");
- Optional<APInt> Num = getAPInt(A, AA, *AI.CB->getArgOperand(0));
- Optional<APInt> Size = getAPInt(A, AA, *AI.CB->getArgOperand(1));
- if (!Num.hasValue() || !Size.hasValue())
- return llvm::None;
- bool Overflow = false;
- Size = Size.getValue().umul_ov(Num.getValue(), Overflow);
- return Overflow ? llvm::None : Size;
+ const Function *F = getAnchorScope();
+ const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
+ return getAllocSize(AI.CB, TLI, Mapper);
}
/// Collection of all malloc-like calls in a function with associated
@@ -6025,6 +6014,7 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
ChangeStatus Changed = ChangeStatus::UNCHANGED;
const Function *F = getAnchorScope();
+ const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
const auto &LivenessAA =
A.getAAFor<AAIsDead>(*this, IRPosition::function(*F), DepClassTy::NONE);
@@ -6239,22 +6229,24 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
if (AI.Status == AllocationInfo::INVALID)
continue;
- if (MaxHeapToStackSize == -1) {
- if (AI.Kind == AllocationInfo::AllocationKind::ALIGNED_ALLOC)
- if (!getAPInt(A, *this, *AI.CB->getArgOperand(0)).hasValue()) {
- LLVM_DEBUG(dbgs() << "[H2S] Unknown allocation alignment: " << *AI.CB
- << "\n");
- AI.Status = AllocationInfo::INVALID;
- Changed = ChangeStatus::CHANGED;
- continue;
- }
- } else {
+ if (Value *Align = getAllocAlignment(AI.CB, TLI)) {
+ if (!getAPInt(A, *this, *Align)) {
+ // Can't generate an alloca which respects the required alignment
+ // on the allocation.
+ LLVM_DEBUG(dbgs() << "[H2S] Unknown allocation alignment: " << *AI.CB
+ << "\n");
+ AI.Status = AllocationInfo::INVALID;
+ Changed = ChangeStatus::CHANGED;
+ continue;
+ }
+ }
+
+ if (MaxHeapToStackSize != -1) {
Optional<APInt> Size = getSize(A, *this, AI);
if (!Size.hasValue() || Size.getValue().ugt(MaxHeapToStackSize)) {
LLVM_DEBUG({
if (!Size.hasValue())
- dbgs() << "[H2S] Unknown allocation size (or alignment): " << *AI.CB
- << "\n";
+ dbgs() << "[H2S] Unknown allocation size: " << *AI.CB << "\n";
else
dbgs() << "[H2S] Allocation size too large: " << *AI.CB << " vs. "
<< MaxHeapToStackSize << "\n";
@@ -6637,9 +6629,10 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
IRBuilder<NoFolder> IRB(IP);
const DataLayout &DL = IP->getModule()->getDataLayout();
- if (Base->getType()->getPointerElementType() != PrivType)
- Base = BitCastInst::CreateBitOrPointerCast(Base, PrivType->getPointerTo(),
- "", ACS.getInstruction());
+ Type *PrivPtrType = PrivType->getPointerTo();
+ if (Base->getType() != PrivPtrType)
+ Base = BitCastInst::CreateBitOrPointerCast(Base, PrivPtrType, "",
+ ACS.getInstruction());
// Traverse the type, build GEPs and loads.
if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
@@ -6781,7 +6774,7 @@ struct AAPrivatizablePtrFloating : public AAPrivatizablePtrImpl {
if (auto *AI = dyn_cast<AllocaInst>(Obj))
if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize()))
if (CI->isOne())
- return Obj->getType()->getPointerElementType();
+ return AI->getAllocatedType();
if (auto *Arg = dyn_cast<Argument>(Obj)) {
auto &PrivArgAA = A.getAAFor<AAPrivatizablePtr>(
*this, IRPosition::argument(*Arg), DepClassTy::REQUIRED);
@@ -7675,7 +7668,6 @@ void AAMemoryLocationImpl::categorizePtrValue(
for (Value *Obj : Objects) {
// TODO: recognize the TBAA used for constant accesses.
MemoryLocationsKind MLK = NO_LOCATIONS;
- assert(!isa<GEPOperator>(Obj) && "GEPs should have been stripped.");
if (isa<UndefValue>(Obj))
continue;
if (isa<Argument>(Obj)) {
@@ -8485,13 +8477,30 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
/* UseValueSimplify */ false))
return indicatePessimisticFixpoint();
- return clampStateAndIndicateChange(getState(), T);
+ // Ensure that long def-use chains can't cause circular reasoning either by
+ // introducing a cutoff below.
+ if (clampStateAndIndicateChange(getState(), T) == ChangeStatus::UNCHANGED)
+ return ChangeStatus::UNCHANGED;
+ if (++NumChanges > MaxNumChanges) {
+ LLVM_DEBUG(dbgs() << "[AAValueConstantRange] performed " << NumChanges
+ << " but only " << MaxNumChanges
+ << " are allowed to avoid cyclic reasoning.");
+ return indicatePessimisticFixpoint();
+ }
+ return ChangeStatus::CHANGED;
}
/// See AbstractAttribute::trackStatistics()
void trackStatistics() const override {
STATS_DECLTRACK_FLOATING_ATTR(value_range)
}
+
+ /// Tracker to bail after too many widening steps of the constant range.
+ int NumChanges = 0;
+
+ /// Upper bound for the number of allowed changes (=widening steps) for the
+ /// constant range before we give up.
+ static constexpr int MaxNumChanges = 5;
};
struct AAValueConstantRangeFunction : AAValueConstantRangeImpl {
diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index fb9ab7954e36..2a6e38b0437f 100644
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -287,7 +287,8 @@ bool DeadArgumentEliminationPass::RemoveDeadArgumentsFromCallers(Function &Fn) {
SmallVector<unsigned, 8> UnusedArgs;
bool Changed = false;
- AttrBuilder UBImplyingAttributes = AttributeFuncs::getUBImplyingAttributes();
+ AttributeMask UBImplyingAttributes =
+ AttributeFuncs::getUBImplyingAttributes();
for (Argument &Arg : Fn.args()) {
if (!Arg.hasSwiftErrorAttr() && Arg.use_empty() &&
!Arg.hasPassPointeeByValueCopyAttr()) {
@@ -838,7 +839,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
assert(NRetTy && "No new return type found?");
// The existing function return attributes.
- AttrBuilder RAttrs(PAL.getRetAttrs());
+ AttrBuilder RAttrs(F->getContext(), PAL.getRetAttrs());
// Remove any incompatible attributes, but only if we removed all return
// values. Otherwise, ensure that we don't have any conflicting attributes
@@ -889,7 +890,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
// Adjust the call return attributes in case the function was changed to
// return void.
- AttrBuilder RAttrs(CallPAL.getRetAttrs());
+ AttrBuilder RAttrs(F->getContext(), CallPAL.getRetAttrs());
RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
@@ -912,7 +913,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
// this is not an expected case anyway
ArgAttrVec.push_back(AttributeSet::get(
F->getContext(),
- AttrBuilder(Attrs).removeAttribute(Attribute::Returned)));
+ AttrBuilder(F->getContext(), Attrs).removeAttribute(Attribute::Returned)));
} else {
// Otherwise, use the original attributes.
ArgAttrVec.push_back(Attrs);
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 321d4a19a585..213a998d5bba 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -133,7 +133,7 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
if (AliasAnalysis::onlyReadsMemory(MRB))
return MAK_ReadOnly;
- if (AliasAnalysis::doesNotReadMemory(MRB))
+ if (AliasAnalysis::onlyWritesMemory(MRB))
return MAK_WriteOnly;
// Conservatively assume it reads and writes to memory.
@@ -295,13 +295,13 @@ static void addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter,
// No change.
continue;
- if (F->doesNotReadMemory() && WritesMemory)
+ if (F->onlyWritesMemory() && WritesMemory)
continue;
Changed.insert(F);
// Clear out any existing attributes.
- AttrBuilder AttrsToRemove;
+ AttributeMask AttrsToRemove;
AttrsToRemove.addAttribute(Attribute::ReadOnly);
AttrsToRemove.addAttribute(Attribute::ReadNone);
AttrsToRemove.addAttribute(Attribute::WriteOnly);
@@ -720,10 +720,16 @@ determinePointerAccessAttrs(Argument *A,
// The accessors used on call site here do the right thing for calls and
// invokes with operand bundles.
- if (!CB.onlyReadsMemory() && !CB.onlyReadsMemory(UseIndex))
- return Attribute::None;
- if (!CB.doesNotAccessMemory(UseIndex))
+ if (CB.doesNotAccessMemory(UseIndex)) {
+ /* nop */
+ } else if (CB.onlyReadsMemory() || CB.onlyReadsMemory(UseIndex)) {
IsRead = true;
+ } else if (CB.hasFnAttr(Attribute::WriteOnly) ||
+ CB.dataOperandHasImpliedAttr(UseIndex, Attribute::WriteOnly)) {
+ IsWrite = true;
+ } else {
+ return Attribute::None;
+ }
break;
}
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 2425646455bd..6c3cc3914337 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -6,15 +6,24 @@
//
//===----------------------------------------------------------------------===//
//
-// This specialises functions with constant parameters (e.g. functions,
-// globals). Constant parameters like function pointers and constant globals
-// are propagated to the callee by specializing the function.
+// This specialises functions with constant parameters. Constant parameters
+// like function pointers and constant globals are propagated to the callee by
+// specializing the function. The main benefit of this pass at the moment is
+// that indirect calls are transformed into direct calls, which provides inline
+// opportunities that the inliner would not have been able to achieve. That's
+// why function specialisation is run before the inliner in the optimisation
+// pipeline; that is by design. Otherwise, we would only benefit from constant
+// passing, which is a valid use-case too, but hasn't been explored much in
+// terms of performance uplifts, cost-model and compile-time impact.
//
// Current limitations:
-// - It does not yet handle integer ranges.
+// - It does not yet handle integer ranges. We do support "literal constants",
+// but that's off by default under an option.
// - Only 1 argument per function is specialised,
-// - The cost-model could be further looked into,
-// - We are not yet caching analysis results.
+// - The cost-model could be further looked into (it mainly focuses on inlining
+// benefits),
+// - We are not yet caching analysis results, but profiling and checking where
+// extra compile time is spent didn't suggest this to be a problem.
//
// Ideas:
// - With a function specialization attribute for arguments, we could have
@@ -30,8 +39,12 @@
// https://reviews.llvm.org/D106426 for details. Perhaps there is a
// compile-time friendlier way to control/limit the number of specialisations
// for recursive functions.
-// - Don't transform the function if there is no function specialization
-// happens.
+// - Don't transform the function if function specialization does not trigger;
+// the SCCPSolver may make IR changes.
+//
+// References:
+// - 2021 LLVM Dev Mtg “Introducing function specialisation, and can we enable
+// it by default?”, https://www.youtube.com/watch?v=zJiCjeXgV5Q
//
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index b1f3ff15c97b..d3cac3efce86 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -303,11 +303,11 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
else if (auto *GEP = dyn_cast<GEPOperator>(U))
append_range(WorkList, GEP->users());
else if (auto *LI = dyn_cast<LoadInst>(U)) {
- // A load from zeroinitializer is always zeroinitializer, regardless of
- // any applied offset.
+ // A load from a uniform value is always the same, regardless of any
+ // applied offset.
Type *Ty = LI->getType();
- if (Init->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy()) {
- LI->replaceAllUsesWith(Constant::getNullValue(Ty));
+ if (Constant *Res = ConstantFoldLoadFromUniformValue(Init, Ty)) {
+ LI->replaceAllUsesWith(Res);
EraseFromParent(LI);
continue;
}
@@ -337,107 +337,68 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
return Changed;
}
-static bool isSafeSROAElementUse(Value *V);
-
-/// Return true if the specified GEP is a safe user of a derived
-/// expression from a global that we want to SROA.
-static bool isSafeSROAGEP(User *U) {
- // Check to see if this ConstantExpr GEP is SRA'able. In particular, we
- // don't like < 3 operand CE's, and we don't like non-constant integer
- // indices. This enforces that all uses are 'gep GV, 0, C, ...' for some
- // value of C.
- if (U->getNumOperands() < 3 || !isa<Constant>(U->getOperand(1)) ||
- !cast<Constant>(U->getOperand(1))->isNullValue())
- return false;
-
- gep_type_iterator GEPI = gep_type_begin(U), E = gep_type_end(U);
- ++GEPI; // Skip over the pointer index.
-
- // For all other level we require that the indices are constant and inrange.
- // In particular, consider: A[0][i]. We cannot know that the user isn't doing
- // invalid things like allowing i to index an out-of-range subscript that
- // accesses A[1]. This can also happen between different members of a struct
- // in llvm IR.
- for (; GEPI != E; ++GEPI) {
- if (GEPI.isStruct())
+/// Look at all uses of the global and determine which (offset, type) pairs it
+/// can be split into.
+static bool collectSRATypes(DenseMap<uint64_t, Type *> &Types, GlobalValue *GV,
+ const DataLayout &DL) {
+ SmallVector<Use *, 16> Worklist;
+ SmallPtrSet<Use *, 16> Visited;
+ auto AppendUses = [&](Value *V) {
+ for (Use &U : V->uses())
+ if (Visited.insert(&U).second)
+ Worklist.push_back(&U);
+ };
+ AppendUses(GV);
+ while (!Worklist.empty()) {
+ Use *U = Worklist.pop_back_val();
+ User *V = U->getUser();
+ if (isa<BitCastOperator>(V) || isa<AddrSpaceCastOperator>(V)) {
+ AppendUses(V);
continue;
+ }
- ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPI.getOperand());
- if (!IdxVal || (GEPI.isBoundedSequential() &&
- IdxVal->getZExtValue() >= GEPI.getSequentialNumElements()))
- return false;
- }
-
- return llvm::all_of(U->users(), isSafeSROAElementUse);
-}
-
-/// Return true if the specified instruction is a safe user of a derived
-/// expression from a global that we want to SROA.
-static bool isSafeSROAElementUse(Value *V) {
- // We might have a dead and dangling constant hanging off of here.
- if (Constant *C = dyn_cast<Constant>(V))
- return isSafeToDestroyConstant(C);
-
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I) return false;
+ if (auto *GEP = dyn_cast<GEPOperator>(V)) {
+ if (!GEP->hasAllConstantIndices())
+ return false;
+ AppendUses(V);
+ continue;
+ }
- // Loads are ok.
- if (isa<LoadInst>(I)) return true;
+ if (Value *Ptr = getLoadStorePointerOperand(V)) {
+ // This is storing the global address into somewhere, not storing into
+ // the global.
+ if (isa<StoreInst>(V) && U->getOperandNo() == 0)
+ return false;
- // Stores *to* the pointer are ok.
- if (StoreInst *SI = dyn_cast<StoreInst>(I))
- return SI->getOperand(0) != V;
+ APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+ Ptr = Ptr->stripAndAccumulateConstantOffsets(DL, Offset,
+ /* AllowNonInbounds */ true);
+ if (Ptr != GV || Offset.getActiveBits() >= 64)
+ return false;
- // Otherwise, it must be a GEP. Check it and its users are safe to SRA.
- return isa<GetElementPtrInst>(I) && isSafeSROAGEP(I);
-}
+ // TODO: We currently require that all accesses at a given offset must
+ // use the same type. This could be relaxed.
+ Type *Ty = getLoadStoreType(V);
+ auto It = Types.try_emplace(Offset.getZExtValue(), Ty).first;
+ if (Ty != It->second)
+ return false;
+ continue;
+ }
-/// Look at all uses of the global and decide whether it is safe for us to
-/// perform this transformation.
-static bool GlobalUsersSafeToSRA(GlobalValue *GV) {
- for (User *U : GV->users()) {
- // The user of the global must be a GEP Inst or a ConstantExpr GEP.
- if (!isa<GetElementPtrInst>(U) &&
- (!isa<ConstantExpr>(U) ||
- cast<ConstantExpr>(U)->getOpcode() != Instruction::GetElementPtr))
- return false;
+ // Ignore dead constant users.
+ if (auto *C = dyn_cast<Constant>(V)) {
+ if (!isSafeToDestroyConstant(C))
+ return false;
+ continue;
+ }
- // Check the gep and it's users are safe to SRA
- if (!isSafeSROAGEP(U))
- return false;
+ // Unknown user.
+ return false;
}
return true;
}
-static bool IsSRASequential(Type *T) {
- return isa<ArrayType>(T) || isa<VectorType>(T);
-}
-static uint64_t GetSRASequentialNumElements(Type *T) {
- if (ArrayType *AT = dyn_cast<ArrayType>(T))
- return AT->getNumElements();
- return cast<FixedVectorType>(T)->getNumElements();
-}
-static Type *GetSRASequentialElementType(Type *T) {
- if (ArrayType *AT = dyn_cast<ArrayType>(T))
- return AT->getElementType();
- return cast<VectorType>(T)->getElementType();
-}
-static bool CanDoGlobalSRA(GlobalVariable *GV) {
- Constant *Init = GV->getInitializer();
-
- if (isa<StructType>(Init->getType())) {
- // nothing to check
- } else if (IsSRASequential(Init->getType())) {
- if (GetSRASequentialNumElements(Init->getType()) > 16 &&
- GV->hasNUsesOrMore(16))
- return false; // It's not worth it.
- } else
- return false;
-
- return GlobalUsersSafeToSRA(GV);
-}
-
/// Copy over the debug info for a variable to its SRA replacements.
static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV,
uint64_t FragmentOffsetInBits,
@@ -468,161 +429,140 @@ static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV,
/// transformation is safe already. We return the first global variable we
/// insert so that the caller can reprocess it.
static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
- // Make sure this global only has simple uses that we can SRA.
- if (!CanDoGlobalSRA(GV))
+ assert(GV->hasLocalLinkage());
+
+ // Collect types to split into.
+ DenseMap<uint64_t, Type *> Types;
+ if (!collectSRATypes(Types, GV, DL) || Types.empty())
return nullptr;
- assert(GV->hasLocalLinkage());
- Constant *Init = GV->getInitializer();
- Type *Ty = Init->getType();
- uint64_t VarSize = DL.getTypeSizeInBits(Ty);
+ // Make sure we don't SRA back to the same type.
+ if (Types.size() == 1 && Types.begin()->second == GV->getValueType())
+ return nullptr;
- std::map<unsigned, GlobalVariable *> NewGlobals;
+ // Don't perform SRA if we would have to split into many globals.
+ if (Types.size() > 16)
+ return nullptr;
- // Get the alignment of the global, either explicit or target-specific.
- Align StartAlignment =
- DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getType());
-
- // Loop over all users and create replacement variables for used aggregate
- // elements.
- for (User *GEP : GV->users()) {
- assert(((isa<ConstantExpr>(GEP) && cast<ConstantExpr>(GEP)->getOpcode() ==
- Instruction::GetElementPtr) ||
- isa<GetElementPtrInst>(GEP)) &&
- "NonGEP CE's are not SRAable!");
-
- // Ignore the 1th operand, which has to be zero or else the program is quite
- // broken (undefined). Get the 2nd operand, which is the structure or array
- // index.
- unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
- if (NewGlobals.count(ElementIdx) == 1)
- continue; // we`ve already created replacement variable
- assert(NewGlobals.count(ElementIdx) == 0);
-
- Type *ElTy = nullptr;
- if (StructType *STy = dyn_cast<StructType>(Ty))
- ElTy = STy->getElementType(ElementIdx);
- else
- ElTy = GetSRASequentialElementType(Ty);
- assert(ElTy);
+ // Sort by offset.
+ SmallVector<std::pair<uint64_t, Type *>, 16> TypesVector;
+ append_range(TypesVector, Types);
+ sort(TypesVector,
+ [](const auto &A, const auto &B) { return A.first < B.first; });
- Constant *In = Init->getAggregateElement(ElementIdx);
- assert(In && "Couldn't get element of initializer?");
+ // Check that the types are non-overlapping.
+ uint64_t Offset = 0;
+ for (const auto &Pair : TypesVector) {
+ // Overlaps with previous type.
+ if (Pair.first < Offset)
+ return nullptr;
- GlobalVariable *NGV = new GlobalVariable(
- ElTy, false, GlobalVariable::InternalLinkage, In,
- GV->getName() + "." + Twine(ElementIdx), GV->getThreadLocalMode(),
- GV->getType()->getAddressSpace());
- NGV->setExternallyInitialized(GV->isExternallyInitialized());
- NGV->copyAttributesFrom(GV);
- NewGlobals.insert(std::make_pair(ElementIdx, NGV));
-
- if (StructType *STy = dyn_cast<StructType>(Ty)) {
- const StructLayout &Layout = *DL.getStructLayout(STy);
-
- // Calculate the known alignment of the field. If the original aggregate
- // had 256 byte alignment for example, something might depend on that:
- // propagate info to each field.
- uint64_t FieldOffset = Layout.getElementOffset(ElementIdx);
- Align NewAlign = commonAlignment(StartAlignment, FieldOffset);
- if (NewAlign > DL.getABITypeAlign(STy->getElementType(ElementIdx)))
- NGV->setAlignment(NewAlign);
-
- // Copy over the debug info for the variable.
- uint64_t Size = DL.getTypeAllocSizeInBits(NGV->getValueType());
- uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(ElementIdx);
- transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size, VarSize);
- } else {
- uint64_t EltSize = DL.getTypeAllocSize(ElTy);
- Align EltAlign = DL.getABITypeAlign(ElTy);
- uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy);
-
- // Calculate the known alignment of the field. If the original aggregate
- // had 256 byte alignment for example, something might depend on that:
- // propagate info to each field.
- Align NewAlign = commonAlignment(StartAlignment, EltSize * ElementIdx);
- if (NewAlign > EltAlign)
- NGV->setAlignment(NewAlign);
- transferSRADebugInfo(GV, NGV, FragmentSizeInBits * ElementIdx,
- FragmentSizeInBits, VarSize);
- }
+ Offset = Pair.first + DL.getTypeAllocSize(Pair.second);
}
- if (NewGlobals.empty())
+ // Some accesses go beyond the end of the global, don't bother.
+ if (Offset > DL.getTypeAllocSize(GV->getValueType()))
return nullptr;
- Module::GlobalListType &Globals = GV->getParent()->getGlobalList();
- for (auto NewGlobalVar : NewGlobals)
- Globals.push_back(NewGlobalVar.second);
+ // Collect initializers for new globals.
+ Constant *OrigInit = GV->getInitializer();
+ DenseMap<uint64_t, Constant *> Initializers;
+ for (const auto &Pair : Types) {
+ Constant *NewInit = ConstantFoldLoadFromConst(OrigInit, Pair.second,
+ APInt(64, Pair.first), DL);
+ if (!NewInit) {
+ LLVM_DEBUG(dbgs() << "Global SRA: Failed to evaluate initializer of "
+ << *GV << " with type " << *Pair.second << " at offset "
+ << Pair.first << "\n");
+ return nullptr;
+ }
+ Initializers.insert({Pair.first, NewInit});
+ }
LLVM_DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n");
- Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext()));
+ // Get the alignment of the global, either explicit or target-specific.
+ Align StartAlignment =
+ DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
+ uint64_t VarSize = DL.getTypeSizeInBits(GV->getValueType());
+
+ // Create replacement globals.
+ DenseMap<uint64_t, GlobalVariable *> NewGlobals;
+ unsigned NameSuffix = 0;
+ for (auto &Pair : TypesVector) {
+ uint64_t Offset = Pair.first;
+ Type *Ty = Pair.second;
+ GlobalVariable *NGV = new GlobalVariable(
+ *GV->getParent(), Ty, false, GlobalVariable::InternalLinkage,
+ Initializers[Offset], GV->getName() + "." + Twine(NameSuffix++), GV,
+ GV->getThreadLocalMode(), GV->getAddressSpace());
+ NGV->copyAttributesFrom(GV);
+ NewGlobals.insert({Offset, NGV});
+
+ // Calculate the known alignment of the field. If the original aggregate
+ // had 256 byte alignment for example, something might depend on that:
+ // propagate info to each field.
+ Align NewAlign = commonAlignment(StartAlignment, Offset);
+ if (NewAlign > DL.getABITypeAlign(Ty))
+ NGV->setAlignment(NewAlign);
+
+ // Copy over the debug info for the variable.
+ transferSRADebugInfo(GV, NGV, Offset * 8, DL.getTypeAllocSizeInBits(Ty),
+ VarSize);
+ }
+
+ // Replace uses of the original global with uses of the new global.
+ SmallVector<Value *, 16> Worklist;
+ SmallPtrSet<Value *, 16> Visited;
+ SmallVector<WeakTrackingVH, 16> DeadInsts;
+ auto AppendUsers = [&](Value *V) {
+ for (User *U : V->users())
+ if (Visited.insert(U).second)
+ Worklist.push_back(U);
+ };
+ AppendUsers(GV);
+ while (!Worklist.empty()) {
+ Value *V = Worklist.pop_back_val();
+ if (isa<BitCastOperator>(V) || isa<AddrSpaceCastOperator>(V) ||
+ isa<GEPOperator>(V)) {
+ AppendUsers(V);
+ if (isa<Instruction>(V))
+ DeadInsts.push_back(V);
+ continue;
+ }
- // Loop over all of the uses of the global, replacing the constantexpr geps,
- // with smaller constantexpr geps or direct references.
- while (!GV->use_empty()) {
- User *GEP = GV->user_back();
- assert(((isa<ConstantExpr>(GEP) &&
- cast<ConstantExpr>(GEP)->getOpcode()==Instruction::GetElementPtr)||
- isa<GetElementPtrInst>(GEP)) && "NonGEP CE's are not SRAable!");
-
- // Ignore the 1th operand, which has to be zero or else the program is quite
- // broken (undefined). Get the 2nd operand, which is the structure or array
- // index.
- unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
- assert(NewGlobals.count(ElementIdx) == 1);
-
- Value *NewPtr = NewGlobals[ElementIdx];
- Type *NewTy = NewGlobals[ElementIdx]->getValueType();
-
- // Form a shorter GEP if needed.
- if (GEP->getNumOperands() > 3) {
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GEP)) {
- SmallVector<Constant*, 8> Idxs;
- Idxs.push_back(NullInt);
- for (unsigned i = 3, e = CE->getNumOperands(); i != e; ++i)
- Idxs.push_back(CE->getOperand(i));
- NewPtr =
- ConstantExpr::getGetElementPtr(NewTy, cast<Constant>(NewPtr), Idxs);
+ if (Value *Ptr = getLoadStorePointerOperand(V)) {
+ APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+ Ptr = Ptr->stripAndAccumulateConstantOffsets(DL, Offset,
+ /* AllowNonInbounds */ true);
+ assert(Ptr == GV && "Load/store must be from/to global");
+ GlobalVariable *NGV = NewGlobals[Offset.getZExtValue()];
+ assert(NGV && "Must have replacement global for this offset");
+
+ // Update the pointer operand and recalculate alignment.
+ Align PrefAlign = DL.getPrefTypeAlign(getLoadStoreType(V));
+ Align NewAlign =
+ getOrEnforceKnownAlignment(NGV, PrefAlign, DL, cast<Instruction>(V));
+
+ if (auto *LI = dyn_cast<LoadInst>(V)) {
+ LI->setOperand(0, NGV);
+ LI->setAlignment(NewAlign);
} else {
- GetElementPtrInst *GEPI = cast<GetElementPtrInst>(GEP);
- SmallVector<Value*, 8> Idxs;
- Idxs.push_back(NullInt);
- for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i)
- Idxs.push_back(GEPI->getOperand(i));
- NewPtr = GetElementPtrInst::Create(
- NewTy, NewPtr, Idxs, GEPI->getName() + "." + Twine(ElementIdx),
- GEPI);
- }
- }
- GEP->replaceAllUsesWith(NewPtr);
-
- // We changed the pointer of any memory access user. Recalculate alignments.
- for (User *U : NewPtr->users()) {
- if (auto *Load = dyn_cast<LoadInst>(U)) {
- Align PrefAlign = DL.getPrefTypeAlign(Load->getType());
- Align NewAlign = getOrEnforceKnownAlignment(Load->getPointerOperand(),
- PrefAlign, DL, Load);
- Load->setAlignment(NewAlign);
- }
- if (auto *Store = dyn_cast<StoreInst>(U)) {
- Align PrefAlign =
- DL.getPrefTypeAlign(Store->getValueOperand()->getType());
- Align NewAlign = getOrEnforceKnownAlignment(Store->getPointerOperand(),
- PrefAlign, DL, Store);
- Store->setAlignment(NewAlign);
+ auto *SI = cast<StoreInst>(V);
+ SI->setOperand(1, NGV);
+ SI->setAlignment(NewAlign);
}
+ continue;
}
- if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(GEP))
- GEPI->eraseFromParent();
- else
- cast<ConstantExpr>(GEP)->destroyConstant();
+ assert(isa<Constant>(V) && isSafeToDestroyConstant(cast<Constant>(V)) &&
+ "Other users can only be dead constants");
}
- // Delete the old global, now that it is dead.
- Globals.erase(GV);
+ // Delete old instructions and global.
+ RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);
+ GV->removeDeadConstantUsers();
+ GV->eraseFromParent();
++NumSRA;
assert(NewGlobals.size() > 0);
@@ -677,7 +617,7 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V,
"Should be GlobalVariable");
// This and only this kind of non-signed ICmpInst is to be replaced with
// the comparing of the value of the created global init bool later in
- // optimizeGlobalAddressOfMalloc for the global variable.
+ // optimizeGlobalAddressOfAllocation for the global variable.
} else {
//cerr << "NONTRAPPING USE: " << *U;
return false;
@@ -895,29 +835,36 @@ static void ConstantPropUsersOf(Value *V, const DataLayout &DL,
/// to actually DO the malloc. Instead, turn the malloc into a global, and any
/// loads of GV as uses of the new global.
static GlobalVariable *
-OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
- ConstantInt *NElements, const DataLayout &DL,
- TargetLibraryInfo *TLI) {
+OptimizeGlobalAddressOfAllocation(GlobalVariable *GV, CallInst *CI,
+ uint64_t AllocSize, Constant *InitVal,
+ const DataLayout &DL,
+ TargetLibraryInfo *TLI) {
LLVM_DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << " CALL = " << *CI
<< '\n');
- Type *GlobalType;
- if (NElements->getZExtValue() == 1)
- GlobalType = AllocTy;
- else
- // If we have an array allocation, the global variable is of an array.
- GlobalType = ArrayType::get(AllocTy, NElements->getZExtValue());
+ // Create global of type [AllocSize x i8].
+ Type *GlobalType = ArrayType::get(Type::getInt8Ty(GV->getContext()),
+ AllocSize);
- // Create the new global variable. The contents of the malloc'd memory is
- // undefined, so initialize with an undef value.
+ // Create the new global variable. The contents of the allocated memory is
+ // undefined initially, so initialize with an undef value.
GlobalVariable *NewGV = new GlobalVariable(
*GV->getParent(), GlobalType, false, GlobalValue::InternalLinkage,
UndefValue::get(GlobalType), GV->getName() + ".body", nullptr,
GV->getThreadLocalMode());
- // If there are bitcast users of the malloc (which is typical, usually we have
- // a malloc + bitcast) then replace them with uses of the new global. Update
- // other users to use the global as well.
+ // Initialize the global at the point of the original call. Note that this
+ // is a different point from the initialization referred to below for the
+ // nullability handling. Sublety: We have not proven the original global was
+ // only initialized once. As such, we can not fold this into the initializer
+ // of the new global as may need to re-init the storage multiple times.
+ if (!isa<UndefValue>(InitVal)) {
+ IRBuilder<> Builder(CI->getNextNode());
+ // TODO: Use alignment above if align!=1
+ Builder.CreateMemSet(NewGV, InitVal, AllocSize, None);
+ }
+
+ // Update users of the allocation to use the new global instead.
BitCastInst *TheBC = nullptr;
while (!CI->use_empty()) {
Instruction *User = cast<Instruction>(CI->user_back());
@@ -1009,7 +956,7 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
} else
GV->getParent()->getGlobalList().insert(GV->getIterator(), InitBool);
- // Now the GV is dead, nuke it and the malloc..
+ // Now the GV is dead, nuke it and the allocation..
GV->eraseFromParent();
CI->eraseFromParent();
@@ -1066,15 +1013,33 @@ valueIsOnlyUsedLocallyOrStoredToOneGlobal(const CallInst *CI,
return true;
}
-/// This function is called when we see a pointer global variable with a single
-/// value stored it that is a malloc or cast of malloc.
-static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
- Type *AllocTy,
- AtomicOrdering Ordering,
- const DataLayout &DL,
- TargetLibraryInfo *TLI) {
- // If this is a malloc of an abstract type, don't touch it.
- if (!AllocTy->isSized())
+/// If we have a global that is only initialized with a fixed size allocation
+/// try to transform the program to use global memory instead of heap
+/// allocated memory. This eliminates dynamic allocation, avoids an indirection
+/// accessing the data, and exposes the resultant global to further GlobalOpt.
+static bool tryToOptimizeStoreOfAllocationToGlobal(GlobalVariable *GV,
+ CallInst *CI,
+ AtomicOrdering Ordering,
+ const DataLayout &DL,
+ TargetLibraryInfo *TLI) {
+ if (!isAllocRemovable(CI, TLI))
+ // Must be able to remove the call when we get done..
+ return false;
+
+ Type *Int8Ty = Type::getInt8Ty(CI->getFunction()->getContext());
+ Constant *InitVal = getInitialValueOfAllocation(CI, TLI, Int8Ty);
+ if (!InitVal)
+ // Must be able to emit a memset for initialization
+ return false;
+
+ uint64_t AllocSize;
+ if (!getObjectSize(CI, AllocSize, DL, TLI, ObjectSizeOpts()))
+ return false;
+
+ // Restrict this transformation to only working on small allocations
+ // (2048 bytes currently), as we don't want to introduce a 16M global or
+ // something.
+ if (AllocSize >= 2048)
return false;
// We can't optimize this global unless all uses of it are *known* to be
@@ -1093,25 +1058,8 @@ static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
if (!valueIsOnlyUsedLocallyOrStoredToOneGlobal(CI, GV))
return false;
- // If we have a global that is only initialized with a fixed size malloc,
- // transform the program to use global memory instead of malloc'd memory.
- // This eliminates dynamic allocation, avoids an indirection accessing the
- // data, and exposes the resultant global to further GlobalOpt.
- // We cannot optimize the malloc if we cannot determine malloc array size.
- Value *NElems = getMallocArraySize(CI, DL, TLI, true);
- if (!NElems)
- return false;
-
- if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems))
- // Restrict this transformation to only working on small allocations
- // (2048 bytes currently), as we don't want to introduce a 16M global or
- // something.
- if (NElements->getZExtValue() * DL.getTypeAllocSize(AllocTy) < 2048) {
- OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI);
- return true;
- }
-
- return false;
+ OptimizeGlobalAddressOfAllocation(GV, CI, AllocSize, InitVal, DL, TLI);
+ return true;
}
// Try to optimize globals based on the knowledge that only one value (besides
@@ -1140,12 +1088,12 @@ optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
// Optimize away any trapping uses of the loaded value.
if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, DL, GetTLI))
return true;
- } else if (CallInst *CI = extractMallocCall(StoredOnceVal, GetTLI)) {
- auto *TLI = &GetTLI(*CI->getFunction());
- Type *MallocType = getMallocAllocatedType(CI, TLI);
- if (MallocType && tryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType,
- Ordering, DL, TLI))
- return true;
+ } else if (isAllocationFn(StoredOnceVal, GetTLI)) {
+ if (auto *CI = dyn_cast<CallInst>(StoredOnceVal)) {
+ auto *TLI = &GetTLI(*CI->getFunction());
+ if (tryToOptimizeStoreOfAllocationToGlobal(GV, CI, Ordering, DL, TLI))
+ return true;
+ }
}
}
@@ -1171,9 +1119,12 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
// Walk the use list of the global seeing if all the uses are load or store.
// If there is anything else, bail out.
- for (User *U : GV->users())
+ for (User *U : GV->users()) {
if (!isa<LoadInst>(U) && !isa<StoreInst>(U))
return false;
+ if (getLoadStoreType(U) != GVElType)
+ return false;
+ }
LLVM_DEBUG(dbgs() << " *** SHRINKING TO BOOL: " << *GV << "\n");
@@ -1590,11 +1541,25 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
// This is restricted to address spaces that allow globals to have
// initializers. NVPTX, for example, does not support initializers for
// shared memory (AS 3).
- if (SOVConstant && SOVConstant->getType() == GV->getValueType() &&
- isa<UndefValue>(GV->getInitializer()) &&
+ if (SOVConstant && isa<UndefValue>(GV->getInitializer()) &&
+ DL.getTypeAllocSize(SOVConstant->getType()) ==
+ DL.getTypeAllocSize(GV->getValueType()) &&
CanHaveNonUndefGlobalInitializer) {
- // Change the initial value here.
- GV->setInitializer(SOVConstant);
+ if (SOVConstant->getType() == GV->getValueType()) {
+ // Change the initializer in place.
+ GV->setInitializer(SOVConstant);
+ } else {
+ // Create a new global with adjusted type.
+ auto *NGV = new GlobalVariable(
+ *GV->getParent(), SOVConstant->getType(), GV->isConstant(),
+ GV->getLinkage(), SOVConstant, "", GV, GV->getThreadLocalMode(),
+ GV->getAddressSpace());
+ NGV->takeName(GV);
+ NGV->copyAttributesFrom(GV);
+ GV->replaceAllUsesWith(ConstantExpr::getBitCast(NGV, GV->getType()));
+ GV->eraseFromParent();
+ GV = NGV;
+ }
// Clean up any obviously simplifiable users now.
CleanupConstantGlobalUsers(GV, DL);
@@ -2066,194 +2031,6 @@ OptimizeGlobalVars(Module &M,
return Changed;
}
-/// Evaluate a piece of a constantexpr store into a global initializer. This
-/// returns 'Init' modified to reflect 'Val' stored into it. At this point, the
-/// GEP operands of Addr [0, OpNo) have been stepped into.
-static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
- ConstantExpr *Addr, unsigned OpNo) {
- // Base case of the recursion.
- if (OpNo == Addr->getNumOperands()) {
- assert(Val->getType() == Init->getType() && "Type mismatch!");
- return Val;
- }
-
- SmallVector<Constant*, 32> Elts;
- if (StructType *STy = dyn_cast<StructType>(Init->getType())) {
- // Break up the constant into its elements.
- for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
- Elts.push_back(Init->getAggregateElement(i));
-
- // Replace the element that we are supposed to.
- ConstantInt *CU = cast<ConstantInt>(Addr->getOperand(OpNo));
- unsigned Idx = CU->getZExtValue();
- assert(Idx < STy->getNumElements() && "Struct index out of range!");
- Elts[Idx] = EvaluateStoreInto(Elts[Idx], Val, Addr, OpNo+1);
-
- // Return the modified struct.
- return ConstantStruct::get(STy, Elts);
- }
-
- ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo));
- uint64_t NumElts;
- if (ArrayType *ATy = dyn_cast<ArrayType>(Init->getType()))
- NumElts = ATy->getNumElements();
- else
- NumElts = cast<FixedVectorType>(Init->getType())->getNumElements();
-
- // Break up the array into elements.
- for (uint64_t i = 0, e = NumElts; i != e; ++i)
- Elts.push_back(Init->getAggregateElement(i));
-
- assert(CI->getZExtValue() < NumElts);
- Elts[CI->getZExtValue()] =
- EvaluateStoreInto(Elts[CI->getZExtValue()], Val, Addr, OpNo+1);
-
- if (Init->getType()->isArrayTy())
- return ConstantArray::get(cast<ArrayType>(Init->getType()), Elts);
- return ConstantVector::get(Elts);
-}
-
-/// We have decided that Addr (which satisfies the predicate
-/// isSimpleEnoughPointerToCommit) should get Val as its value. Make it happen.
-static void CommitValueTo(Constant *Val, Constant *Addr) {
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
- assert(GV->hasInitializer());
- GV->setInitializer(Val);
- return;
- }
-
- ConstantExpr *CE = cast<ConstantExpr>(Addr);
- GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
- GV->setInitializer(EvaluateStoreInto(GV->getInitializer(), Val, CE, 2));
-}
-
-/// Given a map of address -> value, where addresses are expected to be some form
-/// of either a global or a constant GEP, set the initializer for the address to
-/// be the value. This performs mostly the same function as CommitValueTo()
-/// and EvaluateStoreInto() but is optimized to be more efficient for the common
-/// case where the set of addresses are GEPs sharing the same underlying global,
-/// processing the GEPs in batches rather than individually.
-///
-/// To give an example, consider the following C++ code adapted from the clang
-/// regression tests:
-/// struct S {
-/// int n = 10;
-/// int m = 2 * n;
-/// S(int a) : n(a) {}
-/// };
-///
-/// template<typename T>
-/// struct U {
-/// T *r = &q;
-/// T q = 42;
-/// U *p = this;
-/// };
-///
-/// U<S> e;
-///
-/// The global static constructor for 'e' will need to initialize 'r' and 'p' of
-/// the outer struct, while also initializing the inner 'q' structs 'n' and 'm'
-/// members. This batch algorithm will simply use general CommitValueTo() method
-/// to handle the complex nested S struct initialization of 'q', before
-/// processing the outermost members in a single batch. Using CommitValueTo() to
-/// handle member in the outer struct is inefficient when the struct/array is
-/// very large as we end up creating and destroy constant arrays for each
-/// initialization.
-/// For the above case, we expect the following IR to be generated:
-///
-/// %struct.U = type { %struct.S*, %struct.S, %struct.U* }
-/// %struct.S = type { i32, i32 }
-/// @e = global %struct.U { %struct.S* gep inbounds (%struct.U, %struct.U* @e,
-/// i64 0, i32 1),
-/// %struct.S { i32 42, i32 84 }, %struct.U* @e }
-/// The %struct.S { i32 42, i32 84 } inner initializer is treated as a complex
-/// constant expression, while the other two elements of @e are "simple".
-static void BatchCommitValueTo(const DenseMap<Constant*, Constant*> &Mem) {
- SmallVector<std::pair<GlobalVariable*, Constant*>, 32> GVs;
- SmallVector<std::pair<ConstantExpr*, Constant*>, 32> ComplexCEs;
- SmallVector<std::pair<ConstantExpr*, Constant*>, 32> SimpleCEs;
- SimpleCEs.reserve(Mem.size());
-
- for (const auto &I : Mem) {
- if (auto *GV = dyn_cast<GlobalVariable>(I.first)) {
- GVs.push_back(std::make_pair(GV, I.second));
- } else {
- ConstantExpr *GEP = cast<ConstantExpr>(I.first);
- // We don't handle the deeply recursive case using the batch method.
- if (GEP->getNumOperands() > 3)
- ComplexCEs.push_back(std::make_pair(GEP, I.second));
- else
- SimpleCEs.push_back(std::make_pair(GEP, I.second));
- }
- }
-
- // The algorithm below doesn't handle cases like nested structs, so use the
- // slower fully general method if we have to.
- for (auto ComplexCE : ComplexCEs)
- CommitValueTo(ComplexCE.second, ComplexCE.first);
-
- for (auto GVPair : GVs) {
- assert(GVPair.first->hasInitializer());
- GVPair.first->setInitializer(GVPair.second);
- }
-
- if (SimpleCEs.empty())
- return;
-
- // We cache a single global's initializer elements in the case where the
- // subsequent address/val pair uses the same one. This avoids throwing away and
- // rebuilding the constant struct/vector/array just because one element is
- // modified at a time.
- SmallVector<Constant *, 32> Elts;
- Elts.reserve(SimpleCEs.size());
- GlobalVariable *CurrentGV = nullptr;
-
- auto commitAndSetupCache = [&](GlobalVariable *GV, bool Update) {
- Constant *Init = GV->getInitializer();
- Type *Ty = Init->getType();
- if (Update) {
- if (CurrentGV) {
- assert(CurrentGV && "Expected a GV to commit to!");
- Type *CurrentInitTy = CurrentGV->getInitializer()->getType();
- // We have a valid cache that needs to be committed.
- if (StructType *STy = dyn_cast<StructType>(CurrentInitTy))
- CurrentGV->setInitializer(ConstantStruct::get(STy, Elts));
- else if (ArrayType *ArrTy = dyn_cast<ArrayType>(CurrentInitTy))
- CurrentGV->setInitializer(ConstantArray::get(ArrTy, Elts));
- else
- CurrentGV->setInitializer(ConstantVector::get(Elts));
- }
- if (CurrentGV == GV)
- return;
- // Need to clear and set up cache for new initializer.
- CurrentGV = GV;
- Elts.clear();
- unsigned NumElts;
- if (auto *STy = dyn_cast<StructType>(Ty))
- NumElts = STy->getNumElements();
- else if (auto *ATy = dyn_cast<ArrayType>(Ty))
- NumElts = ATy->getNumElements();
- else
- NumElts = cast<FixedVectorType>(Ty)->getNumElements();
- for (unsigned i = 0, e = NumElts; i != e; ++i)
- Elts.push_back(Init->getAggregateElement(i));
- }
- };
-
- for (auto CEPair : SimpleCEs) {
- ConstantExpr *GEP = CEPair.first;
- Constant *Val = CEPair.second;
-
- GlobalVariable *GV = cast<GlobalVariable>(GEP->getOperand(0));
- commitAndSetupCache(GV, GV != CurrentGV);
- ConstantInt *CI = cast<ConstantInt>(GEP->getOperand(2));
- Elts[CI->getZExtValue()] = Val;
- }
- // The last initializer in the list needs to be committed, others
- // will be committed on a new initializer being processed.
- commitAndSetupCache(CurrentGV, true);
-}
-
/// Evaluate static constructors in the function, if we can. Return true if we
/// can, false otherwise.
static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
@@ -2268,10 +2045,12 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
++NumCtorsEvaluated;
// We succeeded at evaluation: commit the result.
+ auto NewInitializers = Eval.getMutatedInitializers();
LLVM_DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
- << F->getName() << "' to "
- << Eval.getMutatedMemory().size() << " stores.\n");
- BatchCommitValueTo(Eval.getMutatedMemory());
+ << F->getName() << "' to " << NewInitializers.size()
+ << " stores.\n");
+ for (const auto &Pair : NewInitializers)
+ Pair.first->setInitializer(Pair.second);
for (GlobalVariable *GV : Eval.getInvariants())
GV->setConstant(true);
}
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index b8a314c54f18..e064fbbef595 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -36,8 +36,14 @@ using namespace IRSimilarity;
// A command flag to be used for debugging to exclude branches from similarity
// matching and outlining.
+namespace llvm {
extern cl::opt<bool> DisableBranches;
+// A command flag to be used for debugging to indirect calls from similarity
+// matching and outlining.
+extern cl::opt<bool> DisableIndirectCalls;
+} // namespace llvm
+
// Set to true if the user wants the ir outliner to run on linkonceodr linkage
// functions. This is false by default because the linker can dedupe linkonceodr
// functions. Since the outliner is confined to a single module (modulo LTO),
@@ -104,6 +110,16 @@ struct OutlinableGroup {
/// of the region.
unsigned BranchesToOutside = 0;
+ /// Tracker counting backwards from the highest unsigned value possible to
+ /// avoid conflicting with the GVNs of assigned values. We start at -3 since
+ /// -2 and -1 are assigned by the DenseMap.
+ unsigned PHINodeGVNTracker = -3;
+
+ DenseMap<unsigned,
+ std::pair<std::pair<unsigned, unsigned>, SmallVector<unsigned, 2>>>
+ PHINodeGVNToGVNs;
+ DenseMap<hash_code, unsigned> GVNsToPHINodeGVN;
+
/// The number of instructions that will be outlined by extracting \ref
/// Regions.
InstructionCost Benefit = 0;
@@ -169,6 +185,44 @@ Value *OutlinableRegion::findCorrespondingValueIn(const OutlinableRegion &Other,
return FoundValueOpt.getValueOr(nullptr);
}
+/// Rewrite the BranchInsts in the incoming blocks to \p PHIBlock that are found
+/// in \p Included to branch to BasicBlock \p Replace if they currently branch
+/// to the BasicBlock \p Find. This is used to fix up the incoming basic blocks
+/// when PHINodes are included in outlined regions.
+///
+/// \param PHIBlock - The BasicBlock containing the PHINodes that need to be
+/// checked.
+/// \param Find - The successor block to be replaced.
+/// \param Replace - The new succesor block to branch to.
+/// \param Included - The set of blocks about to be outlined.
+static void replaceTargetsFromPHINode(BasicBlock *PHIBlock, BasicBlock *Find,
+ BasicBlock *Replace,
+ DenseSet<BasicBlock *> &Included) {
+ for (PHINode &PN : PHIBlock->phis()) {
+ for (unsigned Idx = 0, PNEnd = PN.getNumIncomingValues(); Idx != PNEnd;
+ ++Idx) {
+ // Check if the incoming block is included in the set of blocks being
+ // outlined.
+ BasicBlock *Incoming = PN.getIncomingBlock(Idx);
+ if (!Included.contains(Incoming))
+ continue;
+
+ BranchInst *BI = dyn_cast<BranchInst>(Incoming->getTerminator());
+ assert(BI && "Not a branch instruction?");
+ // Look over the branching instructions into this block to see if we
+ // used to branch to Find in this outlined block.
+ for (unsigned Succ = 0, End = BI->getNumSuccessors(); Succ != End;
+ Succ++) {
+ // If we have found the block to replace, we do so here.
+ if (BI->getSuccessor(Succ) != Find)
+ continue;
+ BI->setSuccessor(Succ, Replace);
+ }
+ }
+ }
+}
+
+
void OutlinableRegion::splitCandidate() {
assert(!CandidateSplit && "Candidate already split!");
@@ -199,6 +253,39 @@ void OutlinableRegion::splitCandidate() {
StartBB = StartInst->getParent();
PrevBB = StartBB;
+ DenseSet<BasicBlock *> BBSet;
+ Candidate->getBasicBlocks(BBSet);
+
+ // We iterate over the instructions in the region, if we find a PHINode, we
+ // check if there are predecessors outside of the region, if there are,
+ // we ignore this region since we are unable to handle the severing of the
+ // phi node right now.
+ BasicBlock::iterator It = StartInst->getIterator();
+ while (PHINode *PN = dyn_cast<PHINode>(&*It)) {
+ unsigned NumPredsOutsideRegion = 0;
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ if (!BBSet.contains(PN->getIncomingBlock(i)))
+ ++NumPredsOutsideRegion;
+
+ if (NumPredsOutsideRegion > 1)
+ return;
+
+ It++;
+ }
+
+ // If the region starts with a PHINode, but is not the initial instruction of
+ // the BasicBlock, we ignore this region for now.
+ if (isa<PHINode>(StartInst) && StartInst != &*StartBB->begin())
+ return;
+
+ // If the region ends with a PHINode, but does not contain all of the phi node
+ // instructions of the region, we ignore it for now.
+ if (isa<PHINode>(BackInst)) {
+ EndBB = BackInst->getParent();
+ if (BackInst != &*std::prev(EndBB->getFirstInsertionPt()))
+ return;
+ }
+
// The basic block gets split like so:
// block: block:
// inst1 inst1
@@ -225,12 +312,20 @@ void OutlinableRegion::splitCandidate() {
FollowBB = EndBB->splitBasicBlock(EndInst, OriginalName + "_after_outline");
EndBB->replaceSuccessorsPhiUsesWith(EndBB, FollowBB);
FollowBB->replaceSuccessorsPhiUsesWith(PrevBB, FollowBB);
- return;
+ } else {
+ EndBB = BackInst->getParent();
+ EndsInBranch = true;
+ FollowBB = nullptr;
}
- EndBB = BackInst->getParent();
- EndsInBranch = true;
- FollowBB = nullptr;
+ // Refind the basic block set.
+ BBSet.clear();
+ Candidate->getBasicBlocks(BBSet);
+ // For the phi nodes in the new starting basic block of the region, we
+ // reassign the targets of the basic blocks branching instructions.
+ replaceTargetsFromPHINode(StartBB, PrevBB, StartBB, BBSet);
+ if (FollowBB)
+ replaceTargetsFromPHINode(FollowBB, EndBB, FollowBB, BBSet);
}
void OutlinableRegion::reattachCandidate() {
@@ -252,15 +347,21 @@ void OutlinableRegion::reattachCandidate() {
// inst4
assert(StartBB != nullptr && "StartBB for Candidate is not defined!");
- // StartBB should only have one predecessor since we put an unconditional
- // branch at the end of PrevBB when we split the BasicBlock.
- PrevBB = StartBB->getSinglePredecessor();
- assert(PrevBB != nullptr &&
- "No Predecessor for the region start basic block!");
-
assert(PrevBB->getTerminator() && "Terminator removed from PrevBB!");
PrevBB->getTerminator()->eraseFromParent();
+ // If we reattaching after outlining, we iterate over the phi nodes to
+ // the initial block, and reassign the branch instructions of the incoming
+ // blocks to the block we are remerging into.
+ if (!ExtractedFunction) {
+ DenseSet<BasicBlock *> BBSet;
+ Candidate->getBasicBlocks(BBSet);
+
+ replaceTargetsFromPHINode(StartBB, StartBB, PrevBB, BBSet);
+ if (!EndsInBranch)
+ replaceTargetsFromPHINode(FollowBB, FollowBB, EndBB, BBSet);
+ }
+
moveBBContents(*StartBB, *PrevBB);
BasicBlock *PlacementBB = PrevBB;
@@ -354,6 +455,24 @@ InstructionCost OutlinableRegion::getBenefit(TargetTransformInfo &TTI) {
return Benefit;
}
+/// Check the \p OutputMappings structure for value \p Input, if it exists
+/// it has been used as an output for outlining, and has been renamed, and we
+/// return the new value, otherwise, we return the same value.
+///
+/// \param OutputMappings [in] - The mapping of values to their renamed value
+/// after being used as an output for an outlined region.
+/// \param Input [in] - The value to find the remapped value of, if it exists.
+/// \return The remapped value if it has been renamed, and the same value if has
+/// not.
+static Value *findOutputMapping(const DenseMap<Value *, Value *> OutputMappings,
+ Value *Input) {
+ DenseMap<Value *, Value *>::const_iterator OutputMapping =
+ OutputMappings.find(Input);
+ if (OutputMapping != OutputMappings.end())
+ return OutputMapping->second;
+ return Input;
+}
+
/// Find whether \p Region matches the global value numbering to Constant
/// mapping found so far.
///
@@ -830,6 +949,209 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region,
Region.NumExtractedInputs = OriginalIndex;
}
+/// Check if the \p V has any uses outside of the region other than \p PN.
+///
+/// \param V [in] - The value to check.
+/// \param PHILoc [in] - The location in the PHINode of \p V.
+/// \param PN [in] - The PHINode using \p V.
+/// \param Exits [in] - The potential blocks we exit to from the outlined
+/// region.
+/// \param BlocksInRegion [in] - The basic blocks contained in the region.
+/// \returns true if \p V has any use soutside its region other than \p PN.
+static bool outputHasNonPHI(Value *V, unsigned PHILoc, PHINode &PN,
+ SmallPtrSet<BasicBlock *, 1> &Exits,
+ DenseSet<BasicBlock *> &BlocksInRegion) {
+ // We check to see if the value is used by the PHINode from some other
+ // predecessor not included in the region. If it is, we make sure
+ // to keep it as an output.
+ SmallVector<unsigned, 2> IncomingNumbers(PN.getNumIncomingValues());
+ std::iota(IncomingNumbers.begin(), IncomingNumbers.end(), 0);
+ if (any_of(IncomingNumbers, [PHILoc, &PN, V, &BlocksInRegion](unsigned Idx) {
+ return (Idx != PHILoc && V == PN.getIncomingValue(Idx) &&
+ !BlocksInRegion.contains(PN.getIncomingBlock(Idx)));
+ }))
+ return true;
+
+ // Check if the value is used by any other instructions outside the region.
+ return any_of(V->users(), [&Exits, &BlocksInRegion](User *U) {
+ Instruction *I = dyn_cast<Instruction>(U);
+ if (!I)
+ return false;
+
+ // If the use of the item is inside the region, we skip it. Uses
+ // inside the region give us useful information about how the item could be
+ // used as an output.
+ BasicBlock *Parent = I->getParent();
+ if (BlocksInRegion.contains(Parent))
+ return false;
+
+ // If it's not a PHINode then we definitely know the use matters. This
+ // output value will not completely combined with another item in a PHINode
+ // as it is directly reference by another non-phi instruction
+ if (!isa<PHINode>(I))
+ return true;
+
+ // If we have a PHINode outside one of the exit locations, then it
+ // can be considered an outside use as well. If there is a PHINode
+ // contained in the Exit where this values use matters, it will be
+ // caught when we analyze that PHINode.
+ if (!Exits.contains(Parent))
+ return true;
+
+ return false;
+ });
+}
+
+/// Test whether \p CurrentExitFromRegion contains any PhiNodes that should be
+/// considered outputs. A PHINodes is an output when more than one incoming
+/// value has been marked by the CodeExtractor as an output.
+///
+/// \param CurrentExitFromRegion [in] - The block to analyze.
+/// \param PotentialExitsFromRegion [in] - The potential exit blocks from the
+/// region.
+/// \param RegionBlocks [in] - The basic blocks in the region.
+/// \param Outputs [in, out] - The existing outputs for the region, we may add
+/// PHINodes to this as we find that they replace output values.
+/// \param OutputsReplacedByPHINode [out] - A set containing outputs that are
+/// totally replaced by a PHINode.
+/// \param OutputsWithNonPhiUses [out] - A set containing outputs that are used
+/// in PHINodes, but have other uses, and should still be considered outputs.
+static void analyzeExitPHIsForOutputUses(
+ BasicBlock *CurrentExitFromRegion,
+ SmallPtrSet<BasicBlock *, 1> &PotentialExitsFromRegion,
+ DenseSet<BasicBlock *> &RegionBlocks, SetVector<Value *> &Outputs,
+ DenseSet<Value *> &OutputsReplacedByPHINode,
+ DenseSet<Value *> &OutputsWithNonPhiUses) {
+ for (PHINode &PN : CurrentExitFromRegion->phis()) {
+ // Find all incoming values from the outlining region.
+ SmallVector<unsigned, 2> IncomingVals;
+ for (unsigned I = 0, E = PN.getNumIncomingValues(); I < E; ++I)
+ if (RegionBlocks.contains(PN.getIncomingBlock(I)))
+ IncomingVals.push_back(I);
+
+ // Do not process PHI if there are no predecessors from region.
+ unsigned NumIncomingVals = IncomingVals.size();
+ if (NumIncomingVals == 0)
+ continue;
+
+ // If there is one predecessor, we mark it as a value that needs to be kept
+ // as an output.
+ if (NumIncomingVals == 1) {
+ Value *V = PN.getIncomingValue(*IncomingVals.begin());
+ OutputsWithNonPhiUses.insert(V);
+ OutputsReplacedByPHINode.erase(V);
+ continue;
+ }
+
+ // This PHINode will be used as an output value, so we add it to our list.
+ Outputs.insert(&PN);
+
+ // Not all of the incoming values should be ignored as other inputs and
+ // outputs may have uses in outlined region. If they have other uses
+ // outside of the single PHINode we should not skip over it.
+ for (unsigned Idx : IncomingVals) {
+ Value *V = PN.getIncomingValue(Idx);
+ if (outputHasNonPHI(V, Idx, PN, PotentialExitsFromRegion, RegionBlocks)) {
+ OutputsWithNonPhiUses.insert(V);
+ OutputsReplacedByPHINode.erase(V);
+ continue;
+ }
+ if (!OutputsWithNonPhiUses.contains(V))
+ OutputsReplacedByPHINode.insert(V);
+ }
+ }
+}
+
+// Represents the type for the unsigned number denoting the output number for
+// phi node, along with the canonical number for the exit block.
+using ArgLocWithBBCanon = std::pair<unsigned, unsigned>;
+// The list of canonical numbers for the incoming values to a PHINode.
+using CanonList = SmallVector<unsigned, 2>;
+// The pair type representing the set of canonical values being combined in the
+// PHINode, along with the location data for the PHINode.
+using PHINodeData = std::pair<ArgLocWithBBCanon, CanonList>;
+
+/// Encode \p PND as an integer for easy lookup based on the argument location,
+/// the parent BasicBlock canonical numbering, and the canonical numbering of
+/// the values stored in the PHINode.
+///
+/// \param PND - The data to hash.
+/// \returns The hash code of \p PND.
+static hash_code encodePHINodeData(PHINodeData &PND) {
+ return llvm::hash_combine(
+ llvm::hash_value(PND.first.first), llvm::hash_value(PND.first.second),
+ llvm::hash_combine_range(PND.second.begin(), PND.second.end()));
+}
+
+/// Create a special GVN for PHINodes that will be used outside of
+/// the region. We create a hash code based on the Canonical number of the
+/// parent BasicBlock, the canonical numbering of the values stored in the
+/// PHINode and the aggregate argument location. This is used to find whether
+/// this PHINode type has been given a canonical numbering already. If not, we
+/// assign it a value and store it for later use. The value is returned to
+/// identify different output schemes for the set of regions.
+///
+/// \param Region - The region that \p PN is an output for.
+/// \param PN - The PHINode we are analyzing.
+/// \param AggArgIdx - The argument \p PN will be stored into.
+/// \returns An optional holding the assigned canonical number, or None if
+/// there is some attribute of the PHINode blocking it from being used.
+static Optional<unsigned> getGVNForPHINode(OutlinableRegion &Region,
+ PHINode *PN, unsigned AggArgIdx) {
+ OutlinableGroup &Group = *Region.Parent;
+ IRSimilarityCandidate &Cand = *Region.Candidate;
+ BasicBlock *PHIBB = PN->getParent();
+ CanonList PHIGVNs;
+ for (Value *Incoming : PN->incoming_values()) {
+ // If we cannot find a GVN, this means that the input to the PHINode is
+ // not included in the region we are trying to analyze, meaning, that if
+ // it was outlined, we would be adding an extra input. We ignore this
+ // case for now, and so ignore the region.
+ Optional<unsigned> OGVN = Cand.getGVN(Incoming);
+ if (!OGVN.hasValue()) {
+ Region.IgnoreRegion = true;
+ return None;
+ }
+
+ // Collect the canonical numbers of the values in the PHINode.
+ unsigned GVN = OGVN.getValue();
+ OGVN = Cand.getCanonicalNum(GVN);
+ assert(OGVN.hasValue() && "No GVN found for incoming value?");
+ PHIGVNs.push_back(*OGVN);
+ }
+
+ // Now that we have the GVNs for the incoming values, we are going to combine
+ // them with the GVN of the incoming bock, and the output location of the
+ // PHINode to generate a hash value representing this instance of the PHINode.
+ DenseMap<hash_code, unsigned>::iterator GVNToPHIIt;
+ DenseMap<unsigned, PHINodeData>::iterator PHIToGVNIt;
+ Optional<unsigned> BBGVN = Cand.getGVN(PHIBB);
+ assert(BBGVN.hasValue() && "Could not find GVN for the incoming block!");
+
+ BBGVN = Cand.getCanonicalNum(BBGVN.getValue());
+ assert(BBGVN.hasValue() &&
+ "Could not find canonical number for the incoming block!");
+ // Create a pair of the exit block canonical value, and the aggregate
+ // argument location, connected to the canonical numbers stored in the
+ // PHINode.
+ PHINodeData TemporaryPair =
+ std::make_pair(std::make_pair(BBGVN.getValue(), AggArgIdx), PHIGVNs);
+ hash_code PHINodeDataHash = encodePHINodeData(TemporaryPair);
+
+ // Look for and create a new entry in our connection between canonical
+ // numbers for PHINodes, and the set of objects we just created.
+ GVNToPHIIt = Group.GVNsToPHINodeGVN.find(PHINodeDataHash);
+ if (GVNToPHIIt == Group.GVNsToPHINodeGVN.end()) {
+ bool Inserted = false;
+ std::tie(PHIToGVNIt, Inserted) = Group.PHINodeGVNToGVNs.insert(
+ std::make_pair(Group.PHINodeGVNTracker, TemporaryPair));
+ std::tie(GVNToPHIIt, Inserted) = Group.GVNsToPHINodeGVN.insert(
+ std::make_pair(PHINodeDataHash, Group.PHINodeGVNTracker--));
+ }
+
+ return GVNToPHIIt->second;
+}
+
/// Create a mapping of the output arguments for the \p Region to the output
/// arguments of the overall outlined function.
///
@@ -842,35 +1164,25 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
IRSimilarityCandidate &C = *Region.Candidate;
SmallVector<BasicBlock *> BE;
- DenseSet<BasicBlock *> BBSet;
- C.getBasicBlocks(BBSet, BE);
+ DenseSet<BasicBlock *> BlocksInRegion;
+ C.getBasicBlocks(BlocksInRegion, BE);
// Find the exits to the region.
SmallPtrSet<BasicBlock *, 1> Exits;
for (BasicBlock *Block : BE)
for (BasicBlock *Succ : successors(Block))
- if (!BBSet.contains(Succ))
+ if (!BlocksInRegion.contains(Succ))
Exits.insert(Succ);
// After determining which blocks exit to PHINodes, we add these PHINodes to
// the set of outputs to be processed. We also check the incoming values of
// the PHINodes for whether they should no longer be considered outputs.
- for (BasicBlock *ExitBB : Exits) {
- for (PHINode &PN : ExitBB->phis()) {
- // Find all incoming values from the outlining region.
- SmallVector<unsigned, 2> IncomingVals;
- for (unsigned Idx = 0; Idx < PN.getNumIncomingValues(); ++Idx)
- if (BBSet.contains(PN.getIncomingBlock(Idx)))
- IncomingVals.push_back(Idx);
-
- // Do not process PHI if there is one (or fewer) predecessor from region.
- if (IncomingVals.size() <= 1)
- continue;
-
- Region.IgnoreRegion = true;
- return;
- }
- }
+ DenseSet<Value *> OutputsReplacedByPHINode;
+ DenseSet<Value *> OutputsWithNonPhiUses;
+ for (BasicBlock *ExitBB : Exits)
+ analyzeExitPHIsForOutputUses(ExitBB, Exits, BlocksInRegion, Outputs,
+ OutputsReplacedByPHINode,
+ OutputsWithNonPhiUses);
// This counts the argument number in the extracted function.
unsigned OriginalIndex = Region.NumExtractedInputs;
@@ -893,9 +1205,13 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
// do not have to be in same order, but are functionally the same, we will
// have to use a different scheme, as one-to-one correspondence is not
// guaranteed.
- unsigned GlobalValue = C.getGVN(Output).getValue();
unsigned ArgumentSize = Group.ArgumentTypes.size();
+ // If the output is combined in a PHINode, we make sure to skip over it.
+ if (OutputsReplacedByPHINode.contains(Output))
+ continue;
+
+ unsigned AggArgIdx = 0;
for (unsigned Jdx = TypeIndex; Jdx < ArgumentSize; Jdx++) {
if (Group.ArgumentTypes[Jdx] != PointerType::getUnqual(Output->getType()))
continue;
@@ -907,7 +1223,7 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
AggArgsUsed.insert(Jdx);
Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, Jdx));
Region.AggArgToExtracted.insert(std::make_pair(Jdx, OriginalIndex));
- Region.GVNStores.push_back(GlobalValue);
+ AggArgIdx = Jdx;
break;
}
@@ -916,18 +1232,54 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
// function to handle this output and create a mapping to it.
if (!TypeFound) {
Group.ArgumentTypes.push_back(PointerType::getUnqual(Output->getType()));
- AggArgsUsed.insert(Group.ArgumentTypes.size() - 1);
+ // Mark the new pointer type as the last value in the aggregate argument
+ // list.
+ unsigned ArgTypeIdx = Group.ArgumentTypes.size() - 1;
+ AggArgsUsed.insert(ArgTypeIdx);
Region.ExtractedArgToAgg.insert(
- std::make_pair(OriginalIndex, Group.ArgumentTypes.size() - 1));
+ std::make_pair(OriginalIndex, ArgTypeIdx));
Region.AggArgToExtracted.insert(
- std::make_pair(Group.ArgumentTypes.size() - 1, OriginalIndex));
- Region.GVNStores.push_back(GlobalValue);
+ std::make_pair(ArgTypeIdx, OriginalIndex));
+ AggArgIdx = ArgTypeIdx;
+ }
+
+ // TODO: Adapt to the extra input from the PHINode.
+ PHINode *PN = dyn_cast<PHINode>(Output);
+
+ Optional<unsigned> GVN;
+ if (PN && !BlocksInRegion.contains(PN->getParent())) {
+ // Values outside the region can be combined into PHINode when we
+ // have multiple exits. We collect both of these into a list to identify
+ // which values are being used in the PHINode. Each list identifies a
+ // different PHINode, and a different output. We store the PHINode as it's
+ // own canonical value. These canonical values are also dependent on the
+ // output argument it is saved to.
+
+ // If two PHINodes have the same canonical values, but different aggregate
+ // argument locations, then they will have distinct Canonical Values.
+ GVN = getGVNForPHINode(Region, PN, AggArgIdx);
+ if (!GVN.hasValue())
+ return;
+ } else {
+ // If we do not have a PHINode we use the global value numbering for the
+ // output value, to find the canonical number to add to the set of stored
+ // values.
+ GVN = C.getGVN(Output);
+ GVN = C.getCanonicalNum(*GVN);
}
- stable_sort(Region.GVNStores);
+ // Each region has a potentially unique set of outputs. We save which
+ // values are output in a list of canonical values so we can differentiate
+ // among the different store schemes.
+ Region.GVNStores.push_back(*GVN);
+
OriginalIndex++;
TypeIndex++;
}
+
+ // We sort the stored values to make sure that we are not affected by analysis
+ // order when determining what combination of items were stored.
+ stable_sort(Region.GVNStores);
}
void IROutliner::findAddInputsOutputs(Module &M, OutlinableRegion &Region,
@@ -1063,6 +1415,214 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) {
return Call;
}
+/// Find or create a BasicBlock in the outlined function containing PhiBlocks
+/// for \p RetVal.
+///
+/// \param Group - The OutlinableGroup containing the information about the
+/// overall outlined function.
+/// \param RetVal - The return value or exit option that we are currently
+/// evaluating.
+/// \returns The found or newly created BasicBlock to contain the needed
+/// PHINodes to be used as outputs.
+static BasicBlock *findOrCreatePHIBlock(OutlinableGroup &Group, Value *RetVal) {
+ DenseMap<Value *, BasicBlock *>::iterator PhiBlockForRetVal,
+ ReturnBlockForRetVal;
+ PhiBlockForRetVal = Group.PHIBlocks.find(RetVal);
+ ReturnBlockForRetVal = Group.EndBBs.find(RetVal);
+ assert(ReturnBlockForRetVal != Group.EndBBs.end() &&
+ "Could not find output value!");
+ BasicBlock *ReturnBB = ReturnBlockForRetVal->second;
+
+ // Find if a PHIBlock exists for this return value already. If it is
+ // the first time we are analyzing this, we will not, so we record it.
+ PhiBlockForRetVal = Group.PHIBlocks.find(RetVal);
+ if (PhiBlockForRetVal != Group.PHIBlocks.end())
+ return PhiBlockForRetVal->second;
+
+ // If we did not find a block, we create one, and insert it into the
+ // overall function and record it.
+ bool Inserted = false;
+ BasicBlock *PHIBlock = BasicBlock::Create(ReturnBB->getContext(), "phi_block",
+ ReturnBB->getParent());
+ std::tie(PhiBlockForRetVal, Inserted) =
+ Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock));
+
+ // We find the predecessors of the return block in the newly created outlined
+ // function in order to point them to the new PHIBlock rather than the already
+ // existing return block.
+ SmallVector<BranchInst *, 2> BranchesToChange;
+ for (BasicBlock *Pred : predecessors(ReturnBB))
+ BranchesToChange.push_back(cast<BranchInst>(Pred->getTerminator()));
+
+ // Now we mark the branch instructions found, and change the references of the
+ // return block to the newly created PHIBlock.
+ for (BranchInst *BI : BranchesToChange)
+ for (unsigned Succ = 0, End = BI->getNumSuccessors(); Succ < End; Succ++) {
+ if (BI->getSuccessor(Succ) != ReturnBB)
+ continue;
+ BI->setSuccessor(Succ, PHIBlock);
+ }
+
+ BranchInst::Create(ReturnBB, PHIBlock);
+
+ return PhiBlockForRetVal->second;
+}
+
+/// For the function call now representing the \p Region, find the passed value
+/// to that call that represents Argument \p A at the call location if the
+/// call has already been replaced with a call to the overall, aggregate
+/// function.
+///
+/// \param A - The Argument to get the passed value for.
+/// \param Region - The extracted Region corresponding to the outlined function.
+/// \returns The Value representing \p A at the call site.
+static Value *
+getPassedArgumentInAlreadyOutlinedFunction(const Argument *A,
+ const OutlinableRegion &Region) {
+ // If we don't need to adjust the argument number at all (since the call
+ // has already been replaced by a call to the overall outlined function)
+ // we can just get the specified argument.
+ return Region.Call->getArgOperand(A->getArgNo());
+}
+
+/// For the function call now representing the \p Region, find the passed value
+/// to that call that represents Argument \p A at the call location if the
+/// call has only been replaced by the call to the aggregate function.
+///
+/// \param A - The Argument to get the passed value for.
+/// \param Region - The extracted Region corresponding to the outlined function.
+/// \returns The Value representing \p A at the call site.
+static Value *
+getPassedArgumentAndAdjustArgumentLocation(const Argument *A,
+ const OutlinableRegion &Region) {
+ unsigned ArgNum = A->getArgNo();
+
+ // If it is a constant, we can look at our mapping from when we created
+ // the outputs to figure out what the constant value is.
+ if (Region.AggArgToConstant.count(ArgNum))
+ return Region.AggArgToConstant.find(ArgNum)->second;
+
+ // If it is not a constant, and we are not looking at the overall function, we
+ // need to adjust which argument we are looking at.
+ ArgNum = Region.AggArgToExtracted.find(ArgNum)->second;
+ return Region.Call->getArgOperand(ArgNum);
+}
+
+/// Find the canonical numbering for the incoming Values into the PHINode \p PN.
+///
+/// \param PN [in] - The PHINode that we are finding the canonical numbers for.
+/// \param Region [in] - The OutlinableRegion containing \p PN.
+/// \param OutputMappings [in] - The mapping of output values from outlined
+/// region to their original values.
+/// \param CanonNums [out] - The canonical numbering for the incoming values to
+/// \p PN.
+/// \param ReplacedWithOutlinedCall - A flag to use the extracted function call
+/// of \p Region rather than the overall function's call.
+static void
+findCanonNumsForPHI(PHINode *PN, OutlinableRegion &Region,
+ const DenseMap<Value *, Value *> &OutputMappings,
+ DenseSet<unsigned> &CanonNums,
+ bool ReplacedWithOutlinedCall = true) {
+ // Iterate over the incoming values.
+ for (unsigned Idx = 0, EIdx = PN->getNumIncomingValues(); Idx < EIdx; Idx++) {
+ Value *IVal = PN->getIncomingValue(Idx);
+ // If we have an argument as incoming value, we need to grab the passed
+ // value from the call itself.
+ if (Argument *A = dyn_cast<Argument>(IVal)) {
+ if (ReplacedWithOutlinedCall)
+ IVal = getPassedArgumentInAlreadyOutlinedFunction(A, Region);
+ else
+ IVal = getPassedArgumentAndAdjustArgumentLocation(A, Region);
+ }
+
+ // Get the original value if it has been replaced by an output value.
+ IVal = findOutputMapping(OutputMappings, IVal);
+
+ // Find and add the canonical number for the incoming value.
+ Optional<unsigned> GVN = Region.Candidate->getGVN(IVal);
+ assert(GVN.hasValue() && "No GVN for incoming value");
+ Optional<unsigned> CanonNum = Region.Candidate->getCanonicalNum(*GVN);
+ assert(CanonNum.hasValue() && "No Canonical Number for GVN");
+ CanonNums.insert(*CanonNum);
+ }
+}
+
+/// Find, or add PHINode \p PN to the combined PHINode Block \p OverallPHIBlock
+/// in order to condense the number of instructions added to the outlined
+/// function.
+///
+/// \param PN [in] - The PHINode that we are finding the canonical numbers for.
+/// \param Region [in] - The OutlinableRegion containing \p PN.
+/// \param OverallPhiBlock [in] - The overall PHIBlock we are trying to find
+/// \p PN in.
+/// \param OutputMappings [in] - The mapping of output values from outlined
+/// region to their original values.
+/// \return the newly found or created PHINode in \p OverallPhiBlock.
+static PHINode*
+findOrCreatePHIInBlock(PHINode &PN, OutlinableRegion &Region,
+ BasicBlock *OverallPhiBlock,
+ const DenseMap<Value *, Value *> &OutputMappings) {
+ OutlinableGroup &Group = *Region.Parent;
+
+ DenseSet<unsigned> PNCanonNums;
+ // We have to use the extracted function since we have merged this region into
+ // the overall function yet. We make sure to reassign the argument numbering
+ // since it is possible that the argument ordering is different between the
+ // functions.
+ findCanonNumsForPHI(&PN, Region, OutputMappings, PNCanonNums,
+ /* ReplacedWithOutlinedCall = */ false);
+
+ OutlinableRegion *FirstRegion = Group.Regions[0];
+ DenseSet<unsigned> CurrentCanonNums;
+ // Find the Canonical Numbering for each PHINode, if it matches, we replace
+ // the uses of the PHINode we are searching for, with the found PHINode.
+ for (PHINode &CurrPN : OverallPhiBlock->phis()) {
+ CurrentCanonNums.clear();
+ findCanonNumsForPHI(&CurrPN, *FirstRegion, OutputMappings, CurrentCanonNums,
+ /* ReplacedWithOutlinedCall = */ true);
+
+ if (all_of(PNCanonNums, [&CurrentCanonNums](unsigned CanonNum) {
+ return CurrentCanonNums.contains(CanonNum);
+ }))
+ return &CurrPN;
+ }
+
+ // If we've made it here, it means we weren't able to replace the PHINode, so
+ // we must insert it ourselves.
+ PHINode *NewPN = cast<PHINode>(PN.clone());
+ NewPN->insertBefore(&*OverallPhiBlock->begin());
+ for (unsigned Idx = 0, Edx = NewPN->getNumIncomingValues(); Idx < Edx;
+ Idx++) {
+ Value *IncomingVal = NewPN->getIncomingValue(Idx);
+ BasicBlock *IncomingBlock = NewPN->getIncomingBlock(Idx);
+
+ // Find corresponding basic block in the overall function for the incoming
+ // block.
+ Instruction *FirstNonPHI = IncomingBlock->getFirstNonPHI();
+ assert(FirstNonPHI && "Incoming block is empty?");
+ Value *CorrespondingVal =
+ Region.findCorrespondingValueIn(*FirstRegion, FirstNonPHI);
+ assert(CorrespondingVal && "Value is nullptr?");
+ BasicBlock *BlockToUse = cast<Instruction>(CorrespondingVal)->getParent();
+ NewPN->setIncomingBlock(Idx, BlockToUse);
+
+ // If we have an argument we make sure we replace using the argument from
+ // the correct function.
+ if (Argument *A = dyn_cast<Argument>(IncomingVal)) {
+ Value *Val = Group.OutlinedFunction->getArg(A->getArgNo());
+ NewPN->setIncomingValue(Idx, Val);
+ continue;
+ }
+
+ // Find the corresponding value in the overall function.
+ IncomingVal = findOutputMapping(OutputMappings, IncomingVal);
+ Value *Val = Region.findCorrespondingValueIn(*FirstRegion, IncomingVal);
+ assert(Val && "Value is nullptr?");
+ NewPN->setIncomingValue(Idx, Val);
+ }
+ return NewPN;
+}
+
// Within an extracted function, replace the argument uses of the extracted
// region with the arguments of the function for an OutlinableGroup.
//
@@ -1075,6 +1635,7 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) {
static void
replaceArgumentUses(OutlinableRegion &Region,
DenseMap<Value *, BasicBlock *> &OutputBBs,
+ const DenseMap<Value *, Value *> &OutputMappings,
bool FirstFunction = false) {
OutlinableGroup &Group = *Region.Parent;
assert(Region.ExtractedFunction && "Region has no extracted function?");
@@ -1144,12 +1705,47 @@ replaceArgumentUses(OutlinableRegion &Region,
LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to "
<< *OutputBB << "\n");
- if (FirstFunction)
+ // If this is storing a PHINode, we must make sure it is included in the
+ // overall function.
+ if (!isa<PHINode>(ValueOperand) ||
+ Region.Candidate->getGVN(ValueOperand).hasValue()) {
+ if (FirstFunction)
+ continue;
+ Value *CorrVal =
+ Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand);
+ assert(CorrVal && "Value is nullptr?");
+ NewI->setOperand(0, CorrVal);
+ continue;
+ }
+ PHINode *PN = cast<PHINode>(SI->getValueOperand());
+ // If it has a value, it was not split by the code extractor, which
+ // is what we are looking for.
+ if (Region.Candidate->getGVN(PN).hasValue())
continue;
- Value *CorrVal =
- Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand);
- assert(CorrVal && "Value is nullptr?");
- NewI->setOperand(0, CorrVal);
+
+ // We record the parent block for the PHINode in the Region so that
+ // we can exclude it from checks later on.
+ Region.PHIBlocks.insert(std::make_pair(RetVal, PN->getParent()));
+
+ // If this is the first function, we do not need to worry about mergiing
+ // this with any other block in the overall outlined function, so we can
+ // just continue.
+ if (FirstFunction) {
+ BasicBlock *PHIBlock = PN->getParent();
+ Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock));
+ continue;
+ }
+
+ // We look for the aggregate block that contains the PHINodes leading into
+ // this exit path. If we can't find one, we create one.
+ BasicBlock *OverallPhiBlock = findOrCreatePHIBlock(Group, RetVal);
+
+ // For our PHINode, we find the combined canonical numbering, and
+ // attempt to find a matching PHINode in the overall PHIBlock. If we
+ // cannot, we copy the PHINode and move it into this new block.
+ PHINode *NewPN =
+ findOrCreatePHIInBlock(*PN, Region, OverallPhiBlock, OutputMappings);
+ NewI->setOperand(0, NewPN);
}
// If we added an edge for basic blocks without a predecessor, we remove it
@@ -1390,7 +1986,12 @@ void createSwitchStatement(
Module &M, OutlinableGroup &OG, DenseMap<Value *, BasicBlock *> &EndBBs,
std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) {
// We only need the switch statement if there is more than one store
- // combination.
+ // combination, or there is more than one set of output blocks. The first
+ // will occur when we store different sets of values for two different
+ // regions. The second will occur when we have two outputs that are combined
+ // in a PHINode outside of the region in one outlined instance, and are used
+ // seaparately in another. This will create the same set of OutputGVNs, but
+ // will generate two different output schemes.
if (OG.OutputGVNCombinations.size() > 1) {
Function *AggFunc = OG.OutlinedFunction;
// Create a final block for each different return block.
@@ -1433,8 +2034,14 @@ void createSwitchStatement(
return;
}
+ assert(OutputStoreBBs.size() < 2 && "Different store sets not handled!");
+
// If there needs to be stores, move them from the output blocks to their
- // corresponding ending block.
+ // corresponding ending block. We do not check that the OutputGVNCombinations
+ // is equal to 1 here since that could just been the case where there are 0
+ // outputs. Instead, we check whether there is more than one set of output
+ // blocks since this is the only case where we would have to move the
+ // stores, and erase the extraneous blocks.
if (OutputStoreBBs.size() == 1) {
LLVM_DEBUG(dbgs() << "Move store instructions to the end block in "
<< *OG.OutlinedFunction << "\n");
@@ -1466,10 +2073,13 @@ void createSwitchStatement(
/// set of stores needed for the different functions.
/// \param [in,out] FuncsToRemove - Extracted functions to erase from module
/// once outlining is complete.
+/// \param [in] OutputMappings - Extracted functions to erase from module
+/// once outlining is complete.
static void fillOverallFunction(
Module &M, OutlinableGroup &CurrentGroup,
std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs,
- std::vector<Function *> &FuncsToRemove) {
+ std::vector<Function *> &FuncsToRemove,
+ const DenseMap<Value *, Value *> &OutputMappings) {
OutlinableRegion *CurrentOS = CurrentGroup.Regions[0];
// Move first extracted function's instructions into new function.
@@ -1489,7 +2099,7 @@ static void fillOverallFunction(
CurrentGroup.OutlinedFunction, "output_block_0");
CurrentOS->OutputBlockNum = 0;
- replaceArgumentUses(*CurrentOS, NewBBs, true);
+ replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings, true);
replaceConstants(*CurrentOS);
// We first identify if any output blocks are empty, if they are we remove
@@ -1523,7 +2133,8 @@ void IROutliner::deduplicateExtractedSections(
OutlinableRegion *CurrentOS;
- fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove);
+ fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove,
+ OutputMappings);
std::vector<Value *> SortedKeys;
for (unsigned Idx = 1; Idx < CurrentGroup.Regions.size(); Idx++) {
@@ -1537,8 +2148,7 @@ void IROutliner::deduplicateExtractedSections(
createAndInsertBasicBlocks(
CurrentGroup.EndBBs, NewBBs, CurrentGroup.OutlinedFunction,
"output_block_" + Twine(static_cast<unsigned>(Idx)));
-
- replaceArgumentUses(*CurrentOS, NewBBs);
+ replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings);
alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBBs,
CurrentGroup.EndBBs, OutputMappings,
OutputStoreBBs);
@@ -1637,7 +2247,7 @@ void IROutliner::pruneIncompatibleRegions(
if (FirstCandidate.getLength() == 2) {
if (isa<CallInst>(FirstCandidate.front()->Inst) &&
isa<BranchInst>(FirstCandidate.back()->Inst))
- return;
+ return;
}
unsigned CurrentEndIdx = 0;
@@ -1706,6 +2316,34 @@ IROutliner::findBenefitFromAllRegions(OutlinableGroup &CurrentGroup) {
return RegionBenefit;
}
+/// For the \p OutputCanon number passed in find the value represented by this
+/// canonical number. If it is from a PHINode, we pick the first incoming
+/// value and return that Value instead.
+///
+/// \param Region - The OutlinableRegion to get the Value from.
+/// \param OutputCanon - The canonical number to find the Value from.
+/// \returns The Value represented by a canonical number \p OutputCanon in \p
+/// Region.
+static Value *findOutputValueInRegion(OutlinableRegion &Region,
+ unsigned OutputCanon) {
+ OutlinableGroup &CurrentGroup = *Region.Parent;
+ // If the value is greater than the value in the tracker, we have a
+ // PHINode and will instead use one of the incoming values to find the
+ // type.
+ if (OutputCanon > CurrentGroup.PHINodeGVNTracker) {
+ auto It = CurrentGroup.PHINodeGVNToGVNs.find(OutputCanon);
+ assert(It != CurrentGroup.PHINodeGVNToGVNs.end() &&
+ "Could not find GVN set for PHINode number!");
+ assert(It->second.second.size() > 0 && "PHINode does not have any values!");
+ OutputCanon = *It->second.second.begin();
+ }
+ Optional<unsigned> OGVN = Region.Candidate->fromCanonicalNum(OutputCanon);
+ assert(OGVN.hasValue() && "Could not find GVN for Canonical Number?");
+ Optional<Value *> OV = Region.Candidate->fromGVN(*OGVN);
+ assert(OV.hasValue() && "Could not find value for GVN?");
+ return *OV;
+}
+
InstructionCost
IROutliner::findCostOutputReloads(OutlinableGroup &CurrentGroup) {
InstructionCost OverallCost = 0;
@@ -1713,10 +2351,8 @@ IROutliner::findCostOutputReloads(OutlinableGroup &CurrentGroup) {
TargetTransformInfo &TTI = getTTI(*Region->StartBB->getParent());
// Each output incurs a load after the call, so we add that to the cost.
- for (unsigned OutputGVN : Region->GVNStores) {
- Optional<Value *> OV = Region->Candidate->fromGVN(OutputGVN);
- assert(OV.hasValue() && "Could not find value for GVN?");
- Value *V = OV.getValue();
+ for (unsigned OutputCanon : Region->GVNStores) {
+ Value *V = findOutputValueInRegion(*Region, OutputCanon);
InstructionCost LoadCost =
TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0,
TargetTransformInfo::TCK_CodeSize);
@@ -1745,6 +2381,7 @@ static InstructionCost findCostForOutputBlocks(Module &M,
InstructionCost OutputCost = 0;
unsigned NumOutputBranches = 0;
+ OutlinableRegion &FirstRegion = *CurrentGroup.Regions[0];
IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate;
DenseSet<BasicBlock *> CandidateBlocks;
Candidate.getBasicBlocks(CandidateBlocks);
@@ -1770,10 +2407,8 @@ static InstructionCost findCostForOutputBlocks(Module &M,
for (const ArrayRef<unsigned> &OutputUse :
CurrentGroup.OutputGVNCombinations) {
- for (unsigned GVN : OutputUse) {
- Optional<Value *> OV = Candidate.fromGVN(GVN);
- assert(OV.hasValue() && "Could not find value for GVN?");
- Value *V = OV.getValue();
+ for (unsigned OutputCanon : OutputUse) {
+ Value *V = findOutputValueInRegion(FirstRegion, OutputCanon);
InstructionCost StoreCost =
TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0,
TargetTransformInfo::TCK_CodeSize);
@@ -1974,6 +2609,7 @@ bool IROutliner::extractSection(OutlinableRegion &Region) {
unsigned IROutliner::doOutline(Module &M) {
// Find the possible similarity sections.
InstructionClassifier.EnableBranches = !DisableBranches;
+ InstructionClassifier.EnableIndirectCalls = !DisableIndirectCalls;
IRSimilarityIdentifier &Identifier = getIRSI(M);
SimilarityGroupList &SimilarityCandidates = *Identifier.getSimilarity();
@@ -2033,8 +2669,8 @@ unsigned IROutliner::doOutline(Module &M) {
continue;
SmallVector<BasicBlock *> BE;
- DenseSet<BasicBlock *> BBSet;
- OS->Candidate->getBasicBlocks(BBSet, BE);
+ DenseSet<BasicBlock *> BlocksInRegion;
+ OS->Candidate->getBasicBlocks(BlocksInRegion, BE);
OS->CE = new (ExtractorAllocator.Allocate())
CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
false, "outlined");
@@ -2144,8 +2780,8 @@ unsigned IROutliner::doOutline(Module &M) {
OutlinedRegions.clear();
for (OutlinableRegion *OS : CurrentGroup.Regions) {
SmallVector<BasicBlock *> BE;
- DenseSet<BasicBlock *> BBSet;
- OS->Candidate->getBasicBlocks(BBSet, BE);
+ DenseSet<BasicBlock *> BlocksInRegion;
+ OS->Candidate->getBasicBlocks(BlocksInRegion, BE);
OS->CE = new (ExtractorAllocator.Allocate())
CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
false, "outlined");
diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
index 4e3689f09536..49babc24cb82 100644
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -92,6 +92,11 @@ static cl::opt<bool>
DisableInlinedAllocaMerging("disable-inlined-alloca-merging",
cl::init(false), cl::Hidden);
+/// A flag for test, so we can print the content of the advisor when running it
+/// as part of the default (e.g. -O3) pipeline.
+static cl::opt<bool> KeepAdvisorForPrinting("keep-inline-advisor-for-printing",
+ cl::init(false), cl::Hidden);
+
extern cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats;
static cl::opt<std::string> CGSCCInlineReplayFile(
@@ -660,7 +665,7 @@ bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG,
}
if (!DeadFunctionsInComdats.empty()) {
// Filter out the functions whose comdats remain alive.
- filterDeadComdatFunctions(CG.getModule(), DeadFunctionsInComdats);
+ filterDeadComdatFunctions(DeadFunctionsInComdats);
// Remove the rest.
for (Function *F : DeadFunctionsInComdats)
RemoveCGN(CG[F]);
@@ -741,7 +746,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
InlineAdvisor &Advisor = getAdvisor(MAMProxy, FAM, M);
Advisor.onPassEntry();
- auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(); });
+ auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(&InitialC); });
// We use a single common worklist for calls across the entire SCC. We
// process these in-order and append new calls introduced during inlining to
@@ -823,6 +828,10 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
// defer deleting these to make it easier to handle the call graph updates.
SmallVector<Function *, 4> DeadFunctions;
+ // Track potentially dead non-local functions with comdats to see if they can
+ // be deleted as a batch after inlining.
+ SmallVector<Function *, 4> DeadFunctionsInComdats;
+
// Loop forward over all of the calls.
while (!Calls->empty()) {
// We expect the calls to typically be batched with sequences of calls that
@@ -935,16 +944,15 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
// Merge the attributes based on the inlining.
AttributeFuncs::mergeAttributesForInlining(F, Callee);
- // For local functions, check whether this makes the callee trivially
- // dead. In that case, we can drop the body of the function eagerly
- // which may reduce the number of callers of other functions to one,
- // changing inline cost thresholds.
+ // For local functions or discardable functions without comdats, check
+ // whether this makes the callee trivially dead. In that case, we can drop
+ // the body of the function eagerly which may reduce the number of callers
+ // of other functions to one, changing inline cost thresholds. Non-local
+ // discardable functions with comdats are checked later on.
bool CalleeWasDeleted = false;
- if (Callee.hasLocalLinkage()) {
- // To check this we also need to nuke any dead constant uses (perhaps
- // made dead by this operation on other functions).
- Callee.removeDeadConstantUsers();
- if (Callee.use_empty() && !CG.isLibFunction(Callee)) {
+ if (Callee.isDiscardableIfUnused() && Callee.hasZeroLiveUses() &&
+ !CG.isLibFunction(Callee)) {
+ if (Callee.hasLocalLinkage() || !Callee.hasComdat()) {
Calls->erase_if([&](const std::pair<CallBase *, int> &Call) {
return Call.first->getCaller() == &Callee;
});
@@ -957,6 +965,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
"Cannot put cause a function to become dead twice!");
DeadFunctions.push_back(&Callee);
CalleeWasDeleted = true;
+ } else {
+ DeadFunctionsInComdats.push_back(&Callee);
}
}
if (CalleeWasDeleted)
@@ -1019,6 +1029,15 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
FAM.invalidate(F, PreservedAnalyses::none());
}
+ // We must ensure that we only delete functions with comdats if every function
+ // in the comdat is going to be deleted.
+ if (!DeadFunctionsInComdats.empty()) {
+ filterDeadComdatFunctions(DeadFunctionsInComdats);
+ for (auto *Callee : DeadFunctionsInComdats)
+ Callee->dropAllReferences();
+ DeadFunctions.append(DeadFunctionsInComdats);
+ }
+
// Now that we've finished inlining all of the calls across this SCC, delete
// all of the trivially dead functions, updating the call graph and the CGSCC
// pass manager in the process.
@@ -1045,14 +1064,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
UR.UpdatedC = nullptr;
// And delete the actual function from the module.
- // The Advisor may use Function pointers to efficiently index various
- // internal maps, e.g. for memoization. Function cleanup passes like
- // argument promotion create new functions. It is possible for a new
- // function to be allocated at the address of a deleted function. We could
- // index using names, but that's inefficient. Alternatively, we let the
- // Advisor free the functions when it sees fit.
- DeadF->getBasicBlockList().clear();
- M.getFunctionList().remove(DeadF);
+ M.getFunctionList().erase(DeadF);
++NumDeleted;
}
@@ -1073,8 +1085,7 @@ ModuleInlinerWrapperPass::ModuleInlinerWrapperPass(InlineParams Params,
bool MandatoryFirst,
InliningAdvisorMode Mode,
unsigned MaxDevirtIterations)
- : Params(Params), Mode(Mode), MaxDevirtIterations(MaxDevirtIterations),
- PM(), MPM() {
+ : Params(Params), Mode(Mode), MaxDevirtIterations(MaxDevirtIterations) {
// Run the inliner first. The theory is that we are walking bottom-up and so
// the callees have already been fully optimized, and we want to inline them
// into the callers so that our optimizations can reflect that.
@@ -1118,7 +1129,8 @@ PreservedAnalyses ModuleInlinerWrapperPass::run(Module &M,
// Discard the InlineAdvisor, a subsequent inlining session should construct
// its own.
auto PA = PreservedAnalyses::all();
- PA.abandon<InlineAdvisorAnalysis>();
+ if (!KeepAdvisorForPrinting)
+ PA.abandon<InlineAdvisorAnalysis>();
return PA;
}
diff --git a/llvm/lib/Transforms/IPO/ModuleInliner.cpp b/llvm/lib/Transforms/IPO/ModuleInliner.cpp
index ebf080e87c3b..d515303e4911 100644
--- a/llvm/lib/Transforms/IPO/ModuleInliner.cpp
+++ b/llvm/lib/Transforms/IPO/ModuleInliner.cpp
@@ -335,14 +335,7 @@ PreservedAnalyses ModuleInlinerPass::run(Module &M,
FAM.clear(*DeadF, DeadF->getName());
// And delete the actual function from the module.
- // The Advisor may use Function pointers to efficiently index various
- // internal maps, e.g. for memoization. Function cleanup passes like
- // argument promotion create new functions. It is possible for a new
- // function to be allocated at the address of a deleted function. We could
- // index using names, but that's inefficient. Alternatively, we let the
- // Advisor free the functions when it sees fit.
- DeadF->getBasicBlockList().clear();
- M.getFunctionList().remove(DeadF);
+ M.getFunctionList().erase(DeadF);
++NumDeleted;
}
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index f289e3ecc979..68f33410c602 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -21,6 +21,7 @@
#include "llvm/ADT/EnumeratedArray.h"
#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/CallGraph.h"
@@ -153,14 +154,6 @@ static constexpr auto TAG = "[" DEBUG_TYPE "]";
namespace {
-enum class AddressSpace : unsigned {
- Generic = 0,
- Global = 1,
- Shared = 3,
- Constant = 4,
- Local = 5,
-};
-
struct AAHeapToShared;
struct AAICVTracker;
@@ -170,7 +163,7 @@ struct AAICVTracker;
struct OMPInformationCache : public InformationCache {
OMPInformationCache(Module &M, AnalysisGetter &AG,
BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC,
- SmallPtrSetImpl<Kernel> &Kernels)
+ KernelSet &Kernels)
: InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M),
Kernels(Kernels) {
@@ -424,6 +417,12 @@ struct OMPInformationCache : public InformationCache {
recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
}
+ // Helper function to inherit the calling convention of the function callee.
+ void setCallingConvention(FunctionCallee Callee, CallInst *CI) {
+ if (Function *Fn = dyn_cast<Function>(Callee.getCallee()))
+ CI->setCallingConv(Fn->getCallingConv());
+ }
+
/// Helper to initialize all runtime function information for those defined
/// in OpenMPKinds.def.
void initializeRuntimeFunctions() {
@@ -485,7 +484,7 @@ struct OMPInformationCache : public InformationCache {
}
/// Collection of known kernels (\see Kernel) in the module.
- SmallPtrSetImpl<Kernel> &Kernels;
+ KernelSet &Kernels;
/// Collection of known OpenMP runtime functions..
DenseSet<const Function *> RTLFunctions;
@@ -1013,7 +1012,8 @@ private:
// into a single parallel region is contained in a single basic block
// without any other instructions. We use the OpenMPIRBuilder to outline
// that block and call the resulting function via __kmpc_fork_call.
- auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) {
+ auto Merge = [&](const SmallVectorImpl<CallInst *> &MergableCIs,
+ BasicBlock *BB) {
// TODO: Change the interface to allow single CIs expanded, e.g, to
// include an outer loop.
assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs");
@@ -1075,8 +1075,7 @@ private:
BranchInst::Create(AfterBB, AfterIP.getBlock());
// Perform the actual outlining.
- OMPInfoCache.OMPBuilder.finalize(OriginalFn,
- /* AllowExtractorSinking */ true);
+ OMPInfoCache.OMPBuilder.finalize(OriginalFn);
Function *OutlinedFn = MergableCIs.front()->getCaller();
@@ -1538,6 +1537,7 @@ private:
CallInst *IssueCallsite =
CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall);
+ OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite);
RuntimeCall.eraseFromParent();
// Add "wait" runtime call declaration:
@@ -1550,7 +1550,9 @@ private:
OffloadArray::DeviceIDArgNum), // device_id.
Handle // handle to wait on.
};
- CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint);
+ CallInst *WaitCallsite = CallInst::Create(
+ WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint);
+ OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite);
return true;
}
@@ -1597,8 +1599,10 @@ private:
&F.getEntryBlock(), F.getEntryBlock().begin()));
// Create a fallback location if non was found.
// TODO: Use the debug locations of the calls instead.
- Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr();
- Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc);
+ uint32_t SrcLocStrSize;
+ Constant *Loc =
+ OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
+ Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize);
}
return Ident;
}
@@ -2171,7 +2175,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
};
auto CallCheck = [&](Instruction &I) {
- Optional<Value *> ReplVal = getValueForCall(A, &I, ICV);
+ Optional<Value *> ReplVal = getValueForCall(A, I, ICV);
if (ReplVal.hasValue() &&
ValuesMap.insert(std::make_pair(&I, *ReplVal)).second)
HasChanged = ChangeStatus::CHANGED;
@@ -2197,12 +2201,12 @@ struct AAICVTrackerFunction : public AAICVTracker {
return HasChanged;
}
- /// Hepler to check if \p I is a call and get the value for it if it is
+ /// Helper to check if \p I is a call and get the value for it if it is
/// unique.
- Optional<Value *> getValueForCall(Attributor &A, const Instruction *I,
+ Optional<Value *> getValueForCall(Attributor &A, const Instruction &I,
InternalControlVar &ICV) const {
- const auto *CB = dyn_cast<CallBase>(I);
+ const auto *CB = dyn_cast<CallBase>(&I);
if (!CB || CB->hasFnAttr("no_openmp") ||
CB->hasFnAttr("no_openmp_routines"))
return None;
@@ -2218,8 +2222,8 @@ struct AAICVTrackerFunction : public AAICVTracker {
if (CalledFunction == GetterRFI.Declaration)
return None;
if (CalledFunction == SetterRFI.Declaration) {
- if (ICVReplacementValuesMap[ICV].count(I))
- return ICVReplacementValuesMap[ICV].lookup(I);
+ if (ICVReplacementValuesMap[ICV].count(&I))
+ return ICVReplacementValuesMap[ICV].lookup(&I);
return nullptr;
}
@@ -2231,8 +2235,11 @@ struct AAICVTrackerFunction : public AAICVTracker {
const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
*this, IRPosition::callsite_returned(*CB), DepClassTy::REQUIRED);
- if (ICVTrackingAA.isAssumedTracked())
- return ICVTrackingAA.getUniqueReplacementValue(ICV);
+ if (ICVTrackingAA.isAssumedTracked()) {
+ Optional<Value *> URV = ICVTrackingAA.getUniqueReplacementValue(ICV);
+ if (!URV || (*URV && AA::isValidAtPosition(**URV, I, OMPInfoCache)))
+ return URV;
+ }
// If we don't know, assume it changes.
return nullptr;
@@ -2284,7 +2291,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
break;
}
- Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV);
+ Optional<Value *> NewReplVal = getValueForCall(A, *CurrInst, ICV);
if (!NewReplVal.hasValue())
continue;
@@ -2548,7 +2555,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
}
/// Set of basic blocks that are executed by a single thread.
- DenseSet<const BasicBlock *> SingleThreadedBBs;
+ SmallSetVector<const BasicBlock *, 16> SingleThreadedBBs;
/// Total number of basic blocks in this function.
long unsigned NumBBs;
@@ -2572,7 +2579,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
if (!A.checkForAllCallSites(PredForCallSite, *this,
/* RequiresAllCallSites */ true,
AllCallSitesKnown))
- SingleThreadedBBs.erase(&F->getEntryBlock());
+ SingleThreadedBBs.remove(&F->getEntryBlock());
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
@@ -2637,7 +2644,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
for (auto *BB : RPOT) {
if (!MergePredecessorStates(BB))
- SingleThreadedBBs.erase(BB);
+ SingleThreadedBBs.remove(BB);
}
return (NumSingleThreadedBBs == SingleThreadedBBs.size())
@@ -2759,7 +2766,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
if (FreeCalls.size() != 1)
continue;
- ConstantInt *AllocSize = dyn_cast<ConstantInt>(CB->getArgOperand(0));
+ auto *AllocSize = cast<ConstantInt>(CB->getArgOperand(0));
LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB
<< " with " << AllocSize->getZExtValue()
@@ -2772,7 +2779,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
auto *SharedMem = new GlobalVariable(
*M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage,
- UndefValue::get(Int8ArrTy), CB->getName(), nullptr,
+ UndefValue::get(Int8ArrTy), CB->getName() + "_shared", nullptr,
GlobalValue::NotThreadLocal,
static_cast<unsigned>(AddressSpace::Shared));
auto *NewBuffer =
@@ -2786,7 +2793,10 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
};
A.emitRemark<OptimizationRemark>(CB, "OMP111", Remark);
- SharedMem->setAlignment(MaybeAlign(32));
+ MaybeAlign Alignment = CB->getRetAlign();
+ assert(Alignment &&
+ "HeapToShared on allocation without alignment attribute");
+ SharedMem->setAlignment(MaybeAlign(Alignment));
A.changeValueAfterManifest(*CB, *NewBuffer);
A.deleteAfterManifest(*CB);
@@ -2813,7 +2823,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
if (CallBase *CB = dyn_cast<CallBase>(U))
if (!isa<ConstantInt>(CB->getArgOperand(0)) ||
!ED.isExecutedByInitialThreadOnly(*CB))
- MallocCalls.erase(CB);
+ MallocCalls.remove(CB);
}
findPotentialRemovedFreeCalls(A);
@@ -2825,7 +2835,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
}
/// Collection of all malloc calls in a function.
- SmallPtrSet<CallBase *, 4> MallocCalls;
+ SmallSetVector<CallBase *, 4> MallocCalls;
/// Collection of potentially removed free calls in a function.
SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
};
@@ -2962,7 +2972,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
UsedAssumedInformation = !isAtFixpoint();
auto *FalseVal =
- ConstantInt::getBool(IRP.getAnchorValue().getContext(), 0);
+ ConstantInt::getBool(IRP.getAnchorValue().getContext(), false);
return FalseVal;
};
@@ -3225,8 +3235,11 @@ struct AAKernelInfoFunction : AAKernelInfo {
OpenMPIRBuilder::LocationDescription Loc(
InsertPointTy(ParentBB, ParentBB->end()), DL);
OMPInfoCache.OMPBuilder.updateToLocation(Loc);
- auto *SrcLocStr = OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc);
- Value *Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr);
+ uint32_t SrcLocStrSize;
+ auto *SrcLocStr =
+ OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+ Value *Ident =
+ OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
BranchInst::Create(RegionCheckTidBB, ParentBB)->setDebugLoc(DL);
// Add check for Tid in RegionCheckTidBB
@@ -3237,8 +3250,10 @@ struct AAKernelInfoFunction : AAKernelInfo {
FunctionCallee HardwareTidFn =
OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
- Value *Tid =
+ CallInst *Tid =
OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
+ Tid->setDebugLoc(DL);
+ OMPInfoCache.setCallingConvention(HardwareTidFn, Tid);
Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
OMPInfoCache.OMPBuilder.Builder
.CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
@@ -3251,14 +3266,18 @@ struct AAKernelInfoFunction : AAKernelInfo {
M, OMPRTL___kmpc_barrier_simple_spmd);
OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
RegionBarrierBB, RegionBarrierBB->getFirstInsertionPt()));
- OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid})
- ->setDebugLoc(DL);
+ CallInst *Barrier =
+ OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid});
+ Barrier->setDebugLoc(DL);
+ OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
// Second barrier ensures workers have read broadcast values.
- if (HasBroadcastValues)
- CallInst::Create(BarrierFn, {Ident, Tid}, "",
- RegionBarrierBB->getTerminator())
- ->setDebugLoc(DL);
+ if (HasBroadcastValues) {
+ CallInst *Barrier = CallInst::Create(BarrierFn, {Ident, Tid}, "",
+ RegionBarrierBB->getTerminator());
+ Barrier->setDebugLoc(DL);
+ OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
+ }
};
auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
@@ -3352,17 +3371,17 @@ struct AAKernelInfoFunction : AAKernelInfo {
OMP_TGT_EXEC_MODE_SPMD));
A.changeUseAfterManifest(
KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo),
- *ConstantInt::getBool(Ctx, 0));
+ *ConstantInt::getBool(Ctx, false));
A.changeUseAfterManifest(
KernelDeinitCB->getArgOperandUse(DeinitModeArgNo),
*ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx),
OMP_TGT_EXEC_MODE_SPMD));
A.changeUseAfterManifest(
KernelInitCB->getArgOperandUse(InitRequiresFullRuntimeArgNo),
- *ConstantInt::getBool(Ctx, 0));
+ *ConstantInt::getBool(Ctx, false));
A.changeUseAfterManifest(
KernelDeinitCB->getArgOperandUse(DeinitRequiresFullRuntimeArgNo),
- *ConstantInt::getBool(Ctx, 0));
+ *ConstantInt::getBool(Ctx, false));
++NumOpenMPTargetRegionKernelsSPMD;
@@ -3403,7 +3422,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
// If not SPMD mode, indicate we use a custom state machine now.
auto &Ctx = getAnchorValue().getContext();
- auto *FalseVal = ConstantInt::getBool(Ctx, 0);
+ auto *FalseVal = ConstantInt::getBool(Ctx, false);
A.changeUseAfterManifest(
KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal);
@@ -3528,10 +3547,12 @@ struct AAKernelInfoFunction : AAKernelInfo {
FunctionCallee WarpSizeFn =
OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
M, OMPRTL___kmpc_get_warp_size);
- Instruction *BlockHwSize =
+ CallInst *BlockHwSize =
CallInst::Create(BlockHwSizeFn, "block.hw_size", InitBB);
+ OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize);
BlockHwSize->setDebugLoc(DLoc);
- Instruction *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB);
+ CallInst *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB);
+ OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize);
WarpSize->setDebugLoc(DLoc);
Instruction *BlockSize =
BinaryOperator::CreateSub(BlockHwSize, WarpSize, "block.size", InitBB);
@@ -3571,8 +3592,10 @@ struct AAKernelInfoFunction : AAKernelInfo {
FunctionCallee BarrierFn =
OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
M, OMPRTL___kmpc_barrier_simple_generic);
- CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB)
- ->setDebugLoc(DLoc);
+ CallInst *Barrier =
+ CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB);
+ OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
+ Barrier->setDebugLoc(DLoc);
if (WorkFnAI->getType()->getPointerAddressSpace() !=
(unsigned int)AddressSpace::Generic) {
@@ -3588,8 +3611,9 @@ struct AAKernelInfoFunction : AAKernelInfo {
FunctionCallee KernelParallelFn =
OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
M, OMPRTL___kmpc_kernel_parallel);
- Instruction *IsActiveWorker = CallInst::Create(
+ CallInst *IsActiveWorker = CallInst::Create(
KernelParallelFn, {WorkFnAI}, "worker.is_active", StateMachineBeginBB);
+ OMPInfoCache.setCallingConvention(KernelParallelFn, IsActiveWorker);
IsActiveWorker->setDebugLoc(DLoc);
Instruction *WorkFn = new LoadInst(VoidPtrTy, WorkFnAI, "worker.work_fn",
StateMachineBeginBB);
@@ -3669,10 +3693,13 @@ struct AAKernelInfoFunction : AAKernelInfo {
StateMachineIfCascadeCurrentBB)
->setDebugLoc(DLoc);
- CallInst::Create(OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
- M, OMPRTL___kmpc_kernel_end_parallel),
- {}, "", StateMachineEndParallelBB)
- ->setDebugLoc(DLoc);
+ FunctionCallee EndParallelFn =
+ OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+ M, OMPRTL___kmpc_kernel_end_parallel);
+ CallInst *EndParallel =
+ CallInst::Create(EndParallelFn, {}, "", StateMachineEndParallelBB);
+ OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel);
+ EndParallel->setDebugLoc(DLoc);
BranchInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB)
->setDebugLoc(DLoc);
@@ -4508,6 +4535,8 @@ void OpenMPOpt::registerAAs(bool IsModulePass) {
bool UsedAssumedInformation = false;
A.getAssumedSimplified(IRPosition::value(*LI), /* AA */ nullptr,
UsedAssumedInformation);
+ } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*SI));
}
}
}
diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 2d717475ce7f..5f2223e4047e 100644
--- a/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -169,8 +169,7 @@ struct FunctionOutliningInfo {
};
struct FunctionOutliningMultiRegionInfo {
- FunctionOutliningMultiRegionInfo()
- : ORI() {}
+ FunctionOutliningMultiRegionInfo() {}
// Container for outline regions
struct OutlineRegionInfo {
@@ -971,6 +970,9 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap(
};
for (User *User : Users) {
+ // Don't bother with BlockAddress used by CallBr for asm goto.
+ if (isa<BlockAddress>(User))
+ continue;
CallBase *CB = getSupportedCallBase(User);
Function *Caller = CB->getCaller();
if (CurrentCaller != Caller) {
@@ -1414,6 +1416,10 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
bool AnyInline = false;
for (User *User : Users) {
+ // Don't bother with BlockAddress used by CallBr for asm goto.
+ if (isa<BlockAddress>(User))
+ continue;
+
CallBase *CB = getSupportedCallBase(User);
if (isLimitReached())
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index eb1b8a29cfc5..0598f751febe 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -519,13 +519,6 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
unsigned NextTmpIdx = 0;
FAddend TmpResult[3];
- // Points to the constant addend of the resulting simplified expression.
- // If the resulting expr has constant-addend, this constant-addend is
- // desirable to reside at the top of the resulting expression tree. Placing
- // constant close to supper-expr(s) will potentially reveal some optimization
- // opportunities in super-expr(s).
- const FAddend *ConstAdd = nullptr;
-
// Simplified addends are placed <SimpVect>.
AddendVect SimpVect;
@@ -541,6 +534,14 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
}
Value *Val = ThisAddend->getSymVal();
+
+ // If the resulting expr has constant-addend, this constant-addend is
+ // desirable to reside at the top of the resulting expression tree. Placing
+ // constant close to super-expr(s) will potentially reveal some
+ // optimization opportunities in super-expr(s). Here we do not implement
+ // this logic intentionally and rely on SimplifyAssociativeOrCommutative
+ // call later.
+
unsigned StartIdx = SimpVect.size();
SimpVect.push_back(ThisAddend);
@@ -569,14 +570,8 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
// Pop all addends being folded and push the resulting folded addend.
SimpVect.resize(StartIdx);
- if (Val) {
- if (!R.isZero()) {
- SimpVect.push_back(&R);
- }
- } else {
- // Don't push constant addend at this time. It will be the last element
- // of <SimpVect>.
- ConstAdd = &R;
+ if (!R.isZero()) {
+ SimpVect.push_back(&R);
}
}
}
@@ -584,9 +579,6 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
assert((NextTmpIdx <= array_lengthof(TmpResult) + 1) &&
"out-of-bound access");
- if (ConstAdd)
- SimpVect.push_back(ConstAdd);
-
Value *Result;
if (!SimpVect.empty())
Result = createNaryFAdd(SimpVect, InstrQuota);
@@ -1296,6 +1288,9 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
if (Instruction *X = foldVectorBinop(I))
return X;
+ if (Instruction *Phi = foldBinopWithPhiOperands(I))
+ return Phi;
+
// (A*B)+(A*C) -> A*(B+C) etc
if (Value *V = SimplifyUsingDistributiveLaws(I))
return replaceInstUsesWith(I, V);
@@ -1498,15 +1493,18 @@ static Instruction *factorizeFAddFSub(BinaryOperator &I,
return Lerp;
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ if (!Op0->hasOneUse() || !Op1->hasOneUse())
+ return nullptr;
+
Value *X, *Y, *Z;
bool IsFMul;
- if ((match(Op0, m_OneUse(m_FMul(m_Value(X), m_Value(Z)))) &&
- match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z))))) ||
- (match(Op0, m_OneUse(m_FMul(m_Value(Z), m_Value(X)))) &&
- match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z))))))
+ if ((match(Op0, m_FMul(m_Value(X), m_Value(Z))) &&
+ match(Op1, m_c_FMul(m_Value(Y), m_Specific(Z)))) ||
+ (match(Op0, m_FMul(m_Value(Z), m_Value(X))) &&
+ match(Op1, m_c_FMul(m_Value(Y), m_Specific(Z)))))
IsFMul = true;
- else if (match(Op0, m_OneUse(m_FDiv(m_Value(X), m_Value(Z)))) &&
- match(Op1, m_OneUse(m_FDiv(m_Value(Y), m_Specific(Z)))))
+ else if (match(Op0, m_FDiv(m_Value(X), m_Value(Z))) &&
+ match(Op1, m_FDiv(m_Value(Y), m_Specific(Z))))
IsFMul = false;
else
return nullptr;
@@ -1541,6 +1539,9 @@ Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) {
if (Instruction *X = foldVectorBinop(I))
return X;
+ if (Instruction *Phi = foldBinopWithPhiOperands(I))
+ return Phi;
+
if (Instruction *FoldedFAdd = foldBinOpIntoSelectOrPhi(I))
return FoldedFAdd;
@@ -1654,6 +1655,14 @@ Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) {
{X->getType()}, {NewStartC, X}, &I));
}
+ // (X * MulC) + X --> X * (MulC + 1.0)
+ Constant *MulC;
+ if (match(&I, m_c_FAdd(m_FMul(m_Value(X), m_ImmConstant(MulC)),
+ m_Deferred(X)))) {
+ MulC = ConstantExpr::getFAdd(MulC, ConstantFP::get(I.getType(), 1.0));
+ return BinaryOperator::CreateFMulFMF(X, MulC, &I);
+ }
+
if (Value *V = FAddCombine(Builder).simplify(&I))
return replaceInstUsesWith(I, V);
}
@@ -1748,6 +1757,9 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
if (Instruction *X = foldVectorBinop(I))
return X;
+ if (Instruction *Phi = foldBinopWithPhiOperands(I))
+ return Phi;
+
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
// If this is a 'B = x-(-A)', change to B = x+A.
@@ -2310,6 +2322,9 @@ Instruction *InstCombinerImpl::visitFSub(BinaryOperator &I) {
if (Instruction *X = foldVectorBinop(I))
return X;
+ if (Instruction *Phi = foldBinopWithPhiOperands(I))
+ return Phi;
+
// Subtraction from -0.0 is the canonical form of fneg.
// fsub -0.0, X ==> fneg X
// fsub nsz 0.0, X ==> fneg nsz X
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index de1034c910d5..6bbb0251f2bc 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1727,25 +1727,37 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I,
(Opcode == Instruction::And) ? Instruction::Or : Instruction::And;
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- Value *A, *B, *C, *X, *Y;
+ Value *A, *B, *C, *X, *Y, *Dummy;
+
+ // Match following expressions:
+ // (~(A | B) & C)
+ // (~(A & B) | C)
+ // Captures X = ~(A | B) or ~(A & B)
+ const auto matchNotOrAnd =
+ [Opcode, FlippedOpcode](Value *Op, auto m_A, auto m_B, auto m_C,
+ Value *&X, bool CountUses = false) -> bool {
+ if (CountUses && !Op->hasOneUse())
+ return false;
+
+ if (match(Op, m_c_BinOp(FlippedOpcode,
+ m_CombineAnd(m_Value(X),
+ m_Not(m_c_BinOp(Opcode, m_A, m_B))),
+ m_C)))
+ return !CountUses || X->hasOneUse();
+
+ return false;
+ };
// (~(A | B) & C) | ... --> ...
// (~(A & B) | C) & ... --> ...
// TODO: One use checks are conservative. We just need to check that a total
// number of multiple used values does not exceed reduction
// in operations.
- if (match(Op0,
- m_c_BinOp(FlippedOpcode,
- m_CombineAnd(m_Value(X), m_Not(m_BinOp(Opcode, m_Value(A),
- m_Value(B)))),
- m_Value(C)))) {
+ if (matchNotOrAnd(Op0, m_Value(A), m_Value(B), m_Value(C), X)) {
// (~(A | B) & C) | (~(A | C) & B) --> (B ^ C) & ~A
// (~(A & B) | C) & (~(A & C) | B) --> ~((B ^ C) & A)
- if (match(Op1,
- m_OneUse(m_c_BinOp(FlippedOpcode,
- m_OneUse(m_Not(m_c_BinOp(Opcode, m_Specific(A),
- m_Specific(C)))),
- m_Specific(B))))) {
+ if (matchNotOrAnd(Op1, m_Specific(A), m_Specific(C), m_Specific(B), Dummy,
+ true)) {
Value *Xor = Builder.CreateXor(B, C);
return (Opcode == Instruction::Or)
? BinaryOperator::CreateAnd(Xor, Builder.CreateNot(A))
@@ -1754,11 +1766,8 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I,
// (~(A | B) & C) | (~(B | C) & A) --> (A ^ C) & ~B
// (~(A & B) | C) & (~(B & C) | A) --> ~((A ^ C) & B)
- if (match(Op1,
- m_OneUse(m_c_BinOp(FlippedOpcode,
- m_OneUse(m_Not(m_c_BinOp(Opcode, m_Specific(B),
- m_Specific(C)))),
- m_Specific(A))))) {
+ if (matchNotOrAnd(Op1, m_Specific(B), m_Specific(C), m_Specific(A), Dummy,
+ true)) {
Value *Xor = Builder.CreateXor(A, C);
return (Opcode == Instruction::Or)
? BinaryOperator::CreateAnd(Xor, Builder.CreateNot(B))
@@ -1863,6 +1872,9 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
if (Instruction *X = foldVectorBinop(I))
return X;
+ if (Instruction *Phi = foldBinopWithPhiOperands(I))
+ return Phi;
+
// See if we can simplify any instructions used by the instruction whose sole
// purpose is to compute bits we don't care about.
if (SimplifyDemandedInstructionBits(I))
@@ -2072,21 +2084,37 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
if (Op0->hasOneUse() || isFreeToInvert(C, C->hasOneUse()))
return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(C));
- // (A | B) & ((~A) ^ B) -> (A & B)
- // (A | B) & (B ^ (~A)) -> (A & B)
- // (B | A) & ((~A) ^ B) -> (A & B)
- // (B | A) & (B ^ (~A)) -> (A & B)
+ // (A | B) & (~A ^ B) -> A & B
+ // (A | B) & (B ^ ~A) -> A & B
+ // (B | A) & (~A ^ B) -> A & B
+ // (B | A) & (B ^ ~A) -> A & B
if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
return BinaryOperator::CreateAnd(A, B);
- // ((~A) ^ B) & (A | B) -> (A & B)
- // ((~A) ^ B) & (B | A) -> (A & B)
- // (B ^ (~A)) & (A | B) -> (A & B)
- // (B ^ (~A)) & (B | A) -> (A & B)
+ // (~A ^ B) & (A | B) -> A & B
+ // (~A ^ B) & (B | A) -> A & B
+ // (B ^ ~A) & (A | B) -> A & B
+ // (B ^ ~A) & (B | A) -> A & B
if (match(Op0, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
match(Op1, m_c_Or(m_Specific(A), m_Specific(B))))
return BinaryOperator::CreateAnd(A, B);
+
+ // (~A | B) & (A ^ B) -> ~A & B
+ // (~A | B) & (B ^ A) -> ~A & B
+ // (B | ~A) & (A ^ B) -> ~A & B
+ // (B | ~A) & (B ^ A) -> ~A & B
+ if (match(Op0, m_c_Or(m_Not(m_Value(A)), m_Value(B))) &&
+ match(Op1, m_c_Xor(m_Specific(A), m_Specific(B))))
+ return BinaryOperator::CreateAnd(Builder.CreateNot(A), B);
+
+ // (A ^ B) & (~A | B) -> ~A & B
+ // (B ^ A) & (~A | B) -> ~A & B
+ // (A ^ B) & (B | ~A) -> ~A & B
+ // (B ^ A) & (B | ~A) -> ~A & B
+ if (match(Op1, m_c_Or(m_Not(m_Value(A)), m_Value(B))) &&
+ match(Op0, m_c_Xor(m_Specific(A), m_Specific(B))))
+ return BinaryOperator::CreateAnd(Builder.CreateNot(A), B);
}
{
@@ -2640,6 +2668,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
if (Instruction *X = foldVectorBinop(I))
return X;
+ if (Instruction *Phi = foldBinopWithPhiOperands(I))
+ return Phi;
+
// See if we can simplify any instructions used by the instruction whose sole
// purpose is to compute bits we don't care about.
if (SimplifyDemandedInstructionBits(I))
@@ -3528,6 +3559,9 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
if (Instruction *X = foldVectorBinop(I))
return X;
+ if (Instruction *Phi = foldBinopWithPhiOperands(I))
+ return Phi;
+
if (Instruction *NewXor = foldXorToXor(I, Builder))
return NewXor;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 14427bd1f2f4..1fb46af46bee 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -352,9 +352,27 @@ Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) {
// * Dereferenceable address & few lanes -> scalarize speculative load/selects
// * Adjacent vector addresses -> masked.load
// * Narrow width by halfs excluding zero/undef lanes
-// * Vector splat address w/known mask -> scalar load
// * Vector incrementing address -> vector masked load
Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) {
+ auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2));
+ if (!ConstMask)
+ return nullptr;
+
+ // Vector splat address w/known mask -> scalar load
+ // Fold the gather to load the source vector first lane
+ // because it is reloading the same value each time
+ if (ConstMask->isAllOnesValue())
+ if (auto *SplatPtr = getSplatValue(II.getArgOperand(0))) {
+ auto *VecTy = cast<VectorType>(II.getType());
+ const Align Alignment =
+ cast<ConstantInt>(II.getArgOperand(1))->getAlignValue();
+ LoadInst *L = Builder.CreateAlignedLoad(VecTy->getElementType(), SplatPtr,
+ Alignment, "load.scalar");
+ Value *Shuf =
+ Builder.CreateVectorSplat(VecTy->getElementCount(), L, "broadcast");
+ return replaceInstUsesWith(II, cast<Instruction>(Shuf));
+ }
+
return nullptr;
}
@@ -362,7 +380,6 @@ Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) {
// * Single constant active lane -> store
// * Adjacent vector addresses -> masked.store
// * Narrow store width by halfs excluding zero/undef lanes
-// * Vector splat address w/known mask -> scalar store
// * Vector incrementing address -> vector masked store
Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
@@ -373,6 +390,34 @@ Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
if (ConstMask->isNullValue())
return eraseInstFromFunction(II);
+ // Vector splat address -> scalar store
+ if (auto *SplatPtr = getSplatValue(II.getArgOperand(1))) {
+ // scatter(splat(value), splat(ptr), non-zero-mask) -> store value, ptr
+ if (auto *SplatValue = getSplatValue(II.getArgOperand(0))) {
+ Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
+ StoreInst *S =
+ new StoreInst(SplatValue, SplatPtr, /*IsVolatile=*/false, Alignment);
+ S->copyMetadata(II);
+ return S;
+ }
+ // scatter(vector, splat(ptr), splat(true)) -> store extract(vector,
+ // lastlane), ptr
+ if (ConstMask->isAllOnesValue()) {
+ Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
+ VectorType *WideLoadTy = cast<VectorType>(II.getArgOperand(1)->getType());
+ ElementCount VF = WideLoadTy->getElementCount();
+ Constant *EC =
+ ConstantInt::get(Builder.getInt32Ty(), VF.getKnownMinValue());
+ Value *RunTimeVF = VF.isScalable() ? Builder.CreateVScale(EC) : EC;
+ Value *LastLane = Builder.CreateSub(RunTimeVF, Builder.getInt32(1));
+ Value *Extract =
+ Builder.CreateExtractElement(II.getArgOperand(0), LastLane);
+ StoreInst *S =
+ new StoreInst(Extract, SplatPtr, /*IsVolatile=*/false, Alignment);
+ S->copyMetadata(II);
+ return S;
+ }
+ }
if (isa<ScalableVectorType>(ConstMask->getType()))
return nullptr;
@@ -449,7 +494,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
// ctlz/cttz i1 Op0 --> not Op0
if (match(Op1, m_Zero()))
return BinaryOperator::CreateNot(Op0);
- // If zero is undef, then the input can be assumed to be "true", so the
+ // If zero is poison, then the input can be assumed to be "true", so the
// instruction simplifies to "false".
assert(match(Op1, m_One()) && "Expected ctlz/cttz operand to be 0 or 1");
return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(II.getType()));
@@ -474,7 +519,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
}
// Zext doesn't change the number of trailing zeros, so narrow:
- // cttz(zext(x)) -> zext(cttz(x)) if the 'ZeroIsUndef' parameter is 'true'.
+ // cttz(zext(x)) -> zext(cttz(x)) if the 'ZeroIsPoison' parameter is 'true'.
if (match(Op0, m_OneUse(m_ZExt(m_Value(X)))) && match(Op1, m_One())) {
auto *Cttz = IC.Builder.CreateBinaryIntrinsic(Intrinsic::cttz, X,
IC.Builder.getTrue());
@@ -511,7 +556,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
}
// If the input to cttz/ctlz is known to be non-zero,
- // then change the 'ZeroIsUndef' parameter to 'true'
+ // then change the 'ZeroIsPoison' parameter to 'true'
// because we know the zero behavior can't affect the result.
if (!Known.One.isZero() ||
isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II,
@@ -1188,6 +1233,21 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
Value *IIOperand = II->getArgOperand(0);
Value *X = nullptr;
+ KnownBits Known = computeKnownBits(IIOperand, 0, II);
+ uint64_t LZ = alignDown(Known.countMinLeadingZeros(), 8);
+ uint64_t TZ = alignDown(Known.countMinTrailingZeros(), 8);
+
+ // bswap(x) -> shift(x) if x has exactly one "active byte"
+ if (Known.getBitWidth() - LZ - TZ == 8) {
+ assert(LZ != TZ && "active byte cannot be in the middle");
+ if (LZ > TZ) // -> shl(x) if the "active byte" is in the low part of x
+ return BinaryOperator::CreateNUWShl(
+ IIOperand, ConstantInt::get(IIOperand->getType(), LZ - TZ));
+ // -> lshr(x) if the "active byte" is in the high part of x
+ return BinaryOperator::CreateExactLShr(
+ IIOperand, ConstantInt::get(IIOperand->getType(), TZ - LZ));
+ }
+
// bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) {
unsigned C = X->getType()->getScalarSizeInBits() -
@@ -2460,7 +2520,7 @@ static bool isSafeToEliminateVarargsCast(const CallBase &Call,
if (!Call.isByValArgument(ix))
return false;
- Type *SrcElemTy = SrcTy->getElementType();
+ Type *SrcElemTy = SrcTy->getNonOpaquePointerElementType();
Type *DstElemTy = Call.getParamByValType(ix);
if (!SrcElemTy->isSized() || !DstElemTy->isSized())
return false;
@@ -2571,57 +2631,36 @@ static IntrinsicInst *findInitTrampoline(Value *Callee) {
}
void InstCombinerImpl::annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) {
- unsigned NumArgs = Call.arg_size();
- ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0));
- ConstantInt *Op1C =
- (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1));
- // Bail out if the allocation size is zero (or an invalid alignment of zero
- // with aligned_alloc).
- if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue()))
- return;
-
- if (isMallocLikeFn(&Call, TLI) && Op0C) {
- if (isOpNewLikeFn(&Call, TLI))
+ // Note: We only handle cases which can't be driven from generic attributes
+ // here. So, for example, nonnull and noalias (which are common properties
+ // of some allocation functions) are expected to be handled via annotation
+ // of the respective allocator declaration with generic attributes.
+
+ uint64_t Size;
+ ObjectSizeOpts Opts;
+ if (getObjectSize(&Call, Size, DL, TLI, Opts) && Size > 0) {
+ // TODO: We really should just emit deref_or_null here and then
+ // let the generic inference code combine that with nonnull.
+ if (Call.hasRetAttr(Attribute::NonNull))
Call.addRetAttr(Attribute::getWithDereferenceableBytes(
- Call.getContext(), Op0C->getZExtValue()));
+ Call.getContext(), Size));
else
Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
- Call.getContext(), Op0C->getZExtValue()));
- } else if (isAlignedAllocLikeFn(&Call, TLI)) {
- if (Op1C)
- Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
- Call.getContext(), Op1C->getZExtValue()));
- // Add alignment attribute if alignment is a power of two constant.
- if (Op0C && Op0C->getValue().ult(llvm::Value::MaximumAlignment) &&
- isKnownNonZero(Call.getOperand(1), DL, 0, &AC, &Call, &DT)) {
- uint64_t AlignmentVal = Op0C->getZExtValue();
- if (llvm::isPowerOf2_64(AlignmentVal)) {
- Call.removeRetAttr(Attribute::Alignment);
- Call.addRetAttr(Attribute::getWithAlignment(Call.getContext(),
- Align(AlignmentVal)));
- }
- }
- } else if (isReallocLikeFn(&Call, TLI) && Op1C) {
- Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
- Call.getContext(), Op1C->getZExtValue()));
- } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) {
- bool Overflow;
- const APInt &N = Op0C->getValue();
- APInt Size = N.umul_ov(Op1C->getValue(), Overflow);
- if (!Overflow)
- Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
- Call.getContext(), Size.getZExtValue()));
- } else if (isStrdupLikeFn(&Call, TLI)) {
- uint64_t Len = GetStringLength(Call.getOperand(0));
- if (Len) {
- // strdup
- if (NumArgs == 1)
- Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
- Call.getContext(), Len));
- // strndup
- else if (NumArgs == 2 && Op1C)
- Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
- Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1)));
+ Call.getContext(), Size));
+ }
+
+ // Add alignment attribute if alignment is a power of two constant.
+ Value *Alignment = getAllocAlignment(&Call, TLI);
+ if (!Alignment)
+ return;
+
+ ConstantInt *AlignOpC = dyn_cast<ConstantInt>(Alignment);
+ if (AlignOpC && AlignOpC->getValue().ult(llvm::Value::MaximumAlignment)) {
+ uint64_t AlignmentVal = AlignOpC->getZExtValue();
+ if (llvm::isPowerOf2_64(AlignmentVal)) {
+ Call.removeRetAttr(Attribute::Alignment);
+ Call.addRetAttr(Attribute::getWithAlignment(Call.getContext(),
+ Align(AlignmentVal)));
}
}
}
@@ -2744,9 +2783,9 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
PointerType *NewTy = cast<PointerType>(CI->getOperand(0)->getType());
if (!NewTy->isOpaque() && Call.isByValArgument(ix)) {
Call.removeParamAttr(ix, Attribute::ByVal);
- Call.addParamAttr(
- ix, Attribute::getWithByValType(
- Call.getContext(), NewTy->getElementType()));
+ Call.addParamAttr(ix, Attribute::getWithByValType(
+ Call.getContext(),
+ NewTy->getNonOpaquePointerElementType()));
}
Changed = true;
}
@@ -2782,7 +2821,8 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
Call, Builder.CreateBitOrPointerCast(ReturnedArg, CallTy));
}
- if (isAllocLikeFn(&Call, &TLI))
+ if (isAllocationFn(&Call, &TLI) &&
+ isAllocRemovable(&cast<CallBase>(Call), &TLI))
return visitAllocSite(Call);
// Handle intrinsics which can be used in both call and invoke context.
@@ -2934,7 +2974,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
}
if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
- AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
+ AttrBuilder RAttrs(FT->getContext(), CallerPAL.getRetAttrs());
if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy)))
return false; // Attribute not compatible with transformed value.
}
@@ -2980,7 +3020,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL))
return false; // Cannot transform this parameter value.
- if (AttrBuilder(CallerPAL.getParamAttrs(i))
+ if (AttrBuilder(FT->getContext(), CallerPAL.getParamAttrs(i))
.overlaps(AttributeFuncs::typeIncompatible(ParamTy)))
return false; // Attribute not compatible with transformed value.
@@ -2994,12 +3034,12 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
// sized type and the sized type has to have the same size as the old type.
if (ParamTy != ActTy && CallerPAL.hasParamAttr(i, Attribute::ByVal)) {
PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
- if (!ParamPTy || !ParamPTy->getElementType()->isSized())
+ if (!ParamPTy || !ParamPTy->getPointerElementType()->isSized())
return false;
Type *CurElTy = Call.getParamByValType(i);
if (DL.getTypeAllocSize(CurElTy) !=
- DL.getTypeAllocSize(ParamPTy->getElementType()))
+ DL.getTypeAllocSize(ParamPTy->getPointerElementType()))
return false;
}
}
@@ -3012,17 +3052,14 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
// If the callee is just a declaration, don't change the varargsness of the
// call. We don't want to introduce a varargs call where one doesn't
// already exist.
- PointerType *APTy = cast<PointerType>(Call.getCalledOperand()->getType());
- if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg())
+ if (FT->isVarArg() != Call.getFunctionType()->isVarArg())
return false;
// If both the callee and the cast type are varargs, we still have to make
// sure the number of fixed parameters are the same or we have the same
// ABI issues as if we introduce a varargs call.
- if (FT->isVarArg() &&
- cast<FunctionType>(APTy->getElementType())->isVarArg() &&
- FT->getNumParams() !=
- cast<FunctionType>(APTy->getElementType())->getNumParams())
+ if (FT->isVarArg() && Call.getFunctionType()->isVarArg() &&
+ FT->getNumParams() != Call.getFunctionType()->getNumParams())
return false;
}
@@ -3045,7 +3082,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
ArgAttrs.reserve(NumActualArgs);
// Get any return attributes.
- AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
+ AttrBuilder RAttrs(FT->getContext(), CallerPAL.getRetAttrs());
// If the return value is not being used, the type may not be compatible
// with the existing attributes. Wipe out any problematic attributes.
@@ -3063,7 +3100,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
// Add any parameter attributes.
if (CallerPAL.hasParamAttr(i, Attribute::ByVal)) {
- AttrBuilder AB(CallerPAL.getParamAttrs(i));
+ AttrBuilder AB(FT->getContext(), CallerPAL.getParamAttrs(i));
AB.addByValAttr(NewArg->getType()->getPointerElementType());
ArgAttrs.push_back(AttributeSet::get(Ctx, AB));
} else
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 8df4a4529f47..f11ba8772f3c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -85,13 +85,16 @@ static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI,
AllocaInst &AI) {
PointerType *PTy = cast<PointerType>(CI.getType());
+ // Opaque pointers don't have an element type we could replace with.
+ if (PTy->isOpaque())
+ return nullptr;
IRBuilderBase::InsertPointGuard Guard(Builder);
Builder.SetInsertPoint(&AI);
// Get the type really allocated and the type casted to.
Type *AllocElTy = AI.getAllocatedType();
- Type *CastElTy = PTy->getElementType();
+ Type *CastElTy = PTy->getNonOpaquePointerElementType();
if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr;
// This optimisation does not work for cases where the cast type
@@ -2649,8 +2652,8 @@ static Instruction *convertBitCastToGEP(BitCastInst &CI, IRBuilderBase &Builder,
if (SrcPTy->isOpaque() || DstPTy->isOpaque())
return nullptr;
- Type *DstElTy = DstPTy->getElementType();
- Type *SrcElTy = SrcPTy->getElementType();
+ Type *DstElTy = DstPTy->getNonOpaquePointerElementType();
+ Type *SrcElTy = SrcPTy->getNonOpaquePointerElementType();
// When the type pointed to is not sized the cast cannot be
// turned into a gep.
@@ -2669,8 +2672,8 @@ static Instruction *convertBitCastToGEP(BitCastInst &CI, IRBuilderBase &Builder,
// If we found a path from the src to dest, create the getelementptr now.
if (SrcElTy == DstElTy) {
SmallVector<Value *, 8> Idxs(NumZeros + 1, Builder.getInt32(0));
- GetElementPtrInst *GEP =
- GetElementPtrInst::Create(SrcPTy->getElementType(), Src, Idxs);
+ GetElementPtrInst *GEP = GetElementPtrInst::Create(
+ SrcPTy->getNonOpaquePointerElementType(), Src, Idxs);
// If the source pointer is dereferenceable, then assume it points to an
// allocated object and apply "inbounds" to the GEP.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index ed53b88aed61..fd58a44504b3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -503,7 +503,7 @@ static Value *evaluateGEPOffsetExpression(User *GEP, InstCombinerImpl &IC,
/// Returns true if we can rewrite Start as a GEP with pointer Base
/// and some integer offset. The nodes that need to be re-written
/// for this transformation will be added to Explored.
-static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
+static bool canRewriteGEPAsOffset(Type *ElemTy, Value *Start, Value *Base,
const DataLayout &DL,
SetVector<Value *> &Explored) {
SmallVector<Value *, 16> WorkList(1, Start);
@@ -551,7 +551,7 @@ static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
// the original pointer type. We could handle more cases in the
// future.
if (GEP->getNumIndices() != 1 || !GEP->isInBounds() ||
- GEP->getType() != Start->getType())
+ GEP->getSourceElementType() != ElemTy)
return false;
if (!Explored.contains(GEP->getOperand(0)))
@@ -627,7 +627,7 @@ static void setInsertionPoint(IRBuilder<> &Builder, Value *V,
/// Returns a re-written value of Start as an indexed GEP using Base as a
/// pointer.
-static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
+static Value *rewriteGEPAsOffset(Type *ElemTy, Value *Start, Value *Base,
const DataLayout &DL,
SetVector<Value *> &Explored) {
// Perform all the substitutions. This is a bit tricky because we can
@@ -714,6 +714,8 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
}
}
+ PointerType *PtrTy =
+ ElemTy->getPointerTo(Start->getType()->getPointerAddressSpace());
for (Value *Val : Explored) {
if (Val == Base)
continue;
@@ -722,22 +724,14 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
// a GEP or a GEP + ptrtoint.
setInsertionPoint(Builder, Val, false);
- // If required, create an inttoptr instruction for Base.
- Value *NewBase = Base;
- if (!Base->getType()->isPointerTy())
- NewBase = Builder.CreateBitOrPointerCast(Base, Start->getType(),
- Start->getName() + "to.ptr");
-
- Value *GEP = Builder.CreateInBoundsGEP(
- Start->getType()->getPointerElementType(), NewBase,
- makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr");
-
- if (!Val->getType()->isPointerTy()) {
- Value *Cast = Builder.CreatePointerCast(GEP, Val->getType(),
- Val->getName() + ".conv");
- GEP = Cast;
- }
- Val->replaceAllUsesWith(GEP);
+ // Cast base to the expected type.
+ Value *NewVal = Builder.CreateBitOrPointerCast(
+ Base, PtrTy, Start->getName() + "to.ptr");
+ NewVal = Builder.CreateInBoundsGEP(
+ ElemTy, NewVal, makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr");
+ NewVal = Builder.CreateBitOrPointerCast(
+ NewVal, Val->getType(), Val->getName() + ".conv");
+ Val->replaceAllUsesWith(NewVal);
}
return NewInsts[Start];
@@ -747,7 +741,7 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
/// the input Value as a constant indexed GEP. Returns a pair containing
/// the GEPs Pointer and Index.
static std::pair<Value *, Value *>
-getAsConstantIndexedAddress(Value *V, const DataLayout &DL) {
+getAsConstantIndexedAddress(Type *ElemTy, Value *V, const DataLayout &DL) {
Type *IndexType = IntegerType::get(V->getContext(),
DL.getIndexTypeSizeInBits(V->getType()));
@@ -759,7 +753,7 @@ getAsConstantIndexedAddress(Value *V, const DataLayout &DL) {
if (!GEP->isInBounds())
break;
if (GEP->hasAllConstantIndices() && GEP->getNumIndices() == 1 &&
- GEP->getType() == V->getType()) {
+ GEP->getSourceElementType() == ElemTy) {
V = GEP->getOperand(0);
Constant *GEPIndex = static_cast<Constant *>(GEP->getOperand(1));
Index = ConstantExpr::getAdd(
@@ -798,17 +792,14 @@ static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS,
if (!GEPLHS->hasAllConstantIndices())
return nullptr;
- // Make sure the pointers have the same type.
- if (GEPLHS->getType() != RHS->getType())
- return nullptr;
-
+ Type *ElemTy = GEPLHS->getSourceElementType();
Value *PtrBase, *Index;
- std::tie(PtrBase, Index) = getAsConstantIndexedAddress(GEPLHS, DL);
+ std::tie(PtrBase, Index) = getAsConstantIndexedAddress(ElemTy, GEPLHS, DL);
// The set of nodes that will take part in this transformation.
SetVector<Value *> Nodes;
- if (!canRewriteGEPAsOffset(RHS, PtrBase, DL, Nodes))
+ if (!canRewriteGEPAsOffset(ElemTy, RHS, PtrBase, DL, Nodes))
return nullptr;
// We know we can re-write this as
@@ -817,7 +808,7 @@ static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS,
// can't have overflow on either side. We can therefore re-write
// this as:
// OFFSET1 cmp OFFSET2
- Value *NewRHS = rewriteGEPAsOffset(RHS, PtrBase, DL, Nodes);
+ Value *NewRHS = rewriteGEPAsOffset(ElemTy, RHS, PtrBase, DL, Nodes);
// RewriteGEPAsOffset has replaced RHS and all of its uses with a re-written
// GEP having PtrBase as the pointer base, and has returned in NewRHS the
@@ -894,9 +885,10 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
// If the base pointers are different, but the indices are the same, just
// compare the base pointer.
if (PtrBase != GEPRHS->getOperand(0)) {
- bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands();
- IndicesTheSame &= GEPLHS->getOperand(0)->getType() ==
- GEPRHS->getOperand(0)->getType();
+ bool IndicesTheSame =
+ GEPLHS->getNumOperands() == GEPRHS->getNumOperands() &&
+ GEPLHS->getType() == GEPRHS->getType() &&
+ GEPLHS->getSourceElementType() == GEPRHS->getSourceElementType();
if (IndicesTheSame)
for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
@@ -1271,8 +1263,8 @@ static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
// This is only really a signed overflow check if the inputs have been
// sign-extended; check for that condition. For example, if CI2 is 2^31 and
// the operands of the add are 64 bits wide, we need at least 33 sign bits.
- if (IC.ComputeMinSignedBits(A, 0, &I) > NewWidth ||
- IC.ComputeMinSignedBits(B, 0, &I) > NewWidth)
+ if (IC.ComputeMaxSignificantBits(A, 0, &I) > NewWidth ||
+ IC.ComputeMaxSignificantBits(B, 0, &I) > NewWidth)
return nullptr;
// In order to replace the original add with a narrower
@@ -2221,7 +2213,7 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
// icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0
Value *X = Shr->getOperand(0);
CmpInst::Predicate Pred = Cmp.getPredicate();
- if (Cmp.isEquality() && Shr->isExact() && Shr->hasOneUse() && C.isZero())
+ if (Cmp.isEquality() && Shr->isExact() && C.isZero())
return new ICmpInst(Pred, X, Cmp.getOperand(1));
const APInt *ShiftVal;
@@ -2247,9 +2239,10 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
// those conditions rather than checking them. This is difficult because of
// undef/poison (PR34838).
if (IsAShr) {
- if (Pred == CmpInst::ICMP_SLT || (Pred == CmpInst::ICMP_SGT && IsExact)) {
- // icmp slt (ashr X, ShAmtC), C --> icmp slt X, (C << ShAmtC)
- // icmp sgt (ashr exact X, ShAmtC), C --> icmp sgt X, (C << ShAmtC)
+ if (IsExact || Pred == CmpInst::ICMP_SLT || Pred == CmpInst::ICMP_ULT) {
+ // When ShAmtC can be shifted losslessly:
+ // icmp PRED (ashr exact X, ShAmtC), C --> icmp PRED X, (C << ShAmtC)
+ // icmp slt/ult (ashr X, ShAmtC), C --> icmp slt/ult X, (C << ShAmtC)
APInt ShiftedC = C.shl(ShAmtVal);
if (ShiftedC.ashr(ShAmtVal) == C)
return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
@@ -2261,6 +2254,12 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
(ShiftedC + 1).ashr(ShAmtVal) == (C + 1))
return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
}
+ if (Pred == CmpInst::ICMP_UGT) {
+ // icmp ugt (ashr X, ShAmtC), C --> icmp ugt X, ((C + 1) << ShAmtC) - 1
+ APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1;
+ if ((ShiftedC + 1).ashr(ShAmtVal) == (C + 1))
+ return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
+ }
// If the compare constant has significant bits above the lowest sign-bit,
// then convert an unsigned cmp to a test of the sign-bit:
@@ -3957,6 +3956,33 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
(Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE))
return new ICmpInst(Pred, X, Builder.CreateNot(Op0));
+ {
+ // Similar to above: an unsigned overflow comparison may use offset + mask:
+ // ((Op1 + C) & C) u< Op1 --> Op1 != 0
+ // ((Op1 + C) & C) u>= Op1 --> Op1 == 0
+ // Op0 u> ((Op0 + C) & C) --> Op0 != 0
+ // Op0 u<= ((Op0 + C) & C) --> Op0 == 0
+ BinaryOperator *BO;
+ const APInt *C;
+ if ((Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE) &&
+ match(Op0, m_And(m_BinOp(BO), m_LowBitMask(C))) &&
+ match(BO, m_Add(m_Specific(Op1), m_SpecificIntAllowUndef(*C)))) {
+ CmpInst::Predicate NewPred =
+ Pred == ICmpInst::ICMP_ULT ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
+ Constant *Zero = ConstantInt::getNullValue(Op1->getType());
+ return new ICmpInst(NewPred, Op1, Zero);
+ }
+
+ if ((Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE) &&
+ match(Op1, m_And(m_BinOp(BO), m_LowBitMask(C))) &&
+ match(BO, m_Add(m_Specific(Op0), m_SpecificIntAllowUndef(*C)))) {
+ CmpInst::Predicate NewPred =
+ Pred == ICmpInst::ICMP_UGT ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
+ Constant *Zero = ConstantInt::getNullValue(Op1->getType());
+ return new ICmpInst(NewPred, Op0, Zero);
+ }
+ }
+
bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
if (BO0 && isa<OverflowingBinaryOperator>(BO0))
NoOp0WrapProblem =
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 39b55b028110..7743b4c41555 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -148,6 +148,8 @@ public:
Instruction *SliceUpIllegalIntegerPHI(PHINode &PN);
Instruction *visitPHINode(PHINode &PN);
Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP);
+ Instruction *visitGEPOfGEP(GetElementPtrInst &GEP, GEPOperator *Src);
+ Instruction *visitGEPOfBitcast(BitCastInst *BCI, GetElementPtrInst &GEP);
Instruction *visitAllocaInst(AllocaInst &AI);
Instruction *visitAllocSite(Instruction &FI);
Instruction *visitFree(CallInst &FI);
@@ -195,8 +197,6 @@ private:
bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
bool shouldChangeType(Type *From, Type *To) const;
Value *dyn_castNegVal(Value *V) const;
- Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
- SmallVectorImpl<Value *> &NewIndices);
/// Classify whether a cast is worth optimizing.
///
@@ -607,6 +607,16 @@ public:
/// only possible if all operands to the PHI are constants).
Instruction *foldOpIntoPhi(Instruction &I, PHINode *PN);
+ /// For a binary operator with 2 phi operands, try to hoist the binary
+ /// operation before the phi. This can result in fewer instructions in
+ /// patterns where at least one set of phi operands simplifies.
+ /// Example:
+ /// BB3: binop (phi [X, BB1], [C1, BB2]), (phi [Y, BB1], [C2, BB2])
+ /// -->
+ /// BB1: BO = binop X, Y
+ /// BB3: phi [BO, BB1], [(binop C1, C2), BB2]
+ Instruction *foldBinopWithPhiOperands(BinaryOperator &BO);
+
/// Given an instruction with a select as one operand and a constant as the
/// other operand, try to fold the binary operator into the select arguments.
/// This also works for Cast instructions, which obviously do not have a
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 0dbfdba353c4..756792918dba 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -301,16 +301,17 @@ void PointerReplacer::replace(Instruction *I) {
assert(V && "Operand not replaced");
SmallVector<Value *, 8> Indices;
Indices.append(GEP->idx_begin(), GEP->idx_end());
- auto *NewI = GetElementPtrInst::Create(
- V->getType()->getPointerElementType(), V, Indices);
+ auto *NewI =
+ GetElementPtrInst::Create(GEP->getSourceElementType(), V, Indices);
IC.InsertNewInstWith(NewI, *GEP);
NewI->takeName(GEP);
WorkMap[GEP] = NewI;
} else if (auto *BC = dyn_cast<BitCastInst>(I)) {
auto *V = getReplacement(BC->getOperand(0));
assert(V && "Operand not replaced");
- auto *NewT = PointerType::get(BC->getType()->getPointerElementType(),
- V->getType()->getPointerAddressSpace());
+ auto *NewT = PointerType::getWithSamePointeeType(
+ cast<PointerType>(BC->getType()),
+ V->getType()->getPointerAddressSpace());
auto *NewI = new BitCastInst(V, NewT);
IC.InsertNewInstWith(NewI, *BC);
NewI->takeName(BC);
@@ -345,8 +346,7 @@ void PointerReplacer::replacePointer(Instruction &I, Value *V) {
#ifndef NDEBUG
auto *PT = cast<PointerType>(I.getType());
auto *NT = cast<PointerType>(V->getType());
- assert(PT != NT && PT->getElementType() == NT->getElementType() &&
- "Invalid usage");
+ assert(PT != NT && PT->hasSameElementTypeAs(NT) && "Invalid usage");
#endif
WorkMap[&I] = V;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index aca7ec8d7325..1aa10b550fc4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -155,6 +155,9 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
if (Instruction *X = foldVectorBinop(I))
return X;
+ if (Instruction *Phi = foldBinopWithPhiOperands(I))
+ return Phi;
+
if (Value *V = SimplifyUsingDistributiveLaws(I))
return replaceInstUsesWith(I, V);
@@ -348,13 +351,21 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
return CastInst::Create(Instruction::SExt, And, I.getType());
}
- // (bool X) * Y --> X ? Y : 0
- // Y * (bool X) --> X ? Y : 0
+ // (zext bool X) * Y --> X ? Y : 0
+ // Y * (zext bool X) --> X ? Y : 0
if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
return SelectInst::Create(X, Op1, ConstantInt::get(I.getType(), 0));
if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
return SelectInst::Create(X, Op0, ConstantInt::get(I.getType(), 0));
+ // (sext bool X) * C --> X ? -C : 0
+ Constant *ImmC;
+ if (match(Op0, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1) &&
+ match(Op1, m_ImmConstant(ImmC))) {
+ Constant *NegC = ConstantExpr::getNeg(ImmC);
+ return SelectInst::Create(X, NegC, ConstantInt::getNullValue(I.getType()));
+ }
+
// (lshr X, 31) * Y --> (ashr X, 31) & Y
// Y * (lshr X, 31) --> (ashr X, 31) & Y
// TODO: We are not checking one-use because the elimination of the multiply
@@ -442,6 +453,9 @@ Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) {
if (Instruction *X = foldVectorBinop(I))
return X;
+ if (Instruction *Phi = foldBinopWithPhiOperands(I))
+ return Phi;
+
if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
return FoldedMul;
@@ -742,6 +756,9 @@ static bool isMultiple(const APInt &C1, const APInt &C2, APInt &Quotient,
/// division instructions.
/// Common integer divide transforms
Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) {
+ if (Instruction *Phi = foldBinopWithPhiOperands(I))
+ return Phi;
+
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
bool IsSigned = I.getOpcode() == Instruction::SDiv;
Type *Ty = I.getType();
@@ -1359,6 +1376,9 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
if (Instruction *X = foldVectorBinop(I))
return X;
+ if (Instruction *Phi = foldBinopWithPhiOperands(I))
+ return Phi;
+
if (Instruction *R = foldFDivConstantDivisor(I))
return R;
@@ -1460,6 +1480,9 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
/// remainder instructions.
/// Common integer remainder transforms
Instruction *InstCombinerImpl::commonIRemTransforms(BinaryOperator &I) {
+ if (Instruction *Phi = foldBinopWithPhiOperands(I))
+ return Phi;
+
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
// The RHS is known non-zero.
@@ -1638,5 +1661,8 @@ Instruction *InstCombinerImpl::visitFRem(BinaryOperator &I) {
if (Instruction *X = foldVectorBinop(I))
return X;
+ if (Instruction *Phi = foldBinopWithPhiOperands(I))
+ return Phi;
+
return nullptr;
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index a6d6b5199105..65e60498ff95 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -943,7 +943,7 @@ static Instruction *foldSelectCtlzToCttz(ICmpInst *ICI, Value *TrueVal,
}
/// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single
-/// call to cttz/ctlz with flag 'is_zero_undef' cleared.
+/// call to cttz/ctlz with flag 'is_zero_poison' cleared.
///
/// For example, we can fold the following code sequence:
/// \code
@@ -987,7 +987,7 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
// sizeof in bits of 'Count'.
unsigned SizeOfInBits = Count->getType()->getScalarSizeInBits();
if (match(ValueOnZero, m_SpecificInt(SizeOfInBits))) {
- // Explicitly clear the 'undef_on_zero' flag. It's always valid to go from
+ // Explicitly clear the 'is_zero_poison' flag. It's always valid to go from
// true to false on this flag, so we can replace it for all users.
II->setArgOperand(1, ConstantInt::getFalse(II->getContext()));
return SelectArg;
@@ -995,7 +995,7 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
// The ValueOnZero is not the bitwidth. But if the cttz/ctlz (and optional
// zext/trunc) have one use (ending at the select), the cttz/ctlz result will
- // not be used if the input is zero. Relax to 'undef_on_zero' for that case.
+ // not be used if the input is zero. Relax to 'zero is poison' for that case.
if (II->hasOneUse() && SelectArg->hasOneUse() &&
!match(II->getArgOperand(1), m_One()))
II->setArgOperand(1, ConstantInt::getTrue(II->getContext()));
@@ -2325,8 +2325,9 @@ Instruction *InstCombinerImpl::matchSAddSubSat(Instruction &MinMax1) {
// The two operands of the add/sub must be nsw-truncatable to the NewTy. This
// is usually achieved via a sext from a smaller type.
- if (ComputeMinSignedBits(AddSub->getOperand(0), 0, AddSub) > NewBitWidth ||
- ComputeMinSignedBits(AddSub->getOperand(1), 0, AddSub) > NewBitWidth)
+ if (ComputeMaxSignificantBits(AddSub->getOperand(0), 0, AddSub) >
+ NewBitWidth ||
+ ComputeMaxSignificantBits(AddSub->getOperand(1), 0, AddSub) > NewBitWidth)
return nullptr;
// Finally create and return the sat intrinsic, truncated to the new type
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 06421d553915..17f0c5c4cff0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -369,6 +369,9 @@ static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I,
}
Instruction *InstCombinerImpl::commonShiftTransforms(BinaryOperator &I) {
+ if (Instruction *Phi = foldBinopWithPhiOperands(I))
+ return Phi;
+
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
assert(Op0->getType() == Op1->getType());
@@ -1032,12 +1035,13 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
NewLShr->setIsExact(I.isExact());
return NewLShr;
}
- // (X << C1) >>u C --> (X >>u (C - C1)) & (-1 >> C)
- Value *NewLShr = Builder.CreateLShr(X, ShiftDiff, "", I.isExact());
- APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC));
- return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask));
- }
- if (C1->ugt(ShAmtC)) {
+ if (Op0->hasOneUse()) {
+ // (X << C1) >>u C --> (X >>u (C - C1)) & (-1 >> C)
+ Value *NewLShr = Builder.CreateLShr(X, ShiftDiff, "", I.isExact());
+ APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC));
+ return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask));
+ }
+ } else if (C1->ugt(ShAmtC)) {
unsigned ShlAmtC = C1->getZExtValue();
Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmtC - ShAmtC);
if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
@@ -1046,15 +1050,33 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
NewShl->setHasNoUnsignedWrap(true);
return NewShl;
}
- // (X << C1) >>u C --> X << (C1 - C) & (-1 >> C)
- Value *NewShl = Builder.CreateShl(X, ShiftDiff);
+ if (Op0->hasOneUse()) {
+ // (X << C1) >>u C --> X << (C1 - C) & (-1 >> C)
+ Value *NewShl = Builder.CreateShl(X, ShiftDiff);
+ APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC));
+ return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask));
+ }
+ } else {
+ assert(*C1 == ShAmtC);
+ // (X << C) >>u C --> X & (-1 >>u C)
APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC));
- return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask));
+ return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
}
- assert(*C1 == ShAmtC);
- // (X << C) >>u C --> X & (-1 >>u C)
- APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC));
- return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
+ }
+
+ // ((X << C) + Y) >>u C --> (X + (Y >>u C)) & (-1 >>u C)
+ // TODO: Consolidate with the more general transform that starts from shl
+ // (the shifts are in the opposite order).
+ Value *Y;
+ if (match(Op0,
+ m_OneUse(m_c_Add(m_OneUse(m_Shl(m_Value(X), m_Specific(Op1))),
+ m_Value(Y))))) {
+ Value *NewLshr = Builder.CreateLShr(Y, Op1);
+ Value *NewAdd = Builder.CreateAdd(NewLshr, X);
+ unsigned Op1Val = C->getLimitedValue(BitWidth);
+ APInt Bits = APInt::getLowBitsSet(BitWidth, BitWidth - Op1Val);
+ Constant *Mask = ConstantInt::get(Ty, Bits);
+ return BinaryOperator::CreateAnd(NewAdd, Mask);
}
if (match(Op0, m_OneUse(m_ZExt(m_Value(X)))) &&
@@ -1094,7 +1116,6 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
}
}
- Value *Y;
if (ShAmtC == BitWidth - 1) {
// lshr i32 or(X,-X), 31 --> zext (X != 0)
if (match(Op0, m_OneUse(m_c_Or(m_Neg(m_Value(X)), m_Deferred(X)))))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 4dc712f32536..71a5ae24eead 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -800,22 +800,21 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
// Round NTZ down to the next byte. If we have 11 trailing zeros, then
// we need all the bits down to bit 8. Likewise, round NLZ. If we
// have 14 leading zeros, round to 8.
- NLZ &= ~7;
- NTZ &= ~7;
+ NLZ = alignDown(NLZ, 8);
+ NTZ = alignDown(NTZ, 8);
// If we need exactly one byte, we can do this transformation.
- if (BitWidth-NLZ-NTZ == 8) {
- unsigned ResultBit = NTZ;
- unsigned InputBit = BitWidth-NTZ-8;
-
+ if (BitWidth - NLZ - NTZ == 8) {
// Replace this with either a left or right shift to get the byte into
// the right place.
Instruction *NewVal;
- if (InputBit > ResultBit)
- NewVal = BinaryOperator::CreateLShr(II->getArgOperand(0),
- ConstantInt::get(I->getType(), InputBit-ResultBit));
+ if (NLZ > NTZ)
+ NewVal = BinaryOperator::CreateLShr(
+ II->getArgOperand(0),
+ ConstantInt::get(I->getType(), NLZ - NTZ));
else
- NewVal = BinaryOperator::CreateShl(II->getArgOperand(0),
- ConstantInt::get(I->getType(), ResultBit-InputBit));
+ NewVal = BinaryOperator::CreateShl(
+ II->getArgOperand(0),
+ ConstantInt::get(I->getType(), NTZ - NLZ));
NewVal->takeName(I);
return InsertNewInstWith(NewVal, *I);
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index c6a4602e59e3..736cf9c825d5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -495,8 +495,7 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
}
GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
- cast<PointerType>(NewPtr->getType())->getElementType(), NewPtr,
- NewOps);
+ GEP->getSourceElementType(), NewPtr, NewOps);
NewGEP->setIsInBounds(GEP->isInBounds());
return NewGEP;
}
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index eb5eadba194d..029be5257694 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1027,13 +1027,11 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
if (!ConstIsRHS)
std::swap(Op0, Op1);
- auto *BO = cast<BinaryOperator>(&I);
- Value *RI = Builder.CreateBinOp(BO->getOpcode(), Op0, Op1,
- SO->getName() + ".op");
- auto *FPInst = dyn_cast<Instruction>(RI);
- if (FPInst && isa<FPMathOperator>(FPInst))
- FPInst->copyFastMathFlags(BO);
- return RI;
+ Value *NewBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), Op0,
+ Op1, SO->getName() + ".op");
+ if (auto *NewBOI = dyn_cast<Instruction>(NewBO))
+ NewBOI->copyIRFlags(&I);
+ return NewBO;
}
Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op,
@@ -1289,6 +1287,70 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
return replaceInstUsesWith(I, NewPN);
}
+Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) {
+ // TODO: This should be similar to the incoming values check in foldOpIntoPhi:
+ // we are guarding against replicating the binop in >1 predecessor.
+ // This could miss matching a phi with 2 constant incoming values.
+ auto *Phi0 = dyn_cast<PHINode>(BO.getOperand(0));
+ auto *Phi1 = dyn_cast<PHINode>(BO.getOperand(1));
+ if (!Phi0 || !Phi1 || !Phi0->hasOneUse() || !Phi1->hasOneUse() ||
+ Phi0->getNumOperands() != 2 || Phi1->getNumOperands() != 2)
+ return nullptr;
+
+ // TODO: Remove the restriction for binop being in the same block as the phis.
+ if (BO.getParent() != Phi0->getParent() ||
+ BO.getParent() != Phi1->getParent())
+ return nullptr;
+
+ // Match a pair of incoming constants for one of the predecessor blocks.
+ BasicBlock *ConstBB, *OtherBB;
+ Constant *C0, *C1;
+ if (match(Phi0->getIncomingValue(0), m_ImmConstant(C0))) {
+ ConstBB = Phi0->getIncomingBlock(0);
+ OtherBB = Phi0->getIncomingBlock(1);
+ } else if (match(Phi0->getIncomingValue(1), m_ImmConstant(C0))) {
+ ConstBB = Phi0->getIncomingBlock(1);
+ OtherBB = Phi0->getIncomingBlock(0);
+ } else {
+ return nullptr;
+ }
+ if (!match(Phi1->getIncomingValueForBlock(ConstBB), m_ImmConstant(C1)))
+ return nullptr;
+
+ // The block that we are hoisting to must reach here unconditionally.
+ // Otherwise, we could be speculatively executing an expensive or
+ // non-speculative op.
+ auto *PredBlockBranch = dyn_cast<BranchInst>(OtherBB->getTerminator());
+ if (!PredBlockBranch || PredBlockBranch->isConditional() ||
+ !DT.isReachableFromEntry(OtherBB))
+ return nullptr;
+
+ // TODO: This check could be tightened to only apply to binops (div/rem) that
+ // are not safe to speculatively execute. But that could allow hoisting
+ // potentially expensive instructions (fdiv for example).
+ for (auto BBIter = BO.getParent()->begin(); &*BBIter != &BO; ++BBIter)
+ if (!isGuaranteedToTransferExecutionToSuccessor(&*BBIter))
+ return nullptr;
+
+ // Make a new binop in the predecessor block with the non-constant incoming
+ // values.
+ Builder.SetInsertPoint(PredBlockBranch);
+ Value *NewBO = Builder.CreateBinOp(BO.getOpcode(),
+ Phi0->getIncomingValueForBlock(OtherBB),
+ Phi1->getIncomingValueForBlock(OtherBB));
+ if (auto *NotFoldedNewBO = dyn_cast<BinaryOperator>(NewBO))
+ NotFoldedNewBO->copyIRFlags(&BO);
+
+ // Fold constants for the predecessor block with constant incoming values.
+ Constant *NewC = ConstantExpr::get(BO.getOpcode(), C0, C1);
+
+ // Replace the binop with a phi of the new values. The old phis are dead.
+ PHINode *NewPhi = PHINode::Create(BO.getType(), 2);
+ NewPhi->addIncoming(NewBO, OtherBB);
+ NewPhi->addIncoming(NewC, ConstBB);
+ return NewPhi;
+}
+
Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
if (!isa<Constant>(I.getOperand(1)))
return nullptr;
@@ -1307,10 +1369,11 @@ Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
/// is a sequence of GEP indices into the pointed type that will land us at the
/// specified offset. If so, fill them into NewIndices and return the resultant
/// element type, otherwise return null.
-Type *
-InstCombinerImpl::FindElementAtOffset(PointerType *PtrTy, int64_t IntOffset,
- SmallVectorImpl<Value *> &NewIndices) {
- Type *Ty = PtrTy->getElementType();
+static Type *findElementAtOffset(PointerType *PtrTy, int64_t IntOffset,
+ SmallVectorImpl<Value *> &NewIndices,
+ const DataLayout &DL) {
+ // Only used by visitGEPOfBitcast(), which is skipped for opaque pointers.
+ Type *Ty = PtrTy->getNonOpaquePointerElementType();
if (!Ty->isSized())
return nullptr;
@@ -1320,7 +1383,7 @@ InstCombinerImpl::FindElementAtOffset(PointerType *PtrTy, int64_t IntOffset,
return nullptr;
for (const APInt &Index : Indices)
- NewIndices.push_back(Builder.getInt(Index));
+ NewIndices.push_back(ConstantInt::get(PtrTy->getContext(), Index));
return Ty;
}
@@ -1884,12 +1947,254 @@ static Instruction *foldSelectGEP(GetElementPtrInst &GEP,
return SelectInst::Create(Cond, NewTrueC, NewFalseC, "", nullptr, Sel);
}
+Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP,
+ GEPOperator *Src) {
+ // Combine Indices - If the source pointer to this getelementptr instruction
+ // is a getelementptr instruction with matching element type, combine the
+ // indices of the two getelementptr instructions into a single instruction.
+ if (Src->getResultElementType() != GEP.getSourceElementType())
+ return nullptr;
+
+ if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
+ return nullptr;
+
+ if (Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 &&
+ Src->hasOneUse()) {
+ Value *GO1 = GEP.getOperand(1);
+ Value *SO1 = Src->getOperand(1);
+
+ if (LI) {
+ // Try to reassociate loop invariant GEP chains to enable LICM.
+ if (Loop *L = LI->getLoopFor(GEP.getParent())) {
+ // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is
+ // invariant: this breaks the dependence between GEPs and allows LICM
+ // to hoist the invariant part out of the loop.
+ if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) {
+ // We have to be careful here.
+ // We have something like:
+ // %src = getelementptr <ty>, <ty>* %base, <ty> %idx
+ // %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2
+ // If we just swap idx & idx2 then we could inadvertantly
+ // change %src from a vector to a scalar, or vice versa.
+ // Cases:
+ // 1) %base a scalar & idx a scalar & idx2 a vector
+ // => Swapping idx & idx2 turns %src into a vector type.
+ // 2) %base a scalar & idx a vector & idx2 a scalar
+ // => Swapping idx & idx2 turns %src in a scalar type
+ // 3) %base, %idx, and %idx2 are scalars
+ // => %src & %gep are scalars
+ // => swapping idx & idx2 is safe
+ // 4) %base a vector
+ // => %src is a vector
+ // => swapping idx & idx2 is safe.
+ auto *SO0 = Src->getOperand(0);
+ auto *SO0Ty = SO0->getType();
+ if (!isa<VectorType>(GEP.getType()) || // case 3
+ isa<VectorType>(SO0Ty)) { // case 4
+ Src->setOperand(1, GO1);
+ GEP.setOperand(1, SO1);
+ return &GEP;
+ } else {
+ // Case 1 or 2
+ // -- have to recreate %src & %gep
+ // put NewSrc at same location as %src
+ Builder.SetInsertPoint(cast<Instruction>(Src));
+ Value *NewSrc = Builder.CreateGEP(
+ GEP.getSourceElementType(), SO0, GO1, Src->getName());
+ // Propagate 'inbounds' if the new source was not constant-folded.
+ if (auto *NewSrcGEPI = dyn_cast<GetElementPtrInst>(NewSrc))
+ NewSrcGEPI->setIsInBounds(Src->isInBounds());
+ GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+ GEP.getSourceElementType(), NewSrc, {SO1});
+ NewGEP->setIsInBounds(GEP.isInBounds());
+ return NewGEP;
+ }
+ }
+ }
+ }
+ }
+
+ // Note that if our source is a gep chain itself then we wait for that
+ // chain to be resolved before we perform this transformation. This
+ // avoids us creating a TON of code in some cases.
+ if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0)))
+ if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
+ return nullptr; // Wait until our source is folded to completion.
+
+ SmallVector<Value*, 8> Indices;
+
+ // Find out whether the last index in the source GEP is a sequential idx.
+ bool EndsWithSequential = false;
+ for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src);
+ I != E; ++I)
+ EndsWithSequential = I.isSequential();
+
+ // Can we combine the two pointer arithmetics offsets?
+ if (EndsWithSequential) {
+ // Replace: gep (gep %P, long B), long A, ...
+ // With: T = long A+B; gep %P, T, ...
+ Value *SO1 = Src->getOperand(Src->getNumOperands()-1);
+ Value *GO1 = GEP.getOperand(1);
+
+ // If they aren't the same type, then the input hasn't been processed
+ // by the loop above yet (which canonicalizes sequential index types to
+ // intptr_t). Just avoid transforming this until the input has been
+ // normalized.
+ if (SO1->getType() != GO1->getType())
+ return nullptr;
+
+ Value *Sum =
+ SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
+ // Only do the combine when we are sure the cost after the
+ // merge is never more than that before the merge.
+ if (Sum == nullptr)
+ return nullptr;
+
+ // Update the GEP in place if possible.
+ if (Src->getNumOperands() == 2) {
+ GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)));
+ replaceOperand(GEP, 0, Src->getOperand(0));
+ replaceOperand(GEP, 1, Sum);
+ return &GEP;
+ }
+ Indices.append(Src->op_begin()+1, Src->op_end()-1);
+ Indices.push_back(Sum);
+ Indices.append(GEP.op_begin()+2, GEP.op_end());
+ } else if (isa<Constant>(*GEP.idx_begin()) &&
+ cast<Constant>(*GEP.idx_begin())->isNullValue() &&
+ Src->getNumOperands() != 1) {
+ // Otherwise we can do the fold if the first index of the GEP is a zero
+ Indices.append(Src->op_begin()+1, Src->op_end());
+ Indices.append(GEP.idx_begin()+1, GEP.idx_end());
+ }
+
+ if (!Indices.empty())
+ return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))
+ ? GetElementPtrInst::CreateInBounds(
+ Src->getSourceElementType(), Src->getOperand(0), Indices,
+ GEP.getName())
+ : GetElementPtrInst::Create(Src->getSourceElementType(),
+ Src->getOperand(0), Indices,
+ GEP.getName());
+
+ return nullptr;
+}
+
+// Note that we may have also stripped an address space cast in between.
+Instruction *InstCombinerImpl::visitGEPOfBitcast(BitCastInst *BCI,
+ GetElementPtrInst &GEP) {
+ // With opaque pointers, there is no pointer element type we can use to
+ // adjust the GEP type.
+ PointerType *SrcType = cast<PointerType>(BCI->getSrcTy());
+ if (SrcType->isOpaque())
+ return nullptr;
+
+ Type *GEPEltType = GEP.getSourceElementType();
+ Type *SrcEltType = SrcType->getNonOpaquePointerElementType();
+ Value *SrcOp = BCI->getOperand(0);
+
+ // GEP directly using the source operand if this GEP is accessing an element
+ // of a bitcasted pointer to vector or array of the same dimensions:
+ // gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z
+ // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z
+ auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy,
+ const DataLayout &DL) {
+ auto *VecVTy = cast<FixedVectorType>(VecTy);
+ return ArrTy->getArrayElementType() == VecVTy->getElementType() &&
+ ArrTy->getArrayNumElements() == VecVTy->getNumElements() &&
+ DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy);
+ };
+ if (GEP.getNumOperands() == 3 &&
+ ((GEPEltType->isArrayTy() && isa<FixedVectorType>(SrcEltType) &&
+ areMatchingArrayAndVecTypes(GEPEltType, SrcEltType, DL)) ||
+ (isa<FixedVectorType>(GEPEltType) && SrcEltType->isArrayTy() &&
+ areMatchingArrayAndVecTypes(SrcEltType, GEPEltType, DL)))) {
+
+ // Create a new GEP here, as using `setOperand()` followed by
+ // `setSourceElementType()` won't actually update the type of the
+ // existing GEP Value. Causing issues if this Value is accessed when
+ // constructing an AddrSpaceCastInst
+ SmallVector<Value *, 8> Indices(GEP.indices());
+ Value *NGEP = GEP.isInBounds()
+ ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, Indices)
+ : Builder.CreateGEP(SrcEltType, SrcOp, Indices);
+ NGEP->takeName(&GEP);
+
+ // Preserve GEP address space to satisfy users
+ if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
+ return new AddrSpaceCastInst(NGEP, GEP.getType());
+
+ return replaceInstUsesWith(GEP, NGEP);
+ }
+
+ // See if we can simplify:
+ // X = bitcast A* to B*
+ // Y = gep X, <...constant indices...>
+ // into a gep of the original struct. This is important for SROA and alias
+ // analysis of unions. If "A" is also a bitcast, wait for A/X to be merged.
+ unsigned OffsetBits = DL.getIndexTypeSizeInBits(GEP.getType());
+ APInt Offset(OffsetBits, 0);
+
+ // If the bitcast argument is an allocation, The bitcast is for convertion
+ // to actual type of allocation. Removing such bitcasts, results in having
+ // GEPs with i8* base and pure byte offsets. That means GEP is not aware of
+ // struct or array hierarchy.
+ // By avoiding such GEPs, phi translation and MemoryDependencyAnalysis have
+ // a better chance to succeed.
+ if (!isa<BitCastInst>(SrcOp) && GEP.accumulateConstantOffset(DL, Offset) &&
+ !isAllocationFn(SrcOp, &TLI)) {
+ // If this GEP instruction doesn't move the pointer, just replace the GEP
+ // with a bitcast of the real input to the dest type.
+ if (!Offset) {
+ // If the bitcast is of an allocation, and the allocation will be
+ // converted to match the type of the cast, don't touch this.
+ if (isa<AllocaInst>(SrcOp)) {
+ // See if the bitcast simplifies, if so, don't nuke this GEP yet.
+ if (Instruction *I = visitBitCast(*BCI)) {
+ if (I != BCI) {
+ I->takeName(BCI);
+ BCI->getParent()->getInstList().insert(BCI->getIterator(), I);
+ replaceInstUsesWith(*BCI, I);
+ }
+ return &GEP;
+ }
+ }
+
+ if (SrcType->getPointerAddressSpace() != GEP.getAddressSpace())
+ return new AddrSpaceCastInst(SrcOp, GEP.getType());
+ return new BitCastInst(SrcOp, GEP.getType());
+ }
+
+ // Otherwise, if the offset is non-zero, we need to find out if there is a
+ // field at Offset in 'A's type. If so, we can pull the cast through the
+ // GEP.
+ SmallVector<Value*, 8> NewIndices;
+ if (findElementAtOffset(SrcType, Offset.getSExtValue(), NewIndices, DL)) {
+ Value *NGEP =
+ GEP.isInBounds()
+ ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, NewIndices)
+ : Builder.CreateGEP(SrcEltType, SrcOp, NewIndices);
+
+ if (NGEP->getType() == GEP.getType())
+ return replaceInstUsesWith(GEP, NGEP);
+ NGEP->takeName(&GEP);
+
+ if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
+ return new AddrSpaceCastInst(NGEP, GEP.getType());
+ return new BitCastInst(NGEP, GEP.getType());
+ }
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
- SmallVector<Value *, 8> Ops(GEP.operands());
+ Value *PtrOp = GEP.getOperand(0);
+ SmallVector<Value *, 8> Indices(GEP.indices());
Type *GEPType = GEP.getType();
Type *GEPEltType = GEP.getSourceElementType();
bool IsGEPSrcEleScalable = isa<ScalableVectorType>(GEPEltType);
- if (Value *V = SimplifyGEPInst(GEPEltType, Ops, GEP.isInBounds(),
+ if (Value *V = SimplifyGEPInst(GEPEltType, PtrOp, Indices, GEP.isInBounds(),
SQ.getWithInstruction(&GEP)))
return replaceInstUsesWith(GEP, V);
@@ -1912,8 +2217,6 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
// undef elements to decrease demanded bits
}
- Value *PtrOp = GEP.getOperand(0);
-
// Eliminate unneeded casts for indices, and replace indices which displace
// by multiples of a zero size type with zero.
bool MadeChange = false;
@@ -2063,132 +2366,9 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
PtrOp = NewGEP;
}
- // Combine Indices - If the source pointer to this getelementptr instruction
- // is a getelementptr instruction, combine the indices of the two
- // getelementptr instructions into a single instruction.
- if (auto *Src = dyn_cast<GEPOperator>(PtrOp)) {
- if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
- return nullptr;
-
- if (Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 &&
- Src->hasOneUse()) {
- Value *GO1 = GEP.getOperand(1);
- Value *SO1 = Src->getOperand(1);
-
- if (LI) {
- // Try to reassociate loop invariant GEP chains to enable LICM.
- if (Loop *L = LI->getLoopFor(GEP.getParent())) {
- // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is
- // invariant: this breaks the dependence between GEPs and allows LICM
- // to hoist the invariant part out of the loop.
- if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) {
- // We have to be careful here.
- // We have something like:
- // %src = getelementptr <ty>, <ty>* %base, <ty> %idx
- // %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2
- // If we just swap idx & idx2 then we could inadvertantly
- // change %src from a vector to a scalar, or vice versa.
- // Cases:
- // 1) %base a scalar & idx a scalar & idx2 a vector
- // => Swapping idx & idx2 turns %src into a vector type.
- // 2) %base a scalar & idx a vector & idx2 a scalar
- // => Swapping idx & idx2 turns %src in a scalar type
- // 3) %base, %idx, and %idx2 are scalars
- // => %src & %gep are scalars
- // => swapping idx & idx2 is safe
- // 4) %base a vector
- // => %src is a vector
- // => swapping idx & idx2 is safe.
- auto *SO0 = Src->getOperand(0);
- auto *SO0Ty = SO0->getType();
- if (!isa<VectorType>(GEPType) || // case 3
- isa<VectorType>(SO0Ty)) { // case 4
- Src->setOperand(1, GO1);
- GEP.setOperand(1, SO1);
- return &GEP;
- } else {
- // Case 1 or 2
- // -- have to recreate %src & %gep
- // put NewSrc at same location as %src
- Builder.SetInsertPoint(cast<Instruction>(PtrOp));
- Value *NewSrc =
- Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName());
- // Propagate 'inbounds' if the new source was not constant-folded.
- if (auto *NewSrcGEPI = dyn_cast<GetElementPtrInst>(NewSrc))
- NewSrcGEPI->setIsInBounds(Src->isInBounds());
- GetElementPtrInst *NewGEP =
- GetElementPtrInst::Create(GEPEltType, NewSrc, {SO1});
- NewGEP->setIsInBounds(GEP.isInBounds());
- return NewGEP;
- }
- }
- }
- }
- }
-
- // Note that if our source is a gep chain itself then we wait for that
- // chain to be resolved before we perform this transformation. This
- // avoids us creating a TON of code in some cases.
- if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0)))
- if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
- return nullptr; // Wait until our source is folded to completion.
-
- SmallVector<Value*, 8> Indices;
-
- // Find out whether the last index in the source GEP is a sequential idx.
- bool EndsWithSequential = false;
- for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src);
- I != E; ++I)
- EndsWithSequential = I.isSequential();
-
- // Can we combine the two pointer arithmetics offsets?
- if (EndsWithSequential) {
- // Replace: gep (gep %P, long B), long A, ...
- // With: T = long A+B; gep %P, T, ...
- Value *SO1 = Src->getOperand(Src->getNumOperands()-1);
- Value *GO1 = GEP.getOperand(1);
-
- // If they aren't the same type, then the input hasn't been processed
- // by the loop above yet (which canonicalizes sequential index types to
- // intptr_t). Just avoid transforming this until the input has been
- // normalized.
- if (SO1->getType() != GO1->getType())
- return nullptr;
-
- Value *Sum =
- SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
- // Only do the combine when we are sure the cost after the
- // merge is never more than that before the merge.
- if (Sum == nullptr)
- return nullptr;
-
- // Update the GEP in place if possible.
- if (Src->getNumOperands() == 2) {
- GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)));
- replaceOperand(GEP, 0, Src->getOperand(0));
- replaceOperand(GEP, 1, Sum);
- return &GEP;
- }
- Indices.append(Src->op_begin()+1, Src->op_end()-1);
- Indices.push_back(Sum);
- Indices.append(GEP.op_begin()+2, GEP.op_end());
- } else if (isa<Constant>(*GEP.idx_begin()) &&
- cast<Constant>(*GEP.idx_begin())->isNullValue() &&
- Src->getNumOperands() != 1) {
- // Otherwise we can do the fold if the first index of the GEP is a zero
- Indices.append(Src->op_begin()+1, Src->op_end());
- Indices.append(GEP.idx_begin()+1, GEP.idx_end());
- }
-
- if (!Indices.empty())
- return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))
- ? GetElementPtrInst::CreateInBounds(
- Src->getSourceElementType(), Src->getOperand(0), Indices,
- GEP.getName())
- : GetElementPtrInst::Create(Src->getSourceElementType(),
- Src->getOperand(0), Indices,
- GEP.getName());
- }
+ if (auto *Src = dyn_cast<GEPOperator>(PtrOp))
+ if (Instruction *I = visitGEPOfGEP(GEP, Src))
+ return I;
// Skip if GEP source element type is scalable. The type alloc size is unknown
// at compile-time.
@@ -2234,9 +2414,13 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
Value *StrippedPtr = PtrOp->stripPointerCasts();
PointerType *StrippedPtrTy = cast<PointerType>(StrippedPtr->getType());
- if (StrippedPtr != PtrOp) {
+ // TODO: The basic approach of these folds is not compatible with opaque
+ // pointers, because we can't use bitcasts as a hint for a desirable GEP
+ // type. Instead, we should perform canonicalization directly on the GEP
+ // type. For now, skip these.
+ if (StrippedPtr != PtrOp && !StrippedPtrTy->isOpaque()) {
bool HasZeroPointerIndex = false;
- Type *StrippedPtrEltTy = StrippedPtrTy->getElementType();
+ Type *StrippedPtrEltTy = StrippedPtrTy->getNonOpaquePointerElementType();
if (auto *C = dyn_cast<ConstantInt>(GEP.getOperand(1)))
HasZeroPointerIndex = C->isZero();
@@ -2420,103 +2604,9 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
ASCStrippedPtrOp = BC;
}
- if (auto *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp)) {
- Value *SrcOp = BCI->getOperand(0);
- PointerType *SrcType = cast<PointerType>(BCI->getSrcTy());
- Type *SrcEltType = SrcType->getElementType();
-
- // GEP directly using the source operand if this GEP is accessing an element
- // of a bitcasted pointer to vector or array of the same dimensions:
- // gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z
- // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z
- auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy,
- const DataLayout &DL) {
- auto *VecVTy = cast<FixedVectorType>(VecTy);
- return ArrTy->getArrayElementType() == VecVTy->getElementType() &&
- ArrTy->getArrayNumElements() == VecVTy->getNumElements() &&
- DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy);
- };
- if (GEP.getNumOperands() == 3 &&
- ((GEPEltType->isArrayTy() && isa<FixedVectorType>(SrcEltType) &&
- areMatchingArrayAndVecTypes(GEPEltType, SrcEltType, DL)) ||
- (isa<FixedVectorType>(GEPEltType) && SrcEltType->isArrayTy() &&
- areMatchingArrayAndVecTypes(SrcEltType, GEPEltType, DL)))) {
-
- // Create a new GEP here, as using `setOperand()` followed by
- // `setSourceElementType()` won't actually update the type of the
- // existing GEP Value. Causing issues if this Value is accessed when
- // constructing an AddrSpaceCastInst
- Value *NGEP =
- GEP.isInBounds()
- ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]})
- : Builder.CreateGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]});
- NGEP->takeName(&GEP);
-
- // Preserve GEP address space to satisfy users
- if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
- return new AddrSpaceCastInst(NGEP, GEPType);
-
- return replaceInstUsesWith(GEP, NGEP);
- }
-
- // See if we can simplify:
- // X = bitcast A* to B*
- // Y = gep X, <...constant indices...>
- // into a gep of the original struct. This is important for SROA and alias
- // analysis of unions. If "A" is also a bitcast, wait for A/X to be merged.
- unsigned OffsetBits = DL.getIndexTypeSizeInBits(GEPType);
- APInt Offset(OffsetBits, 0);
-
- // If the bitcast argument is an allocation, The bitcast is for convertion
- // to actual type of allocation. Removing such bitcasts, results in having
- // GEPs with i8* base and pure byte offsets. That means GEP is not aware of
- // struct or array hierarchy.
- // By avoiding such GEPs, phi translation and MemoryDependencyAnalysis have
- // a better chance to succeed.
- if (!isa<BitCastInst>(SrcOp) && GEP.accumulateConstantOffset(DL, Offset) &&
- !isAllocationFn(SrcOp, &TLI)) {
- // If this GEP instruction doesn't move the pointer, just replace the GEP
- // with a bitcast of the real input to the dest type.
- if (!Offset) {
- // If the bitcast is of an allocation, and the allocation will be
- // converted to match the type of the cast, don't touch this.
- if (isa<AllocaInst>(SrcOp)) {
- // See if the bitcast simplifies, if so, don't nuke this GEP yet.
- if (Instruction *I = visitBitCast(*BCI)) {
- if (I != BCI) {
- I->takeName(BCI);
- BCI->getParent()->getInstList().insert(BCI->getIterator(), I);
- replaceInstUsesWith(*BCI, I);
- }
- return &GEP;
- }
- }
-
- if (SrcType->getPointerAddressSpace() != GEP.getAddressSpace())
- return new AddrSpaceCastInst(SrcOp, GEPType);
- return new BitCastInst(SrcOp, GEPType);
- }
-
- // Otherwise, if the offset is non-zero, we need to find out if there is a
- // field at Offset in 'A's type. If so, we can pull the cast through the
- // GEP.
- SmallVector<Value*, 8> NewIndices;
- if (FindElementAtOffset(SrcType, Offset.getSExtValue(), NewIndices)) {
- Value *NGEP =
- GEP.isInBounds()
- ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, NewIndices)
- : Builder.CreateGEP(SrcEltType, SrcOp, NewIndices);
-
- if (NGEP->getType() == GEPType)
- return replaceInstUsesWith(GEP, NGEP);
- NGEP->takeName(&GEP);
-
- if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
- return new AddrSpaceCastInst(NGEP, GEPType);
- return new BitCastInst(NGEP, GEPType);
- }
- }
- }
+ if (auto *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp))
+ if (Instruction *I = visitGEPOfBitcast(BCI, GEP))
+ return I;
if (!GEP.isInBounds()) {
unsigned IdxWidth =
@@ -2533,8 +2623,7 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
DL.getTypeAllocSize(AI->getAllocatedType()).getKnownMinSize());
if (BasePtrOffset.ule(AllocSize)) {
return GetElementPtrInst::CreateInBounds(
- GEP.getSourceElementType(), PtrOp, makeArrayRef(Ops).slice(1),
- GEP.getName());
+ GEP.getSourceElementType(), PtrOp, Indices, GEP.getName());
}
}
}
@@ -2553,10 +2642,6 @@ static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo &TLI,
if (auto *LI = dyn_cast<LoadInst>(V))
return isa<GlobalVariable>(LI->getPointerOperand());
// Two distinct allocations will never be equal.
- // We rely on LookThroughBitCast in isAllocLikeFn being false, since looking
- // through bitcasts of V can cause
- // the result statement below to be true, even when AI and V (ex:
- // i8* ->i32* ->i8* of AI) are the same allocations.
return isAllocLikeFn(V, &TLI) && V != AI;
}
@@ -2659,7 +2744,7 @@ static bool isAllocSiteRemovable(Instruction *AI,
continue;
}
- if (isReallocLikeFn(I, &TLI, true)) {
+ if (isReallocLikeFn(I, &TLI)) {
Users.emplace_back(I);
Worklist.push_back(I);
continue;
@@ -2682,6 +2767,8 @@ static bool isAllocSiteRemovable(Instruction *AI,
}
Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
+ assert(isa<AllocaInst>(MI) || isAllocRemovable(&cast<CallBase>(MI), &TLI));
+
// If we have a malloc call which is only used in any amount of comparisons to
// null and free calls, delete the calls and replace the comparisons with true
// or false as appropriate.
@@ -2900,7 +2987,7 @@ Instruction *InstCombinerImpl::visitFree(CallInst &FI) {
// If we had free(realloc(...)) with no intervening uses, then eliminate the
// realloc() entirely.
if (CallInst *CI = dyn_cast<CallInst>(Op)) {
- if (CI->hasOneUse() && isReallocLikeFn(CI, &TLI, true)) {
+ if (CI->hasOneUse() && isReallocLikeFn(CI, &TLI)) {
return eraseInstFromFunction(
*replaceInstUsesWith(*CI, CI->getOperand(0)));
}
@@ -3709,16 +3796,61 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
return nullptr;
}
+/// Check for case where the call writes to an otherwise dead alloca. This
+/// shows up for unused out-params in idiomatic C/C++ code. Note that this
+/// helper *only* analyzes the write; doesn't check any other legality aspect.
+static bool SoleWriteToDeadLocal(Instruction *I, TargetLibraryInfo &TLI) {
+ auto *CB = dyn_cast<CallBase>(I);
+ if (!CB)
+ // TODO: handle e.g. store to alloca here - only worth doing if we extend
+ // to allow reload along used path as described below. Otherwise, this
+ // is simply a store to a dead allocation which will be removed.
+ return false;
+ Optional<MemoryLocation> Dest = MemoryLocation::getForDest(CB, TLI);
+ if (!Dest)
+ return false;
+ auto *AI = dyn_cast<AllocaInst>(getUnderlyingObject(Dest->Ptr));
+ if (!AI)
+ // TODO: allow malloc?
+ return false;
+ // TODO: allow memory access dominated by move point? Note that since AI
+ // could have a reference to itself captured by the call, we would need to
+ // account for cycles in doing so.
+ SmallVector<const User *> AllocaUsers;
+ SmallPtrSet<const User *, 4> Visited;
+ auto pushUsers = [&](const Instruction &I) {
+ for (const User *U : I.users()) {
+ if (Visited.insert(U).second)
+ AllocaUsers.push_back(U);
+ }
+ };
+ pushUsers(*AI);
+ while (!AllocaUsers.empty()) {
+ auto *UserI = cast<Instruction>(AllocaUsers.pop_back_val());
+ if (isa<BitCastInst>(UserI) || isa<GetElementPtrInst>(UserI) ||
+ isa<AddrSpaceCastInst>(UserI)) {
+ pushUsers(*UserI);
+ continue;
+ }
+ if (UserI == CB)
+ continue;
+ // TODO: support lifetime.start/end here
+ return false;
+ }
+ return true;
+}
+
/// Try to move the specified instruction from its current block into the
/// beginning of DestBlock, which can only happen if it's safe to move the
/// instruction past all of the instructions between it and the end of its
/// block.
-static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
+static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock,
+ TargetLibraryInfo &TLI) {
assert(I->getUniqueUndroppableUser() && "Invariants didn't hold!");
BasicBlock *SrcBlock = I->getParent();
// Cannot move control-flow-involving, volatile loads, vaarg, etc.
- if (isa<PHINode>(I) || I->isEHPad() || I->mayHaveSideEffects() ||
+ if (isa<PHINode>(I) || I->isEHPad() || I->mayThrow() || !I->willReturn() ||
I->isTerminator())
return false;
@@ -3738,6 +3870,14 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
if (CI->isConvergent())
return false;
}
+
+ // Unless we can prove that the memory write isn't visibile except on the
+ // path we're sinking to, we must bail.
+ if (I->mayWriteToMemory()) {
+ if (!SoleWriteToDeadLocal(I, TLI))
+ return false;
+ }
+
// We can only sink load instructions if there is nothing between the load and
// the end of block that could change the value.
if (I->mayReadFromMemory()) {
@@ -3746,7 +3886,7 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
// successor block.
if (DestBlock->getUniquePredecessor() != I->getParent())
return false;
- for (BasicBlock::iterator Scan = I->getIterator(),
+ for (BasicBlock::iterator Scan = std::next(I->getIterator()),
E = I->getParent()->end();
Scan != E; ++Scan)
if (Scan->mayWriteToMemory())
@@ -3906,12 +4046,11 @@ bool InstCombinerImpl::run() {
// predecessor, so that we don't have to split the critical edge.
// Another option where we can sink is a block that ends with a
// terminator that does not pass control to other block (such as
- // return or unreachable). In this case:
+ // return or unreachable or resume). In this case:
// - I dominates the User (by SSA form);
// - the User will be executed at most once.
// So sinking I down to User is always profitable or neutral.
- if (UserParent->getUniquePredecessor() == BB ||
- (isa<ReturnInst>(Term) || isa<UnreachableInst>(Term))) {
+ if (UserParent->getUniquePredecessor() == BB || succ_empty(Term)) {
assert(DT.dominates(BB, UserParent) && "Dominance relation broken?");
return UserParent;
}
@@ -3922,7 +4061,7 @@ bool InstCombinerImpl::run() {
if (OptBB) {
auto *UserParent = *OptBB;
// Okay, the CFG is simple enough, try to sink this instruction.
- if (TryToSinkInstruction(I, UserParent)) {
+ if (TryToSinkInstruction(I, UserParent, TLI)) {
LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
MadeIRChange = true;
// We'll add uses of the sunk instruction below, but since
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index bd2dc8d639fc..6e72255e51ae 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1547,10 +1547,9 @@ void AddressSanitizer::getInterestingMemoryOperands(
Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true,
XCHG->getCompareOperand()->getType(), None);
} else if (auto CI = dyn_cast<CallInst>(I)) {
- auto *F = CI->getCalledFunction();
- if (F && (F->getName().startswith("llvm.masked.load.") ||
- F->getName().startswith("llvm.masked.store."))) {
- bool IsWrite = F->getName().startswith("llvm.masked.store.");
+ if (CI->getIntrinsicID() == Intrinsic::masked_load ||
+ CI->getIntrinsicID() == Intrinsic::masked_store) {
+ bool IsWrite = CI->getIntrinsicID() == Intrinsic::masked_store;
// Masked store has an initial operand for the value.
unsigned OpOffset = IsWrite ? 1 : 0;
if (IsWrite ? !ClInstrumentWrites : !ClInstrumentReads)
@@ -1559,7 +1558,7 @@ void AddressSanitizer::getInterestingMemoryOperands(
auto BasePtr = CI->getOperand(OpOffset);
if (ignoreAccess(LI, BasePtr))
return;
- auto Ty = cast<PointerType>(BasePtr->getType())->getElementType();
+ Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType();
MaybeAlign Alignment = Align(1);
// Otherwise no alignment guarantees. We probably got Undef.
if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
@@ -1653,11 +1652,10 @@ static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass,
const DataLayout &DL, Type *IntptrTy,
Value *Mask, Instruction *I,
Value *Addr, MaybeAlign Alignment,
- unsigned Granularity, uint32_t TypeSize,
+ unsigned Granularity, Type *OpType,
bool IsWrite, Value *SizeArgument,
bool UseCalls, uint32_t Exp) {
- auto *VTy = cast<FixedVectorType>(
- cast<PointerType>(Addr->getType())->getElementType());
+ auto *VTy = cast<FixedVectorType>(OpType);
uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType());
unsigned Num = VTy->getNumElements();
auto Zero = ConstantInt::get(IntptrTy, 0);
@@ -1735,7 +1733,7 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
unsigned Granularity = 1 << Mapping.Scale;
if (O.MaybeMask) {
instrumentMaskedLoadOrStore(this, DL, IntptrTy, O.MaybeMask, O.getInsn(),
- Addr, O.Alignment, Granularity, O.TypeSize,
+ Addr, O.Alignment, Granularity, O.OpType,
O.IsWrite, nullptr, UseCalls, Exp);
} else {
doInstrumentAddress(this, O.getInsn(), O.getInsn(), Addr, O.Alignment,
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 9f26b37bbc79..ff3aa14a2a83 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -208,6 +208,14 @@ static cl::opt<bool> ClEventCallbacks(
cl::desc("Insert calls to __dfsan_*_callback functions on data events."),
cl::Hidden, cl::init(false));
+// Experimental feature that inserts callbacks for conditionals, including:
+// conditional branch, switch, select.
+// This must be true for dfsan_set_conditional_callback() to have effect.
+static cl::opt<bool> ClConditionalCallbacks(
+ "dfsan-conditional-callbacks",
+ cl::desc("Insert calls to callback functions on conditionals."), cl::Hidden,
+ cl::init(false));
+
// Controls whether the pass tracks the control flow of select instructions.
static cl::opt<bool> ClTrackSelectControlFlow(
"dfsan-track-select-control-flow",
@@ -428,6 +436,8 @@ class DataFlowSanitizer {
FunctionType *DFSanSetLabelFnTy;
FunctionType *DFSanNonzeroLabelFnTy;
FunctionType *DFSanVarargWrapperFnTy;
+ FunctionType *DFSanConditionalCallbackFnTy;
+ FunctionType *DFSanConditionalCallbackOriginFnTy;
FunctionType *DFSanCmpCallbackFnTy;
FunctionType *DFSanLoadStoreCallbackFnTy;
FunctionType *DFSanMemTransferCallbackFnTy;
@@ -444,6 +454,8 @@ class DataFlowSanitizer {
FunctionCallee DFSanLoadCallbackFn;
FunctionCallee DFSanStoreCallbackFn;
FunctionCallee DFSanMemTransferCallbackFn;
+ FunctionCallee DFSanConditionalCallbackFn;
+ FunctionCallee DFSanConditionalCallbackOriginFn;
FunctionCallee DFSanCmpCallbackFn;
FunctionCallee DFSanChainOriginFn;
FunctionCallee DFSanChainOriginIfTaintedFn;
@@ -454,7 +466,7 @@ class DataFlowSanitizer {
MDNode *OriginStoreWeights;
DFSanABIList ABIList;
DenseMap<Value *, Function *> UnwrappedFnMap;
- AttrBuilder ReadOnlyNoneAttrs;
+ AttributeMask ReadOnlyNoneAttrs;
/// Memory map parameters used in calculation mapping application addresses
/// to shadow addresses and origin addresses.
@@ -642,6 +654,10 @@ struct DFSanFunction {
Align getShadowAlign(Align InstAlignment);
+ // If ClConditionalCallbacks is enabled, insert a callback after a given
+ // branch instruction using the given conditional expression.
+ void addConditionalCallbacksIfEnabled(Instruction &I, Value *Condition);
+
private:
/// Collapses the shadow with aggregate type into a single primitive shadow
/// value.
@@ -748,6 +764,8 @@ public:
void visitSelectInst(SelectInst &I);
void visitMemSetInst(MemSetInst &I);
void visitMemTransferInst(MemTransferInst &I);
+ void visitBranchInst(BranchInst &BR);
+ void visitSwitchInst(SwitchInst &SW);
private:
void visitCASOrRMW(Align InstAlignment, Instruction &I);
@@ -971,6 +989,22 @@ Value *DFSanFunction::collapseToPrimitiveShadow(Value *Shadow,
return PrimitiveShadow;
}
+void DFSanFunction::addConditionalCallbacksIfEnabled(Instruction &I,
+ Value *Condition) {
+ if (!ClConditionalCallbacks) {
+ return;
+ }
+ IRBuilder<> IRB(&I);
+ Value *CondShadow = getShadow(Condition);
+ if (DFS.shouldTrackOrigins()) {
+ Value *CondOrigin = getOrigin(Condition);
+ IRB.CreateCall(DFS.DFSanConditionalCallbackOriginFn,
+ {CondShadow, CondOrigin});
+ } else {
+ IRB.CreateCall(DFS.DFSanConditionalCallbackFn, {CondShadow});
+ }
+}
+
Type *DataFlowSanitizer::getShadowTy(Type *OrigTy) {
if (!OrigTy->isSized())
return PrimitiveShadowTy;
@@ -1032,6 +1066,13 @@ bool DataFlowSanitizer::initializeModule(Module &M) {
FunctionType::get(Type::getVoidTy(*Ctx), None, /*isVarArg=*/false);
DFSanVarargWrapperFnTy = FunctionType::get(
Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
+ DFSanConditionalCallbackFnTy =
+ FunctionType::get(Type::getVoidTy(*Ctx), PrimitiveShadowTy,
+ /*isVarArg=*/false);
+ Type *DFSanConditionalCallbackOriginArgs[2] = {PrimitiveShadowTy, OriginTy};
+ DFSanConditionalCallbackOriginFnTy = FunctionType::get(
+ Type::getVoidTy(*Ctx), DFSanConditionalCallbackOriginArgs,
+ /*isVarArg=*/false);
DFSanCmpCallbackFnTy =
FunctionType::get(Type::getVoidTy(*Ctx), PrimitiveShadowTy,
/*isVarArg=*/false);
@@ -1160,7 +1201,7 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
// F is called by a wrapped custom function with primitive shadows. So
// its arguments and return value need conversion.
DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true,
- /*ForceZeroLabels=*/false);
+ /*IsForceZeroLabels=*/false);
Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI;
++ValAI;
for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N) {
@@ -1271,6 +1312,10 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
DFSanRuntimeFunctions.insert(
DFSanMemTransferCallbackFn.getCallee()->stripPointerCasts());
DFSanRuntimeFunctions.insert(
+ DFSanConditionalCallbackFn.getCallee()->stripPointerCasts());
+ DFSanRuntimeFunctions.insert(
+ DFSanConditionalCallbackOriginFn.getCallee()->stripPointerCasts());
+ DFSanRuntimeFunctions.insert(
DFSanCmpCallbackFn.getCallee()->stripPointerCasts());
DFSanRuntimeFunctions.insert(
DFSanChainOriginFn.getCallee()->stripPointerCasts());
@@ -1292,6 +1337,12 @@ void DataFlowSanitizer::initializeCallbackFunctions(Module &M) {
"__dfsan_mem_transfer_callback", DFSanMemTransferCallbackFnTy);
DFSanCmpCallbackFn =
Mod->getOrInsertFunction("__dfsan_cmp_callback", DFSanCmpCallbackFnTy);
+
+ DFSanConditionalCallbackFn = Mod->getOrInsertFunction(
+ "__dfsan_conditional_callback", DFSanConditionalCallbackFnTy);
+ DFSanConditionalCallbackOriginFn =
+ Mod->getOrInsertFunction("__dfsan_conditional_callback_origin",
+ DFSanConditionalCallbackOriginFnTy);
}
void DataFlowSanitizer::injectMetadataGlobals(Module &M) {
@@ -2593,6 +2644,8 @@ void DFSanVisitor::visitSelectInst(SelectInst &I) {
Value *FalseOrigin =
ShouldTrackOrigins ? DFSF.getOrigin(I.getFalseValue()) : nullptr;
+ DFSF.addConditionalCallbacksIfEnabled(I, I.getCondition());
+
if (isa<VectorType>(I.getCondition()->getType())) {
ShadowSel = DFSF.combineShadowsThenConvert(I.getType(), TrueShadow,
FalseShadow, &I);
@@ -2683,6 +2736,17 @@ void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
}
}
+void DFSanVisitor::visitBranchInst(BranchInst &BR) {
+ if (!BR.isConditional())
+ return;
+
+ DFSF.addConditionalCallbacksIfEnabled(BR, BR.getCondition());
+}
+
+void DFSanVisitor::visitSwitchInst(SwitchInst &SW) {
+ DFSF.addConditionalCallbacksIfEnabled(SW, SW.getCondition());
+}
+
static bool isAMustTailRetVal(Value *RetVal) {
// Tail call may have a bitcast between return.
if (auto *I = dyn_cast<BitCastInst>(RetVal)) {
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 8d3bc1383e96..fb10a99d1338 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1403,16 +1403,16 @@ bool HWAddressSanitizer::instrumentStack(
size_t Size = getAllocaSizeInBytes(*AI);
size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
+ auto TagEnd = [&](Instruction *Node) {
+ IRB.SetInsertPoint(Node);
+ Value *UARTag = getUARTag(IRB, StackTag);
+ tagAlloca(IRB, AI, UARTag, AlignedSize);
+ };
bool StandardLifetime =
UnrecognizedLifetimes.empty() && isStandardLifetime(Info, GetDT());
if (DetectUseAfterScope && StandardLifetime) {
IntrinsicInst *Start = Info.LifetimeStart[0];
IRB.SetInsertPoint(Start->getNextNode());
- auto TagEnd = [&](Instruction *Node) {
- IRB.SetInsertPoint(Node);
- Value *UARTag = getUARTag(IRB, StackTag);
- tagAlloca(IRB, AI, UARTag, AlignedSize);
- };
tagAlloca(IRB, AI, Tag, Size);
if (!forAllReachableExits(GetDT(), GetPDT(), Start, Info.LifetimeEnd,
RetVec, TagEnd)) {
@@ -1421,11 +1421,8 @@ bool HWAddressSanitizer::instrumentStack(
}
} else {
tagAlloca(IRB, AI, Tag, Size);
- for (auto *RI : RetVec) {
- IRB.SetInsertPoint(RI);
- Value *UARTag = getUARTag(IRB, StackTag);
- tagAlloca(IRB, AI, UARTag, AlignedSize);
- }
+ for (auto *RI : RetVec)
+ TagEnd(RI);
if (!StandardLifetime) {
for (auto &II : Info.LifetimeStart)
II->eraseFromParent();
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index de34348606ef..ab179b03dd29 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -248,8 +248,7 @@ public:
PGOCounterPromoter(
DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands,
Loop &CurLoop, LoopInfo &LI, BlockFrequencyInfo *BFI)
- : LoopToCandidates(LoopToCands), ExitBlocks(), InsertPts(), L(CurLoop),
- LI(LI), BFI(BFI) {
+ : LoopToCandidates(LoopToCands), L(CurLoop), LI(LI), BFI(BFI) {
// Skip collection of ExitBlocks and InsertPts for loops that will not be
// able to have counters promoted.
@@ -446,24 +445,19 @@ llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options,
return new InstrProfilingLegacyPass(Options, IsCS);
}
-static InstrProfIncrementInst *castToIncrementInst(Instruction *Instr) {
- InstrProfIncrementInst *Inc = dyn_cast<InstrProfIncrementInstStep>(Instr);
- if (Inc)
- return Inc;
- return dyn_cast<InstrProfIncrementInst>(Instr);
-}
-
bool InstrProfiling::lowerIntrinsics(Function *F) {
bool MadeChange = false;
PromotionCandidates.clear();
for (BasicBlock &BB : *F) {
for (Instruction &Instr : llvm::make_early_inc_range(BB)) {
- InstrProfIncrementInst *Inc = castToIncrementInst(&Instr);
- if (Inc) {
- lowerIncrement(Inc);
+ if (auto *IPIS = dyn_cast<InstrProfIncrementInstStep>(&Instr)) {
+ lowerIncrement(IPIS);
+ MadeChange = true;
+ } else if (auto *IPI = dyn_cast<InstrProfIncrementInst>(&Instr)) {
+ lowerIncrement(IPI);
MadeChange = true;
- } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(&Instr)) {
- lowerValueProfileInst(Ind);
+ } else if (auto *IPVP = dyn_cast<InstrProfValueProfileInst>(&Instr)) {
+ lowerValueProfileInst(IPVP);
MadeChange = true;
}
}
@@ -540,19 +534,14 @@ static bool needsRuntimeHookUnconditionally(const Triple &TT) {
/// Check if the module contains uses of any profiling intrinsics.
static bool containsProfilingIntrinsics(Module &M) {
- if (auto *F = M.getFunction(
- Intrinsic::getName(llvm::Intrinsic::instrprof_increment)))
- if (!F->use_empty())
- return true;
- if (auto *F = M.getFunction(
- Intrinsic::getName(llvm::Intrinsic::instrprof_increment_step)))
- if (!F->use_empty())
- return true;
- if (auto *F = M.getFunction(
- Intrinsic::getName(llvm::Intrinsic::instrprof_value_profile)))
- if (!F->use_empty())
- return true;
- return false;
+ auto containsIntrinsic = [&](int ID) {
+ if (auto *F = M.getFunction(Intrinsic::getName(ID)))
+ return !F->use_empty();
+ return false;
+ };
+ return containsIntrinsic(llvm::Intrinsic::instrprof_increment) ||
+ containsIntrinsic(llvm::Intrinsic::instrprof_increment_step) ||
+ containsIntrinsic(llvm::Intrinsic::instrprof_value_profile);
}
bool InstrProfiling::run(
@@ -771,7 +760,7 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
}
/// Get the name of a profiling variable for a particular function.
-static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix,
+static std::string getVarName(InstrProfInstBase *Inc, StringRef Prefix,
bool &Renamed) {
StringRef NamePrefix = getInstrProfNameVarPrefix();
StringRef Name = Inc->getName()->getName().substr(NamePrefix.size());
@@ -860,7 +849,7 @@ static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) {
}
GlobalVariable *
-InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
+InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) {
GlobalVariable *NamePtr = Inc->getName();
auto &PD = ProfileDataMap[NamePtr];
if (PD.RegionCounters)
@@ -997,8 +986,11 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx));
}
- if (DebugInfoCorrelate)
+ if (DebugInfoCorrelate) {
+ // Mark the counter variable as used so that it isn't optimized out.
+ CompilerUsedVars.push_back(PD.RegionCounters);
return PD.RegionCounters;
+ }
// Create data variable.
auto *IntPtrTy = M->getDataLayout().getIntPtrType(M->getContext());
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 727672fa0605..8fedefccf0e1 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -156,6 +156,7 @@ struct InterestingMemoryAccess {
Value *Addr = nullptr;
bool IsWrite;
unsigned Alignment;
+ Type *AccessTy;
uint64_t TypeSize;
Value *MaybeMask = nullptr;
};
@@ -181,7 +182,7 @@ public:
Value *Addr, uint32_t TypeSize, bool IsWrite);
void instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask,
Instruction *I, Value *Addr,
- unsigned Alignment, uint32_t TypeSize,
+ unsigned Alignment, Type *AccessTy,
bool IsWrite);
void instrumentMemIntrinsic(MemIntrinsic *MI);
Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
@@ -334,36 +335,32 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const {
InterestingMemoryAccess Access;
- const DataLayout &DL = I->getModule()->getDataLayout();
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
if (!ClInstrumentReads)
return None;
Access.IsWrite = false;
- Access.TypeSize = DL.getTypeStoreSizeInBits(LI->getType());
+ Access.AccessTy = LI->getType();
Access.Alignment = LI->getAlignment();
Access.Addr = LI->getPointerOperand();
} else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
if (!ClInstrumentWrites)
return None;
Access.IsWrite = true;
- Access.TypeSize =
- DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType());
+ Access.AccessTy = SI->getValueOperand()->getType();
Access.Alignment = SI->getAlignment();
Access.Addr = SI->getPointerOperand();
} else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
if (!ClInstrumentAtomics)
return None;
Access.IsWrite = true;
- Access.TypeSize =
- DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType());
+ Access.AccessTy = RMW->getValOperand()->getType();
Access.Alignment = 0;
Access.Addr = RMW->getPointerOperand();
} else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
if (!ClInstrumentAtomics)
return None;
Access.IsWrite = true;
- Access.TypeSize =
- DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType());
+ Access.AccessTy = XCHG->getCompareOperand()->getType();
Access.Alignment = 0;
Access.Addr = XCHG->getPointerOperand();
} else if (auto *CI = dyn_cast<CallInst>(I)) {
@@ -376,16 +373,16 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const {
return None;
// Masked store has an initial operand for the value.
OpOffset = 1;
+ Access.AccessTy = CI->getArgOperand(0)->getType();
Access.IsWrite = true;
} else {
if (!ClInstrumentReads)
return None;
+ Access.AccessTy = CI->getType();
Access.IsWrite = false;
}
auto *BasePtr = CI->getOperand(0 + OpOffset);
- auto *Ty = cast<PointerType>(BasePtr->getType())->getElementType();
- Access.TypeSize = DL.getTypeStoreSizeInBits(Ty);
if (auto *AlignmentConstant =
dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
Access.Alignment = (unsigned)AlignmentConstant->getZExtValue();
@@ -412,15 +409,16 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const {
if (Access.Addr->isSwiftError())
return None;
+ const DataLayout &DL = I->getModule()->getDataLayout();
+ Access.TypeSize = DL.getTypeStoreSizeInBits(Access.AccessTy);
return Access;
}
void MemProfiler::instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask,
Instruction *I, Value *Addr,
unsigned Alignment,
- uint32_t TypeSize, bool IsWrite) {
- auto *VTy = cast<FixedVectorType>(
- cast<PointerType>(Addr->getType())->getElementType());
+ Type *AccessTy, bool IsWrite) {
+ auto *VTy = cast<FixedVectorType>(AccessTy);
uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType());
unsigned Num = VTy->getNumElements();
auto *Zero = ConstantInt::get(IntptrTy, 0);
@@ -469,7 +467,7 @@ void MemProfiler::instrumentMop(Instruction *I, const DataLayout &DL,
if (Access.MaybeMask) {
instrumentMaskedLoadOrStore(DL, Access.MaybeMask, I, Access.Addr,
- Access.Alignment, Access.TypeSize,
+ Access.Alignment, Access.AccessTy,
Access.IsWrite);
} else {
// Since the access counts will be accumulated across the entire allocation,
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 446e601cd4d7..cfe993dedbc2 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -492,7 +492,7 @@ class MemorySanitizer {
public:
MemorySanitizer(Module &M, MemorySanitizerOptions Options)
: CompileKernel(Options.Kernel), TrackOrigins(Options.TrackOrigins),
- Recover(Options.Recover) {
+ Recover(Options.Recover), EagerChecks(Options.EagerChecks) {
initializeModule(M);
}
@@ -522,6 +522,7 @@ private:
/// Track origins (allocation points) of uninitialized values.
int TrackOrigins;
bool Recover;
+ bool EagerChecks;
LLVMContext *C;
Type *IntptrTy;
@@ -665,10 +666,12 @@ template <class T> T getOptOrDefault(const cl::opt<T> &Opt, T Default) {
} // end anonymous namespace
-MemorySanitizerOptions::MemorySanitizerOptions(int TO, bool R, bool K)
+MemorySanitizerOptions::MemorySanitizerOptions(int TO, bool R, bool K,
+ bool EagerChecks)
: Kernel(getOptOrDefault(ClEnableKmsan, K)),
TrackOrigins(getOptOrDefault(ClTrackOrigins, Kernel ? 2 : TO)),
- Recover(getOptOrDefault(ClKeepGoing, Kernel || R)) {}
+ Recover(getOptOrDefault(ClKeepGoing, Kernel || R)),
+ EagerChecks(getOptOrDefault(ClEagerChecks, EagerChecks)) {}
PreservedAnalyses MemorySanitizerPass::run(Function &F,
FunctionAnalysisManager &FAM) {
@@ -695,6 +698,8 @@ void MemorySanitizerPass::printPipeline(
OS << "recover;";
if (Options.Kernel)
OS << "kernel;";
+ if (Options.EagerChecks)
+ OS << "eager-checks;";
OS << "track-origins=" << Options.TrackOrigins;
OS << ">";
}
@@ -1667,9 +1672,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
/// This function either returns the value set earlier with setShadow,
/// or extracts if from ParamTLS (for function arguments).
Value *getShadow(Value *V) {
- if (!PropagateShadow) return getCleanShadow(V);
if (Instruction *I = dyn_cast<Instruction>(V)) {
- if (I->getMetadata("nosanitize"))
+ if (!PropagateShadow || I->getMetadata("nosanitize"))
return getCleanShadow(V);
// For instructions the shadow is already stored in the map.
Value *Shadow = ShadowMap[V];
@@ -1681,7 +1685,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return Shadow;
}
if (UndefValue *U = dyn_cast<UndefValue>(V)) {
- Value *AllOnes = PoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V);
+ Value *AllOnes = (PropagateShadow && PoisonUndef) ? getPoisonedShadow(V)
+ : getCleanShadow(V);
LLVM_DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n");
(void)U;
return AllOnes;
@@ -1701,22 +1706,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
continue;
}
- bool FArgByVal = FArg.hasByValAttr();
- bool FArgNoUndef = FArg.hasAttribute(Attribute::NoUndef);
- bool FArgEagerCheck = ClEagerChecks && !FArgByVal && FArgNoUndef;
- unsigned Size =
- FArg.hasByValAttr()
- ? DL.getTypeAllocSize(FArg.getParamByValType())
- : DL.getTypeAllocSize(FArg.getType());
+ unsigned Size = FArg.hasByValAttr()
+ ? DL.getTypeAllocSize(FArg.getParamByValType())
+ : DL.getTypeAllocSize(FArg.getType());
if (A == &FArg) {
bool Overflow = ArgOffset + Size > kParamTLSSize;
- if (FArgEagerCheck) {
- *ShadowPtr = getCleanShadow(V);
- setOrigin(A, getCleanOrigin());
- break;
- } else if (FArgByVal) {
- Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
+ if (FArg.hasByValAttr()) {
// ByVal pointer itself has clean shadow. We copy the actual
// argument shadow to the underlying memory.
// Figure out maximal valid memcpy alignment.
@@ -1727,40 +1723,38 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
/*isStore*/ true)
.first;
// TODO(glider): need to copy origins.
- if (Overflow) {
+ if (!PropagateShadow || Overflow) {
// ParamTLS overflow.
EntryIRB.CreateMemSet(
CpShadowPtr, Constant::getNullValue(EntryIRB.getInt8Ty()),
Size, ArgAlign);
} else {
+ Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
const Align CopyAlign = std::min(ArgAlign, kShadowTLSAlignment);
Value *Cpy = EntryIRB.CreateMemCpy(CpShadowPtr, CopyAlign, Base,
CopyAlign, Size);
LLVM_DEBUG(dbgs() << " ByValCpy: " << *Cpy << "\n");
(void)Cpy;
}
+ }
+
+ if (!PropagateShadow || Overflow || FArg.hasByValAttr() ||
+ (MS.EagerChecks && FArg.hasAttribute(Attribute::NoUndef))) {
*ShadowPtr = getCleanShadow(V);
+ setOrigin(A, getCleanOrigin());
} else {
// Shadow over TLS
Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
- if (Overflow) {
- // ParamTLS overflow.
- *ShadowPtr = getCleanShadow(V);
- } else {
- *ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base,
- kShadowTLSAlignment);
+ *ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base,
+ kShadowTLSAlignment);
+ if (MS.TrackOrigins) {
+ Value *OriginPtr =
+ getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset);
+ setOrigin(A, EntryIRB.CreateLoad(MS.OriginTy, OriginPtr));
}
}
LLVM_DEBUG(dbgs()
<< " ARG: " << FArg << " ==> " << **ShadowPtr << "\n");
- if (MS.TrackOrigins && !Overflow) {
- Value *OriginPtr =
- getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset);
- setOrigin(A, EntryIRB.CreateLoad(MS.OriginTy, OriginPtr));
- } else {
- setOrigin(A, getCleanOrigin());
- }
-
break;
}
@@ -3664,7 +3658,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// will become a non-readonly function after it is instrumented by us. To
// prevent this code from being optimized out, mark that function
// non-readonly in advance.
- AttrBuilder B;
+ AttributeMask B;
B.addAttribute(Attribute::ReadOnly)
.addAttribute(Attribute::ReadNone)
.addAttribute(Attribute::WriteOnly)
@@ -3679,7 +3673,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
maybeMarkSanitizerLibraryCallNoBuiltin(Call, TLI);
}
IRBuilder<> IRB(&CB);
- bool MayCheckCall = ClEagerChecks;
+ bool MayCheckCall = MS.EagerChecks;
if (Function *Func = CB.getCalledFunction()) {
// __sanitizer_unaligned_{load,store} functions may be called by users
// and always expects shadows in the TLS. So don't check them.
@@ -3697,15 +3691,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
continue;
}
unsigned Size = 0;
- Value *Store = nullptr;
- // Compute the Shadow for arg even if it is ByVal, because
- // in that case getShadow() will copy the actual arg shadow to
- // __msan_param_tls.
- Value *ArgShadow = getShadow(A);
- Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset);
- LLVM_DEBUG(dbgs() << " Arg#" << i << ": " << *A
- << " Shadow: " << *ArgShadow << "\n");
- bool ArgIsInitialized = false;
const DataLayout &DL = F.getParent()->getDataLayout();
bool ByVal = CB.paramHasAttr(i, Attribute::ByVal);
@@ -3716,6 +3701,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
insertShadowCheck(A, &CB);
Size = DL.getTypeAllocSize(A->getType());
} else {
+ bool ArgIsInitialized = false;
+ Value *Store = nullptr;
+ // Compute the Shadow for arg even if it is ByVal, because
+ // in that case getShadow() will copy the actual arg shadow to
+ // __msan_param_tls.
+ Value *ArgShadow = getShadow(A);
+ Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset);
+ LLVM_DEBUG(dbgs() << " Arg#" << i << ": " << *A
+ << " Shadow: " << *ArgShadow << "\n");
if (ByVal) {
// ByVal requires some special handling as it's too big for a single
// load
@@ -3732,10 +3726,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), Alignment,
/*isStore*/ false)
.first;
-
- Store = IRB.CreateMemCpy(ArgShadowBase, Alignment, AShadowPtr,
- Alignment, Size);
- // TODO(glider): need to copy origins.
+ if (!PropagateShadow) {
+ Store = IRB.CreateMemSet(ArgShadowBase,
+ Constant::getNullValue(IRB.getInt8Ty()),
+ Size, Alignment);
+ } else {
+ Store = IRB.CreateMemCpy(ArgShadowBase, Alignment, AShadowPtr,
+ Alignment, Size);
+ }
} else {
// Any other parameters mean we need bit-grained tracking of uninit
// data
@@ -3832,10 +3830,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
bool HasNoUndef =
F.hasRetAttribute(Attribute::NoUndef);
- bool StoreShadow = !(ClEagerChecks && HasNoUndef);
+ bool StoreShadow = !(MS.EagerChecks && HasNoUndef);
// FIXME: Consider using SpecialCaseList to specify a list of functions that
// must always return fully initialized values. For now, we hardcode "main".
- bool EagerCheck = (ClEagerChecks && HasNoUndef) || (F.getName() == "main");
+ bool EagerCheck = (MS.EagerChecks && HasNoUndef) || (F.getName() == "main");
Value *Shadow = getShadow(RetVal);
bool StoreOrigin = true;
@@ -5359,7 +5357,7 @@ bool MemorySanitizer::sanitizeFunction(Function &F, TargetLibraryInfo &TLI) {
MemorySanitizerVisitor Visitor(F, *this, TLI);
// Clear out readonly/readnone attributes.
- AttrBuilder B;
+ AttributeMask B;
B.addAttribute(Attribute::ReadOnly)
.addAttribute(Attribute::ReadNone)
.addAttribute(Attribute::WriteOnly)
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index b6ba1fc2132c..c46415e5b1f4 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -877,7 +877,10 @@ populateEHOperandBundle(VPCandidateInfo &Cand,
DenseMap<BasicBlock *, ColorVector> &BlockColors,
SmallVectorImpl<OperandBundleDef> &OpBundles) {
auto *OrigCall = dyn_cast<CallBase>(Cand.AnnotatedInst);
- if (OrigCall && !isa<IntrinsicInst>(OrigCall)) {
+ if (!OrigCall)
+ return;
+
+ if (!isa<IntrinsicInst>(OrigCall)) {
// The instrumentation call should belong to the same funclet as a
// non-intrinsic call, so just copy the operand bundle, if any exists.
Optional<OperandBundleUse> ParentFunclet =
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index da8ee1f15bf8..d3b60c7add34 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -917,8 +917,7 @@ void ModuleSanitizerCoverage::InjectTraceForGep(
void ModuleSanitizerCoverage::InjectTraceForLoadsAndStores(
Function &, ArrayRef<LoadInst *> Loads, ArrayRef<StoreInst *> Stores) {
- auto CallbackIdx = [&](const Value *Ptr) -> int {
- auto ElementTy = cast<PointerType>(Ptr->getType())->getElementType();
+ auto CallbackIdx = [&](Type *ElementTy) -> int {
uint64_t TypeSize = DL->getTypeStoreSizeInBits(ElementTy);
return TypeSize == 8 ? 0
: TypeSize == 16 ? 1
@@ -932,7 +931,7 @@ void ModuleSanitizerCoverage::InjectTraceForLoadsAndStores(
for (auto LI : Loads) {
IRBuilder<> IRB(LI);
auto Ptr = LI->getPointerOperand();
- int Idx = CallbackIdx(Ptr);
+ int Idx = CallbackIdx(LI->getType());
if (Idx < 0)
continue;
IRB.CreateCall(SanCovLoadFunction[Idx],
@@ -941,7 +940,7 @@ void ModuleSanitizerCoverage::InjectTraceForLoadsAndStores(
for (auto SI : Stores) {
IRBuilder<> IRB(SI);
auto Ptr = SI->getPointerOperand();
- int Idx = CallbackIdx(Ptr);
+ int Idx = CallbackIdx(SI->getValueOperand()->getType());
if (Idx < 0)
continue;
IRB.CreateCall(SanCovStoreFunction[Idx],
diff --git a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
index 764dc5f92707..c11691c613ac 100644
--- a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
+++ b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
@@ -42,7 +42,7 @@ enum class ARCRuntimeEntryPointKind {
Autorelease,
StoreStrong,
RetainRV,
- ClaimRV,
+ UnsafeClaimRV,
RetainAutorelease,
RetainAutoreleaseRV,
};
@@ -62,7 +62,7 @@ public:
Autorelease = nullptr;
StoreStrong = nullptr;
RetainRV = nullptr;
- ClaimRV = nullptr;
+ UnsafeClaimRV = nullptr;
RetainAutorelease = nullptr;
RetainAutoreleaseRV = nullptr;
}
@@ -87,9 +87,9 @@ public:
case ARCRuntimeEntryPointKind::RetainRV:
return getIntrinsicEntryPoint(RetainRV,
Intrinsic::objc_retainAutoreleasedReturnValue);
- case ARCRuntimeEntryPointKind::ClaimRV:
+ case ARCRuntimeEntryPointKind::UnsafeClaimRV:
return getIntrinsicEntryPoint(
- ClaimRV, Intrinsic::objc_unsafeClaimAutoreleasedReturnValue);
+ UnsafeClaimRV, Intrinsic::objc_unsafeClaimAutoreleasedReturnValue);
case ARCRuntimeEntryPointKind::RetainAutorelease:
return getIntrinsicEntryPoint(RetainAutorelease,
Intrinsic::objc_retainAutorelease);
@@ -127,7 +127,7 @@ private:
Function *RetainRV = nullptr;
/// Declaration for objc_unsafeClaimAutoreleasedReturnValue().
- Function *ClaimRV = nullptr;
+ Function *UnsafeClaimRV = nullptr;
/// Declaration for objc_retainAutorelease().
Function *RetainAutorelease = nullptr;
diff --git a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
index 4921209f041b..de0f5803b4c7 100644
--- a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
+++ b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -194,9 +194,6 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst,
return CanInterruptRV(Class);
}
}
-
- case RetainRVDep:
- return CanInterruptRV(GetBasicARCInstKind(Inst));
}
llvm_unreachable("Invalid dependence flavor");
diff --git a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h
index cf4c05ebe91c..dd6a1c3f9795 100644
--- a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h
+++ b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h
@@ -46,8 +46,7 @@ enum DependenceKind {
AutoreleasePoolBoundary,
CanChangeRetainCount,
RetainAutoreleaseDep, ///< Blocks objc_retainAutorelease.
- RetainAutoreleaseRVDep, ///< Blocks objc_retainAutoreleaseReturnValue.
- RetainRVDep ///< Blocks objc_retainAutoreleasedReturnValue.
+ RetainAutoreleaseRVDep ///< Blocks objc_retainAutoreleaseReturnValue.
};
/// Find dependent instructions. If there is exactly one dependent instruction,
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index c2ed94e8e1f6..9e2832827686 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -433,7 +433,7 @@ bool ObjCARCContract::tryToPeepholeInstruction(
// If we succeed in our optimization, fall through.
LLVM_FALLTHROUGH;
case ARCInstKind::RetainRV:
- case ARCInstKind::ClaimRV: {
+ case ARCInstKind::UnsafeClaimRV: {
bool IsInstContainedInBundle = BundledInsts->contains(Inst);
// Return now if the target doesn't need a special inline-asm marker. Return
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 0fa4904456cd..b6dc97f1e43f 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -515,7 +515,7 @@ class ObjCARCOpt {
Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
Instruction *Inst, ARCInstKind Class, const Value *Arg);
- /// Try to optimize an AutoreleaseRV with a RetainRV or ClaimRV. If the
+ /// Try to optimize an AutoreleaseRV with a RetainRV or UnsafeClaimRV. If the
/// optimization occurs, returns true to indicate that the caller should
/// assume the instructions are dead.
bool OptimizeInlinedAutoreleaseRVCall(
@@ -705,14 +705,14 @@ bool ObjCARCOpt::OptimizeInlinedAutoreleaseRVCall(
return true;
}
- // ClaimRV is a frontend peephole for RetainRV + Release. Since the
- // AutoreleaseRV and RetainRV cancel out, replace the ClaimRV with a Release.
- assert(Class == ARCInstKind::ClaimRV);
+ // UnsafeClaimRV is a frontend peephole for RetainRV + Release. Since the
+ // AutoreleaseRV and RetainRV cancel out, replace UnsafeClaimRV with Release.
+ assert(Class == ARCInstKind::UnsafeClaimRV);
Value *CallArg = cast<CallInst>(Inst)->getArgOperand(0);
CallInst *Release = CallInst::Create(
EP.get(ARCRuntimeEntryPointKind::Release), CallArg, "", Inst);
- assert(IsAlwaysTail(ARCInstKind::ClaimRV) &&
- "Expected ClaimRV to be safe to tail call");
+ assert(IsAlwaysTail(ARCInstKind::UnsafeClaimRV) &&
+ "Expected UnsafeClaimRV to be safe to tail call");
Release->setTailCall();
Inst->replaceAllUsesWith(CallArg);
EraseInstruction(Inst);
@@ -810,7 +810,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
BlockColors = colorEHFunclets(F);
// Store any delayed AutoreleaseRV intrinsics, so they can be easily paired
- // with RetainRV and ClaimRV.
+ // with RetainRV and UnsafeClaimRV.
Instruction *DelayedAutoreleaseRV = nullptr;
const Value *DelayedAutoreleaseRVArg = nullptr;
auto setDelayedAutoreleaseRV = [&](Instruction *AutoreleaseRV) {
@@ -837,7 +837,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
return false;
// Given the frontend rules for emitting AutoreleaseRV, RetainRV, and
- // ClaimRV, it's probably safe to skip over even opaque function calls
+ // UnsafeClaimRV, it's probably safe to skip over even opaque function calls
// here since OptimizeInlinedAutoreleaseRVCall will confirm that they
// have the same RCIdentityRoot. However, what really matters is
// skipping instructions or intrinsics that the inliner could leave behind;
@@ -881,7 +881,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
setDelayedAutoreleaseRV(Inst);
continue;
case ARCInstKind::RetainRV:
- case ARCInstKind::ClaimRV:
+ case ARCInstKind::UnsafeClaimRV:
if (DelayedAutoreleaseRV) {
// We have a potential RV pair. Check if they cancel out.
if (OptimizeInlinedAutoreleaseRVCall(F, BlockColors, Inst, Arg, Class,
@@ -979,9 +979,8 @@ void ObjCARCOpt::OptimizeIndividualCallImpl(
CallInst *CI = cast<CallInst>(Inst);
if (IsNullOrUndef(CI->getArgOperand(0))) {
Changed = true;
- Type *Ty = CI->getArgOperand(0)->getType();
- new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
- Constant::getNullValue(Ty), CI);
+ new StoreInst(ConstantInt::getTrue(CI->getContext()),
+ UndefValue::get(Type::getInt1PtrTy(CI->getContext())), CI);
Value *NewValue = UndefValue::get(CI->getType());
LLVM_DEBUG(
dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
@@ -999,9 +998,8 @@ void ObjCARCOpt::OptimizeIndividualCallImpl(
if (IsNullOrUndef(CI->getArgOperand(0)) ||
IsNullOrUndef(CI->getArgOperand(1))) {
Changed = true;
- Type *Ty = CI->getArgOperand(0)->getType();
- new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
- Constant::getNullValue(Ty), CI);
+ new StoreInst(ConstantInt::getTrue(CI->getContext()),
+ UndefValue::get(Type::getInt1PtrTy(CI->getContext())), CI);
Value *NewValue = UndefValue::get(CI->getType());
LLVM_DEBUG(
@@ -1165,7 +1163,7 @@ void ObjCARCOpt::OptimizeIndividualCallImpl(
DepInst = findSingleDependency(AutoreleasePoolBoundary, Arg,
Inst->getParent(), Inst, PA);
break;
- case ARCInstKind::ClaimRV:
+ case ARCInstKind::UnsafeClaimRV:
case ARCInstKind::RetainRV:
case ARCInstKind::AutoreleaseRV:
// Don't move these; the RV optimization depends on the autoreleaseRV
diff --git a/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
index 6d0a67c91cfa..1624cf26094a 100644
--- a/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
+++ b/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
@@ -32,7 +32,6 @@
namespace llvm {
class AAResults;
-class DataLayout;
class PHINode;
class SelectInst;
class Value;
diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp
index b693acceb3f6..1cda206a7e14 100644
--- a/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -579,6 +579,7 @@ bool AggressiveDeadCodeElimination::updateDeadRegions() {
// Don't compute the post ordering unless we needed it.
bool HavePostOrder = false;
bool Changed = false;
+ SmallVector<DominatorTree::UpdateType, 10> DeletedEdges;
for (auto *BB : BlocksWithDeadTerminators) {
auto &Info = BlockInfo[BB];
@@ -617,7 +618,6 @@ bool AggressiveDeadCodeElimination::updateDeadRegions() {
makeUnconditional(BB, PreferredSucc->BB);
// Inform the dominators about the deleted CFG edges.
- SmallVector<DominatorTree::UpdateType, 4> DeletedEdges;
for (auto *Succ : RemovedSuccessors) {
// It might have happened that the same successor appeared multiple times
// and the CFG edge wasn't really removed.
@@ -629,13 +629,14 @@ bool AggressiveDeadCodeElimination::updateDeadRegions() {
}
}
- DomTreeUpdater(DT, &PDT, DomTreeUpdater::UpdateStrategy::Eager)
- .applyUpdates(DeletedEdges);
-
NumBranchesRemoved += 1;
Changed = true;
}
+ if (!DeletedEdges.empty())
+ DomTreeUpdater(DT, &PDT, DomTreeUpdater::UpdateStrategy::Eager)
+ .applyUpdates(DeletedEdges);
+
return Changed;
}
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 37a7053d778e..25e8c3ef3b48 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -414,6 +414,14 @@ void ConstantHoistingPass::collectConstantCandidates(
IntegerType *PtrIntTy = DL->getIntPtrType(*Ctx, GVPtrTy->getAddressSpace());
APInt Offset(DL->getTypeSizeInBits(PtrIntTy), /*val*/0, /*isSigned*/true);
auto *GEPO = cast<GEPOperator>(ConstExpr);
+
+ // TODO: If we have a mix of inbounds and non-inbounds GEPs, then basing a
+ // non-inbounds GEP on an inbounds GEP is potentially incorrect. Restrict to
+ // inbounds GEP for now -- alternatively, we could drop inbounds from the
+ // constant expression,
+ if (!GEPO->isInBounds())
+ return;
+
if (!GEPO->accumulateConstantOffset(*DL, Offset))
return;
@@ -470,7 +478,7 @@ void ConstantHoistingPass::collectConstantCandidates(
// Visit constant expressions that have constant integers.
if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
// Handle constant gep expressions.
- if (ConstHoistGEP && ConstExpr->isGEPWithNoNotionalOverIndexing())
+ if (ConstHoistGEP && isa<GEPOperator>(ConstExpr))
collectConstantCandidates(ConstCandMap, Inst, Idx, ConstExpr);
// Only visit constant cast expressions.
@@ -810,7 +818,7 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
// Visit constant expression.
if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
- if (ConstExpr->isGEPWithNoNotionalOverIndexing()) {
+ if (isa<GEPOperator>(ConstExpr)) {
// Operand is a ConstantGEP, replace it.
updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat);
return;
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 7f2d5d7d9987..13963657d183 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -43,6 +43,51 @@ DEBUG_COUNTER(EliminatedCounter, "conds-eliminated",
static int64_t MaxConstraintValue = std::numeric_limits<int64_t>::max();
+namespace {
+struct ConstraintTy {
+ SmallVector<int64_t, 8> Coefficients;
+
+ ConstraintTy(SmallVector<int64_t, 8> Coefficients)
+ : Coefficients(Coefficients) {}
+
+ unsigned size() const { return Coefficients.size(); }
+};
+
+/// Struct to manage a list of constraints.
+struct ConstraintListTy {
+ SmallVector<ConstraintTy, 4> Constraints;
+
+ ConstraintListTy() {}
+
+ ConstraintListTy(const SmallVector<ConstraintTy, 4> &Constraints)
+ : Constraints(Constraints) {}
+
+ void mergeIn(const ConstraintListTy &Other) {
+ append_range(Constraints, Other.Constraints);
+ }
+
+ unsigned size() const { return Constraints.size(); }
+
+ unsigned empty() const { return Constraints.empty(); }
+
+ /// Returns true if any constraint has a non-zero coefficient for any of the
+ /// newly added indices. Zero coefficients for new indices are removed. If it
+ /// returns true, no new variable need to be added to the system.
+ bool needsNewIndices(const DenseMap<Value *, unsigned> &NewIndices) {
+ assert(size() == 1);
+ for (unsigned I = 0; I < NewIndices.size(); ++I) {
+ int64_t Last = get(0).Coefficients.pop_back_val();
+ if (Last != 0)
+ return true;
+ }
+ return false;
+ }
+
+ ConstraintTy &get(unsigned I) { return Constraints[I]; }
+};
+
+} // namespace
+
// Decomposes \p V into a vector of pairs of the form { c, X } where c * X. The
// sum of the pairs equals \p V. The first pair is the constant-factor and X
// must be nullptr. If the expression cannot be decomposed, returns an empty
@@ -108,24 +153,15 @@ static SmallVector<std::pair<int64_t, Value *>, 4> decompose(Value *V) {
if (match(V, m_NUWSub(m_Value(Op0), m_ConstantInt(CI))))
return {{-1 * CI->getSExtValue(), nullptr}, {1, Op0}};
if (match(V, m_NUWSub(m_Value(Op0), m_Value(Op1))))
- return {{0, nullptr}, {1, Op0}, {1, Op1}};
+ return {{0, nullptr}, {1, Op0}, {-1, Op1}};
return {{0, nullptr}, {1, V}};
}
-struct ConstraintTy {
- SmallVector<int64_t, 8> Coefficients;
-
- ConstraintTy(SmallVector<int64_t, 8> Coefficients)
- : Coefficients(Coefficients) {}
-
- unsigned size() const { return Coefficients.size(); }
-};
-
/// Turn a condition \p CmpI into a vector of constraints, using indices from \p
/// Value2Index. Additional indices for newly discovered values are added to \p
/// NewIndices.
-static SmallVector<ConstraintTy, 4>
+static ConstraintListTy
getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
const DenseMap<Value *, unsigned> &Value2Index,
DenseMap<Value *, unsigned> &NewIndices) {
@@ -151,11 +187,15 @@ getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
Value2Index, NewIndices);
if (Pred == CmpInst::ICMP_EQ) {
+ if (match(Op1, m_Zero()))
+ return getConstraint(CmpInst::ICMP_ULE, Op0, Op1, Value2Index,
+ NewIndices);
+
auto A =
getConstraint(CmpInst::ICMP_UGE, Op0, Op1, Value2Index, NewIndices);
auto B =
getConstraint(CmpInst::ICMP_ULE, Op0, Op1, Value2Index, NewIndices);
- append_range(A, B);
+ A.mergeIn(B);
return A;
}
@@ -200,10 +240,10 @@ getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
R[GetOrAddIndex(KV.second)] -= KV.first;
R[0] = Offset1 + Offset2 + (Pred == CmpInst::ICMP_ULT ? -1 : 0);
- return {R};
+ return {{R}};
}
-static SmallVector<ConstraintTy, 4>
+static ConstraintListTy
getConstraint(CmpInst *Cmp, const DenseMap<Value *, unsigned> &Value2Index,
DenseMap<Value *, unsigned> &NewIndices) {
return getConstraint(Cmp->getPredicate(), Cmp->getOperand(0),
@@ -397,21 +437,10 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) {
if (R.size() != 1)
continue;
- // Check if all coefficients of new indices are 0 after building the
- // constraint. Skip if any of the new indices has a non-null
- // coefficient.
- bool HasNewIndex = false;
- for (unsigned I = 0; I < NewIndices.size(); ++I) {
- int64_t Last = R[0].Coefficients.pop_back_val();
- if (Last != 0) {
- HasNewIndex = true;
- break;
- }
- }
- if (HasNewIndex || R[0].size() == 1)
+ if (R.needsNewIndices(NewIndices))
continue;
- if (CS.isConditionImplied(R[0].Coefficients)) {
+ if (CS.isConditionImplied(R.get(0).Coefficients)) {
if (!DebugCounter::shouldExecute(EliminatedCounter))
continue;
@@ -432,7 +461,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) {
Changed = true;
}
if (CS.isConditionImplied(
- ConstraintSystem::negate(R[0].Coefficients))) {
+ ConstraintSystem::negate(R.get(0).Coefficients))) {
if (!DebugCounter::shouldExecute(EliminatedCounter))
continue;
@@ -479,7 +508,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) {
LLVM_DEBUG(dbgs() << "Adding " << *CB.Condition << " " << CB.Not << "\n");
bool Added = false;
- for (auto &C : R) {
+ for (auto &C : R.Constraints) {
auto Coeffs = C.Coefficients;
LLVM_DEBUG({
dbgs() << " constraint: ";
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index eadbb4293539..ae636e7b61f7 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -699,17 +699,14 @@ bool isNoopIntrinsic(Instruction *I) {
}
// Check if we can ignore \p D for DSE.
-bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller,
- const TargetLibraryInfo &TLI) {
+bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
Instruction *DI = D->getMemoryInst();
// Calls that only access inaccessible memory cannot read or write any memory
// locations we consider for elimination.
if (auto *CB = dyn_cast<CallBase>(DI))
- if (CB->onlyAccessesInaccessibleMemory()) {
- if (isAllocLikeFn(DI, &TLI))
- return false;
+ if (CB->onlyAccessesInaccessibleMemory())
return true;
- }
+
// We can eliminate stores to locations not visible to the caller across
// throwing instructions.
if (DI->mayThrow() && !DefVisibleToCaller)
@@ -759,10 +756,8 @@ struct DSEState {
SmallVector<MemoryDef *, 64> MemDefs;
// Any that should be skipped as they are already deleted
SmallPtrSet<MemoryAccess *, 4> SkipStores;
- // Keep track of all of the objects that are invisible to the caller before
- // the function returns.
- // SmallPtrSet<const Value *, 16> InvisibleToCallerBeforeRet;
- DenseMap<const Value *, bool> InvisibleToCallerBeforeRet;
+ // Keep track whether a given object is captured before return or not.
+ DenseMap<const Value *, bool> CapturedBeforeReturn;
// Keep track of all of the objects that are invisible to the caller after
// the function returns.
DenseMap<const Value *, bool> InvisibleToCallerAfterRet;
@@ -805,12 +800,8 @@ struct DSEState {
// Treat byval or inalloca arguments the same as Allocas, stores to them are
// dead at the end of the function.
for (Argument &AI : F.args())
- if (AI.hasPassPointeeByValueCopyAttr()) {
- // For byval, the caller doesn't know the address of the allocation.
- if (AI.hasByValAttr())
- InvisibleToCallerBeforeRet.insert({&AI, true});
+ if (AI.hasPassPointeeByValueCopyAttr())
InvisibleToCallerAfterRet.insert({&AI, true});
- }
// Collect whether there is any irreducible control flow in the function.
ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI);
@@ -835,6 +826,20 @@ struct DSEState {
if (!isGuaranteedLoopIndependent(DeadI, KillingI, DeadLoc))
return OW_Unknown;
+ const Value *DeadPtr = DeadLoc.Ptr->stripPointerCasts();
+ const Value *KillingPtr = KillingLoc.Ptr->stripPointerCasts();
+ const Value *DeadUndObj = getUnderlyingObject(DeadPtr);
+ const Value *KillingUndObj = getUnderlyingObject(KillingPtr);
+
+ // Check whether the killing store overwrites the whole object, in which
+ // case the size/offset of the dead store does not matter.
+ if (DeadUndObj == KillingUndObj && KillingLoc.Size.isPrecise()) {
+ uint64_t KillingUndObjSize = getPointerSize(KillingUndObj, DL, TLI, &F);
+ if (KillingUndObjSize != MemoryLocation::UnknownSize &&
+ KillingUndObjSize == KillingLoc.Size.getValue())
+ return OW_Complete;
+ }
+
// FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll
// get imprecise values here, though (except for unknown sizes).
if (!KillingLoc.Size.isPrecise() || !DeadLoc.Size.isPrecise()) {
@@ -875,14 +880,6 @@ struct DSEState {
return OW_Complete;
}
- // Check to see if the killing store is to the entire object (either a
- // global, an alloca, or a byval/inalloca argument). If so, then it clearly
- // overwrites any other store to the same object.
- const Value *DeadPtr = DeadLoc.Ptr->stripPointerCasts();
- const Value *KillingPtr = KillingLoc.Ptr->stripPointerCasts();
- const Value *DeadUndObj = getUnderlyingObject(DeadPtr);
- const Value *KillingUndObj = getUnderlyingObject(KillingPtr);
-
// If we can't resolve the same pointers to the same object, then we can't
// analyze them at all.
if (DeadUndObj != KillingUndObj) {
@@ -896,12 +893,6 @@ struct DSEState {
return OW_Unknown;
}
- // If the KillingI store is to a recognizable object, get its size.
- uint64_t KillingUndObjSize = getPointerSize(KillingUndObj, DL, TLI, &F);
- if (KillingUndObjSize != MemoryLocation::UnknownSize)
- if (KillingUndObjSize == KillingSize && KillingUndObjSize >= DeadSize)
- return OW_Complete;
-
// Okay, we have stores to two completely different pointers. Try to
// decompose the pointer into a "base + constant_offset" form. If the base
// pointers are equal, then we can reason about the two stores.
@@ -957,31 +948,30 @@ struct DSEState {
return true;
auto I = InvisibleToCallerAfterRet.insert({V, false});
if (I.second) {
- if (!isInvisibleToCallerBeforeRet(V)) {
+ if (!isInvisibleToCallerOnUnwind(V)) {
I.first->second = false;
- } else {
- auto *Inst = dyn_cast<Instruction>(V);
- if (Inst && isAllocLikeFn(Inst, &TLI))
- I.first->second = !PointerMayBeCaptured(V, true, false);
+ } else if (isNoAliasCall(V)) {
+ I.first->second = !PointerMayBeCaptured(V, true, false);
}
}
return I.first->second;
}
- bool isInvisibleToCallerBeforeRet(const Value *V) {
- if (isa<AllocaInst>(V))
+ bool isInvisibleToCallerOnUnwind(const Value *V) {
+ bool RequiresNoCaptureBeforeUnwind;
+ if (!isNotVisibleOnUnwind(V, RequiresNoCaptureBeforeUnwind))
+ return false;
+ if (!RequiresNoCaptureBeforeUnwind)
return true;
- auto I = InvisibleToCallerBeforeRet.insert({V, false});
- if (I.second) {
- auto *Inst = dyn_cast<Instruction>(V);
- if (Inst && isAllocLikeFn(Inst, &TLI))
- // NOTE: This could be made more precise by PointerMayBeCapturedBefore
- // with the killing MemoryDef. But we refrain from doing so for now to
- // limit compile-time and this does not cause any changes to the number
- // of stores removed on a large test set in practice.
- I.first->second = !PointerMayBeCaptured(V, false, true);
- }
- return I.first->second;
+
+ auto I = CapturedBeforeReturn.insert({V, true});
+ if (I.second)
+ // NOTE: This could be made more precise by PointerMayBeCapturedBefore
+ // with the killing MemoryDef. But we refrain from doing so for now to
+ // limit compile-time and this does not cause any changes to the number
+ // of stores removed on a large test set in practice.
+ I.first->second = PointerMayBeCaptured(V, false, true);
+ return !I.first->second;
}
Optional<MemoryLocation> getLocForWrite(Instruction *I) const {
@@ -1269,8 +1259,7 @@ struct DSEState {
MemoryDef *CurrentDef = cast<MemoryDef>(Current);
Instruction *CurrentI = CurrentDef->getMemoryInst();
- if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(KillingUndObj),
- TLI)) {
+ if (canSkipDef(CurrentDef, !isInvisibleToCallerOnUnwind(KillingUndObj))) {
CanOptimize = false;
continue;
}
@@ -1442,7 +1431,7 @@ struct DSEState {
continue;
}
- if (UseInst->mayThrow() && !isInvisibleToCallerBeforeRet(KillingUndObj)) {
+ if (UseInst->mayThrow() && !isInvisibleToCallerOnUnwind(KillingUndObj)) {
LLVM_DEBUG(dbgs() << " ... found throwing instruction\n");
return None;
}
@@ -1623,7 +1612,7 @@ struct DSEState {
// First see if we can ignore it by using the fact that KillingI is an
// alloca/alloca like object that is not visible to the caller during
// execution of the function.
- if (KillingUndObj && isInvisibleToCallerBeforeRet(KillingUndObj))
+ if (KillingUndObj && isInvisibleToCallerOnUnwind(KillingUndObj))
return false;
if (KillingI->getParent() == DeadI->getParent())
@@ -1639,7 +1628,7 @@ struct DSEState {
bool isDSEBarrier(const Value *KillingUndObj, Instruction *DeadI) {
// If DeadI may throw it acts as a barrier, unless we are to an
// alloca/alloca like object that does not escape.
- if (DeadI->mayThrow() && !isInvisibleToCallerBeforeRet(KillingUndObj))
+ if (DeadI->mayThrow() && !isInvisibleToCallerOnUnwind(KillingUndObj))
return true;
// If DeadI is an atomic load/store stronger than monotonic, do not try to
@@ -1696,6 +1685,84 @@ struct DSEState {
return MadeChange;
}
+ /// If we have a zero initializing memset following a call to malloc,
+ /// try folding it into a call to calloc.
+ bool tryFoldIntoCalloc(MemoryDef *Def, const Value *DefUO) {
+ Instruction *DefI = Def->getMemoryInst();
+ MemSetInst *MemSet = dyn_cast<MemSetInst>(DefI);
+ if (!MemSet)
+ // TODO: Could handle zero store to small allocation as well.
+ return false;
+ Constant *StoredConstant = dyn_cast<Constant>(MemSet->getValue());
+ if (!StoredConstant || !StoredConstant->isNullValue())
+ return false;
+
+ if (!isRemovable(DefI))
+ // The memset might be volatile..
+ return false;
+
+ if (F.hasFnAttribute(Attribute::SanitizeMemory) ||
+ F.hasFnAttribute(Attribute::SanitizeAddress) ||
+ F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
+ F.getName() == "calloc")
+ return false;
+ auto *Malloc = const_cast<CallInst *>(dyn_cast<CallInst>(DefUO));
+ if (!Malloc)
+ return false;
+ auto *InnerCallee = Malloc->getCalledFunction();
+ if (!InnerCallee)
+ return false;
+ LibFunc Func;
+ if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) ||
+ Func != LibFunc_malloc)
+ return false;
+
+ auto shouldCreateCalloc = [](CallInst *Malloc, CallInst *Memset) {
+ // Check for br(icmp ptr, null), truebb, falsebb) pattern at the end
+ // of malloc block
+ auto *MallocBB = Malloc->getParent(),
+ *MemsetBB = Memset->getParent();
+ if (MallocBB == MemsetBB)
+ return true;
+ auto *Ptr = Memset->getArgOperand(0);
+ auto *TI = MallocBB->getTerminator();
+ ICmpInst::Predicate Pred;
+ BasicBlock *TrueBB, *FalseBB;
+ if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Ptr), m_Zero()), TrueBB,
+ FalseBB)))
+ return false;
+ if (Pred != ICmpInst::ICMP_EQ || MemsetBB != FalseBB)
+ return false;
+ return true;
+ };
+
+ if (Malloc->getOperand(0) != MemSet->getLength())
+ return false;
+ if (!shouldCreateCalloc(Malloc, MemSet) ||
+ !DT.dominates(Malloc, MemSet) ||
+ !memoryIsNotModifiedBetween(Malloc, MemSet, BatchAA, DL, &DT))
+ return false;
+ IRBuilder<> IRB(Malloc);
+ const auto &DL = Malloc->getModule()->getDataLayout();
+ auto *Calloc =
+ emitCalloc(ConstantInt::get(IRB.getIntPtrTy(DL), 1),
+ Malloc->getArgOperand(0), IRB, TLI);
+ if (!Calloc)
+ return false;
+ MemorySSAUpdater Updater(&MSSA);
+ auto *LastDef =
+ cast<MemoryDef>(Updater.getMemorySSA()->getMemoryAccess(Malloc));
+ auto *NewAccess =
+ Updater.createMemoryAccessAfter(cast<Instruction>(Calloc), LastDef,
+ LastDef);
+ auto *NewAccessMD = cast<MemoryDef>(NewAccess);
+ Updater.insertDef(NewAccessMD, /*RenameUses=*/true);
+ Updater.removeMemoryAccess(Malloc);
+ Malloc->replaceAllUsesWith(Calloc);
+ Malloc->eraseFromParent();
+ return true;
+ }
+
/// \returns true if \p Def is a no-op store, either because it
/// directly stores back a loaded value or stores zero to a calloced object.
bool storeIsNoop(MemoryDef *Def, const Value *DefUO) {
@@ -1713,81 +1780,15 @@ struct DSEState {
if (!isRemovable(DefI))
return false;
- if (StoredConstant && StoredConstant->isNullValue()) {
- auto *DefUOInst = dyn_cast<Instruction>(DefUO);
- if (DefUOInst) {
- if (isCallocLikeFn(DefUOInst, &TLI)) {
- auto *UnderlyingDef =
- cast<MemoryDef>(MSSA.getMemoryAccess(DefUOInst));
- // If UnderlyingDef is the clobbering access of Def, no instructions
- // between them can modify the memory location.
- auto *ClobberDef =
- MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def);
- return UnderlyingDef == ClobberDef;
- }
-
- if (MemSet) {
- if (F.hasFnAttribute(Attribute::SanitizeMemory) ||
- F.hasFnAttribute(Attribute::SanitizeAddress) ||
- F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
- F.getName() == "calloc")
- return false;
- auto *Malloc = const_cast<CallInst *>(dyn_cast<CallInst>(DefUOInst));
- if (!Malloc)
- return false;
- auto *InnerCallee = Malloc->getCalledFunction();
- if (!InnerCallee)
- return false;
- LibFunc Func;
- if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) ||
- Func != LibFunc_malloc)
- return false;
-
- auto shouldCreateCalloc = [](CallInst *Malloc, CallInst *Memset) {
- // Check for br(icmp ptr, null), truebb, falsebb) pattern at the end
- // of malloc block
- auto *MallocBB = Malloc->getParent(),
- *MemsetBB = Memset->getParent();
- if (MallocBB == MemsetBB)
- return true;
- auto *Ptr = Memset->getArgOperand(0);
- auto *TI = MallocBB->getTerminator();
- ICmpInst::Predicate Pred;
- BasicBlock *TrueBB, *FalseBB;
- if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Ptr), m_Zero()), TrueBB,
- FalseBB)))
- return false;
- if (Pred != ICmpInst::ICMP_EQ || MemsetBB != FalseBB)
- return false;
- return true;
- };
-
- if (Malloc->getOperand(0) == MemSet->getLength()) {
- if (shouldCreateCalloc(Malloc, MemSet) &&
- DT.dominates(Malloc, MemSet) &&
- memoryIsNotModifiedBetween(Malloc, MemSet, BatchAA, DL, &DT)) {
- IRBuilder<> IRB(Malloc);
- const auto &DL = Malloc->getModule()->getDataLayout();
- if (auto *Calloc =
- emitCalloc(ConstantInt::get(IRB.getIntPtrTy(DL), 1),
- Malloc->getArgOperand(0), IRB, TLI)) {
- MemorySSAUpdater Updater(&MSSA);
- auto *LastDef = cast<MemoryDef>(
- Updater.getMemorySSA()->getMemoryAccess(Malloc));
- auto *NewAccess = Updater.createMemoryAccessAfter(
- cast<Instruction>(Calloc), LastDef, LastDef);
- auto *NewAccessMD = cast<MemoryDef>(NewAccess);
- Updater.insertDef(NewAccessMD, /*RenameUses=*/true);
- Updater.removeMemoryAccess(Malloc);
- Malloc->replaceAllUsesWith(Calloc);
- Malloc->eraseFromParent();
- return true;
- }
- return false;
- }
- }
- }
- }
+ if (StoredConstant && isAllocationFn(DefUO, &TLI)) {
+ auto *CB = cast<CallBase>(DefUO);
+ auto *InitC = getInitialValueOfAllocation(CB, &TLI,
+ StoredConstant->getType());
+ // If the clobbering access is LiveOnEntry, no instructions between them
+ // can modify the memory location.
+ if (InitC && InitC == StoredConstant)
+ return MSSA.isLiveOnEntryDef(
+ MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def));
}
if (!Store)
@@ -2074,6 +2075,15 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
MadeChange = true;
continue;
}
+
+ // Can we form a calloc from a memset/malloc pair?
+ if (!Shortend && State.tryFoldIntoCalloc(KillingDef, KillingUndObj)) {
+ LLVM_DEBUG(dbgs() << "DSE: Remove memset after forming calloc:\n"
+ << " DEAD: " << *KillingI << '\n');
+ State.deleteDeadInstruction(KillingI);
+ MadeChange = true;
+ continue;
+ }
}
if (EnablePartialOverwriteTracking)
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index a24997dd3fd4..59b934c16c8a 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -827,10 +827,13 @@ private:
const ParseMemoryInst &Later);
Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
+ // TODO: We could insert relevant casts on type mismatch here.
if (auto *LI = dyn_cast<LoadInst>(Inst))
- return LI;
- if (auto *SI = dyn_cast<StoreInst>(Inst))
- return SI->getValueOperand();
+ return LI->getType() == ExpectedType ? LI : nullptr;
+ else if (auto *SI = dyn_cast<StoreInst>(Inst)) {
+ Value *V = SI->getValueOperand();
+ return V->getType() == ExpectedType ? V : nullptr;
+ }
assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
auto *II = cast<IntrinsicInst>(Inst);
if (isHandledNonTargetIntrinsic(II->getIntrinsicID()))
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 00506fb86006..398c93e8758c 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -1104,20 +1104,19 @@ bool GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
}
assert(DepInfo.isDef() && "follows from above");
- // Loading the allocation -> undef.
- if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
- isAlignedAllocLikeFn(DepInst, TLI) ||
- // Loading immediately after lifetime begin -> undef.
- isLifetimeStart(DepInst)) {
+ // Loading the alloca -> undef.
+ // Loading immediately after lifetime begin -> undef.
+ if (isa<AllocaInst>(DepInst) || isLifetimeStart(DepInst)) {
Res = AvailableValue::get(UndefValue::get(Load->getType()));
return true;
}
- // Loading from calloc (which zero initializes memory) -> zero
- if (isCallocLikeFn(DepInst, TLI)) {
- Res = AvailableValue::get(Constant::getNullValue(Load->getType()));
- return true;
- }
+ if (isAllocationFn(DepInst, TLI))
+ if (auto *InitVal = getInitialValueOfAllocation(cast<CallBase>(DepInst),
+ TLI, Load->getType())) {
+ Res = AvailableValue::get(InitVal);
+ return true;
+ }
if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
// Reject loads and stores that are to the same address but are of
@@ -1769,7 +1768,7 @@ bool GVNPass::processAssumeIntrinsic(AssumeInst *IntrinsicI) {
// Insert a new store to null instruction before the load to indicate that
// this code is not reachable. FIXME: We could insert unreachable
// instruction directly because we can modify the CFG.
- auto *NewS = new StoreInst(UndefValue::get(Int8Ty),
+ auto *NewS = new StoreInst(PoisonValue::get(Int8Ty),
Constant::getNullValue(Int8Ty->getPointerTo()),
IntrinsicI);
if (MSSAU) {
@@ -2991,12 +2990,12 @@ void GVNPass::addDeadBlock(BasicBlock *BB) {
}
}
- // Now undef the incoming values from the dead predecessors.
+ // Now poison the incoming values from the dead predecessors.
for (BasicBlock *P : predecessors(B)) {
if (!DeadBlocks.count(P))
continue;
for (PHINode &Phi : B->phis()) {
- Phi.setIncomingValueForBlock(P, UndefValue::get(Phi.getType()));
+ Phi.setIncomingValueForBlock(P, PoisonValue::get(Phi.getType()));
if (MD)
MD->invalidateCachedPointerInfo(&Phi);
}
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 7001d330fce0..ceb03eb17f6d 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -138,8 +138,6 @@ AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true),
namespace {
-struct RewritePhi;
-
class IndVarSimplify {
LoopInfo *LI;
ScalarEvolution *SE;
@@ -982,6 +980,7 @@ static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB,
assert(isLoopCounter(IndVar, L, SE));
const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
const SCEV *IVInit = AR->getStart();
+ assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
// IVInit may be a pointer while ExitCount is an integer when FindLoopCounter
// finds a valid pointer IV. Sign extend ExitCount in order to materialize a
@@ -1004,13 +1003,6 @@ static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB,
assert(SE->isLoopInvariant(IVOffset, L) &&
"Computed iteration count is not loop invariant!");
- // We could handle pointer IVs other than i8*, but we need to compensate for
- // gep index scaling.
- assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()),
- cast<PointerType>(IndVar->getType())
- ->getElementType())->isOne() &&
- "unit stride pointer IV must be i8*");
-
const SCEV *IVLimit = SE->getAddExpr(IVInit, IVOffset);
BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
return Rewriter.expandCodeFor(IVLimit, IndVar->getType(), BI);
@@ -1026,7 +1018,6 @@ static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB,
// IVInit integer and ExitCount pointer would only occur if a canonical IV
// were generated on top of case #2, which is not expected.
- assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
// For unit stride, IVCount = Start + ExitCount with 2's complement
// overflow.
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 883d4afff3bd..8f5933b7bd71 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -250,12 +250,6 @@ public:
char InferAddressSpaces::ID = 0;
-namespace llvm {
-
-void initializeInferAddressSpacesPass(PassRegistry &);
-
-} // end namespace llvm
-
INITIALIZE_PASS_BEGIN(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index fe9a7211967c..a3efad104ca6 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -728,8 +728,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
// Handle some boolean conditions.
if (I->getType()->getPrimitiveSizeInBits() == 1) {
using namespace PatternMatch;
-
- assert(Preference == WantInteger && "One-bit non-integer type?");
+ if (Preference != WantInteger)
+ return false;
// X | true -> true
// X & false -> false
Value *Op0, *Op1;
@@ -789,8 +789,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
// Try to simplify some other binary operator values.
} else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
- assert(Preference != WantBlockAddress
- && "A binary operator creating a block address?");
+ if (Preference != WantInteger)
+ return false;
if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
PredValueInfoTy LHSVals;
computeValueKnownInPredecessorsImpl(BO->getOperand(0), BB, LHSVals,
@@ -811,7 +811,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
// Handle compare with phi operand, where the PHI is defined in this block.
if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
- assert(Preference == WantInteger && "Compares only produce integers");
+ if (Preference != WantInteger)
+ return false;
Type *CmpType = Cmp->getType();
Value *CmpLHS = Cmp->getOperand(0);
Value *CmpRHS = Cmp->getOperand(1);
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index bc792ca3d8da..7fb1a25bdf13 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -1355,7 +1355,7 @@ static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
TargetTransformInfo::TCC_Free)
return false;
// For a GEP, we cannot simply use getUserCost because currently it
- // optimistically assume that a GEP will fold into addressing mode
+ // optimistically assumes that a GEP will fold into addressing mode
// regardless of its users.
const BasicBlock *BB = GEP->getParent();
for (const User *U : GEP->users()) {
@@ -1923,26 +1923,15 @@ bool isNotCapturedBeforeOrInLoop(const Value *V, const Loop *L,
L->getHeader()->getTerminator(), DT);
}
-/// Return true iff we can prove that a caller of this function can not inspect
-/// the contents of the provided object in a well defined program.
-bool isKnownNonEscaping(Value *Object, const Loop *L,
- const TargetLibraryInfo *TLI, DominatorTree *DT) {
- if (isa<AllocaInst>(Object))
- // Since the alloca goes out of scope, we know the caller can't retain a
- // reference to it and be well defined. Thus, we don't need to check for
- // capture.
- return true;
+/// Return true if we can prove that a caller cannot inspect the object if an
+/// unwind occurs inside the loop.
+bool isNotVisibleOnUnwindInLoop(const Value *Object, const Loop *L,
+ DominatorTree *DT) {
+ bool RequiresNoCaptureBeforeUnwind;
+ if (!isNotVisibleOnUnwind(Object, RequiresNoCaptureBeforeUnwind))
+ return false;
- // For all other objects we need to know that the caller can't possibly
- // have gotten a reference to the object. There are two components of
- // that:
- // 1) Object can't be escaped by this function. This is what
- // PointerMayBeCaptured checks.
- // 2) Object can't have been captured at definition site. For this, we
- // need to know the return value is noalias. At the moment, we use a
- // weaker condition and handle only AllocLikeFunctions (which are
- // known to be noalias). TODO
- return isAllocLikeFn(Object, TLI) &&
+ return !RequiresNoCaptureBeforeUnwind ||
isNotCapturedBeforeOrInLoop(Object, L, DT);
}
@@ -2030,7 +2019,7 @@ bool llvm::promoteLoopAccessesToScalars(
// this by proving that the caller can't have a reference to the object
// after return and thus can't possibly load from the object.
Value *Object = getUnderlyingObject(SomePtr);
- if (!isKnownNonEscaping(Object, CurLoop, TLI, DT))
+ if (!isNotVisibleOnUnwindInLoop(Object, CurLoop, DT))
return false;
// Subtlety: Alloca's aren't visible to callers, but *are* potentially
// visible to other threads if captured and used during their lifetimes.
@@ -2163,7 +2152,7 @@ bool llvm::promoteLoopAccessesToScalars(
else {
Value *Object = getUnderlyingObject(SomePtr);
SafeToInsertStore =
- (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) &&
+ (isNoAliasCall(Object) || isa<AllocaInst>(Object)) &&
isNotCapturedBeforeOrInLoop(Object, CurLoop, DT);
}
}
diff --git a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index 5814e2f043d5..361d6c0d9381 100644
--- a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -407,25 +407,19 @@ breakBackedgeIfNotTaken(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
if (!L->getLoopLatch())
return LoopDeletionResult::Unmodified;
- auto *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
- if (BTC->isZero()) {
- // SCEV knows this backedge isn't taken!
- breakLoopBackedge(L, DT, SE, LI, MSSA);
- ++NumBackedgesBroken;
- return LoopDeletionResult::Deleted;
- }
-
- // If SCEV leaves open the possibility of a zero trip count, see if
- // symbolically evaluating the first iteration lets us prove the backedge
- // unreachable.
- if (isa<SCEVCouldNotCompute>(BTC) || !SE.isKnownNonZero(BTC))
- if (canProveExitOnFirstIteration(L, DT, LI)) {
- breakLoopBackedge(L, DT, SE, LI, MSSA);
- ++NumBackedgesBroken;
- return LoopDeletionResult::Deleted;
+ auto *BTCMax = SE.getConstantMaxBackedgeTakenCount(L);
+ if (!BTCMax->isZero()) {
+ auto *BTC = SE.getBackedgeTakenCount(L);
+ if (!BTC->isZero()) {
+ if (!isa<SCEVCouldNotCompute>(BTC) && SE.isKnownNonZero(BTC))
+ return LoopDeletionResult::Unmodified;
+ if (!canProveExitOnFirstIteration(L, DT, LI))
+ return LoopDeletionResult::Unmodified;
}
-
- return LoopDeletionResult::Unmodified;
+ }
+ ++NumBackedgesBroken;
+ breakLoopBackedge(L, DT, SE, LI, MSSA);
+ return LoopDeletionResult::Deleted;
}
/// Remove a loop if it is dead.
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index 965d1575518e..c46db4e63bfe 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -10,10 +10,13 @@
//
// The intention is to optimise loop nests like this, which together access an
// array linearly:
+//
// for (int i = 0; i < N; ++i)
// for (int j = 0; j < M; ++j)
// f(A[i*M+j]);
+//
// into one loop:
+//
// for (int i = 0; i < (N*M); ++i)
// f(A[i]);
//
@@ -22,7 +25,27 @@
// expression like i*M+j. If they had any other uses, we would have to insert a
// div/mod to reconstruct the original values, so this wouldn't be profitable.
//
-// We also need to prove that N*M will not overflow.
+// We also need to prove that N*M will not overflow. The preferred solution is
+// to widen the IV, which avoids overflow checks, so that is tried first. If
+// the IV cannot be widened, then we try to determine that this new tripcount
+// expression won't overflow.
+//
+// Q: Does LoopFlatten use SCEV?
+// Short answer: Yes and no.
+//
+// Long answer:
+// For this transformation to be valid, we require all uses of the induction
+// variables to be linear expressions of the form i*M+j. The different Loop
+// APIs are used to get some loop components like the induction variable,
+// compare statement, etc. In addition, we do some pattern matching to find the
+// linear expressions and other loop components like the loop increment. The
+// latter are examples of expressions that do use the induction variable, but
+// are safe to ignore when we check all uses to be of the form i*M+j. We keep
+// track of all of this in bookkeeping struct FlattenInfo.
+// We assume the loops to be canonical, i.e. starting at 0 and increment with
+// 1. This makes RHS of the compare the loop tripcount (with the right
+// predicate). We use SCEV to then sanity check that this tripcount matches
+// with the tripcount as computed by SCEV.
//
//===----------------------------------------------------------------------===//
@@ -31,6 +54,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -70,37 +94,54 @@ static cl::opt<bool>
"trip counts will never overflow"));
static cl::opt<bool>
- WidenIV("loop-flatten-widen-iv", cl::Hidden,
- cl::init(true),
+ WidenIV("loop-flatten-widen-iv", cl::Hidden, cl::init(true),
cl::desc("Widen the loop induction variables, if possible, so "
"overflow checks won't reject flattening"));
+// We require all uses of both induction variables to match this pattern:
+//
+// (OuterPHI * InnerTripCount) + InnerPHI
+//
+// I.e., it needs to be a linear expression of the induction variables and the
+// inner loop trip count. We keep track of all different expressions on which
+// checks will be performed in this bookkeeping struct.
+//
struct FlattenInfo {
- Loop *OuterLoop = nullptr;
+ Loop *OuterLoop = nullptr; // The loop pair to be flattened.
Loop *InnerLoop = nullptr;
- // These PHINodes correspond to loop induction variables, which are expected
- // to start at zero and increment by one on each loop.
- PHINode *InnerInductionPHI = nullptr;
- PHINode *OuterInductionPHI = nullptr;
- Value *InnerTripCount = nullptr;
- Value *OuterTripCount = nullptr;
- BinaryOperator *InnerIncrement = nullptr;
- BinaryOperator *OuterIncrement = nullptr;
- BranchInst *InnerBranch = nullptr;
- BranchInst *OuterBranch = nullptr;
- SmallPtrSet<Value *, 4> LinearIVUses;
+
+ PHINode *InnerInductionPHI = nullptr; // These PHINodes correspond to loop
+ PHINode *OuterInductionPHI = nullptr; // induction variables, which are
+ // expected to start at zero and
+ // increment by one on each loop.
+
+ Value *InnerTripCount = nullptr; // The product of these two tripcounts
+ Value *OuterTripCount = nullptr; // will be the new flattened loop
+ // tripcount. Also used to recognise a
+ // linear expression that will be replaced.
+
+ SmallPtrSet<Value *, 4> LinearIVUses; // Contains the linear expressions
+ // of the form i*M+j that will be
+ // replaced.
+
+ BinaryOperator *InnerIncrement = nullptr; // Uses of induction variables in
+ BinaryOperator *OuterIncrement = nullptr; // loop control statements that
+ BranchInst *InnerBranch = nullptr; // are safe to ignore.
+
+ BranchInst *OuterBranch = nullptr; // The instruction that needs to be
+ // updated with new tripcount.
+
SmallPtrSet<PHINode *, 4> InnerPHIsToTransform;
- // Whether this holds the flatten info before or after widening.
- bool Widened = false;
+ bool Widened = false; // Whether this holds the flatten info before or after
+ // widening.
- // Holds the old/narrow induction phis, i.e. the Phis before IV widening has
- // been applied. This bookkeeping is used so we can skip some checks on these
- // phi nodes.
- PHINode *NarrowInnerInductionPHI = nullptr;
- PHINode *NarrowOuterInductionPHI = nullptr;
+ PHINode *NarrowInnerInductionPHI = nullptr; // Holds the old/narrow induction
+ PHINode *NarrowOuterInductionPHI = nullptr; // phis, i.e. the Phis before IV
+ // has been apllied. Used to skip
+ // checks on phi nodes.
- FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL) {};
+ FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL){};
bool isNarrowInductionPhi(PHINode *Phi) {
// This can't be the narrow phi if we haven't widened the IV first.
@@ -108,6 +149,118 @@ struct FlattenInfo {
return false;
return NarrowInnerInductionPHI == Phi || NarrowOuterInductionPHI == Phi;
}
+ bool isInnerLoopIncrement(User *U) {
+ return InnerIncrement == U;
+ }
+ bool isOuterLoopIncrement(User *U) {
+ return OuterIncrement == U;
+ }
+ bool isInnerLoopTest(User *U) {
+ return InnerBranch->getCondition() == U;
+ }
+
+ bool checkOuterInductionPhiUsers(SmallPtrSet<Value *, 4> &ValidOuterPHIUses) {
+ for (User *U : OuterInductionPHI->users()) {
+ if (isOuterLoopIncrement(U))
+ continue;
+
+ auto IsValidOuterPHIUses = [&] (User *U) -> bool {
+ LLVM_DEBUG(dbgs() << "Found use of outer induction variable: "; U->dump());
+ if (!ValidOuterPHIUses.count(U)) {
+ LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << "Use is optimisable\n");
+ return true;
+ };
+
+ if (auto *V = dyn_cast<TruncInst>(U)) {
+ for (auto *K : V->users()) {
+ if (!IsValidOuterPHIUses(K))
+ return false;
+ }
+ continue;
+ }
+
+ if (!IsValidOuterPHIUses(U))
+ return false;
+ }
+ return true;
+ }
+
+ bool matchLinearIVUser(User *U, Value *InnerTripCount,
+ SmallPtrSet<Value *, 4> &ValidOuterPHIUses) {
+ LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump());
+ Value *MatchedMul = nullptr;
+ Value *MatchedItCount = nullptr;
+
+ bool IsAdd = match(U, m_c_Add(m_Specific(InnerInductionPHI),
+ m_Value(MatchedMul))) &&
+ match(MatchedMul, m_c_Mul(m_Specific(OuterInductionPHI),
+ m_Value(MatchedItCount)));
+
+ // Matches the same pattern as above, except it also looks for truncs
+ // on the phi, which can be the result of widening the induction variables.
+ bool IsAddTrunc =
+ match(U, m_c_Add(m_Trunc(m_Specific(InnerInductionPHI)),
+ m_Value(MatchedMul))) &&
+ match(MatchedMul, m_c_Mul(m_Trunc(m_Specific(OuterInductionPHI)),
+ m_Value(MatchedItCount)));
+
+ if (!MatchedItCount)
+ return false;
+
+ // Look through extends if the IV has been widened.
+ if (Widened &&
+ (isa<SExtInst>(MatchedItCount) || isa<ZExtInst>(MatchedItCount))) {
+ assert(MatchedItCount->getType() == InnerInductionPHI->getType() &&
+ "Unexpected type mismatch in types after widening");
+ MatchedItCount = isa<SExtInst>(MatchedItCount)
+ ? dyn_cast<SExtInst>(MatchedItCount)->getOperand(0)
+ : dyn_cast<ZExtInst>(MatchedItCount)->getOperand(0);
+ }
+
+ if ((IsAdd || IsAddTrunc) && MatchedItCount == InnerTripCount) {
+ LLVM_DEBUG(dbgs() << "Use is optimisable\n");
+ ValidOuterPHIUses.insert(MatchedMul);
+ LinearIVUses.insert(U);
+ return true;
+ }
+
+ LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
+ return false;
+ }
+
+ bool checkInnerInductionPhiUsers(SmallPtrSet<Value *, 4> &ValidOuterPHIUses) {
+ Value *SExtInnerTripCount = InnerTripCount;
+ if (Widened &&
+ (isa<SExtInst>(InnerTripCount) || isa<ZExtInst>(InnerTripCount)))
+ SExtInnerTripCount = cast<Instruction>(InnerTripCount)->getOperand(0);
+
+ for (User *U : InnerInductionPHI->users()) {
+ if (isInnerLoopIncrement(U))
+ continue;
+
+ // After widening the IVs, a trunc instruction might have been introduced,
+ // so look through truncs.
+ if (isa<TruncInst>(U)) {
+ if (!U->hasOneUse())
+ return false;
+ U = *U->user_begin();
+ }
+
+ // If the use is in the compare (which is also the condition of the inner
+ // branch) then the compare has been altered by another transformation e.g
+ // icmp ult %inc, tripcount -> icmp ult %j, tripcount-1, where tripcount is
+ // a constant. Ignore this use as the compare gets removed later anyway.
+ if (isInnerLoopTest(U))
+ continue;
+
+ if (!matchLinearIVUser(U, SExtInnerTripCount, ValidOuterPHIUses))
+ return false;
+ }
+ return true;
+ }
};
static bool
@@ -121,6 +274,77 @@ setLoopComponents(Value *&TC, Value *&TripCount, BinaryOperator *&Increment,
return true;
}
+// Given the RHS of the loop latch compare instruction, verify with SCEV
+// that this is indeed the loop tripcount.
+// TODO: This used to be a straightforward check but has grown to be quite
+// complicated now. It is therefore worth revisiting what the additional
+// benefits are of this (compared to relying on canonical loops and pattern
+// matching).
+static bool verifyTripCount(Value *RHS, Loop *L,
+ SmallPtrSetImpl<Instruction *> &IterationInstructions,
+ PHINode *&InductionPHI, Value *&TripCount, BinaryOperator *&Increment,
+ BranchInst *&BackBranch, ScalarEvolution *SE, bool IsWidened) {
+ const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+ if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+ LLVM_DEBUG(dbgs() << "Backedge-taken count is not predictable\n");
+ return false;
+ }
+
+ // The Extend=false flag is used for getTripCountFromExitCount as we want
+ // to verify and match it with the pattern matched tripcount. Please note
+ // that overflow checks are performed in checkOverflow, but are first tried
+ // to avoid by widening the IV.
+ const SCEV *SCEVTripCount =
+ SE->getTripCountFromExitCount(BackedgeTakenCount, /*Extend=*/false);
+
+ const SCEV *SCEVRHS = SE->getSCEV(RHS);
+ if (SCEVRHS == SCEVTripCount)
+ return setLoopComponents(RHS, TripCount, Increment, IterationInstructions);
+ ConstantInt *ConstantRHS = dyn_cast<ConstantInt>(RHS);
+ if (ConstantRHS) {
+ const SCEV *BackedgeTCExt = nullptr;
+ if (IsWidened) {
+ const SCEV *SCEVTripCountExt;
+ // Find the extended backedge taken count and extended trip count using
+ // SCEV. One of these should now match the RHS of the compare.
+ BackedgeTCExt = SE->getZeroExtendExpr(BackedgeTakenCount, RHS->getType());
+ SCEVTripCountExt = SE->getTripCountFromExitCount(BackedgeTCExt, false);
+ if (SCEVRHS != BackedgeTCExt && SCEVRHS != SCEVTripCountExt) {
+ LLVM_DEBUG(dbgs() << "Could not find valid trip count\n");
+ return false;
+ }
+ }
+ // If the RHS of the compare is equal to the backedge taken count we need
+ // to add one to get the trip count.
+ if (SCEVRHS == BackedgeTCExt || SCEVRHS == BackedgeTakenCount) {
+ ConstantInt *One = ConstantInt::get(ConstantRHS->getType(), 1);
+ Value *NewRHS = ConstantInt::get(
+ ConstantRHS->getContext(), ConstantRHS->getValue() + One->getValue());
+ return setLoopComponents(NewRHS, TripCount, Increment,
+ IterationInstructions);
+ }
+ return setLoopComponents(RHS, TripCount, Increment, IterationInstructions);
+ }
+ // If the RHS isn't a constant then check that the reason it doesn't match
+ // the SCEV trip count is because the RHS is a ZExt or SExt instruction
+ // (and take the trip count to be the RHS).
+ if (!IsWidened) {
+ LLVM_DEBUG(dbgs() << "Could not find valid trip count\n");
+ return false;
+ }
+ auto *TripCountInst = dyn_cast<Instruction>(RHS);
+ if (!TripCountInst) {
+ LLVM_DEBUG(dbgs() << "Could not find valid trip count\n");
+ return false;
+ }
+ if ((!isa<ZExtInst>(TripCountInst) && !isa<SExtInst>(TripCountInst)) ||
+ SE->getSCEV(TripCountInst->getOperand(0)) != SCEVTripCount) {
+ LLVM_DEBUG(dbgs() << "Could not find valid extended trip count\n");
+ return false;
+ }
+ return setLoopComponents(RHS, TripCount, Increment, IterationInstructions);
+}
+
// Finds the induction variable, increment and trip count for a simple loop that
// we can flatten.
static bool findLoopComponents(
@@ -197,63 +421,9 @@ static bool findLoopComponents(
// another transformation has changed the compare (e.g. icmp ult %inc,
// tripcount -> icmp ult %j, tripcount-1), or both.
Value *RHS = Compare->getOperand(1);
- const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
- if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
- LLVM_DEBUG(dbgs() << "Backedge-taken count is not predictable\n");
- return false;
- }
- // The use of the Extend=false flag on getTripCountFromExitCount was added
- // during a refactoring to preserve existing behavior. However, there's
- // nothing obvious in the surrounding code when handles the overflow case.
- // FIXME: audit code to establish whether there's a latent bug here.
- const SCEV *SCEVTripCount =
- SE->getTripCountFromExitCount(BackedgeTakenCount, false);
- const SCEV *SCEVRHS = SE->getSCEV(RHS);
- if (SCEVRHS == SCEVTripCount)
- return setLoopComponents(RHS, TripCount, Increment, IterationInstructions);
- ConstantInt *ConstantRHS = dyn_cast<ConstantInt>(RHS);
- if (ConstantRHS) {
- const SCEV *BackedgeTCExt = nullptr;
- if (IsWidened) {
- const SCEV *SCEVTripCountExt;
- // Find the extended backedge taken count and extended trip count using
- // SCEV. One of these should now match the RHS of the compare.
- BackedgeTCExt = SE->getZeroExtendExpr(BackedgeTakenCount, RHS->getType());
- SCEVTripCountExt = SE->getTripCountFromExitCount(BackedgeTCExt, false);
- if (SCEVRHS != BackedgeTCExt && SCEVRHS != SCEVTripCountExt) {
- LLVM_DEBUG(dbgs() << "Could not find valid trip count\n");
- return false;
- }
- }
- // If the RHS of the compare is equal to the backedge taken count we need
- // to add one to get the trip count.
- if (SCEVRHS == BackedgeTCExt || SCEVRHS == BackedgeTakenCount) {
- ConstantInt *One = ConstantInt::get(ConstantRHS->getType(), 1);
- Value *NewRHS = ConstantInt::get(
- ConstantRHS->getContext(), ConstantRHS->getValue() + One->getValue());
- return setLoopComponents(NewRHS, TripCount, Increment,
- IterationInstructions);
- }
- return setLoopComponents(RHS, TripCount, Increment, IterationInstructions);
- }
- // If the RHS isn't a constant then check that the reason it doesn't match
- // the SCEV trip count is because the RHS is a ZExt or SExt instruction
- // (and take the trip count to be the RHS).
- if (!IsWidened) {
- LLVM_DEBUG(dbgs() << "Could not find valid trip count\n");
- return false;
- }
- auto *TripCountInst = dyn_cast<Instruction>(RHS);
- if (!TripCountInst) {
- LLVM_DEBUG(dbgs() << "Could not find valid trip count\n");
- return false;
- }
- if ((!isa<ZExtInst>(TripCountInst) && !isa<SExtInst>(TripCountInst)) ||
- SE->getSCEV(TripCountInst->getOperand(0)) != SCEVTripCount) {
- LLVM_DEBUG(dbgs() << "Could not find valid extended trip count\n");
- return false;
- }
- return setLoopComponents(RHS, TripCount, Increment, IterationInstructions);
+
+ return verifyTripCount(RHS, L, IterationInstructions, InductionPHI, TripCount,
+ Increment, BackBranch, SE, IsWidened);
}
static bool checkPHIs(FlattenInfo &FI, const TargetTransformInfo *TTI) {
@@ -399,108 +569,26 @@ checkOuterLoopInsts(FlattenInfo &FI,
return true;
}
-static bool checkIVUsers(FlattenInfo &FI) {
- // We require all uses of both induction variables to match this pattern:
- //
- // (OuterPHI * InnerTripCount) + InnerPHI
- //
- // Any uses of the induction variables not matching that pattern would
- // require a div/mod to reconstruct in the flattened loop, so the
- // transformation wouldn't be profitable.
-
- Value *InnerTripCount = FI.InnerTripCount;
- if (FI.Widened &&
- (isa<SExtInst>(InnerTripCount) || isa<ZExtInst>(InnerTripCount)))
- InnerTripCount = cast<Instruction>(InnerTripCount)->getOperand(0);
+
+// We require all uses of both induction variables to match this pattern:
+//
+// (OuterPHI * InnerTripCount) + InnerPHI
+//
+// Any uses of the induction variables not matching that pattern would
+// require a div/mod to reconstruct in the flattened loop, so the
+// transformation wouldn't be profitable.
+static bool checkIVUsers(FlattenInfo &FI) {
// Check that all uses of the inner loop's induction variable match the
// expected pattern, recording the uses of the outer IV.
SmallPtrSet<Value *, 4> ValidOuterPHIUses;
- for (User *U : FI.InnerInductionPHI->users()) {
- if (U == FI.InnerIncrement)
- continue;
-
- // After widening the IVs, a trunc instruction might have been introduced,
- // so look through truncs.
- if (isa<TruncInst>(U)) {
- if (!U->hasOneUse())
- return false;
- U = *U->user_begin();
- }
-
- // If the use is in the compare (which is also the condition of the inner
- // branch) then the compare has been altered by another transformation e.g
- // icmp ult %inc, tripcount -> icmp ult %j, tripcount-1, where tripcount is
- // a constant. Ignore this use as the compare gets removed later anyway.
- if (U == FI.InnerBranch->getCondition())
- continue;
-
- LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump());
-
- Value *MatchedMul = nullptr;
- Value *MatchedItCount = nullptr;
- bool IsAdd = match(U, m_c_Add(m_Specific(FI.InnerInductionPHI),
- m_Value(MatchedMul))) &&
- match(MatchedMul, m_c_Mul(m_Specific(FI.OuterInductionPHI),
- m_Value(MatchedItCount)));
-
- // Matches the same pattern as above, except it also looks for truncs
- // on the phi, which can be the result of widening the induction variables.
- bool IsAddTrunc =
- match(U, m_c_Add(m_Trunc(m_Specific(FI.InnerInductionPHI)),
- m_Value(MatchedMul))) &&
- match(MatchedMul, m_c_Mul(m_Trunc(m_Specific(FI.OuterInductionPHI)),
- m_Value(MatchedItCount)));
-
- if (!MatchedItCount)
- return false;
- // Look through extends if the IV has been widened.
- if (FI.Widened &&
- (isa<SExtInst>(MatchedItCount) || isa<ZExtInst>(MatchedItCount))) {
- assert(MatchedItCount->getType() == FI.InnerInductionPHI->getType() &&
- "Unexpected type mismatch in types after widening");
- MatchedItCount = isa<SExtInst>(MatchedItCount)
- ? dyn_cast<SExtInst>(MatchedItCount)->getOperand(0)
- : dyn_cast<ZExtInst>(MatchedItCount)->getOperand(0);
- }
-
- if ((IsAdd || IsAddTrunc) && MatchedItCount == InnerTripCount) {
- LLVM_DEBUG(dbgs() << "Use is optimisable\n");
- ValidOuterPHIUses.insert(MatchedMul);
- FI.LinearIVUses.insert(U);
- } else {
- LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
- return false;
- }
- }
+ if (!FI.checkInnerInductionPhiUsers(ValidOuterPHIUses))
+ return false;
// Check that there are no uses of the outer IV other than the ones found
// as part of the pattern above.
- for (User *U : FI.OuterInductionPHI->users()) {
- if (U == FI.OuterIncrement)
- continue;
-
- auto IsValidOuterPHIUses = [&] (User *U) -> bool {
- LLVM_DEBUG(dbgs() << "Found use of outer induction variable: "; U->dump());
- if (!ValidOuterPHIUses.count(U)) {
- LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
- return false;
- }
- LLVM_DEBUG(dbgs() << "Use is optimisable\n");
- return true;
- };
-
- if (auto *V = dyn_cast<TruncInst>(U)) {
- for (auto *K : V->users()) {
- if (!IsValidOuterPHIUses(K))
- return false;
- }
- continue;
- }
-
- if (!IsValidOuterPHIUses(U))
- return false;
- }
+ if (!FI.checkOuterInductionPhiUsers(ValidOuterPHIUses))
+ return false;
LLVM_DEBUG(dbgs() << "checkIVUsers: OK\n";
dbgs() << "Found " << FI.LinearIVUses.size()
@@ -535,7 +623,7 @@ static OverflowResult checkOverflow(FlattenInfo &FI, DominatorTree *DT,
for (Value *U : V->users()) {
if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
for (Value *GEPUser : U->users()) {
- Instruction *GEPUserInst = dyn_cast<Instruction>(GEPUser);
+ auto *GEPUserInst = cast<Instruction>(GEPUser);
if (!isa<LoadInst>(GEPUserInst) &&
!(isa<StoreInst>(GEPUserInst) &&
GEP == GEPUserInst->getOperand(1)))
@@ -611,7 +699,8 @@ static bool CanFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
ScalarEvolution *SE, AssumptionCache *AC,
- const TargetTransformInfo *TTI, LPMUpdater *U) {
+ const TargetTransformInfo *TTI, LPMUpdater *U,
+ MemorySSAUpdater *MSSAU) {
Function *F = FI.OuterLoop->getHeader()->getParent();
LLVM_DEBUG(dbgs() << "Checks all passed, doing the transformation\n");
{
@@ -647,7 +736,11 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
BasicBlock *InnerExitingBlock = FI.InnerLoop->getExitingBlock();
InnerExitingBlock->getTerminator()->eraseFromParent();
BranchInst::Create(InnerExitBlock, InnerExitingBlock);
+
+ // Update the DomTree and MemorySSA.
DT->deleteEdge(InnerExitingBlock, FI.InnerLoop->getHeader());
+ if (MSSAU)
+ MSSAU->removeEdge(InnerExitingBlock, FI.InnerLoop->getHeader());
// Replace all uses of the polynomial calculated from the two induction
// variables with the one new one.
@@ -658,8 +751,8 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
OuterValue = Builder.CreateTrunc(FI.OuterInductionPHI, V->getType(),
"flatten.trunciv");
- LLVM_DEBUG(dbgs() << "Replacing: "; V->dump();
- dbgs() << "with: "; OuterValue->dump());
+ LLVM_DEBUG(dbgs() << "Replacing: "; V->dump(); dbgs() << "with: ";
+ OuterValue->dump());
V->replaceAllUsesWith(OuterValue);
}
@@ -698,7 +791,8 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
// (OuterTripCount * InnerTripCount) as the new trip count is safe.
if (InnerType != OuterType ||
InnerType->getScalarSizeInBits() >= MaxLegalSize ||
- MaxLegalType->getScalarSizeInBits() < InnerType->getScalarSizeInBits() * 2) {
+ MaxLegalType->getScalarSizeInBits() <
+ InnerType->getScalarSizeInBits() * 2) {
LLVM_DEBUG(dbgs() << "Can't widen the IV\n");
return false;
}
@@ -708,10 +802,10 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
unsigned ElimExt = 0;
unsigned Widened = 0;
- auto CreateWideIV = [&] (WideIVInfo WideIV, bool &Deleted) -> bool {
- PHINode *WidePhi = createWideIV(WideIV, LI, SE, Rewriter, DT, DeadInsts,
- ElimExt, Widened, true /* HasGuards */,
- true /* UsePostIncrementRanges */);
+ auto CreateWideIV = [&](WideIVInfo WideIV, bool &Deleted) -> bool {
+ PHINode *WidePhi =
+ createWideIV(WideIV, LI, SE, Rewriter, DT, DeadInsts, ElimExt, Widened,
+ true /* HasGuards */, true /* UsePostIncrementRanges */);
if (!WidePhi)
return false;
LLVM_DEBUG(dbgs() << "Created wide phi: "; WidePhi->dump());
@@ -721,14 +815,14 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
};
bool Deleted;
- if (!CreateWideIV({FI.InnerInductionPHI, MaxLegalType, false }, Deleted))
+ if (!CreateWideIV({FI.InnerInductionPHI, MaxLegalType, false}, Deleted))
return false;
// Add the narrow phi to list, so that it will be adjusted later when the
// the transformation is performed.
if (!Deleted)
FI.InnerPHIsToTransform.insert(FI.InnerInductionPHI);
- if (!CreateWideIV({FI.OuterInductionPHI, MaxLegalType, false }, Deleted))
+ if (!CreateWideIV({FI.OuterInductionPHI, MaxLegalType, false}, Deleted))
return false;
assert(Widened && "Widened IV expected");
@@ -744,7 +838,8 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
ScalarEvolution *SE, AssumptionCache *AC,
- const TargetTransformInfo *TTI, LPMUpdater *U) {
+ const TargetTransformInfo *TTI, LPMUpdater *U,
+ MemorySSAUpdater *MSSAU) {
LLVM_DEBUG(
dbgs() << "Loop flattening running on outer loop "
<< FI.OuterLoop->getHeader()->getName() << " and inner loop "
@@ -773,7 +868,7 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
// If we have widened and can perform the transformation, do that here.
if (CanFlatten)
- return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U);
+ return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U, MSSAU);
// Otherwise, if we haven't widened the IV, check if the new iteration
// variable might overflow. In this case, we need to version the loop, and
@@ -791,18 +886,19 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
}
LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n");
- return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U);
+ return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U, MSSAU);
}
bool Flatten(LoopNest &LN, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE,
- AssumptionCache *AC, TargetTransformInfo *TTI, LPMUpdater *U) {
+ AssumptionCache *AC, TargetTransformInfo *TTI, LPMUpdater *U,
+ MemorySSAUpdater *MSSAU) {
bool Changed = false;
for (Loop *InnerLoop : LN.getLoops()) {
auto *OuterLoop = InnerLoop->getParentLoop();
if (!OuterLoop)
continue;
FlattenInfo FI(OuterLoop, InnerLoop);
- Changed |= FlattenLoopPair(FI, DT, LI, SE, AC, TTI, U);
+ Changed |= FlattenLoopPair(FI, DT, LI, SE, AC, TTI, U, MSSAU);
}
return Changed;
}
@@ -813,16 +909,30 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM,
bool Changed = false;
+ Optional<MemorySSAUpdater> MSSAU;
+ if (AR.MSSA) {
+ MSSAU = MemorySSAUpdater(AR.MSSA);
+ if (VerifyMemorySSA)
+ AR.MSSA->verifyMemorySSA();
+ }
+
// The loop flattening pass requires loops to be
// in simplified form, and also needs LCSSA. Running
// this pass will simplify all loops that contain inner loops,
// regardless of whether anything ends up being flattened.
- Changed |= Flatten(LN, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, &U);
+ Changed |= Flatten(LN, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, &U,
+ MSSAU.hasValue() ? MSSAU.getPointer() : nullptr);
if (!Changed)
return PreservedAnalyses::all();
- return getLoopPassPreservedAnalyses();
+ if (AR.MSSA && VerifyMemorySSA)
+ AR.MSSA->verifyMemorySSA();
+
+ auto PA = getLoopPassPreservedAnalyses();
+ if (AR.MSSA)
+ PA.preserve<MemorySSAAnalysis>();
+ return PA;
}
namespace {
@@ -842,6 +952,7 @@ public:
AU.addPreserved<TargetTransformInfoWrapperPass>();
AU.addRequired<AssumptionCacheTracker>();
AU.addPreserved<AssumptionCacheTracker>();
+ AU.addPreserved<MemorySSAWrapperPass>();
}
};
} // namespace
@@ -854,7 +965,9 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_END(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops",
false, false)
-FunctionPass *llvm::createLoopFlattenPass() { return new LoopFlattenLegacyPass(); }
+FunctionPass *llvm::createLoopFlattenPass() {
+ return new LoopFlattenLegacyPass();
+}
bool LoopFlattenLegacyPass::runOnFunction(Function &F) {
ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
@@ -864,10 +977,17 @@ bool LoopFlattenLegacyPass::runOnFunction(Function &F) {
auto &TTIP = getAnalysis<TargetTransformInfoWrapperPass>();
auto *TTI = &TTIP.getTTI(F);
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto *MSSA = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+
+ Optional<MemorySSAUpdater> MSSAU;
+ if (MSSA)
+ MSSAU = MemorySSAUpdater(&MSSA->getMSSA());
+
bool Changed = false;
for (Loop *L : *LI) {
auto LN = LoopNest::getLoopNest(*L, *SE);
- Changed |= Flatten(*LN, DT, LI, SE, AC, TTI, nullptr);
+ Changed |= Flatten(*LN, DT, LI, SE, AC, TTI, nullptr,
+ MSSAU.hasValue() ? MSSAU.getPointer() : nullptr);
}
return Changed;
}
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 5d00fa56e888..35ba4e2b4032 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1117,7 +1117,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
BasicBlock *Preheader = CurLoop->getLoopPreheader();
IRBuilder<> Builder(Preheader->getTerminator());
SCEVExpander Expander(*SE, *DL, "loop-idiom");
- SCEVExpanderCleaner ExpCleaner(Expander, *DT);
+ SCEVExpanderCleaner ExpCleaner(Expander);
Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
@@ -1328,7 +1328,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
IRBuilder<> Builder(Preheader->getTerminator());
SCEVExpander Expander(*SE, *DL, "loop-idiom");
- SCEVExpanderCleaner ExpCleaner(Expander, *DT);
+ SCEVExpanderCleaner ExpCleaner(Expander);
bool Changed = false;
const SCEV *StrStart = StoreEv->getStart();
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 9f605b4ac4ad..c2b065c4eb31 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -292,33 +292,6 @@ static LoopVector populateWorklist(Loop &L) {
return LoopList;
}
-static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
- PHINode *InnerIndexVar = L->getCanonicalInductionVariable();
- if (InnerIndexVar)
- return InnerIndexVar;
- if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr)
- return nullptr;
- for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
- PHINode *PhiVar = cast<PHINode>(I);
- Type *PhiTy = PhiVar->getType();
- if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
- !PhiTy->isPointerTy())
- return nullptr;
- const SCEVAddRecExpr *AddRec =
- dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar));
- if (!AddRec || !AddRec->isAffine())
- continue;
- const SCEV *Step = AddRec->getStepRecurrence(*SE);
- if (!isa<SCEVConstant>(Step))
- continue;
- // Found the induction variable.
- // FIXME: Handle loops with more than one induction variable. Note that,
- // currently, legality makes sure we have only one induction variable.
- return PhiVar;
- }
- return nullptr;
-}
-
namespace {
/// LoopInterchangeLegality checks if it is legal to interchange the loop.
@@ -332,9 +305,13 @@ public:
bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
CharMatrix &DepMatrix);
+ /// Discover induction PHIs in the header of \p L. Induction
+ /// PHIs are added to \p Inductions.
+ bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
+
/// Check if the loop structure is understood. We do not handle triangular
/// loops for now.
- bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
+ bool isLoopStructureUnderstood();
bool currentLimitations();
@@ -342,6 +319,10 @@ public:
return OuterInnerReductions;
}
+ const SmallVectorImpl<PHINode *> &getInnerLoopInductions() const {
+ return InnerLoopInductions;
+ }
+
private:
bool tightlyNested(Loop *Outer, Loop *Inner);
bool containsUnsafeInstructions(BasicBlock *BB);
@@ -365,6 +346,9 @@ private:
/// Set of reduction PHIs taking part of a reduction across the inner and
/// outer loop.
SmallPtrSet<PHINode *, 4> OuterInnerReductions;
+
+ /// Set of inner loop induction PHIs
+ SmallVector<PHINode *, 8> InnerLoopInductions;
};
/// LoopInterchangeProfitability checks if it is profitable to interchange the
@@ -635,25 +619,26 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
return true;
}
-bool LoopInterchangeLegality::isLoopStructureUnderstood(
- PHINode *InnerInduction) {
- unsigned Num = InnerInduction->getNumOperands();
+bool LoopInterchangeLegality::isLoopStructureUnderstood() {
BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
- for (unsigned i = 0; i < Num; ++i) {
- Value *Val = InnerInduction->getOperand(i);
- if (isa<Constant>(Val))
- continue;
- Instruction *I = dyn_cast<Instruction>(Val);
- if (!I)
- return false;
- // TODO: Handle triangular loops.
- // e.g. for(int i=0;i<N;i++)
- // for(int j=i;j<N;j++)
- unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
- if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
- InnerLoopPreheader &&
- !OuterLoop->isLoopInvariant(I)) {
- return false;
+ for (PHINode *InnerInduction : InnerLoopInductions) {
+ unsigned Num = InnerInduction->getNumOperands();
+ for (unsigned i = 0; i < Num; ++i) {
+ Value *Val = InnerInduction->getOperand(i);
+ if (isa<Constant>(Val))
+ continue;
+ Instruction *I = dyn_cast<Instruction>(Val);
+ if (!I)
+ return false;
+ // TODO: Handle triangular loops.
+ // e.g. for(int i=0;i<N;i++)
+ // for(int j=i;j<N;j++)
+ unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
+ if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
+ InnerLoopPreheader &&
+ !OuterLoop->isLoopInvariant(I)) {
+ return false;
+ }
}
}
@@ -682,27 +667,34 @@ bool LoopInterchangeLegality::isLoopStructureUnderstood(
// Return true if V is InnerInduction, or a cast from
// InnerInduction, or a binary operator that involves
// InnerInduction and a constant.
- std::function<bool(Value *)> IsPathToIndVar;
- IsPathToIndVar = [&InnerInduction, &IsPathToIndVar](Value *V) -> bool {
- if (V == InnerInduction)
+ std::function<bool(Value *)> IsPathToInnerIndVar;
+ IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
+ if (llvm::is_contained(InnerLoopInductions, V))
return true;
if (isa<Constant>(V))
return true;
- Instruction *I = dyn_cast<Instruction>(V);
+ const Instruction *I = dyn_cast<Instruction>(V);
if (!I)
return false;
if (isa<CastInst>(I))
- return IsPathToIndVar(I->getOperand(0));
+ return IsPathToInnerIndVar(I->getOperand(0));
if (isa<BinaryOperator>(I))
- return IsPathToIndVar(I->getOperand(0)) &&
- IsPathToIndVar(I->getOperand(1));
+ return IsPathToInnerIndVar(I->getOperand(0)) &&
+ IsPathToInnerIndVar(I->getOperand(1));
return false;
};
- if (IsPathToIndVar(Op0) && !isa<Constant>(Op0)) {
+ // In case of multiple inner loop indvars, it is okay if LHS and RHS
+ // are both inner indvar related variables.
+ if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
+ return true;
+
+ // Otherwise we check if the cmp instruction compares an inner indvar
+ // related variable (Left) with a outer loop invariant (Right).
+ if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
Left = Op0;
Right = Op1;
- } else if (IsPathToIndVar(Op1) && !isa<Constant>(Op1)) {
+ } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
Left = Op1;
Right = Op0;
}
@@ -793,7 +785,6 @@ bool LoopInterchangeLegality::findInductionAndReductions(
// This function indicates the current limitations in the transform as a result
// of which we do not proceed.
bool LoopInterchangeLegality::currentLimitations() {
- BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
// transform currently expects the loop latches to also be the exiting
@@ -815,7 +806,6 @@ bool LoopInterchangeLegality::currentLimitations() {
return true;
}
- PHINode *InnerInductionVar;
SmallVector<PHINode *, 8> Inductions;
if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
LLVM_DEBUG(
@@ -831,20 +821,6 @@ bool LoopInterchangeLegality::currentLimitations() {
return true;
}
- // TODO: Currently we handle only loops with 1 induction variable.
- if (Inductions.size() != 1) {
- LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
- << "supported currently.\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter",
- OuterLoop->getStartLoc(),
- OuterLoop->getHeader())
- << "Only outer loops with 1 induction variable can be "
- "interchanged currently.";
- });
- return true;
- }
-
Inductions.clear();
if (!findInductionAndReductions(InnerLoop, Inductions, nullptr)) {
LLVM_DEBUG(
@@ -860,24 +836,8 @@ bool LoopInterchangeLegality::currentLimitations() {
return true;
}
- // TODO: Currently we handle only loops with 1 induction variable.
- if (Inductions.size() != 1) {
- LLVM_DEBUG(
- dbgs() << "We currently only support loops with 1 induction variable."
- << "Failed to interchange due to current limitation\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Only inner loops with 1 induction variable can be "
- "interchanged currently.";
- });
- return true;
- }
- InnerInductionVar = Inductions.pop_back_val();
-
// TODO: Triangular loops are not handled for now.
- if (!isLoopStructureUnderstood(InnerInductionVar)) {
+ if (!isLoopStructureUnderstood()) {
LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
@@ -888,79 +848,17 @@ bool LoopInterchangeLegality::currentLimitations() {
return true;
}
- // TODO: Current limitation: Since we split the inner loop latch at the point
- // were induction variable is incremented (induction.next); We cannot have
- // more than 1 user of induction.next since it would result in broken code
- // after split.
- // e.g.
- // for(i=0;i<N;i++) {
- // for(j = 0;j<M;j++) {
- // A[j+1][i+2] = A[j][i]+k;
- // }
- // }
- Instruction *InnerIndexVarInc = nullptr;
- if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader)
- InnerIndexVarInc =
- dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1));
- else
- InnerIndexVarInc =
- dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
-
- if (!InnerIndexVarInc) {
- LLVM_DEBUG(
- dbgs() << "Did not find an instruction to increment the induction "
- << "variable.\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "The inner loop does not increment the induction variable.";
- });
- return true;
- }
-
- // Since we split the inner loop latch on this induction variable. Make sure
- // we do not have any instruction between the induction variable and branch
- // instruction.
-
- bool FoundInduction = false;
- for (const Instruction &I :
- llvm::reverse(InnerLoopLatch->instructionsWithoutDebug())) {
- if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) ||
- isa<ZExtInst>(I))
- continue;
-
- // We found an instruction. If this is not induction variable then it is not
- // safe to split this loop latch.
- if (!I.isIdenticalTo(InnerIndexVarInc)) {
- LLVM_DEBUG(dbgs() << "Found unsupported instructions between induction "
- << "variable increment and branch.\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(
- DEBUG_TYPE, "UnsupportedInsBetweenInduction",
- InnerLoop->getStartLoc(), InnerLoop->getHeader())
- << "Found unsupported instruction between induction variable "
- "increment and branch.";
- });
- return true;
- }
+ return false;
+}
- FoundInduction = true;
- break;
- }
- // The loop latch ended and we didn't find the induction variable return as
- // current limitation.
- if (!FoundInduction) {
- LLVM_DEBUG(dbgs() << "Did not find the induction variable.\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Did not find the induction variable.";
- });
- return true;
+bool LoopInterchangeLegality::findInductions(
+ Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
+ for (PHINode &PHI : L->getHeader()->phis()) {
+ InductionDescriptor ID;
+ if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
+ Inductions.push_back(&PHI);
}
- return false;
+ return !Inductions.empty();
}
// We currently only support LCSSA PHI nodes in the inner loop exit, if their
@@ -1076,7 +974,7 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
for (Instruction &I : BB->instructionsWithoutDebug())
if (CallInst *CI = dyn_cast<CallInst>(&I)) {
// readnone functions do not prevent interchanging.
- if (CI->doesNotReadMemory())
+ if (CI->onlyWritesMemory())
continue;
LLVM_DEBUG(
dbgs() << "Loops with call instructions cannot be interchanged "
@@ -1091,6 +989,11 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
return false;
}
+ if (!findInductions(InnerLoop, InnerLoopInductions)) {
+ LLVM_DEBUG(dbgs() << "Cound not find inner loop induction variables.\n");
+ return false;
+ }
+
if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
ORE->emit([&]() {
@@ -1347,25 +1250,25 @@ void LoopInterchangeTransform::restructureLoops(
bool LoopInterchangeTransform::transform() {
bool Transformed = false;
- Instruction *InnerIndexVar;
if (InnerLoop->getSubLoops().empty()) {
BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
- PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
- if (!InductionPHI) {
+ auto &InductionPHIs = LIL.getInnerLoopInductions();
+ if (InductionPHIs.empty()) {
LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
return false;
}
- if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
- InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
- else
- InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
-
- // Ensure that InductionPHI is the first Phi node.
- if (&InductionPHI->getParent()->front() != InductionPHI)
- InductionPHI->moveBefore(&InductionPHI->getParent()->front());
+ SmallVector<Instruction *, 8> InnerIndexVarList;
+ for (PHINode *CurInductionPHI : InductionPHIs) {
+ if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
+ InnerIndexVarList.push_back(
+ dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1)));
+ else
+ InnerIndexVarList.push_back(
+ dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0)));
+ }
// Create a new latch block for the inner loop. We split at the
// current latch's terminator and then move the condition and all
@@ -1377,7 +1280,7 @@ bool LoopInterchangeTransform::transform() {
SmallSetVector<Instruction *, 4> WorkList;
unsigned i = 0;
- auto MoveInstructions = [&i, &WorkList, this, InductionPHI, NewLatch]() {
+ auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
for (; i < WorkList.size(); i++) {
// Duplicate instruction and move it the new latch. Update uses that
// have been moved.
@@ -1389,7 +1292,8 @@ bool LoopInterchangeTransform::transform() {
for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
Instruction *UserI = cast<Instruction>(U.getUser());
if (!InnerLoop->contains(UserI->getParent()) ||
- UserI->getParent() == NewLatch || UserI == InductionPHI)
+ UserI->getParent() == NewLatch ||
+ llvm::is_contained(InductionPHIs, UserI))
U.set(NewI);
}
// Add operands of moved instruction to the worklist, except if they are
@@ -1398,7 +1302,7 @@ bool LoopInterchangeTransform::transform() {
Instruction *OpI = dyn_cast<Instruction>(Op);
if (!OpI ||
this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
- OpI == InductionPHI)
+ llvm::is_contained(InductionPHIs, OpI))
continue;
WorkList.insert(OpI);
}
@@ -1412,7 +1316,8 @@ bool LoopInterchangeTransform::transform() {
if (CondI)
WorkList.insert(CondI);
MoveInstructions();
- WorkList.insert(cast<Instruction>(InnerIndexVar));
+ for (Instruction *InnerIndexVar : InnerIndexVarList)
+ WorkList.insert(cast<Instruction>(InnerIndexVar));
MoveInstructions();
// Splits the inner loops phi nodes out into a separate basic block.
@@ -1685,7 +1590,6 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
InnerLoopLatchSuccessor, DTUpdates);
-
if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
else
@@ -1712,19 +1616,22 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
for (PHINode &PHI : InnerLoopHeader->phis())
if (OuterInnerReductions.contains(&PHI))
- InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
+ InnerLoopPHIs.push_back(&PHI);
+
for (PHINode &PHI : OuterLoopHeader->phis())
if (OuterInnerReductions.contains(&PHI))
- OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
+ OuterLoopPHIs.push_back(&PHI);
// Now move the remaining reduction PHIs from outer to inner loop header and
// vice versa. The PHI nodes must be part of a reduction across the inner and
// outer loop and all the remains to do is and updating the incoming blocks.
for (PHINode *PHI : OuterLoopPHIs) {
+ LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
}
for (PHINode *PHI : InnerLoopPHIs) {
+ LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
}
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 798af48c2337..654f0d2a03a8 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -3486,6 +3486,31 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
// Don't bother if the instruction is in a BB which ends in an EHPad.
if (UseBB->getTerminator()->isEHPad())
continue;
+
+ // Ignore cases in which the currently-examined value could come from
+ // a basic block terminated with an EHPad. This checks all incoming
+ // blocks of the phi node since it is possible that the same incoming
+ // value comes from multiple basic blocks, only some of which may end
+ // in an EHPad. If any of them do, a subsequent rewrite attempt by this
+ // pass would try to insert instructions into an EHPad, hitting an
+ // assertion.
+ if (isa<PHINode>(UserInst)) {
+ const auto *PhiNode = cast<PHINode>(UserInst);
+ bool HasIncompatibleEHPTerminatedBlock = false;
+ llvm::Value *ExpectedValue = U;
+ for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
+ if (PhiNode->getIncomingValue(I) == ExpectedValue) {
+ if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
+ HasIncompatibleEHPTerminatedBlock = true;
+ break;
+ }
+ }
+ }
+ if (HasIncompatibleEHPTerminatedBlock) {
+ continue;
+ }
+ }
+
// Don't bother rewriting PHIs in catchswitch blocks.
if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
continue;
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 893928fb0560..022d9c7abc8c 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1142,7 +1142,7 @@ static LoopUnrollResult tryToUnrollLoop(
// automatic unrolling from interfering with the user requested
// transformation.
Loop *ParentL = L->getParentLoop();
- if (ParentL != NULL &&
+ if (ParentL != nullptr &&
hasUnrollAndJamTransformation(ParentL) == TM_ForcedByUser &&
hasUnrollTransformation(L) != TM_ForcedByUser) {
LLVM_DEBUG(dbgs() << "Not unrolling loop since parent loop has"
diff --git a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 1c186e9a0488..a7eb60b5e032 100644
--- a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -64,7 +64,7 @@ getBranchWeight(Intrinsic::ID IntrinsicID, CallInst *CI, int BranchCount) {
// __builtin_expect_with_probability
assert(CI->getNumOperands() >= 3 &&
"expect with probability must have 3 arguments");
- ConstantFP *Confidence = dyn_cast<ConstantFP>(CI->getArgOperand(2));
+ auto *Confidence = cast<ConstantFP>(CI->getArgOperand(2));
double TrueProb = Confidence->getValueAPF().convertToDouble();
assert((TrueProb >= 0.0 && TrueProb <= 1.0) &&
"probability value must be in the range [0.0, 1.0]");
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 4e4097e13271..8f1d0181ee5b 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -220,9 +220,7 @@ class LowerMatrixIntrinsics {
bool IsColumnMajor = true;
public:
- MatrixTy()
- : Vectors(),
- IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
+ MatrixTy() : IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
MatrixTy(ArrayRef<Value *> Vectors)
: Vectors(Vectors.begin(), Vectors.end()),
IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
@@ -1393,7 +1391,8 @@ public:
// reloads necessary.
unsigned Op0Regs = (R + VF - 1) / VF * M;
unsigned Op1Regs = (M + VF - 1) / VF * C;
- return Op0Regs + Op1Regs > TTI.getNumberOfRegisters(true);
+ return Op0Regs + Op1Regs >
+ TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true));
}
MatrixTy getZeroMatrix(Type *EltType, unsigned R, unsigned C) {
@@ -1832,7 +1831,7 @@ public:
const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
const SmallSetVector<Value *, 32> &ExprsInSubprogram,
Value *Leaf)
- : Str(), Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared),
+ : Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared),
ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {}
void indent(unsigned N) {
@@ -1895,7 +1894,7 @@ public:
write(Name);
return;
}
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+ auto *II = cast<IntrinsicInst>(CI);
write(Intrinsic::getBaseName(II->getIntrinsicID())
.drop_front(StringRef("llvm.matrix.").size()));
write(".");
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 67335a45fb58..6698db26626b 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -20,6 +20,7 @@
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/MemoryLocation.h"
@@ -171,7 +172,7 @@ public:
bool empty() const { return Ranges.empty(); }
void addInst(int64_t OffsetFromFirst, Instruction *Inst) {
- if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+ if (auto *SI = dyn_cast<StoreInst>(Inst))
addStore(OffsetFromFirst, SI);
else
addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst));
@@ -312,15 +313,21 @@ INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start,
Instruction *End) {
assert(Start->getParent() == End->getParent() && "Must be in same block");
- if (!Start->getFunction()->doesNotThrow() &&
- !isa<AllocaInst>(getUnderlyingObject(V))) {
- for (const Instruction &I :
- make_range(Start->getIterator(), End->getIterator())) {
- if (I.mayThrow())
- return true;
- }
- }
- return false;
+ // Function can't unwind, so it also can't be visible through unwinding.
+ if (Start->getFunction()->doesNotThrow())
+ return false;
+
+ // Object is not visible on unwind.
+ // TODO: Support RequiresNoCaptureBeforeUnwind case.
+ bool RequiresNoCaptureBeforeUnwind;
+ if (isNotVisibleOnUnwind(getUnderlyingObject(V),
+ RequiresNoCaptureBeforeUnwind) &&
+ !RequiresNoCaptureBeforeUnwind)
+ return false;
+
+ // Check whether there are any unwinding instructions in the range.
+ return any_of(make_range(Start->getIterator(), End->getIterator()),
+ [](const Instruction &I) { return I.mayThrow(); });
}
void MemCpyOptPass::eraseInstruction(Instruction *I) {
@@ -364,7 +371,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
const DataLayout &DL = StartInst->getModule()->getDataLayout();
// We can't track scalable types
- if (StoreInst *SI = dyn_cast<StoreInst>(StartInst))
+ if (auto *SI = dyn_cast<StoreInst>(StartInst))
if (DL.getTypeStoreSize(SI->getOperand(0)->getType()).isScalable())
return nullptr;
@@ -410,7 +417,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
continue;
}
- if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
+ if (auto *NextStore = dyn_cast<StoreInst>(BI)) {
// If this is a store, see if we can merge it in.
if (!NextStore->isSimple()) break;
@@ -440,7 +447,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
Ranges.addStore(*Offset, NextStore);
} else {
- MemSetInst *MSI = cast<MemSetInst>(BI);
+ auto *MSI = cast<MemSetInst>(BI);
if (MSI->isVolatile() || ByteVal != MSI->getValue() ||
!isa<ConstantInt>(MSI->getLength()))
@@ -661,7 +668,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
return false;
// Load to store forwarding can be interpreted as memcpy.
- if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
+ if (auto *LI = dyn_cast<LoadInst>(StoredVal)) {
if (LI->isSimple() && LI->hasOneUse() &&
LI->getParent() == SI->getParent()) {
@@ -871,7 +878,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
return false;
// Require that src be an alloca. This simplifies the reasoning considerably.
- AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
+ auto *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
if (!srcAlloca)
return false;
@@ -890,8 +897,10 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
// trap. Otherwise the transform is invalid since it might cause a trap
// to occur earlier than it otherwise would.
if (!isDereferenceableAndAlignedPointer(cpyDest, Align(1), APInt(64, cpySize),
- DL, C, DT))
+ DL, C, DT)) {
+ LLVM_DEBUG(dbgs() << "Call Slot: Dest pointer not dereferenceable\n");
return false;
+ }
// Make sure that nothing can observe cpyDest being written early. There are
// a number of cases to consider:
@@ -907,8 +916,10 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
// guaranteed to be executed if C is. As it is a non-atomic access, it
// renders accesses from other threads undefined.
// TODO: This is currently not checked.
- if (mayBeVisibleThroughUnwinding(cpyDest, C, cpyStore))
+ if (mayBeVisibleThroughUnwinding(cpyDest, C, cpyStore)) {
+ LLVM_DEBUG(dbgs() << "Call Slot: Dest may be visible through unwinding");
return false;
+ }
// Check that dest points to memory that is at least as aligned as src.
Align srcAlign = srcAlloca->getAlign();
@@ -930,14 +941,14 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
append_range(srcUseList, U->users());
continue;
}
- if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) {
+ if (const auto *G = dyn_cast<GetElementPtrInst>(U)) {
if (!G->hasAllZeroIndices())
return false;
append_range(srcUseList, U->users());
continue;
}
- if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U))
+ if (const auto *IT = dyn_cast<IntrinsicInst>(U))
if (IT->isLifetimeStartOrEnd())
continue;
@@ -945,12 +956,57 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
return false;
}
- // Check that src isn't captured by the called function since the
- // transformation can cause aliasing issues in that case.
- for (unsigned ArgI = 0, E = C->arg_size(); ArgI != E; ++ArgI)
- if (C->getArgOperand(ArgI) == cpySrc && !C->doesNotCapture(ArgI))
+ // Check whether src is captured by the called function, in which case there
+ // may be further indirect uses of src.
+ bool SrcIsCaptured = any_of(C->args(), [&](Use &U) {
+ return U->stripPointerCasts() == cpySrc &&
+ !C->doesNotCapture(C->getArgOperandNo(&U));
+ });
+
+ // If src is captured, then check whether there are any potential uses of
+ // src through the captured pointer before the lifetime of src ends, either
+ // due to a lifetime.end or a return from the function.
+ if (SrcIsCaptured) {
+ // Check that dest is not captured before/at the call. We have already
+ // checked that src is not captured before it. If either had been captured,
+ // then the call might be comparing the argument against the captured dest
+ // or src pointer.
+ Value *DestObj = getUnderlyingObject(cpyDest);
+ if (!isIdentifiedFunctionLocal(DestObj) ||
+ PointerMayBeCapturedBefore(DestObj, /* ReturnCaptures */ true,
+ /* StoreCaptures */ true, C, DT,
+ /* IncludeI */ true))
return false;
+ MemoryLocation SrcLoc =
+ MemoryLocation(srcAlloca, LocationSize::precise(srcSize));
+ for (Instruction &I :
+ make_range(++C->getIterator(), C->getParent()->end())) {
+ // Lifetime of srcAlloca ends at lifetime.end.
+ if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+ if (II->getIntrinsicID() == Intrinsic::lifetime_end &&
+ II->getArgOperand(1)->stripPointerCasts() == srcAlloca &&
+ cast<ConstantInt>(II->getArgOperand(0))->uge(srcSize))
+ break;
+ }
+
+ // Lifetime of srcAlloca ends at return.
+ if (isa<ReturnInst>(&I))
+ break;
+
+ // Ignore the direct read of src in the load.
+ if (&I == cpyLoad)
+ continue;
+
+ // Check whether this instruction may mod/ref src through the captured
+ // pointer (we have already any direct mod/refs in the loop above).
+ // Also bail if we hit a terminator, as we don't want to scan into other
+ // blocks.
+ if (isModOrRefSet(AA->getModRefInfo(&I, SrcLoc)) || I.isTerminator())
+ return false;
+ }
+ }
+
// Since we're changing the parameter to the callsite, we need to make sure
// that what would be the new parameter dominates the callsite.
if (!DT->dominates(cpyDest, C)) {
@@ -1018,6 +1074,8 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
LLVMContext::MD_invariant_group,
LLVMContext::MD_access_group};
combineMetadata(C, cpyLoad, KnownIDs, true);
+ if (cpyLoad != cpyStore)
+ combineMetadata(C, cpyStore, KnownIDs, true);
++NumCallSlot;
return true;
@@ -1043,8 +1101,8 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
// Second, the length of the memcpy's must be the same, or the preceding one
// must be larger than the following one.
if (MDep->getLength() != M->getLength()) {
- ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
- ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength());
+ auto *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
+ auto *MLen = dyn_cast<ConstantInt>(M->getLength());
if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
return false;
}
@@ -1163,7 +1221,7 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
const unsigned DestAlign =
std::max(MemSet->getDestAlignment(), MemCpy->getDestAlignment());
if (DestAlign > 1)
- if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize))
+ if (auto *SrcSizeC = dyn_cast<ConstantInt>(SrcSize))
Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign);
IRBuilder<> Builder(MemCpy);
@@ -1211,12 +1269,11 @@ static bool hasUndefContents(MemorySSA *MSSA, AliasAnalysis *AA, Value *V,
if (MSSA->isLiveOnEntryDef(Def))
return isa<AllocaInst>(getUnderlyingObject(V));
- if (IntrinsicInst *II =
- dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) {
+ if (auto *II = dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) {
if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
- ConstantInt *LTSize = cast<ConstantInt>(II->getArgOperand(0));
+ auto *LTSize = cast<ConstantInt>(II->getArgOperand(0));
- if (ConstantInt *CSize = dyn_cast<ConstantInt>(Size)) {
+ if (auto *CSize = dyn_cast<ConstantInt>(Size)) {
if (AA->isMustAlias(V, II->getArgOperand(1)) &&
LTSize->getZExtValue() >= CSize->getZExtValue())
return true;
@@ -1226,12 +1283,14 @@ static bool hasUndefContents(MemorySSA *MSSA, AliasAnalysis *AA, Value *V,
// does) and we're querying a pointer based on that alloca, then we know
// the memory is definitely undef, regardless of how exactly we alias.
// The size also doesn't matter, as an out-of-bounds access would be UB.
- AllocaInst *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V));
- if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) {
- const DataLayout &DL = Alloca->getModule()->getDataLayout();
- if (Optional<TypeSize> AllocaSize = Alloca->getAllocationSizeInBits(DL))
- if (*AllocaSize == LTSize->getValue() * 8)
- return true;
+ if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V))) {
+ if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) {
+ const DataLayout &DL = Alloca->getModule()->getDataLayout();
+ if (Optional<TypeSize> AllocaSize =
+ Alloca->getAllocationSizeInBits(DL))
+ if (*AllocaSize == LTSize->getValue() * 8)
+ return true;
+ }
}
}
}
@@ -1266,12 +1325,12 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
// Don't worry about sizes larger than i64.
// A known memset size is required.
- ConstantInt *CMemSetSize = dyn_cast<ConstantInt>(MemSetSize);
+ auto *CMemSetSize = dyn_cast<ConstantInt>(MemSetSize);
if (!CMemSetSize)
return false;
// A known memcpy size is also required.
- ConstantInt *CCopySize = dyn_cast<ConstantInt>(CopySize);
+ auto *CCopySize = dyn_cast<ConstantInt>(CopySize);
if (!CCopySize)
return false;
if (CCopySize->getZExtValue() > CMemSetSize->getZExtValue()) {
@@ -1323,7 +1382,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
}
// If copying from a constant, try to turn the memcpy into a memset.
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource()))
+ if (auto *GV = dyn_cast<GlobalVariable>(M->getSource()))
if (GV->isConstant() && GV->hasDefinitiveInitializer())
if (Value *ByteVal = isBytewiseValue(GV->getInitializer(),
M->getModule()->getDataLayout())) {
@@ -1370,7 +1429,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
// d) memcpy from a just-memset'd source can be turned into memset.
if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) {
if (Instruction *MI = MD->getMemoryInst()) {
- if (ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength())) {
+ if (auto *CopySize = dyn_cast<ConstantInt>(M->getLength())) {
if (auto *C = dyn_cast<CallInst>(MI)) {
// The memcpy must post-dom the call. Limit to the same block for
// now. Additionally, we need to ensure that there are no accesses
@@ -1469,7 +1528,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
return false;
// The length of the memcpy must be larger or equal to the size of the byval.
- ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
+ auto *C1 = dyn_cast<ConstantInt>(MDep->getLength());
if (!C1 || !TypeSize::isKnownGE(
TypeSize::getFixed(C1->getValue().getZExtValue()), ByValSize))
return false;
@@ -1540,13 +1599,13 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
bool RepeatInstruction = false;
- if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ if (auto *SI = dyn_cast<StoreInst>(I))
MadeChange |= processStore(SI, BI);
- else if (MemSetInst *M = dyn_cast<MemSetInst>(I))
+ else if (auto *M = dyn_cast<MemSetInst>(I))
RepeatInstruction = processMemSet(M, BI);
- else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I))
+ else if (auto *M = dyn_cast<MemCpyInst>(I))
RepeatInstruction = processMemCpy(M, BI);
- else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I))
+ else if (auto *M = dyn_cast<MemMoveInst>(I))
RepeatInstruction = processMemMove(M);
else if (auto *CB = dyn_cast<CallBase>(I)) {
for (unsigned i = 0, e = CB->arg_size(); i != e; ++i)
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 10a8742940b1..2476e6c408b1 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -1198,9 +1198,10 @@ NewGVN::ExprResult NewGVN::createExpression(Instruction *I) const {
if (auto Simplified = checkExprResults(E, I, V))
return Simplified;
} else if (auto *GEPI = dyn_cast<GetElementPtrInst>(I)) {
- Value *V = SimplifyGEPInst(GEPI->getSourceElementType(),
- ArrayRef<Value *>(E->op_begin(), E->op_end()),
- GEPI->isInBounds(), SQ);
+ Value *V =
+ SimplifyGEPInst(GEPI->getSourceElementType(), *E->op_begin(),
+ makeArrayRef(std::next(E->op_begin()), E->op_end()),
+ GEPI->isInBounds(), SQ);
if (auto Simplified = checkExprResults(E, I, V))
return Simplified;
} else if (AllConstant) {
@@ -1322,11 +1323,11 @@ bool NewGVN::someEquivalentDominates(const Instruction *Inst,
Value *NewGVN::lookupOperandLeader(Value *V) const {
CongruenceClass *CC = ValueToClass.lookup(V);
if (CC) {
- // Everything in TOP is represented by undef, as it can be any value.
+ // Everything in TOP is represented by poison, as it can be any value.
// We do have to make sure we get the type right though, so we can't set the
- // RepLeader to undef.
+ // RepLeader to poison.
if (CC == TOPClass)
- return UndefValue::get(V->getType());
+ return PoisonValue::get(V->getType());
return CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
}
@@ -1493,8 +1494,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
// undef value. This can happen when loading for a fresh allocation with no
// intervening stores, for example. Note that this is only true in the case
// that the result of the allocation is pointer equal to the load ptr.
- if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
- isAlignedAllocLikeFn(DepInst, TLI)) {
+ if (isa<AllocaInst>(DepInst)) {
return createConstantExpression(UndefValue::get(LoadType));
}
// If this load occurs either right after a lifetime begin,
@@ -1502,12 +1502,10 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
else if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) {
if (II->getIntrinsicID() == Intrinsic::lifetime_start)
return createConstantExpression(UndefValue::get(LoadType));
- }
- // If this load follows a calloc (which zero initializes memory),
- // then the loaded value is zero
- else if (isCallocLikeFn(DepInst, TLI)) {
- return createConstantExpression(Constant::getNullValue(LoadType));
- }
+ } else if (isAllocationFn(DepInst, TLI))
+ if (auto *InitVal = getInitialValueOfAllocation(cast<CallBase>(DepInst),
+ TLI, LoadType))
+ return createConstantExpression(InitVal);
return nullptr;
}
@@ -1521,9 +1519,9 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
return nullptr;
Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand());
- // Load of undef is undef.
+ // Load of undef is UB.
if (isa<UndefValue>(LoadAddressLeader))
- return createConstantExpression(UndefValue::get(LI->getType()));
+ return createConstantExpression(PoisonValue::get(LI->getType()));
MemoryAccess *OriginalAccess = getMemoryAccess(I);
MemoryAccess *DefiningAccess =
MSSAWalker->getClobberingMemoryAccess(OriginalAccess);
@@ -1531,9 +1529,9 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
if (!MSSA->isLiveOnEntryDef(DefiningAccess)) {
if (auto *MD = dyn_cast<MemoryDef>(DefiningAccess)) {
Instruction *DefiningInst = MD->getMemoryInst();
- // If the defining instruction is not reachable, replace with undef.
+ // If the defining instruction is not reachable, replace with poison.
if (!ReachableBlocks.count(DefiningInst->getParent()))
- return createConstantExpression(UndefValue::get(LI->getType()));
+ return createConstantExpression(PoisonValue::get(LI->getType()));
// This will handle stores and memory insts. We only do if it the
// defining access has a different type, or it is a pointer produced by
// certain memory operations that cause the memory to have a fixed value
@@ -1722,8 +1720,12 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
// We match the semantics of SimplifyPhiNode from InstructionSimplify here.
// See if all arguments are the same.
// We track if any were undef because they need special handling.
- bool HasUndef = false;
+ bool HasUndef = false, HasPoison = false;
auto Filtered = make_filter_range(E->operands(), [&](Value *Arg) {
+ if (isa<PoisonValue>(Arg)) {
+ HasPoison = true;
+ return false;
+ }
if (isa<UndefValue>(Arg)) {
HasUndef = true;
return false;
@@ -1732,8 +1734,14 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
});
// If we are left with no operands, it's dead.
if (Filtered.empty()) {
- // If it has undef at this point, it means there are no-non-undef arguments,
- // and thus, the value of the phi node must be undef.
+ // If it has undef or poison at this point, it means there are no-non-undef
+ // arguments, and thus, the value of the phi node must be undef.
+ if (HasPoison && !HasUndef) {
+ LLVM_DEBUG(
+ dbgs() << "PHI Node " << *I
+ << " has no non-poison arguments, valuing it as poison\n");
+ return createConstantExpression(PoisonValue::get(I->getType()));
+ }
if (HasUndef) {
LLVM_DEBUG(
dbgs() << "PHI Node " << *I
@@ -1758,7 +1766,7 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
// expression to say if one is equivalent to the other.
// We also special case undef, so that if we have an undef, we can't use the
// common value unless it dominates the phi block.
- if (HasUndef) {
+ if (HasPoison || HasUndef) {
// If we have undef and at least one other value, this is really a
// multivalued phi, and we need to know if it's cycle free in order to
// evaluate whether we can ignore the undef. The other parts of this are
@@ -2579,6 +2587,15 @@ bool NewGVN::OpIsSafeForPHIOfOpsHelper(
}
auto *OrigI = cast<Instruction>(V);
+ // When we hit an instruction that reads memory (load, call, etc), we must
+ // consider any store that may happen in the loop. For now, we assume the
+ // worst: there is a store in the loop that alias with this read.
+ // The case where the load is outside the loop is already covered by the
+ // dominator check above.
+ // TODO: relax this condition
+ if (OrigI->mayReadFromMemory())
+ return false;
+
for (auto *Op : OrigI->operand_values()) {
if (!isa<Instruction>(Op))
continue;
@@ -2780,7 +2797,7 @@ NewGVN::makePossiblePHIOfOps(Instruction *I,
LLVM_DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
<< getBlockName(PredBB)
<< " because the block is unreachable\n");
- FoundVal = UndefValue::get(I->getType());
+ FoundVal = PoisonValue::get(I->getType());
RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
}
@@ -3459,7 +3476,7 @@ bool NewGVN::runGVN() {
// Delete all instructions marked for deletion.
for (Instruction *ToErase : InstructionsToErase) {
if (!ToErase->use_empty())
- ToErase->replaceAllUsesWith(UndefValue::get(ToErase->getType()));
+ ToErase->replaceAllUsesWith(PoisonValue::get(ToErase->getType()));
assert(ToErase->getParent() &&
"BB containing ToErase deleted unexpectedly!");
@@ -3677,7 +3694,7 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
for (BasicBlock::reverse_iterator I(StartPoint); I != BB->rend();) {
Instruction &Inst = *I++;
if (!Inst.use_empty())
- Inst.replaceAllUsesWith(UndefValue::get(Inst.getType()));
+ Inst.replaceAllUsesWith(PoisonValue::get(Inst.getType()));
if (isa<LandingPadInst>(Inst))
continue;
salvageKnowledge(&Inst, AC);
@@ -3687,7 +3704,7 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
}
// Now insert something that simplifycfg will turn into an unreachable.
Type *Int8Ty = Type::getInt8Ty(BB->getContext());
- new StoreInst(UndefValue::get(Int8Ty),
+ new StoreInst(PoisonValue::get(Int8Ty),
Constant::getNullValue(Int8Ty->getPointerTo()),
BB->getTerminator());
}
@@ -3827,8 +3844,8 @@ bool NewGVN::eliminateInstructions(Function &F) {
LLVM_DEBUG(dbgs() << "Replacing incoming value of " << PHI
<< " for block "
<< getBlockName(PHI->getIncomingBlock(Operand))
- << " with undef due to it being unreachable\n");
- Operand.set(UndefValue::get(PHI->getType()));
+ << " with poison due to it being unreachable\n");
+ Operand.set(PoisonValue::get(PHI->getType()));
}
};
// Replace unreachable phi arguments.
@@ -4128,21 +4145,25 @@ bool NewGVN::eliminateInstructions(Function &F) {
unsigned int NewGVN::getRank(const Value *V) const {
// Prefer constants to undef to anything else
// Undef is a constant, have to check it first.
+ // Prefer poison to undef as it's less defined.
// Prefer smaller constants to constantexprs
+ // Note that the order here matters because of class inheritance
if (isa<ConstantExpr>(V))
- return 2;
- if (isa<UndefValue>(V))
+ return 3;
+ if (isa<PoisonValue>(V))
return 1;
+ if (isa<UndefValue>(V))
+ return 2;
if (isa<Constant>(V))
return 0;
- else if (auto *A = dyn_cast<Argument>(V))
- return 3 + A->getArgNo();
+ if (auto *A = dyn_cast<Argument>(V))
+ return 4 + A->getArgNo();
- // Need to shift the instruction DFS by number of arguments + 3 to account for
+ // Need to shift the instruction DFS by number of arguments + 5 to account for
// the constant and argument ranking above.
unsigned Result = InstrToDFSNum(V);
if (Result > 0)
- return 4 + NumFuncArgs + Result;
+ return 5 + NumFuncArgs + Result;
// Unreachable or something else, just return a really large number.
return ~0;
}
diff --git a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 44027ccd92ca..e0d0301c1ef6 100644
--- a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -82,6 +82,7 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
// Add attribute "readnone" so that backend can use a native sqrt instruction
// for this call.
+ Call->removeFnAttr(Attribute::WriteOnly);
Call->addFnAttr(Attribute::ReadNone);
// Insert a FP compare instruction and use it as the CurrBB branch condition.
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index e12eca0ed287..3da367341d2a 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1367,13 +1367,13 @@ static AttributeList legalizeCallAttributes(LLVMContext &Ctx,
return AL;
// Remove the readonly, readnone, and statepoint function attributes.
- AttrBuilder FnAttrs = AL.getFnAttrs();
+ AttrBuilder FnAttrs(Ctx, AL.getFnAttrs());
for (auto Attr : FnAttrsToStrip)
FnAttrs.removeAttribute(Attr);
for (Attribute A : AL.getFnAttrs()) {
if (isStatepointDirectiveAttr(A))
- FnAttrs.remove(A);
+ FnAttrs.removeAttribute(A);
}
// Just skip parameter and return attributes for now
@@ -2643,10 +2643,10 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
// List of all parameter and return attributes which must be stripped when
// lowering from the abstract machine model. Note that we list attributes
// here which aren't valid as return attributes, that is okay.
-static AttrBuilder getParamAndReturnAttributesToRemove() {
- AttrBuilder R;
- R.addDereferenceableAttr(1);
- R.addDereferenceableOrNullAttr(1);
+static AttributeMask getParamAndReturnAttributesToRemove() {
+ AttributeMask R;
+ R.addAttribute(Attribute::Dereferenceable);
+ R.addAttribute(Attribute::DereferenceableOrNull);
R.addAttribute(Attribute::ReadNone);
R.addAttribute(Attribute::ReadOnly);
R.addAttribute(Attribute::WriteOnly);
@@ -2668,7 +2668,7 @@ static void stripNonValidAttributesFromPrototype(Function &F) {
return;
}
- AttrBuilder R = getParamAndReturnAttributesToRemove();
+ AttributeMask R = getParamAndReturnAttributesToRemove();
for (Argument &A : F.args())
if (isa<PointerType>(A.getType()))
F.removeParamAttrs(A.getArgNo(), R);
@@ -2742,7 +2742,7 @@ static void stripNonValidDataFromBody(Function &F) {
stripInvalidMetadataFromInstruction(I);
- AttrBuilder R = getParamAndReturnAttributesToRemove();
+ AttributeMask R = getParamAndReturnAttributesToRemove();
if (auto *Call = dyn_cast<CallBase>(&I)) {
for (int i = 0, e = Call->arg_size(); i != e; i++)
if (isa<PointerType>(Call->getArgOperand(i)->getType()))
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index ff2f8a25f379..c34da51e6dc1 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -486,7 +486,7 @@ bool llvm::runIPSCCP(
// inaccessiblemem_or_argmemonly attributes do not hold any longer. Remove
// them from both the function and callsites.
if (ReplacedPointerArg) {
- AttrBuilder AttributesToRemove;
+ AttributeMask AttributesToRemove;
AttributesToRemove.addAttribute(Attribute::ArgMemOnly);
AttributesToRemove.addAttribute(Attribute::InaccessibleMemOrArgMemOnly);
F.removeFnAttrs(AttributesToRemove);
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 31c8999c3724..35497ae5ed9a 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -323,7 +323,7 @@ private:
///
/// Note that these are not separated by slice. This is because we expect an
/// alloca to be completely rewritten or not rewritten at all. If rewritten,
- /// all these instructions can simply be removed and replaced with undef as
+ /// all these instructions can simply be removed and replaced with poison as
/// they come from outside of the allocated space.
SmallVector<Instruction *, 8> DeadUsers;
@@ -333,10 +333,10 @@ private:
/// Operands which will become dead if we rewrite the alloca.
///
/// These are operands that in their particular use can be replaced with
- /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
+ /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
/// to PHI nodes and the like. They aren't entirely dead (there might be
/// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
- /// want to swap this particular input for undef to simplify the use lists of
+ /// want to swap this particular input for poison to simplify the use lists of
/// the alloca.
SmallVector<Use *, 8> DeadOperands;
};
@@ -1008,6 +1008,13 @@ private:
if (I.use_empty())
return markAsDead(I);
+ // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
+ // instructions in this BB, which may be required during rewriting. Bail out
+ // on these cases.
+ if (isa<PHINode>(I) &&
+ I.getParent()->getFirstInsertionPt() == I.getParent()->end())
+ return PI.setAborted(&I);
+
// TODO: We could use SimplifyInstruction here to fold PHINodes and
// SelectInsts. However, doing so requires to change the current
// dead-operand-tracking mechanism. For instance, suppose neither loading
@@ -1023,7 +1030,7 @@ private:
enqueueUsers(I);
else
// Otherwise the operand to the PHI/select is dead, and we can replace
- // it with undef.
+ // it with poison.
AS.DeadOperands.push_back(U);
return;
@@ -1043,7 +1050,7 @@ private:
// For PHI and select operands outside the alloca, we can't nuke the entire
// phi or select -- the other side might still be relevant, so we special
// case them here and use a separate structure to track the operands
- // themselves which should be replaced with undef.
+ // themselves which should be replaced with poison.
// FIXME: This should instead be escaped in the event we're instrumenting
// for address sanitization.
if (Offset.uge(AllocSize)) {
@@ -1264,14 +1271,14 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
return true;
}
-static void speculatePHINodeLoads(PHINode &PN) {
+static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
Type *LoadTy = SomeLoad->getType();
- IRBuilderTy PHIBuilder(&PN);
- PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
- PN.getName() + ".sroa.speculated");
+ IRB.SetInsertPoint(&PN);
+ PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(),
+ PN.getName() + ".sroa.speculated");
// Get the AA tags and alignment to use from one of the loads. It does not
// matter which one we get and if any differ.
@@ -1301,9 +1308,9 @@ static void speculatePHINodeLoads(PHINode &PN) {
}
Instruction *TI = Pred->getTerminator();
- IRBuilderTy PredBuilder(TI);
+ IRB.SetInsertPoint(TI);
- LoadInst *Load = PredBuilder.CreateAlignedLoad(
+ LoadInst *Load = IRB.CreateAlignedLoad(
LoadTy, InVal, Alignment,
(PN.getName() + ".sroa.speculate.load." + Pred->getName()));
++NumLoadsSpeculated;
@@ -1361,10 +1368,10 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) {
return true;
}
-static void speculateSelectInstLoads(SelectInst &SI) {
+static void speculateSelectInstLoads(IRBuilderTy &IRB, SelectInst &SI) {
LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
- IRBuilderTy IRB(&SI);
+ IRB.SetInsertPoint(&SI);
Value *TV = SI.getTrueValue();
Value *FV = SI.getFalseValue();
// Replace the loads of the select with a select of two loads.
@@ -1430,8 +1437,10 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
return BasePtr;
- return IRB.CreateInBoundsGEP(BasePtr->getType()->getPointerElementType(),
- BasePtr, Indices, NamePrefix + "sroa_idx");
+ // buildGEP() is only called for non-opaque pointers.
+ return IRB.CreateInBoundsGEP(
+ BasePtr->getType()->getNonOpaquePointerElementType(), BasePtr, Indices,
+ NamePrefix + "sroa_idx");
}
/// Get a natural GEP off of the BasePtr walking through Ty toward
@@ -1504,7 +1513,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8))
return nullptr;
- Type *ElementTy = Ty->getElementType();
+ Type *ElementTy = Ty->getNonOpaquePointerElementType();
if (!ElementTy->isSized())
return nullptr; // We can't GEP through an unsized element.
@@ -1563,7 +1572,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
APInt Int8PtrOffset(Offset.getBitWidth(), 0);
PointerType *TargetPtrTy = cast<PointerType>(PointerTy);
- Type *TargetTy = TargetPtrTy->getElementType();
+ Type *TargetTy = TargetPtrTy->getNonOpaquePointerElementType();
// As `addrspacecast` is , `Ptr` (the storage pointer) may have different
// address space from the expected `PointerTy` (the pointer to be used).
@@ -2558,7 +2567,7 @@ private:
// the computed value, and then replace the placeholder with LI, leaving
// LI only used for this computation.
Value *Placeholder = new LoadInst(
- LI.getType(), UndefValue::get(LI.getType()->getPointerTo(AS)), "",
+ LI.getType(), PoisonValue::get(LI.getType()->getPointerTo(AS)), "",
false, Align(1));
V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
"insert");
@@ -3223,8 +3232,11 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
/// Used to calculate offsets, and hence alignment, of subobjects.
const DataLayout &DL;
+ IRBuilderTy &IRB;
+
public:
- AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {}
+ AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
+ : DL(DL), IRB(IRB) {}
/// Rewrite loads and stores through a pointer and all pointers derived from
/// it.
@@ -3255,7 +3267,7 @@ private:
template <typename Derived> class OpSplitter {
protected:
/// The builder used to form new instructions.
- IRBuilderTy IRB;
+ IRBuilderTy &IRB;
/// The indices which to be used with insert- or extractvalue to select the
/// appropriate value within the aggregate.
@@ -3282,9 +3294,11 @@ private:
/// Initialize the splitter with an insertion point, Ptr and start with a
/// single zero GEP index.
OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
- Align BaseAlign, const DataLayout &DL)
- : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr),
- BaseTy(BaseTy), BaseAlign(BaseAlign), DL(DL) {}
+ Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
+ : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy),
+ BaseAlign(BaseAlign), DL(DL) {
+ IRB.SetInsertPoint(InsertionPoint);
+ }
public:
/// Generic recursive split emission routine.
@@ -3345,9 +3359,10 @@ private:
AAMDNodes AATags;
LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
- AAMDNodes AATags, Align BaseAlign, const DataLayout &DL)
- : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
- DL),
+ AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
+ IRBuilderTy &IRB)
+ : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
+ IRB),
AATags(AATags) {}
/// Emit a leaf load of a single value. This is called at the leaves of the
@@ -3379,8 +3394,8 @@ private:
// We have an aggregate being loaded, split it apart.
LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
- getAdjustedAlignment(&LI, 0), DL);
- Value *V = UndefValue::get(LI.getType());
+ getAdjustedAlignment(&LI, 0), DL, IRB);
+ Value *V = PoisonValue::get(LI.getType());
Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
Visited.erase(&LI);
LI.replaceAllUsesWith(V);
@@ -3390,9 +3405,10 @@ private:
struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
- AAMDNodes AATags, Align BaseAlign, const DataLayout &DL)
+ AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
+ IRBuilderTy &IRB)
: OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
- DL),
+ DL, IRB),
AATags(AATags) {}
AAMDNodes AATags;
/// Emit a leaf store of a single value. This is called at the leaves of the
@@ -3430,7 +3446,7 @@ private:
// We have an aggregate being stored, split it apart.
LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(),
- getAdjustedAlignment(&SI, 0), DL);
+ getAdjustedAlignment(&SI, 0), DL, IRB);
Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
Visited.erase(&SI);
SI.eraseFromParent();
@@ -3458,7 +3474,7 @@ private:
<< "\n original: " << *Sel
<< "\n " << GEPI);
- IRBuilderTy Builder(&GEPI);
+ IRB.SetInsertPoint(&GEPI);
SmallVector<Value *, 4> Index(GEPI.indices());
bool IsInBounds = GEPI.isInBounds();
@@ -3466,21 +3482,20 @@ private:
Value *True = Sel->getTrueValue();
Value *NTrue =
IsInBounds
- ? Builder.CreateInBoundsGEP(Ty, True, Index,
- True->getName() + ".sroa.gep")
- : Builder.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep");
+ ? IRB.CreateInBoundsGEP(Ty, True, Index,
+ True->getName() + ".sroa.gep")
+ : IRB.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep");
Value *False = Sel->getFalseValue();
Value *NFalse =
IsInBounds
- ? Builder.CreateInBoundsGEP(Ty, False, Index,
- False->getName() + ".sroa.gep")
- : Builder.CreateGEP(Ty, False, Index,
- False->getName() + ".sroa.gep");
+ ? IRB.CreateInBoundsGEP(Ty, False, Index,
+ False->getName() + ".sroa.gep")
+ : IRB.CreateGEP(Ty, False, Index, False->getName() + ".sroa.gep");
- Value *NSel = Builder.CreateSelect(Sel->getCondition(), NTrue, NFalse,
- Sel->getName() + ".sroa.sel");
+ Value *NSel = IRB.CreateSelect(Sel->getCondition(), NTrue, NFalse,
+ Sel->getName() + ".sroa.sel");
Visited.erase(&GEPI);
GEPI.replaceAllUsesWith(NSel);
GEPI.eraseFromParent();
@@ -3517,10 +3532,9 @@ private:
SmallVector<Value *, 4> Index(GEPI.indices());
bool IsInBounds = GEPI.isInBounds();
- IRBuilderTy PHIBuilder(GEPI.getParent()->getFirstNonPHI());
- PHINode *NewPN = PHIBuilder.CreatePHI(GEPI.getType(),
- PHI->getNumIncomingValues(),
- PHI->getName() + ".sroa.phi");
+ IRB.SetInsertPoint(GEPI.getParent()->getFirstNonPHI());
+ PHINode *NewPN = IRB.CreatePHI(GEPI.getType(), PHI->getNumIncomingValues(),
+ PHI->getName() + ".sroa.phi");
for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
BasicBlock *B = PHI->getIncomingBlock(I);
Value *NewVal = nullptr;
@@ -3530,11 +3544,12 @@ private:
} else {
Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
- IRBuilderTy B(In->getParent(), std::next(In->getIterator()));
+ IRB.SetInsertPoint(In->getParent(), std::next(In->getIterator()));
Type *Ty = GEPI.getSourceElementType();
- NewVal = IsInBounds
- ? B.CreateInBoundsGEP(Ty, In, Index, In->getName() + ".sroa.gep")
- : B.CreateGEP(Ty, In, Index, In->getName() + ".sroa.gep");
+ NewVal = IsInBounds ? IRB.CreateInBoundsGEP(Ty, In, Index,
+ In->getName() + ".sroa.gep")
+ : IRB.CreateGEP(Ty, In, Index,
+ In->getName() + ".sroa.gep");
}
NewPN->addIncoming(NewVal, B);
}
@@ -4557,11 +4572,11 @@ bool SROAPass::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
return Changed;
}
-/// Clobber a use with undef, deleting the used value if it becomes dead.
+/// Clobber a use with poison, deleting the used value if it becomes dead.
void SROAPass::clobberUse(Use &U) {
Value *OldV = U;
- // Replace the use with an undef value.
- U = UndefValue::get(OldV->getType());
+ // Replace the use with an poison value.
+ U = PoisonValue::get(OldV->getType());
// Check for this making an instruction dead. We have to garbage collect
// all the dead instructions to ensure the uses of any alloca end up being
@@ -4598,7 +4613,8 @@ bool SROAPass::runOnAlloca(AllocaInst &AI) {
// First, split any FCA loads and stores touching this alloca to promote
// better splitting and promotion opportunities.
- AggLoadStoreRewriter AggRewriter(DL);
+ IRBuilderTy IRB(&AI);
+ AggLoadStoreRewriter AggRewriter(DL, IRB);
Changed |= AggRewriter.rewrite(AI);
// Build the slices using a recursive instruction-visiting builder.
@@ -4614,7 +4630,7 @@ bool SROAPass::runOnAlloca(AllocaInst &AI) {
clobberUse(DeadOp);
// Now replace the uses of this instruction.
- DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType()));
+ DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType()));
// And mark it for deletion.
DeadInsts.push_back(DeadUser);
@@ -4633,11 +4649,11 @@ bool SROAPass::runOnAlloca(AllocaInst &AI) {
LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
while (!SpeculatablePHIs.empty())
- speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
+ speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val());
LLVM_DEBUG(dbgs() << " Speculating Selects\n");
while (!SpeculatableSelects.empty())
- speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
+ speculateSelectInstLoads(IRB, *SpeculatableSelects.pop_back_val());
return Changed;
}
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index 1284bae820a4..29cea42e4a00 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -959,7 +959,8 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
Type *LoadTy = CI->getType();
Align Alignment = DL.getValueOrABITypeAlignment(MA,
LoadTy->getScalarType());
- if (TTI.isLegalMaskedGather(LoadTy, Alignment))
+ if (TTI.isLegalMaskedGather(LoadTy, Alignment) &&
+ !TTI.forceScalarizeMaskedGather(cast<VectorType>(LoadTy), Alignment))
return false;
scalarizeMaskedGather(DL, CI, DTU, ModifiedDT);
return true;
@@ -970,7 +971,9 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
Type *StoreTy = CI->getArgOperand(0)->getType();
Align Alignment = DL.getValueOrABITypeAlignment(MA,
StoreTy->getScalarType());
- if (TTI.isLegalMaskedScatter(StoreTy, Alignment))
+ if (TTI.isLegalMaskedScatter(StoreTy, Alignment) &&
+ !TTI.forceScalarizeMaskedScatter(cast<VectorType>(StoreTy),
+ Alignment))
return false;
scalarizeMaskedScatter(DL, CI, DTU, ModifiedDT);
return true;
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 6b7419abe1d1..3606c8a4b073 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -270,7 +270,7 @@ Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
Type *Ty = V->getType();
PtrTy = dyn_cast<PointerType>(Ty);
if (PtrTy)
- Ty = PtrTy->getElementType();
+ Ty = PtrTy->getPointerElementType();
Size = cast<FixedVectorType>(Ty)->getNumElements();
if (!CachePtr)
Tmp.resize(Size, nullptr);
@@ -288,7 +288,8 @@ Value *Scatterer::operator[](unsigned I) {
return CV[I];
IRBuilder<> Builder(BB, BBI);
if (PtrTy) {
- Type *ElTy = cast<VectorType>(PtrTy->getElementType())->getElementType();
+ Type *ElTy =
+ cast<VectorType>(PtrTy->getPointerElementType())->getElementType();
if (!CV[0]) {
Type *NewPtrTy = PointerType::get(ElTy, PtrTy->getAddressSpace());
CV[0] = Builder.CreateBitCast(V, NewPtrTy, V->getName() + ".i0");
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 3799d2dd1cf2..ee17da1875e5 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -78,6 +78,79 @@ static cl::opt<bool> UserSinkCommonInsts(
STATISTIC(NumSimpl, "Number of blocks simplified");
+static bool
+performBlockTailMerging(Function &F, ArrayRef<BasicBlock *> BBs,
+ std::vector<DominatorTree::UpdateType> *Updates) {
+ SmallVector<PHINode *, 1> NewOps;
+
+ // We don't want to change IR just because we can.
+ // Only do that if there are at least two blocks we'll tail-merge.
+ if (BBs.size() < 2)
+ return false;
+
+ if (Updates)
+ Updates->reserve(Updates->size() + BBs.size());
+
+ BasicBlock *CanonicalBB;
+ Instruction *CanonicalTerm;
+ {
+ auto *Term = BBs[0]->getTerminator();
+
+ // Create a canonical block for this function terminator type now,
+ // placing it *before* the first block that will branch to it.
+ CanonicalBB = BasicBlock::Create(
+ F.getContext(), Twine("common.") + Term->getOpcodeName(), &F, BBs[0]);
+ // We'll also need a PHI node per each operand of the terminator.
+ NewOps.resize(Term->getNumOperands());
+ for (auto I : zip(Term->operands(), NewOps)) {
+ std::get<1>(I) = PHINode::Create(std::get<0>(I)->getType(),
+ /*NumReservedValues=*/BBs.size(),
+ CanonicalBB->getName() + ".op");
+ CanonicalBB->getInstList().push_back(std::get<1>(I));
+ }
+ // Make it so that this canonical block actually has the right
+ // terminator.
+ CanonicalTerm = Term->clone();
+ CanonicalBB->getInstList().push_back(CanonicalTerm);
+ // If the canonical terminator has operands, rewrite it to take PHI's.
+ for (auto I : zip(NewOps, CanonicalTerm->operands()))
+ std::get<1>(I) = std::get<0>(I);
+ }
+
+ // Now, go through each block (with the current terminator type)
+ // we've recorded, and rewrite it to branch to the new common block.
+ const DILocation *CommonDebugLoc = nullptr;
+ for (BasicBlock *BB : BBs) {
+ auto *Term = BB->getTerminator();
+ assert(Term->getOpcode() == CanonicalTerm->getOpcode() &&
+ "All blocks to be tail-merged must be the same "
+ "(function-terminating) terminator type.");
+
+ // Aha, found a new non-canonical function terminator. If it has operands,
+ // forward them to the PHI nodes in the canonical block.
+ for (auto I : zip(Term->operands(), NewOps))
+ std::get<1>(I)->addIncoming(std::get<0>(I), BB);
+
+ // Compute the debug location common to all the original terminators.
+ if (!CommonDebugLoc)
+ CommonDebugLoc = Term->getDebugLoc();
+ else
+ CommonDebugLoc =
+ DILocation::getMergedLocation(CommonDebugLoc, Term->getDebugLoc());
+
+ // And turn BB into a block that just unconditionally branches
+ // to the canonical block.
+ Term->eraseFromParent();
+ BranchInst::Create(CanonicalBB, BB);
+ if (Updates)
+ Updates->push_back({DominatorTree::Insert, BB, CanonicalBB});
+ }
+
+ CanonicalTerm->setDebugLoc(CommonDebugLoc);
+
+ return true;
+}
+
static bool tailMergeBlocksWithSimilarFunctionTerminators(Function &F,
DomTreeUpdater *DTU) {
SmallMapVector<unsigned /*TerminatorOpcode*/, SmallVector<BasicBlock *, 2>, 4>
@@ -133,73 +206,8 @@ static bool tailMergeBlocksWithSimilarFunctionTerminators(Function &F,
std::vector<DominatorTree::UpdateType> Updates;
- for (ArrayRef<BasicBlock *> BBs : make_second_range(Structure)) {
- SmallVector<PHINode *, 1> NewOps;
-
- // We don't want to change IR just because we can.
- // Only do that if there are at least two blocks we'll tail-merge.
- if (BBs.size() < 2)
- continue;
-
- Changed = true;
-
- if (DTU)
- Updates.reserve(Updates.size() + BBs.size());
-
- BasicBlock *CanonicalBB;
- Instruction *CanonicalTerm;
- {
- auto *Term = BBs[0]->getTerminator();
-
- // Create a canonical block for this function terminator type now,
- // placing it *before* the first block that will branch to it.
- CanonicalBB = BasicBlock::Create(
- F.getContext(), Twine("common.") + Term->getOpcodeName(), &F, BBs[0]);
- // We'll also need a PHI node per each operand of the terminator.
- NewOps.resize(Term->getNumOperands());
- for (auto I : zip(Term->operands(), NewOps)) {
- std::get<1>(I) = PHINode::Create(std::get<0>(I)->getType(),
- /*NumReservedValues=*/BBs.size(),
- CanonicalBB->getName() + ".op");
- CanonicalBB->getInstList().push_back(std::get<1>(I));
- }
- // Make it so that this canonical block actually has the right
- // terminator.
- CanonicalTerm = Term->clone();
- CanonicalBB->getInstList().push_back(CanonicalTerm);
- // If the canonical terminator has operands, rewrite it to take PHI's.
- for (auto I : zip(NewOps, CanonicalTerm->operands()))
- std::get<1>(I) = std::get<0>(I);
- }
-
- // Now, go through each block (with the current terminator type)
- // we've recorded, and rewrite it to branch to the new common block.
- const DILocation *CommonDebugLoc = nullptr;
- for (BasicBlock *BB : BBs) {
- auto *Term = BB->getTerminator();
-
- // Aha, found a new non-canonical function terminator. If it has operands,
- // forward them to the PHI nodes in the canonical block.
- for (auto I : zip(Term->operands(), NewOps))
- std::get<1>(I)->addIncoming(std::get<0>(I), BB);
-
- // Compute the debug location common to all the original terminators.
- if (!CommonDebugLoc)
- CommonDebugLoc = Term->getDebugLoc();
- else
- CommonDebugLoc =
- DILocation::getMergedLocation(CommonDebugLoc, Term->getDebugLoc());
-
- // And turn BB into a block that just unconditionally branches
- // to the canonical block.
- Term->eraseFromParent();
- BranchInst::Create(CanonicalBB, BB);
- if (DTU)
- Updates.push_back({DominatorTree::Insert, BB, CanonicalBB});
- }
-
- CanonicalTerm->setDebugLoc(CommonDebugLoc);
- }
+ for (ArrayRef<BasicBlock *> BBs : make_second_range(Structure))
+ Changed |= performBlockTailMerging(F, BBs, DTU ? &Updates : nullptr);
if (DTU)
DTU->applyUpdates(Updates);
@@ -313,7 +321,7 @@ static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) {
Options.SinkCommonInsts = UserSinkCommonInsts;
}
-SimplifyCFGPass::SimplifyCFGPass() : Options() {
+SimplifyCFGPass::SimplifyCFGPass() {
applyCommandLineOverridesToOptions(Options);
}
diff --git a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
index fdc914a72bfd..c734611836eb 100644
--- a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
+++ b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
@@ -22,19 +22,6 @@ using namespace llvm;
#define DEBUG_TYPE "amdgpu-emit-printf"
-static bool isCString(const Value *Arg) {
- auto Ty = Arg->getType();
- auto PtrTy = dyn_cast<PointerType>(Ty);
- if (!PtrTy)
- return false;
-
- auto IntTy = dyn_cast<IntegerType>(PtrTy->getElementType());
- if (!IntTy)
- return false;
-
- return IntTy->getBitWidth() == 8;
-}
-
static Value *fitArgInto64Bits(IRBuilder<> &Builder, Value *Arg) {
auto Int64Ty = Builder.getInt64Ty();
auto Ty = Arg->getType();
@@ -176,13 +163,15 @@ static Value *callAppendStringN(IRBuilder<> &Builder, Value *Desc, Value *Str,
static Value *appendString(IRBuilder<> &Builder, Value *Desc, Value *Arg,
bool IsLast) {
+ Arg = Builder.CreateBitCast(
+ Arg, Builder.getInt8PtrTy(Arg->getType()->getPointerAddressSpace()));
auto Length = getStrlenWithNull(Builder, Arg);
return callAppendStringN(Builder, Desc, Arg, Length, IsLast);
}
static Value *processArg(IRBuilder<> &Builder, Value *Desc, Value *Arg,
bool SpecIsCString, bool IsLast) {
- if (SpecIsCString && isCString(Arg)) {
+ if (SpecIsCString && isa<PointerType>(Arg->getType())) {
return appendString(Builder, Desc, Arg, IsLast);
}
// If the format specifies a string but the argument is not, the frontend will
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 580cfd80141e..97f11ca71726 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -34,6 +34,7 @@ STATISTIC(NumReadNone, "Number of functions inferred as readnone");
STATISTIC(NumInaccessibleMemOnly,
"Number of functions inferred as inaccessiblememonly");
STATISTIC(NumReadOnly, "Number of functions inferred as readonly");
+STATISTIC(NumWriteOnly, "Number of functions inferred as writeonly");
STATISTIC(NumArgMemOnly, "Number of functions inferred as argmemonly");
STATISTIC(NumInaccessibleMemOrArgMemOnly,
"Number of functions inferred as inaccessiblemem_or_argmemonly");
@@ -71,6 +72,19 @@ static bool setOnlyReadsMemory(Function &F) {
return true;
}
+static bool setOnlyWritesMemory(Function &F) {
+ if (F.onlyWritesMemory()) // writeonly or readnone
+ return false;
+ // Turn readonly and writeonly into readnone.
+ if (F.hasFnAttribute(Attribute::ReadOnly)) {
+ F.removeFnAttr(Attribute::ReadOnly);
+ return setDoesNotAccessMemory(F);
+ }
+ ++NumWriteOnly;
+ F.setOnlyWritesMemory();
+ return true;
+}
+
static bool setOnlyAccessesArgMemory(Function &F) {
if (F.onlyAccessesArgMemory())
return false;
@@ -233,6 +247,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
switch (TheLibFunc) {
case LibFunc_strlen:
+ case LibFunc_strnlen:
case LibFunc_wcslen:
Changed |= setOnlyReadsMemory(F);
Changed |= setDoesNotThrow(F);
@@ -400,6 +415,8 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setDoesNotCapture(F, 0);
Changed |= setOnlyReadsMemory(F, 0);
return Changed;
+ case LibFunc_aligned_alloc:
+ case LibFunc_valloc:
case LibFunc_malloc:
case LibFunc_vec_malloc:
Changed |= setOnlyAccessesInaccessibleMemory(F);
@@ -484,6 +501,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
return Changed;
case LibFunc_realloc:
case LibFunc_vec_realloc:
+ case LibFunc_reallocf:
Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
Changed |= setRetNoUndef(F);
Changed |= setDoesNotThrow(F);
@@ -492,11 +510,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setDoesNotCapture(F, 0);
Changed |= setArgNoUndef(F, 1);
return Changed;
- case LibFunc_reallocf:
- Changed |= setRetNoUndef(F);
- Changed |= setWillReturn(F);
- Changed |= setArgNoUndef(F, 1);
- return Changed;
case LibFunc_read:
// May throw; "read" is a valid pthread cancellation point.
Changed |= setRetAndArgsNoUndef(F);
@@ -536,13 +549,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setDoesNotCapture(F, 1);
Changed |= setOnlyReadsMemory(F, 1);
return Changed;
- case LibFunc_aligned_alloc:
- Changed |= setOnlyAccessesInaccessibleMemory(F);
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setRetDoesNotAlias(F);
- Changed |= setWillReturn(F);
- return Changed;
case LibFunc_bcopy:
Changed |= setDoesNotThrow(F);
Changed |= setOnlyAccessesArgMemory(F);
@@ -569,6 +575,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
return Changed;
case LibFunc_calloc:
case LibFunc_vec_calloc:
+ Changed |= setOnlyAccessesInaccessibleMemory(F);
Changed |= setRetAndArgsNoUndef(F);
Changed |= setDoesNotThrow(F);
Changed |= setRetDoesNotAlias(F);
@@ -851,13 +858,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setDoesNotCapture(F, 1);
Changed |= setOnlyReadsMemory(F, 1);
return Changed;
- case LibFunc_valloc:
- Changed |= setOnlyAccessesInaccessibleMemory(F);
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setRetDoesNotAlias(F);
- Changed |= setWillReturn(F);
- return Changed;
case LibFunc_vprintf:
Changed |= setRetAndArgsNoUndef(F);
Changed |= setDoesNotThrow(F);
@@ -1020,12 +1020,10 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
case LibFunc_memset_pattern4:
case LibFunc_memset_pattern8:
case LibFunc_memset_pattern16:
- Changed |= setOnlyAccessesArgMemory(F);
Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyWritesMemory(F, 0);
Changed |= setDoesNotCapture(F, 1);
Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
+ LLVM_FALLTHROUGH;
case LibFunc_memset:
Changed |= setWillReturn(F);
LLVM_FALLTHROUGH;
@@ -1158,7 +1156,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
case LibFunc_sqrt:
case LibFunc_sqrtf:
case LibFunc_sqrtl:
- case LibFunc_strnlen:
case LibFunc_tan:
case LibFunc_tanf:
case LibFunc_tanh:
@@ -1171,6 +1168,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
case LibFunc_truncl:
Changed |= setDoesNotThrow(F);
Changed |= setDoesNotFreeMemory(F);
+ Changed |= setOnlyWritesMemory(F);
Changed |= setWillReturn(F);
return Changed;
default:
diff --git a/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp b/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
index b2763900e154..ac3839f2a4ab 100644
--- a/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
@@ -20,8 +20,7 @@ using namespace llvm;
bool CallGraphUpdater::finalize() {
if (!DeadFunctionsInComdats.empty()) {
- filterDeadComdatFunctions(*DeadFunctionsInComdats.front()->getParent(),
- DeadFunctionsInComdats);
+ filterDeadComdatFunctions(DeadFunctionsInComdats);
DeadFunctions.append(DeadFunctionsInComdats.begin(),
DeadFunctionsInComdats.end());
}
diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index ebe19f1751e5..56b6e4bc46a5 100644
--- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -500,7 +500,7 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee,
CB.setArgOperand(ArgNo, Cast);
// Remove any incompatible attributes for the argument.
- AttrBuilder ArgAttrs(CallerPAL.getParamAttrs(ArgNo));
+ AttrBuilder ArgAttrs(Ctx, CallerPAL.getParamAttrs(ArgNo));
ArgAttrs.remove(AttributeFuncs::typeIncompatible(FormalTy));
// We may have a different byval/inalloca type.
@@ -518,7 +518,7 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee,
// If the return type of the call site doesn't match that of the callee, cast
// the returned value to the appropriate type.
// Remove any incompatible return value attribute.
- AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
+ AttrBuilder RAttrs(Ctx, CallerPAL.getRetAttrs());
if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy) {
createRetBitCast(CB, CallSiteRetTy, RetBitCast);
RAttrs.remove(AttributeFuncs::typeIncompatible(CalleeRetTy));
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 96aff563aa9b..24cd5747c5a4 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -829,39 +829,54 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
default: RetTy = Type::getInt16Ty(header->getContext()); break;
}
- std::vector<Type *> paramTy;
+ std::vector<Type *> ParamTy;
+ std::vector<Type *> AggParamTy;
+ ValueSet StructValues;
// Add the types of the input values to the function's argument list
for (Value *value : inputs) {
LLVM_DEBUG(dbgs() << "value used in func: " << *value << "\n");
- paramTy.push_back(value->getType());
+ if (AggregateArgs && !ExcludeArgsFromAggregate.contains(value)) {
+ AggParamTy.push_back(value->getType());
+ StructValues.insert(value);
+ } else
+ ParamTy.push_back(value->getType());
}
// Add the types of the output values to the function's argument list.
for (Value *output : outputs) {
LLVM_DEBUG(dbgs() << "instr used in func: " << *output << "\n");
- if (AggregateArgs)
- paramTy.push_back(output->getType());
- else
- paramTy.push_back(PointerType::getUnqual(output->getType()));
+ if (AggregateArgs && !ExcludeArgsFromAggregate.contains(output)) {
+ AggParamTy.push_back(output->getType());
+ StructValues.insert(output);
+ } else
+ ParamTy.push_back(PointerType::getUnqual(output->getType()));
+ }
+
+ assert(
+ (ParamTy.size() + AggParamTy.size()) ==
+ (inputs.size() + outputs.size()) &&
+ "Number of scalar and aggregate params does not match inputs, outputs");
+ assert(StructValues.empty() ||
+ AggregateArgs && "Expeced StructValues only with AggregateArgs set");
+
+ // Concatenate scalar and aggregate params in ParamTy.
+ size_t NumScalarParams = ParamTy.size();
+ StructType *StructTy = nullptr;
+ if (AggregateArgs && !AggParamTy.empty()) {
+ StructTy = StructType::get(M->getContext(), AggParamTy);
+ ParamTy.push_back(PointerType::getUnqual(StructTy));
}
LLVM_DEBUG({
dbgs() << "Function type: " << *RetTy << " f(";
- for (Type *i : paramTy)
+ for (Type *i : ParamTy)
dbgs() << *i << ", ";
dbgs() << ")\n";
});
- StructType *StructTy = nullptr;
- if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
- StructTy = StructType::get(M->getContext(), paramTy);
- paramTy.clear();
- paramTy.push_back(PointerType::getUnqual(StructTy));
- }
- FunctionType *funcType =
- FunctionType::get(RetTy, paramTy,
- AllowVarArgs && oldFunction->isVarArg());
+ FunctionType *funcType = FunctionType::get(
+ RetTy, ParamTy, AllowVarArgs && oldFunction->isVarArg());
std::string SuffixToUse =
Suffix.empty()
@@ -871,13 +886,6 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
Function *newFunction = Function::Create(
funcType, GlobalValue::InternalLinkage, oldFunction->getAddressSpace(),
oldFunction->getName() + "." + SuffixToUse, M);
- // If the old function is no-throw, so is the new one.
- if (oldFunction->doesNotThrow())
- newFunction->setDoesNotThrow();
-
- // Inherit the uwtable attribute if we need to.
- if (oldFunction->hasUWTable())
- newFunction->setHasUWTable();
// Inherit all of the target dependent attributes and white-listed
// target independent attributes.
@@ -893,53 +901,26 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
} else
switch (Attr.getKindAsEnum()) {
// Those attributes cannot be propagated safely. Explicitly list them
- // here so we get a warning if new attributes are added. This list also
- // includes non-function attributes.
- case Attribute::Alignment:
+ // here so we get a warning if new attributes are added.
case Attribute::AllocSize:
case Attribute::ArgMemOnly:
case Attribute::Builtin:
- case Attribute::ByVal:
case Attribute::Convergent:
- case Attribute::Dereferenceable:
- case Attribute::DereferenceableOrNull:
- case Attribute::ElementType:
- case Attribute::InAlloca:
- case Attribute::InReg:
case Attribute::InaccessibleMemOnly:
case Attribute::InaccessibleMemOrArgMemOnly:
case Attribute::JumpTable:
case Attribute::Naked:
- case Attribute::Nest:
- case Attribute::NoAlias:
case Attribute::NoBuiltin:
- case Attribute::NoCapture:
case Attribute::NoMerge:
case Attribute::NoReturn:
case Attribute::NoSync:
- case Attribute::NoUndef:
- case Attribute::None:
- case Attribute::NonNull:
- case Attribute::Preallocated:
case Attribute::ReadNone:
case Attribute::ReadOnly:
- case Attribute::Returned:
case Attribute::ReturnsTwice:
- case Attribute::SExt:
case Attribute::Speculatable:
case Attribute::StackAlignment:
- case Attribute::StructRet:
- case Attribute::SwiftError:
- case Attribute::SwiftSelf:
- case Attribute::SwiftAsync:
case Attribute::WillReturn:
case Attribute::WriteOnly:
- case Attribute::ZExt:
- case Attribute::ImmArg:
- case Attribute::ByRef:
- case Attribute::EndAttrKinds:
- case Attribute::EmptyKey:
- case Attribute::TombstoneKey:
continue;
// Those attributes should be safe to propagate to the extracted function.
case Attribute::AlwaysInline:
@@ -980,30 +961,62 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
case Attribute::MustProgress:
case Attribute::NoProfile:
break;
+ // These attributes cannot be applied to functions.
+ case Attribute::Alignment:
+ case Attribute::ByVal:
+ case Attribute::Dereferenceable:
+ case Attribute::DereferenceableOrNull:
+ case Attribute::ElementType:
+ case Attribute::InAlloca:
+ case Attribute::InReg:
+ case Attribute::Nest:
+ case Attribute::NoAlias:
+ case Attribute::NoCapture:
+ case Attribute::NoUndef:
+ case Attribute::NonNull:
+ case Attribute::Preallocated:
+ case Attribute::Returned:
+ case Attribute::SExt:
+ case Attribute::StructRet:
+ case Attribute::SwiftError:
+ case Attribute::SwiftSelf:
+ case Attribute::SwiftAsync:
+ case Attribute::ZExt:
+ case Attribute::ImmArg:
+ case Attribute::ByRef:
+ // These are not really attributes.
+ case Attribute::None:
+ case Attribute::EndAttrKinds:
+ case Attribute::EmptyKey:
+ case Attribute::TombstoneKey:
+ llvm_unreachable("Not a function attribute");
}
newFunction->addFnAttr(Attr);
}
newFunction->getBasicBlockList().push_back(newRootNode);
- // Create an iterator to name all of the arguments we inserted.
- Function::arg_iterator AI = newFunction->arg_begin();
+ // Create scalar and aggregate iterators to name all of the arguments we
+ // inserted.
+ Function::arg_iterator ScalarAI = newFunction->arg_begin();
+ Function::arg_iterator AggAI = std::next(ScalarAI, NumScalarParams);
// Rewrite all users of the inputs in the extracted region to use the
// arguments (or appropriate addressing into struct) instead.
- for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
+ for (unsigned i = 0, e = inputs.size(), aggIdx = 0; i != e; ++i) {
Value *RewriteVal;
- if (AggregateArgs) {
+ if (AggregateArgs && StructValues.contains(inputs[i])) {
Value *Idx[2];
Idx[0] = Constant::getNullValue(Type::getInt32Ty(header->getContext()));
- Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i);
+ Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), aggIdx);
Instruction *TI = newFunction->begin()->getTerminator();
GetElementPtrInst *GEP = GetElementPtrInst::Create(
- StructTy, &*AI, Idx, "gep_" + inputs[i]->getName(), TI);
- RewriteVal = new LoadInst(StructTy->getElementType(i), GEP,
+ StructTy, &*AggAI, Idx, "gep_" + inputs[i]->getName(), TI);
+ RewriteVal = new LoadInst(StructTy->getElementType(aggIdx), GEP,
"loadgep_" + inputs[i]->getName(), TI);
+ ++aggIdx;
} else
- RewriteVal = &*AI++;
+ RewriteVal = &*ScalarAI++;
std::vector<User *> Users(inputs[i]->user_begin(), inputs[i]->user_end());
for (User *use : Users)
@@ -1013,12 +1026,14 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
}
// Set names for input and output arguments.
- if (!AggregateArgs) {
- AI = newFunction->arg_begin();
- for (unsigned i = 0, e = inputs.size(); i != e; ++i, ++AI)
- AI->setName(inputs[i]->getName());
- for (unsigned i = 0, e = outputs.size(); i != e; ++i, ++AI)
- AI->setName(outputs[i]->getName()+".out");
+ if (NumScalarParams) {
+ ScalarAI = newFunction->arg_begin();
+ for (unsigned i = 0, e = inputs.size(); i != e; ++i, ++ScalarAI)
+ if (!StructValues.contains(inputs[i]))
+ ScalarAI->setName(inputs[i]->getName());
+ for (unsigned i = 0, e = outputs.size(); i != e; ++i, ++ScalarAI)
+ if (!StructValues.contains(outputs[i]))
+ ScalarAI->setName(outputs[i]->getName() + ".out");
}
// Rewrite branches to basic blocks outside of the loop to new dummy blocks
@@ -1126,7 +1141,8 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
ValueSet &outputs) {
// Emit a call to the new function, passing in: *pointer to struct (if
// aggregating parameters), or plan inputs and allocated memory for outputs
- std::vector<Value *> params, StructValues, ReloadOutputs, Reloads;
+ std::vector<Value *> params, ReloadOutputs, Reloads;
+ ValueSet StructValues;
Module *M = newFunction->getParent();
LLVMContext &Context = M->getContext();
@@ -1134,23 +1150,24 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
CallInst *call = nullptr;
// Add inputs as params, or to be filled into the struct
- unsigned ArgNo = 0;
+ unsigned ScalarInputArgNo = 0;
SmallVector<unsigned, 1> SwiftErrorArgs;
for (Value *input : inputs) {
- if (AggregateArgs)
- StructValues.push_back(input);
+ if (AggregateArgs && !ExcludeArgsFromAggregate.contains(input))
+ StructValues.insert(input);
else {
params.push_back(input);
if (input->isSwiftError())
- SwiftErrorArgs.push_back(ArgNo);
+ SwiftErrorArgs.push_back(ScalarInputArgNo);
}
- ++ArgNo;
+ ++ScalarInputArgNo;
}
// Create allocas for the outputs
+ unsigned ScalarOutputArgNo = 0;
for (Value *output : outputs) {
- if (AggregateArgs) {
- StructValues.push_back(output);
+ if (AggregateArgs && !ExcludeArgsFromAggregate.contains(output)) {
+ StructValues.insert(output);
} else {
AllocaInst *alloca =
new AllocaInst(output->getType(), DL.getAllocaAddrSpace(),
@@ -1158,12 +1175,14 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
&codeReplacer->getParent()->front().front());
ReloadOutputs.push_back(alloca);
params.push_back(alloca);
+ ++ScalarOutputArgNo;
}
}
StructType *StructArgTy = nullptr;
AllocaInst *Struct = nullptr;
- if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
+ unsigned NumAggregatedInputs = 0;
+ if (AggregateArgs && !StructValues.empty()) {
std::vector<Type *> ArgTypes;
for (Value *V : StructValues)
ArgTypes.push_back(V->getType());
@@ -1175,14 +1194,18 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
&codeReplacer->getParent()->front().front());
params.push_back(Struct);
- for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
- Value *Idx[2];
- Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
- Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i);
- GetElementPtrInst *GEP = GetElementPtrInst::Create(
- StructArgTy, Struct, Idx, "gep_" + StructValues[i]->getName());
- codeReplacer->getInstList().push_back(GEP);
- new StoreInst(StructValues[i], GEP, codeReplacer);
+ // Store aggregated inputs in the struct.
+ for (unsigned i = 0, e = StructValues.size(); i != e; ++i) {
+ if (inputs.contains(StructValues[i])) {
+ Value *Idx[2];
+ Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
+ Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i);
+ GetElementPtrInst *GEP = GetElementPtrInst::Create(
+ StructArgTy, Struct, Idx, "gep_" + StructValues[i]->getName());
+ codeReplacer->getInstList().push_back(GEP);
+ new StoreInst(StructValues[i], GEP, codeReplacer);
+ NumAggregatedInputs++;
+ }
}
}
@@ -1205,24 +1228,24 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
newFunction->addParamAttr(SwiftErrArgNo, Attribute::SwiftError);
}
- Function::arg_iterator OutputArgBegin = newFunction->arg_begin();
- unsigned FirstOut = inputs.size();
- if (!AggregateArgs)
- std::advance(OutputArgBegin, inputs.size());
-
- // Reload the outputs passed in by reference.
- for (unsigned i = 0, e = outputs.size(); i != e; ++i) {
+ // Reload the outputs passed in by reference, use the struct if output is in
+ // the aggregate or reload from the scalar argument.
+ for (unsigned i = 0, e = outputs.size(), scalarIdx = 0,
+ aggIdx = NumAggregatedInputs;
+ i != e; ++i) {
Value *Output = nullptr;
- if (AggregateArgs) {
+ if (AggregateArgs && StructValues.contains(outputs[i])) {
Value *Idx[2];
Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
- Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i);
+ Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), aggIdx);
GetElementPtrInst *GEP = GetElementPtrInst::Create(
StructArgTy, Struct, Idx, "gep_reload_" + outputs[i]->getName());
codeReplacer->getInstList().push_back(GEP);
Output = GEP;
+ ++aggIdx;
} else {
- Output = ReloadOutputs[i];
+ Output = ReloadOutputs[scalarIdx];
+ ++scalarIdx;
}
LoadInst *load = new LoadInst(outputs[i]->getType(), Output,
outputs[i]->getName() + ".reload",
@@ -1304,8 +1327,13 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
// Store the arguments right after the definition of output value.
// This should be proceeded after creating exit stubs to be ensure that invoke
// result restore will be placed in the outlined function.
- Function::arg_iterator OAI = OutputArgBegin;
- for (unsigned i = 0, e = outputs.size(); i != e; ++i) {
+ Function::arg_iterator ScalarOutputArgBegin = newFunction->arg_begin();
+ std::advance(ScalarOutputArgBegin, ScalarInputArgNo);
+ Function::arg_iterator AggOutputArgBegin = newFunction->arg_begin();
+ std::advance(AggOutputArgBegin, ScalarInputArgNo + ScalarOutputArgNo);
+
+ for (unsigned i = 0, e = outputs.size(), aggIdx = NumAggregatedInputs; i != e;
+ ++i) {
auto *OutI = dyn_cast<Instruction>(outputs[i]);
if (!OutI)
continue;
@@ -1325,23 +1353,27 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
assert((InsertBefore->getFunction() == newFunction ||
Blocks.count(InsertBefore->getParent())) &&
"InsertPt should be in new function");
- assert(OAI != newFunction->arg_end() &&
- "Number of output arguments should match "
- "the amount of defined values");
- if (AggregateArgs) {
+ if (AggregateArgs && StructValues.contains(outputs[i])) {
+ assert(AggOutputArgBegin != newFunction->arg_end() &&
+ "Number of aggregate output arguments should match "
+ "the number of defined values");
Value *Idx[2];
Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
- Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i);
+ Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), aggIdx);
GetElementPtrInst *GEP = GetElementPtrInst::Create(
- StructArgTy, &*OAI, Idx, "gep_" + outputs[i]->getName(),
+ StructArgTy, &*AggOutputArgBegin, Idx, "gep_" + outputs[i]->getName(),
InsertBefore);
new StoreInst(outputs[i], GEP, InsertBefore);
+ ++aggIdx;
// Since there should be only one struct argument aggregating
- // all the output values, we shouldn't increment OAI, which always
- // points to the struct argument, in this case.
+ // all the output values, we shouldn't increment AggOutputArgBegin, which
+ // always points to the struct argument, in this case.
} else {
- new StoreInst(outputs[i], &*OAI, InsertBefore);
- ++OAI;
+ assert(ScalarOutputArgBegin != newFunction->arg_end() &&
+ "Number of scalar output arguments should match "
+ "the number of defined values");
+ new StoreInst(outputs[i], &*ScalarOutputArgBegin, InsertBefore);
+ ++ScalarOutputArgBegin;
}
}
@@ -1840,3 +1872,7 @@ bool CodeExtractor::verifyAssumptionCache(const Function &OldFunc,
}
return false;
}
+
+void CodeExtractor::excludeArgFromAggregate(Value *Arg) {
+ ExcludeArgsFromAggregate.insert(Arg);
+}
diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp
index 91630d876fc8..e73287c060ae 100644
--- a/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -122,129 +122,114 @@ isSimpleEnoughValueToCommit(Constant *C,
return isSimpleEnoughValueToCommitHelper(C, SimpleConstants, DL);
}
-/// Return true if this constant is simple enough for us to understand. In
-/// particular, if it is a cast to anything other than from one pointer type to
-/// another pointer type, we punt. We basically just support direct accesses to
-/// globals and GEP's of globals. This should be kept up to date with
-/// CommitValueTo.
-static bool isSimpleEnoughPointerToCommit(Constant *C, const DataLayout &DL) {
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
- // Do not allow weak/*_odr/linkonce linkage or external globals.
- return GV->hasUniqueInitializer();
-
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
- // Handle a constantexpr gep.
- if (CE->getOpcode() == Instruction::GetElementPtr &&
- isa<GlobalVariable>(CE->getOperand(0)) &&
- cast<GEPOperator>(CE)->isInBounds()) {
- GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
- // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
- // external globals.
- if (!GV->hasUniqueInitializer())
- return false;
+void Evaluator::MutableValue::clear() {
+ if (auto *Agg = Val.dyn_cast<MutableAggregate *>())
+ delete Agg;
+ Val = nullptr;
+}
- // The first index must be zero.
- ConstantInt *CI = dyn_cast<ConstantInt>(*std::next(CE->op_begin()));
- if (!CI || !CI->isZero()) return false;
+Constant *Evaluator::MutableValue::read(Type *Ty, APInt Offset,
+ const DataLayout &DL) const {
+ TypeSize TySize = DL.getTypeStoreSize(Ty);
+ const MutableValue *V = this;
+ while (const auto *Agg = V->Val.dyn_cast<MutableAggregate *>()) {
+ Type *AggTy = Agg->Ty;
+ Optional<APInt> Index = DL.getGEPIndexForOffset(AggTy, Offset);
+ if (!Index || Index->uge(Agg->Elements.size()) ||
+ !TypeSize::isKnownLE(TySize, DL.getTypeStoreSize(AggTy)))
+ return nullptr;
+
+ V = &Agg->Elements[Index->getZExtValue()];
+ }
- // The remaining indices must be compile-time known integers within the
- // notional bounds of the corresponding static array types.
- if (!CE->isGEPWithNoNotionalOverIndexing())
- return false;
+ return ConstantFoldLoadFromConst(V->Val.get<Constant *>(), Ty, Offset, DL);
+}
- return ConstantFoldLoadThroughGEPConstantExpr(
- GV->getInitializer(), CE,
- cast<GEPOperator>(CE)->getResultElementType(), DL);
- } else if (CE->getOpcode() == Instruction::BitCast &&
- isa<GlobalVariable>(CE->getOperand(0))) {
- // A constantexpr bitcast from a pointer to another pointer is a no-op,
- // and we know how to evaluate it by moving the bitcast from the pointer
- // operand to the value operand.
- // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
- // external globals.
- return cast<GlobalVariable>(CE->getOperand(0))->hasUniqueInitializer();
- }
- }
+bool Evaluator::MutableValue::makeMutable() {
+ Constant *C = Val.get<Constant *>();
+ Type *Ty = C->getType();
+ unsigned NumElements;
+ if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
+ NumElements = VT->getNumElements();
+ } else if (auto *AT = dyn_cast<ArrayType>(Ty))
+ NumElements = AT->getNumElements();
+ else if (auto *ST = dyn_cast<StructType>(Ty))
+ NumElements = ST->getNumElements();
+ else
+ return false;
- return false;
+ MutableAggregate *MA = new MutableAggregate(Ty);
+ MA->Elements.reserve(NumElements);
+ for (unsigned I = 0; I < NumElements; ++I)
+ MA->Elements.push_back(C->getAggregateElement(I));
+ Val = MA;
+ return true;
}
-/// Apply \p TryLoad to Ptr. If this returns \p nullptr, introspect the
-/// pointer's type and walk down through the initial elements to obtain
-/// additional pointers to try. Returns the first non-null return value from
-/// \p TryLoad, or \p nullptr if the type can't be introspected further.
-static Constant *
-evaluateBitcastFromPtr(Constant *Ptr, const DataLayout &DL,
- const TargetLibraryInfo *TLI,
- std::function<Constant *(Constant *)> TryLoad) {
- Constant *Val;
- while (!(Val = TryLoad(Ptr))) {
- // If Ty is a non-opaque struct, we can convert the pointer to the struct
- // into a pointer to its first member.
- // FIXME: This could be extended to support arrays as well.
- Type *Ty = cast<PointerType>(Ptr->getType())->getElementType();
- if (!isa<StructType>(Ty) || cast<StructType>(Ty)->isOpaque())
- break;
-
- IntegerType *IdxTy = IntegerType::get(Ty->getContext(), 32);
- Constant *IdxZero = ConstantInt::get(IdxTy, 0, false);
- Constant *const IdxList[] = {IdxZero, IdxZero};
-
- Ptr = ConstantExpr::getGetElementPtr(Ty, Ptr, IdxList);
- Ptr = ConstantFoldConstant(Ptr, DL, TLI);
+bool Evaluator::MutableValue::write(Constant *V, APInt Offset,
+ const DataLayout &DL) {
+ Type *Ty = V->getType();
+ TypeSize TySize = DL.getTypeStoreSize(Ty);
+ MutableValue *MV = this;
+ while (Offset != 0 ||
+ !CastInst::isBitOrNoopPointerCastable(Ty, MV->getType(), DL)) {
+ if (MV->Val.is<Constant *>() && !MV->makeMutable())
+ return false;
+
+ MutableAggregate *Agg = MV->Val.get<MutableAggregate *>();
+ Type *AggTy = Agg->Ty;
+ Optional<APInt> Index = DL.getGEPIndexForOffset(AggTy, Offset);
+ if (!Index || Index->uge(Agg->Elements.size()) ||
+ !TypeSize::isKnownLE(TySize, DL.getTypeStoreSize(AggTy)))
+ return false;
+
+ MV = &Agg->Elements[Index->getZExtValue()];
}
- return Val;
+
+ Type *MVType = MV->getType();
+ MV->clear();
+ if (Ty->isIntegerTy() && MVType->isPointerTy())
+ MV->Val = ConstantExpr::getIntToPtr(V, MVType);
+ else if (Ty->isPointerTy() && MVType->isIntegerTy())
+ MV->Val = ConstantExpr::getPtrToInt(V, MVType);
+ else if (Ty != MVType)
+ MV->Val = ConstantExpr::getBitCast(V, MVType);
+ else
+ MV->Val = V;
+ return true;
}
-static Constant *getInitializer(Constant *C) {
- auto *GV = dyn_cast<GlobalVariable>(C);
- return GV && GV->hasDefinitiveInitializer() ? GV->getInitializer() : nullptr;
+Constant *Evaluator::MutableAggregate::toConstant() const {
+ SmallVector<Constant *, 32> Consts;
+ for (const MutableValue &MV : Elements)
+ Consts.push_back(MV.toConstant());
+
+ if (auto *ST = dyn_cast<StructType>(Ty))
+ return ConstantStruct::get(ST, Consts);
+ if (auto *AT = dyn_cast<ArrayType>(Ty))
+ return ConstantArray::get(AT, Consts);
+ assert(isa<FixedVectorType>(Ty) && "Must be vector");
+ return ConstantVector::get(Consts);
}
/// Return the value that would be computed by a load from P after the stores
/// reflected by 'memory' have been performed. If we can't decide, return null.
Constant *Evaluator::ComputeLoadResult(Constant *P, Type *Ty) {
- // If this memory location has been recently stored, use the stored value: it
- // is the most up-to-date.
- auto TryFindMemLoc = [this](Constant *Ptr) {
- return MutatedMemory.lookup(Ptr);
- };
-
- if (Constant *Val = TryFindMemLoc(P))
- return Val;
-
- // Access it.
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
- if (GV->hasDefinitiveInitializer())
- return GV->getInitializer();
+ APInt Offset(DL.getIndexTypeSizeInBits(P->getType()), 0);
+ P = cast<Constant>(P->stripAndAccumulateConstantOffsets(
+ DL, Offset, /* AllowNonInbounds */ true));
+ Offset = Offset.sextOrTrunc(DL.getIndexTypeSizeInBits(P->getType()));
+ auto *GV = dyn_cast<GlobalVariable>(P);
+ if (!GV)
return nullptr;
- }
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P)) {
- switch (CE->getOpcode()) {
- // Handle a constantexpr getelementptr.
- case Instruction::GetElementPtr:
- if (auto *I = getInitializer(CE->getOperand(0)))
- return ConstantFoldLoadThroughGEPConstantExpr(I, CE, Ty, DL);
- break;
- // Handle a constantexpr bitcast.
- case Instruction::BitCast:
- // We're evaluating a load through a pointer that was bitcast to a
- // different type. See if the "from" pointer has recently been stored.
- // If it hasn't, we may still be able to find a stored pointer by
- // introspecting the type.
- Constant *Val =
- evaluateBitcastFromPtr(CE->getOperand(0), DL, TLI, TryFindMemLoc);
- if (!Val)
- Val = getInitializer(CE->getOperand(0));
- if (Val)
- return ConstantFoldLoadThroughBitcast(
- Val, P->getType()->getPointerElementType(), DL);
- break;
- }
- }
+ auto It = MutatedMemory.find(GV);
+ if (It != MutatedMemory.end())
+ return It->second.read(Ty, Offset, DL);
- return nullptr; // don't know how to evaluate.
+ if (!GV->hasDefinitiveInitializer())
+ return nullptr;
+ return ConstantFoldLoadFromConst(GV->getInitializer(), Ty, Offset, DL);
}
static Function *getFunction(Constant *C) {
@@ -260,17 +245,10 @@ static Function *getFunction(Constant *C) {
Function *
Evaluator::getCalleeWithFormalArgs(CallBase &CB,
SmallVectorImpl<Constant *> &Formals) {
- auto *V = CB.getCalledOperand();
+ auto *V = CB.getCalledOperand()->stripPointerCasts();
if (auto *Fn = getFunction(getVal(V)))
return getFormalParams(CB, Fn, Formals) ? Fn : nullptr;
-
- auto *CE = dyn_cast<ConstantExpr>(V);
- if (!CE || CE->getOpcode() != Instruction::BitCast ||
- !getFormalParams(CB, getFunction(CE->getOperand(0)), Formals))
- return nullptr;
-
- return dyn_cast<Function>(
- ConstantFoldLoadThroughBitcast(CE, CE->getOperand(0)->getType(), DL));
+ return nullptr;
}
bool Evaluator::getFormalParams(CallBase &CB, Function *F,
@@ -299,17 +277,13 @@ bool Evaluator::getFormalParams(CallBase &CB, Function *F,
/// If call expression contains bitcast then we may need to cast
/// evaluated return value to a type of the call expression.
-Constant *Evaluator::castCallResultIfNeeded(Value *CallExpr, Constant *RV) {
- ConstantExpr *CE = dyn_cast<ConstantExpr>(CallExpr);
- if (!RV || !CE || CE->getOpcode() != Instruction::BitCast)
+Constant *Evaluator::castCallResultIfNeeded(Type *ReturnType, Constant *RV) {
+ if (!RV || RV->getType() == ReturnType)
return RV;
- if (auto *FT =
- dyn_cast<FunctionType>(CE->getType()->getPointerElementType())) {
- RV = ConstantFoldLoadThroughBitcast(RV, FT->getReturnType(), DL);
- if (!RV)
- LLVM_DEBUG(dbgs() << "Failed to fold bitcast call expr\n");
- }
+ RV = ConstantFoldLoadThroughBitcast(RV, ReturnType, DL);
+ if (!RV)
+ LLVM_DEBUG(dbgs() << "Failed to fold bitcast call expr\n");
return RV;
}
@@ -337,68 +311,30 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
Ptr = FoldedPtr;
LLVM_DEBUG(dbgs() << "; To: " << *Ptr << "\n");
}
- // Conservatively, avoid aggregate types. This is because we don't
- // want to worry about them partially overlapping other stores.
- if (!SI->getValueOperand()->getType()->isSingleValueType() ||
- !isSimpleEnoughPointerToCommit(Ptr, DL)) {
- // If this is too complex for us to commit, reject it.
- LLVM_DEBUG(
- dbgs() << "Pointer is too complex for us to evaluate store.");
+
+ APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+ Ptr = cast<Constant>(Ptr->stripAndAccumulateConstantOffsets(
+ DL, Offset, /* AllowNonInbounds */ true));
+ Offset = Offset.sextOrTrunc(DL.getIndexTypeSizeInBits(Ptr->getType()));
+ auto *GV = dyn_cast<GlobalVariable>(Ptr);
+ if (!GV || !GV->hasUniqueInitializer()) {
+ LLVM_DEBUG(dbgs() << "Store is not to global with unique initializer: "
+ << *Ptr << "\n");
return false;
}
- Constant *Val = getVal(SI->getOperand(0));
-
// If this might be too difficult for the backend to handle (e.g. the addr
// of one global variable divided by another) then we can't commit it.
+ Constant *Val = getVal(SI->getOperand(0));
if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, DL)) {
LLVM_DEBUG(dbgs() << "Store value is too complex to evaluate store. "
<< *Val << "\n");
return false;
}
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
- if (CE->getOpcode() == Instruction::BitCast) {
- LLVM_DEBUG(dbgs()
- << "Attempting to resolve bitcast on constant ptr.\n");
- // If we're evaluating a store through a bitcast, then we need
- // to pull the bitcast off the pointer type and push it onto the
- // stored value. In order to push the bitcast onto the stored value,
- // a bitcast from the pointer's element type to Val's type must be
- // legal. If it's not, we can try introspecting the type to find a
- // legal conversion.
-
- auto TryCastValTy = [&](Constant *P) -> Constant * {
- // The conversion is illegal if the store is wider than the
- // pointee proposed by `evaluateBitcastFromPtr`, since that would
- // drop stores to other struct elements when the caller attempts to
- // look through a struct's 0th element.
- Type *NewTy = cast<PointerType>(P->getType())->getElementType();
- Type *STy = Val->getType();
- if (DL.getTypeSizeInBits(NewTy) < DL.getTypeSizeInBits(STy))
- return nullptr;
-
- if (Constant *FV = ConstantFoldLoadThroughBitcast(Val, NewTy, DL)) {
- Ptr = P;
- return FV;
- }
- return nullptr;
- };
-
- Constant *NewVal =
- evaluateBitcastFromPtr(CE->getOperand(0), DL, TLI, TryCastValTy);
- if (!NewVal) {
- LLVM_DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
- "evaluate.\n");
- return false;
- }
-
- Val = NewVal;
- LLVM_DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
- }
- }
-
- MutatedMemory[Ptr] = Val;
+ auto Res = MutatedMemory.try_emplace(GV, GV->getInitializer());
+ if (!Res.first->second.write(Val, Offset, DL))
+ return false;
} else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) {
InstResult = ConstantExpr::get(BO->getOpcode(),
getVal(BO->getOperand(0)),
@@ -593,7 +529,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
if (Callee->isDeclaration()) {
// If this is a function we can constant fold, do it.
if (Constant *C = ConstantFoldCall(&CB, Callee, Formals, TLI)) {
- InstResult = castCallResultIfNeeded(CB.getCalledOperand(), C);
+ InstResult = castCallResultIfNeeded(CB.getType(), C);
if (!InstResult)
return false;
LLVM_DEBUG(dbgs() << "Constant folded function call. Result: "
@@ -617,7 +553,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
return false;
}
ValueStack.pop_back();
- InstResult = castCallResultIfNeeded(CB.getCalledOperand(), RetVal);
+ InstResult = castCallResultIfNeeded(CB.getType(), RetVal);
if (RetVal && !InstResult)
return false;
diff --git a/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
index 9bfc73e4ba6c..f8ec8c6ad426 100644
--- a/llvm/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
@@ -66,8 +66,6 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
for (const Use &U : V->uses()) {
const User *UR = U.getUser();
if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(UR)) {
- GS.HasNonInstructionUser = true;
-
// If the result of the constantexpr isn't pointer type, then we won't
// know to expect it in various places. Just reject early.
if (!isa<PointerType>(CE->getType()))
@@ -105,9 +103,7 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
// value, not an aggregate), keep more specific information about
// stores.
if (GS.StoredType != GlobalStatus::Stored) {
- const Value *Ptr = SI->getPointerOperand();
- if (isa<ConstantExpr>(Ptr))
- Ptr = Ptr->stripPointerCasts();
+ const Value *Ptr = SI->getPointerOperand()->stripPointerCasts();
if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
Value *StoredVal = SI->getOperand(0);
@@ -174,12 +170,10 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
return true; // Any other non-load instruction might take address!
}
} else if (const Constant *C = dyn_cast<Constant>(UR)) {
- GS.HasNonInstructionUser = true;
// We might have a dead and dangling constant hanging off of here.
if (!isSafeToDestroyConstant(C))
return true;
} else {
- GS.HasNonInstructionUser = true;
// Otherwise must be some other user.
return true;
}
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 997667810580..c9f872f5b7e1 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1185,10 +1185,10 @@ static bool MayContainThrowingOrExitingCall(Instruction *Begin,
static AttrBuilder IdentifyValidAttributes(CallBase &CB) {
- AttrBuilder AB(CB.getAttributes(), AttributeList::ReturnIndex);
- if (AB.empty())
+ AttrBuilder AB(CB.getContext(), CB.getAttributes().getRetAttrs());
+ if (!AB.hasAttributes())
return AB;
- AttrBuilder Valid;
+ AttrBuilder Valid(CB.getContext());
// Only allow these white listed attributes to be propagated back to the
// callee. This is because other attributes may only be valid on the call
// itself, i.e. attributes such as signext and zeroext.
@@ -1208,7 +1208,7 @@ static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) {
return;
AttrBuilder Valid = IdentifyValidAttributes(CB);
- if (Valid.empty())
+ if (!Valid.hasAttributes())
return;
auto *CalledFunction = CB.getCalledFunction();
auto &Context = CalledFunction->getContext();
@@ -1667,7 +1667,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind,
Module *Mod = CB.getModule();
assert(objcarc::isRetainOrClaimRV(RVCallKind) && "unexpected ARC function");
bool IsRetainRV = RVCallKind == objcarc::ARCInstKind::RetainRV,
- IsClaimRV = !IsRetainRV;
+ IsUnsafeClaimRV = !IsRetainRV;
for (auto *RI : Returns) {
Value *RetOpnd = objcarc::GetRCIdentityRoot(RI->getOperand(0));
@@ -1694,7 +1694,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind,
// and erase the autoreleaseRV call.
// - If retainRV is attached to the call, just erase the autoreleaseRV
// call.
- if (IsClaimRV) {
+ if (IsUnsafeClaimRV) {
Builder.SetInsertPoint(II);
Function *IFn =
Intrinsic::getDeclaration(Mod, Intrinsic::objc_release);
diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp
index 668626fef933..72b864dc3e48 100644
--- a/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -339,8 +339,10 @@ bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
#ifdef EXPENSIVE_CHECKS
// Verify all sub-loops are in LCSSA form already.
- for (Loop *SubLoop: L)
+ for (Loop *SubLoop: L) {
+ (void)SubLoop; // Silence unused variable warning.
assert(SubLoop->isRecursivelyLCSSAForm(DT, *LI) && "Subloop not in LCSSA!");
+ }
#endif
SmallVector<BasicBlock *, 8> ExitBlocks;
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index ecad79b68185..9f33d2f82732 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -492,7 +492,7 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
}
}
- if (isAllocLikeFn(I, TLI))
+ if (isAllocationFn(I, TLI) && isAllocRemovable(cast<CallBase>(I), TLI))
return true;
if (CallInst *CI = isFreeCall(I, TLI))
@@ -2189,8 +2189,8 @@ CallInst *llvm::createCallMatchingInvoke(InvokeInst *II) {
return NewCall;
}
-/// changeToCall - Convert the specified invoke into a normal call.
-void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
+// changeToCall - Convert the specified invoke into a normal call.
+CallInst *llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
CallInst *NewCall = createCallMatchingInvoke(II);
NewCall->takeName(II);
NewCall->insertBefore(II);
@@ -2207,6 +2207,7 @@ void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
II->eraseFromParent();
if (DTU)
DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDestBB}});
+ return NewCall;
}
BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
@@ -3147,11 +3148,6 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
if (!ITy->isIntOrIntVectorTy() || ITy->getScalarSizeInBits() > 128)
return false; // Can't do integer/elements > 128 bits.
- Type *DemandedTy = ITy;
- if (I->hasOneUse())
- if (auto *Trunc = dyn_cast<TruncInst>(I->user_back()))
- DemandedTy = Trunc->getType();
-
// Try to find all the pieces corresponding to the bswap.
bool FoundRoot = false;
std::map<Value *, Optional<BitPart>> BPS;
@@ -3165,6 +3161,7 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
"Illegal bit provenance index");
// If the upper bits are zero, then attempt to perform as a truncated op.
+ Type *DemandedTy = ITy;
if (BitProvenance.back() == BitPart::Unset) {
while (!BitProvenance.empty() && BitProvenance.back() == BitPart::Unset)
BitProvenance = BitProvenance.drop_back();
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 69fd110dc3c2..92333408aaef 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -359,7 +359,7 @@ static bool violatesLegacyMultiExitLoopCheck(Loop *L) {
// Return the number of iterations we want to peel off.
void llvm::computePeelCount(Loop *L, unsigned LoopSize,
TargetTransformInfo::PeelingPreferences &PP,
- unsigned &TripCount, DominatorTree &DT,
+ unsigned TripCount, DominatorTree &DT,
ScalarEvolution &SE, unsigned Threshold) {
assert(LoopSize > 0 && "Zero loop size is not allowed!");
// Save the PP.PeelCount value set by the target in
@@ -370,7 +370,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
return;
// Only try to peel innermost loops by default.
- // The constraint can be relaxed by the target in TTI.getUnrollingPreferences
+ // The constraint can be relaxed by the target in TTI.getPeelingPreferences
// or by the flag -unroll-allow-loop-nests-peeling.
if (!PP.AllowLoopNestsPeeling && !L->isInnermost())
return;
@@ -407,8 +407,8 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
SmallDenseMap<PHINode *, Optional<unsigned> > IterationsToInvariance;
// Now go through all Phis to calculate their the number of iterations they
// need to become invariants.
- // Start the max computation with the UP.PeelCount value set by the target
- // in TTI.getUnrollingPreferences or by the flag -unroll-peel-count.
+ // Start the max computation with the PP.PeelCount value set by the target
+ // in TTI.getPeelingPreferences or by the flag -unroll-peel-count.
unsigned DesiredPeelCount = TargetPeelCount;
BasicBlock *BackEdge = L->getLoopLatch();
assert(BackEdge && "Loop is not in simplified form?");
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index b0c622b98d5e..9ca1f4f44b97 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -99,6 +99,17 @@ UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
#endif
);
+static cl::opt<bool>
+UnrollVerifyLoopInfo("unroll-verify-loopinfo", cl::Hidden,
+ cl::desc("Verify loopinfo after unrolling"),
+#ifdef EXPENSIVE_CHECKS
+ cl::init(true)
+#else
+ cl::init(false)
+#endif
+ );
+
+
/// Check if unrolling created a situation where we need to insert phi nodes to
/// preserve LCSSA form.
/// \param Blocks is a vector of basic blocks representing unrolled loop.
@@ -764,6 +775,9 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
// Apply updates to the DomTree.
DT = &DTU.getDomTree();
+ assert(!UnrollVerifyDomtree ||
+ DT->verify(DominatorTree::VerificationLevel::Fast));
+
// At this point, the code is well formed. We now simplify the unrolled loop,
// doing constant propagation and dead code elimination as we go.
simplifyLoopAfterUnroll(L, !CompletelyUnroll && ULO.Count > 1, LI, SE, DT, AC,
@@ -777,6 +791,10 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
if (CompletelyUnroll)
LI->erase(L);
+ // LoopInfo should not be valid, confirm that.
+ if (UnrollVerifyLoopInfo)
+ LI->verify(*DT);
+
// After complete unrolling most of the blocks should be contained in OuterL.
// However, some of them might happen to be out of OuterL (e.g. if they
// precede a loop exit). In this case we might need to insert PHI nodes in
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 93157bd87c34..95db2fe8d310 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -22,6 +22,7 @@
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstSimplifyFolder.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
@@ -1567,7 +1568,9 @@ Value *llvm::addRuntimeChecks(
auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, Exp);
LLVMContext &Ctx = Loc->getContext();
- IRBuilder<> ChkBuilder(Loc);
+ IRBuilder<InstSimplifyFolder> ChkBuilder(Ctx,
+ Loc->getModule()->getDataLayout());
+ ChkBuilder.SetInsertPoint(Loc);
// Our instructions might fold to a constant.
Value *MemoryRuntimeCheck = nullptr;
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index 771b7d25b0f2..f0bf625fa18e 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -15,6 +15,7 @@
#include "llvm/Transforms/Utils/LoopVersioning.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/InstSimplifyFolder.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
@@ -70,17 +71,14 @@ void LoopVersioning::versionLoop(
"scev.check");
SCEVRuntimeCheck =
Exp.expandCodeForPredicate(&Preds, RuntimeCheckBB->getTerminator());
- auto *CI = dyn_cast<ConstantInt>(SCEVRuntimeCheck);
-
- // Discard the SCEV runtime check if it is always true.
- if (CI && CI->isZero())
- SCEVRuntimeCheck = nullptr;
+ IRBuilder<InstSimplifyFolder> Builder(
+ RuntimeCheckBB->getContext(),
+ InstSimplifyFolder(RuntimeCheckBB->getModule()->getDataLayout()));
if (MemRuntimeCheck && SCEVRuntimeCheck) {
- RuntimeCheck = BinaryOperator::Create(Instruction::Or, MemRuntimeCheck,
- SCEVRuntimeCheck, "lver.safe");
- if (auto *I = dyn_cast<Instruction>(RuntimeCheck))
- I->insertBefore(RuntimeCheckBB->getTerminator());
+ Builder.SetInsertPoint(RuntimeCheckBB->getTerminator());
+ RuntimeCheck =
+ Builder.CreateOr(MemRuntimeCheck, SCEVRuntimeCheck, "lver.safe");
} else
RuntimeCheck = MemRuntimeCheck ? MemRuntimeCheck : SCEVRuntimeCheck;
@@ -109,8 +107,9 @@ void LoopVersioning::versionLoop(
// Insert the conditional branch based on the result of the memchecks.
Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
- BranchInst::Create(NonVersionedLoop->getLoopPreheader(),
- VersionedLoop->getLoopPreheader(), RuntimeCheck, OrigTerm);
+ Builder.SetInsertPoint(OrigTerm);
+ Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
+ VersionedLoop->getLoopPreheader());
OrigTerm->eraseFromParent();
// The loops merge in the original exit block. This is now dominated by the
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 8dc4702993c3..3d75dd57456d 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -297,7 +297,7 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
Function *F = OrigBB->getParent();
const DataLayout &DL = F->getParent()->getDataLayout();
- Type *EltTy = cast<PointerType>(SrcAddr->getType())->getElementType();
+ Type *EltTy = SrcAddr->getType()->getPointerElementType();
// Create the a comparison of src and dst, based on which we jump to either
// the forward-copy part of the function (if src >= dst) or the backwards-copy
diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
index bb5ff59cba4b..7c9ab7f6ca2c 100644
--- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -178,66 +178,30 @@ llvm::getOrCreateSanitizerCtorAndInitFunctions(
}
void llvm::filterDeadComdatFunctions(
- Module &M, SmallVectorImpl<Function *> &DeadComdatFunctions) {
- // Build a map from the comdat to the number of entries in that comdat we
- // think are dead. If this fully covers the comdat group, then the entire
- // group is dead. If we find another entry in the comdat group though, we'll
- // have to preserve the whole group.
- SmallDenseMap<Comdat *, int, 16> ComdatEntriesCovered;
+ SmallVectorImpl<Function *> &DeadComdatFunctions) {
+ SmallPtrSet<Function *, 32> MaybeDeadFunctions;
+ SmallPtrSet<Comdat *, 32> MaybeDeadComdats;
for (Function *F : DeadComdatFunctions) {
- Comdat *C = F->getComdat();
- assert(C && "Expected all input GVs to be in a comdat!");
- ComdatEntriesCovered[C] += 1;
+ MaybeDeadFunctions.insert(F);
+ if (Comdat *C = F->getComdat())
+ MaybeDeadComdats.insert(C);
}
- auto CheckComdat = [&](Comdat &C) {
- auto CI = ComdatEntriesCovered.find(&C);
- if (CI == ComdatEntriesCovered.end())
- return;
-
- // If this could have been covered by a dead entry, just subtract one to
- // account for it.
- if (CI->second > 0) {
- CI->second -= 1;
- return;
- }
-
- // If we've already accounted for all the entries that were dead, the
- // entire comdat is alive so remove it from the map.
- ComdatEntriesCovered.erase(CI);
- };
-
- auto CheckAllComdats = [&] {
- for (Function &F : M.functions())
- if (Comdat *C = F.getComdat()) {
- CheckComdat(*C);
- if (ComdatEntriesCovered.empty())
- return;
- }
- for (GlobalVariable &GV : M.globals())
- if (Comdat *C = GV.getComdat()) {
- CheckComdat(*C);
- if (ComdatEntriesCovered.empty())
- return;
- }
- for (GlobalAlias &GA : M.aliases())
- if (Comdat *C = GA.getComdat()) {
- CheckComdat(*C);
- if (ComdatEntriesCovered.empty())
- return;
- }
- };
- CheckAllComdats();
-
- if (ComdatEntriesCovered.empty()) {
- DeadComdatFunctions.clear();
- return;
+ // Find comdats for which all users are dead now.
+ SmallPtrSet<Comdat *, 32> DeadComdats;
+ for (Comdat *C : MaybeDeadComdats) {
+ auto IsUserDead = [&](GlobalObject *GO) {
+ auto *F = dyn_cast<Function>(GO);
+ return F && MaybeDeadFunctions.contains(F);
+ };
+ if (all_of(C->getUsers(), IsUserDead))
+ DeadComdats.insert(C);
}
- // Remove the entries that were not covering.
- erase_if(DeadComdatFunctions, [&](GlobalValue *GV) {
- return ComdatEntriesCovered.find(GV->getComdat()) ==
- ComdatEntriesCovered.end();
+ // Only keep functions which have no comdat or a dead comdat.
+ erase_if(DeadComdatFunctions, [&](Function *F) {
+ Comdat *C = F->getComdat();
+ return C && !DeadComdats.contains(C);
});
}
diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
index 2f2dff6b5f0b..961adf2570a7 100644
--- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
+++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
@@ -14,6 +14,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Utils/SampleProfileInference.h"
+#include "llvm/ADT/BitVector.h"
#include "llvm/Support/Debug.h"
#include <queue>
#include <set>
@@ -144,7 +145,7 @@ public:
/// A cost of decreasing the entry block's count by one.
static constexpr int64_t AuxCostDecEntry = 10;
/// A cost of taking an unlikely jump.
- static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 20;
+ static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 30;
private:
/// Check for existence of an augmenting path with a positive capacity.
@@ -236,7 +237,7 @@ private:
}
}
- /// An node in a flow network.
+ /// A node in a flow network.
struct Node {
/// The cost of the cheapest path from the source to the current node.
int64_t Distance;
@@ -303,13 +304,10 @@ public:
rebalanceUnknownSubgraphs();
}
- /// The probability for the first successor of a unknown subgraph
- static constexpr double UnknownFirstSuccProbability = 0.5;
-
private:
void joinIsolatedComponents() {
// Find blocks that are reachable from the source
- auto Visited = std::vector<bool>(NumBlocks(), false);
+ auto Visited = BitVector(NumBlocks(), false);
findReachable(Func.Entry, Visited);
// Iterate over all non-reachable blocks and adjust their weights
@@ -334,7 +332,7 @@ private:
/// Run BFS from a given block along the jumps with a positive flow and mark
/// all reachable blocks.
- void findReachable(uint64_t Src, std::vector<bool> &Visited) {
+ void findReachable(uint64_t Src, BitVector &Visited) {
if (Visited[Src])
return;
std::queue<uint64_t> Queue;
@@ -452,44 +450,70 @@ private:
uint64_t NumBlocks() const { return Func.Blocks.size(); }
- /// Rebalance unknown subgraphs so as each branch splits with probabilities
- /// UnknownFirstSuccProbability and 1 - UnknownFirstSuccProbability
+ /// Rebalance unknown subgraphs so that the flow is split evenly across the
+ /// outgoing branches of every block of the subgraph. The method iterates over
+ /// blocks with known weight and identifies unknown subgraphs rooted at the
+ /// blocks. Then it verifies if flow rebalancing is feasible and applies it.
void rebalanceUnknownSubgraphs() {
- assert(UnknownFirstSuccProbability >= 0.0 &&
- UnknownFirstSuccProbability <= 1.0 &&
- "the share of the unknown successor should be between 0 and 1");
- // Try to find unknown subgraphs from each non-unknown block
+ // Try to find unknown subgraphs from each block
for (uint64_t I = 0; I < Func.Blocks.size(); I++) {
auto SrcBlock = &Func.Blocks[I];
- // Do not attempt to find unknown successors from a unknown or a
- // zero-flow block
- if (SrcBlock->UnknownWeight || SrcBlock->Flow == 0)
+ // Verify if rebalancing rooted at SrcBlock is feasible
+ if (!canRebalanceAtRoot(SrcBlock))
continue;
- std::vector<FlowBlock *> UnknownSuccs;
+ // Find an unknown subgraphs starting at SrcBlock. Along the way,
+ // fill in known destinations and intermediate unknown blocks.
+ std::vector<FlowBlock *> UnknownBlocks;
+ std::vector<FlowBlock *> KnownDstBlocks;
+ findUnknownSubgraph(SrcBlock, KnownDstBlocks, UnknownBlocks);
+
+ // Verify if rebalancing of the subgraph is feasible. If the search is
+ // successful, find the unique destination block (which can be null)
FlowBlock *DstBlock = nullptr;
- // Find a unknown subgraphs starting at block SrcBlock
- if (!findUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs))
+ if (!canRebalanceSubgraph(SrcBlock, KnownDstBlocks, UnknownBlocks,
+ DstBlock))
continue;
- // At the moment, we do not rebalance subgraphs containing cycles among
- // unknown blocks
- if (!isAcyclicSubgraph(SrcBlock, DstBlock, UnknownSuccs))
+
+ // We cannot rebalance subgraphs containing cycles among unknown blocks
+ if (!isAcyclicSubgraph(SrcBlock, DstBlock, UnknownBlocks))
continue;
// Rebalance the flow
- rebalanceUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs);
+ rebalanceUnknownSubgraph(SrcBlock, DstBlock, UnknownBlocks);
}
}
- /// Find a unknown subgraph starting at block SrcBlock.
- /// If the search is successful, the method sets DstBlock and UnknownSuccs.
- bool findUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *&DstBlock,
- std::vector<FlowBlock *> &UnknownSuccs) {
+ /// Verify if rebalancing rooted at a given block is possible.
+ bool canRebalanceAtRoot(const FlowBlock *SrcBlock) {
+ // Do not attempt to find unknown subgraphs from an unknown or a
+ // zero-flow block
+ if (SrcBlock->UnknownWeight || SrcBlock->Flow == 0)
+ return false;
+
+ // Do not attempt to process subgraphs from a block w/o unknown sucessors
+ bool HasUnknownSuccs = false;
+ for (auto Jump : SrcBlock->SuccJumps) {
+ if (Func.Blocks[Jump->Target].UnknownWeight) {
+ HasUnknownSuccs = true;
+ break;
+ }
+ }
+ if (!HasUnknownSuccs)
+ return false;
+
+ return true;
+ }
+
+ /// Find an unknown subgraph starting at block SrcBlock. The method sets
+ /// identified destinations, KnownDstBlocks, and intermediate UnknownBlocks.
+ void findUnknownSubgraph(const FlowBlock *SrcBlock,
+ std::vector<FlowBlock *> &KnownDstBlocks,
+ std::vector<FlowBlock *> &UnknownBlocks) {
// Run BFS from SrcBlock and make sure all paths are going through unknown
// blocks and end at a non-unknown DstBlock
- auto Visited = std::vector<bool>(NumBlocks(), false);
+ auto Visited = BitVector(NumBlocks(), false);
std::queue<uint64_t> Queue;
- DstBlock = nullptr;
Queue.push(SrcBlock->Index);
Visited[SrcBlock->Index] = true;
@@ -498,52 +522,105 @@ private:
Queue.pop();
// Process blocks reachable from Block
for (auto Jump : Block.SuccJumps) {
+ // If Jump can be ignored, skip it
+ if (ignoreJump(SrcBlock, nullptr, Jump))
+ continue;
+
uint64_t Dst = Jump->Target;
+ // If Dst has been visited, skip Jump
if (Visited[Dst])
continue;
+ // Process block Dst
Visited[Dst] = true;
if (!Func.Blocks[Dst].UnknownWeight) {
- // If we see non-unique non-unknown block reachable from SrcBlock,
- // stop processing and skip rebalancing
- FlowBlock *CandidateDstBlock = &Func.Blocks[Dst];
- if (DstBlock != nullptr && DstBlock != CandidateDstBlock)
- return false;
- DstBlock = CandidateDstBlock;
+ KnownDstBlocks.push_back(&Func.Blocks[Dst]);
} else {
Queue.push(Dst);
- UnknownSuccs.push_back(&Func.Blocks[Dst]);
+ UnknownBlocks.push_back(&Func.Blocks[Dst]);
}
}
}
+ }
+ /// Verify if rebalancing of the subgraph is feasible. If the checks are
+ /// successful, set the unique destination block, DstBlock (can be null).
+ bool canRebalanceSubgraph(const FlowBlock *SrcBlock,
+ const std::vector<FlowBlock *> &KnownDstBlocks,
+ const std::vector<FlowBlock *> &UnknownBlocks,
+ FlowBlock *&DstBlock) {
// If the list of unknown blocks is empty, we don't need rebalancing
- if (UnknownSuccs.empty())
+ if (UnknownBlocks.empty())
return false;
- // If all reachable nodes from SrcBlock are unknown, skip rebalancing
- if (DstBlock == nullptr)
+
+ // If there are multiple known sinks, we can't rebalance
+ if (KnownDstBlocks.size() > 1)
return false;
- // If any of the unknown blocks is an exit block, skip rebalancing
- for (auto Block : UnknownSuccs) {
- if (Block->isExit())
+ DstBlock = KnownDstBlocks.empty() ? nullptr : KnownDstBlocks.front();
+
+ // Verify sinks of the subgraph
+ for (auto Block : UnknownBlocks) {
+ if (Block->SuccJumps.empty()) {
+ // If there are multiple (known and unknown) sinks, we can't rebalance
+ if (DstBlock != nullptr)
+ return false;
+ continue;
+ }
+ size_t NumIgnoredJumps = 0;
+ for (auto Jump : Block->SuccJumps) {
+ if (ignoreJump(SrcBlock, DstBlock, Jump))
+ NumIgnoredJumps++;
+ }
+ // If there is a non-sink block in UnknownBlocks with all jumps ignored,
+ // then we can't rebalance
+ if (NumIgnoredJumps == Block->SuccJumps.size())
return false;
}
return true;
}
+ /// Decide whether the Jump is ignored while processing an unknown subgraphs
+ /// rooted at basic block SrcBlock with the destination block, DstBlock.
+ bool ignoreJump(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
+ const FlowJump *Jump) {
+ // Ignore unlikely jumps with zero flow
+ if (Jump->IsUnlikely && Jump->Flow == 0)
+ return true;
+
+ auto JumpSource = &Func.Blocks[Jump->Source];
+ auto JumpTarget = &Func.Blocks[Jump->Target];
+
+ // Do not ignore jumps coming into DstBlock
+ if (DstBlock != nullptr && JumpTarget == DstBlock)
+ return false;
+
+ // Ignore jumps out of SrcBlock to known blocks
+ if (!JumpTarget->UnknownWeight && JumpSource == SrcBlock)
+ return true;
+
+ // Ignore jumps to known blocks with zero flow
+ if (!JumpTarget->UnknownWeight && JumpTarget->Flow == 0)
+ return true;
+
+ return false;
+ }
+
/// Verify if the given unknown subgraph is acyclic, and if yes, reorder
- /// UnknownSuccs in the topological order (so that all jumps are "forward").
- bool isAcyclicSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock,
- std::vector<FlowBlock *> &UnknownSuccs) {
+ /// UnknownBlocks in the topological order (so that all jumps are "forward").
+ bool isAcyclicSubgraph(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
+ std::vector<FlowBlock *> &UnknownBlocks) {
// Extract local in-degrees in the considered subgraph
auto LocalInDegree = std::vector<uint64_t>(NumBlocks(), 0);
- for (auto Jump : SrcBlock->SuccJumps) {
- LocalInDegree[Jump->Target]++;
- }
- for (uint64_t I = 0; I < UnknownSuccs.size(); I++) {
- for (auto Jump : UnknownSuccs[I]->SuccJumps) {
+ auto fillInDegree = [&](const FlowBlock *Block) {
+ for (auto Jump : Block->SuccJumps) {
+ if (ignoreJump(SrcBlock, DstBlock, Jump))
+ continue;
LocalInDegree[Jump->Target]++;
}
+ };
+ fillInDegree(SrcBlock);
+ for (auto Block : UnknownBlocks) {
+ fillInDegree(Block);
}
// A loop containing SrcBlock
if (LocalInDegree[SrcBlock->Index] > 0)
@@ -553,15 +630,20 @@ private:
std::queue<uint64_t> Queue;
Queue.push(SrcBlock->Index);
while (!Queue.empty()) {
- auto &Block = Func.Blocks[Queue.front()];
+ FlowBlock *Block = &Func.Blocks[Queue.front()];
Queue.pop();
- // Stop propagation once we reach DstBlock
- if (Block.Index == DstBlock->Index)
+ // Stop propagation once we reach DstBlock, if any
+ if (DstBlock != nullptr && Block == DstBlock)
break;
- AcyclicOrder.push_back(&Block);
+ // Keep an acyclic order of unknown blocks
+ if (Block->UnknownWeight && Block != SrcBlock)
+ AcyclicOrder.push_back(Block);
+
// Add to the queue all successors with zero local in-degree
- for (auto Jump : Block.SuccJumps) {
+ for (auto Jump : Block->SuccJumps) {
+ if (ignoreJump(SrcBlock, DstBlock, Jump))
+ continue;
uint64_t Dst = Jump->Target;
LocalInDegree[Dst]--;
if (LocalInDegree[Dst] == 0) {
@@ -572,42 +654,69 @@ private:
// If there is a cycle in the subgraph, AcyclicOrder contains only a subset
// of all blocks
- if (UnknownSuccs.size() + 1 != AcyclicOrder.size())
+ if (UnknownBlocks.size() != AcyclicOrder.size())
return false;
- UnknownSuccs = AcyclicOrder;
+ UnknownBlocks = AcyclicOrder;
return true;
}
- /// Rebalance a given subgraph.
- void rebalanceUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock,
- std::vector<FlowBlock *> &UnknownSuccs) {
+ /// Rebalance a given subgraph rooted at SrcBlock, ending at DstBlock and
+ /// having UnknownBlocks intermediate blocks.
+ void rebalanceUnknownSubgraph(const FlowBlock *SrcBlock,
+ const FlowBlock *DstBlock,
+ const std::vector<FlowBlock *> &UnknownBlocks) {
assert(SrcBlock->Flow > 0 && "zero-flow block in unknown subgraph");
- assert(UnknownSuccs.front() == SrcBlock && "incorrect order of unknowns");
- for (auto Block : UnknownSuccs) {
+ // Ditribute flow from the source block
+ uint64_t BlockFlow = 0;
+ // SrcBlock's flow is the sum of outgoing flows along non-ignored jumps
+ for (auto Jump : SrcBlock->SuccJumps) {
+ if (ignoreJump(SrcBlock, DstBlock, Jump))
+ continue;
+ BlockFlow += Jump->Flow;
+ }
+ rebalanceBlock(SrcBlock, DstBlock, SrcBlock, BlockFlow);
+
+ // Ditribute flow from the remaining blocks
+ for (auto Block : UnknownBlocks) {
+ assert(Block->UnknownWeight && "incorrect unknown subgraph");
+ uint64_t BlockFlow = 0;
// Block's flow is the sum of incoming flows
- uint64_t TotalFlow = 0;
- if (Block == SrcBlock) {
- TotalFlow = Block->Flow;
- } else {
- for (auto Jump : Block->PredJumps) {
- TotalFlow += Jump->Flow;
- }
- Block->Flow = TotalFlow;
+ for (auto Jump : Block->PredJumps) {
+ BlockFlow += Jump->Flow;
}
+ Block->Flow = BlockFlow;
+ rebalanceBlock(SrcBlock, DstBlock, Block, BlockFlow);
+ }
+ }
- // Process all successor jumps and update corresponding flow values
- for (uint64_t I = 0; I < Block->SuccJumps.size(); I++) {
- auto Jump = Block->SuccJumps[I];
- if (I + 1 == Block->SuccJumps.size()) {
- Jump->Flow = TotalFlow;
- continue;
- }
- uint64_t Flow = uint64_t(TotalFlow * UnknownFirstSuccProbability);
- Jump->Flow = Flow;
- TotalFlow -= Flow;
- }
+ /// Redistribute flow for a block in a subgraph rooted at SrcBlock,
+ /// and ending at DstBlock.
+ void rebalanceBlock(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
+ const FlowBlock *Block, uint64_t BlockFlow) {
+ // Process all successor jumps and update corresponding flow values
+ size_t BlockDegree = 0;
+ for (auto Jump : Block->SuccJumps) {
+ if (ignoreJump(SrcBlock, DstBlock, Jump))
+ continue;
+ BlockDegree++;
+ }
+ // If all successor jumps of the block are ignored, skip it
+ if (DstBlock == nullptr && BlockDegree == 0)
+ return;
+ assert(BlockDegree > 0 && "all outgoing jumps are ignored");
+
+ // Each of the Block's successors gets the following amount of flow.
+ // Rounding the value up so that all flow is propagated
+ uint64_t SuccFlow = (BlockFlow + BlockDegree - 1) / BlockDegree;
+ for (auto Jump : Block->SuccJumps) {
+ if (ignoreJump(SrcBlock, DstBlock, Jump))
+ continue;
+ uint64_t Flow = std::min(SuccFlow, BlockFlow);
+ Jump->Flow = Flow;
+ BlockFlow -= Flow;
}
+ assert(BlockFlow == 0 && "not all flow is propagated");
}
/// A constant indicating an arbitrary exit block of a function.
@@ -799,7 +908,7 @@ void verifyWeights(const FlowFunction &Func) {
// Run BFS from the source along edges with positive flow
std::queue<uint64_t> Queue;
- auto Visited = std::vector<bool>(NumBlocks, false);
+ auto Visited = BitVector(NumBlocks, false);
Queue.push(Func.Entry);
Visited[Func.Entry] = true;
while (!Queue.empty()) {
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index c840ee85795f..5363a851fc27 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -173,7 +173,7 @@ Value *SCEVExpander::InsertNoopCastOfTo(Value *V, Type *Ty) {
auto *PtrTy = cast<PointerType>(Ty);
if (DL.isNonIntegralPointerType(PtrTy)) {
auto *Int8PtrTy = Builder.getInt8PtrTy(PtrTy->getAddressSpace());
- assert(DL.getTypeAllocSize(Int8PtrTy->getElementType()) == 1 &&
+ assert(DL.getTypeAllocSize(Builder.getInt8Ty()) == 1 &&
"alloc size of i8 must by 1 byte for the GEP to be correct");
auto *GEP = Builder.CreateGEP(
Builder.getInt8Ty(), Constant::getNullValue(Int8PtrTy), V, "uglygep");
@@ -471,7 +471,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
// indexes into the array implied by the pointer operand; the rest of
// the indices index into the element or field type selected by the
// preceding index.
- Type *ElTy = PTy->getElementType();
+ Type *ElTy = PTy->getNonOpaquePointerElementType();
for (;;) {
// If the scale size is not 0, attempt to factor out a scale for
// array indexing.
@@ -640,8 +640,8 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
Value *Casted = V;
if (V->getType() != PTy)
Casted = InsertNoopCastOfTo(Casted, PTy);
- Value *GEP = Builder.CreateGEP(PTy->getElementType(), Casted, GepIndices,
- "scevgep");
+ Value *GEP = Builder.CreateGEP(PTy->getNonOpaquePointerElementType(),
+ Casted, GepIndices, "scevgep");
Ops.push_back(SE.getUnknown(GEP));
}
@@ -1671,7 +1671,7 @@ Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) {
return Builder.CreateSExt(V, Ty);
}
-Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
+Value *SCEVExpander::expandSMaxExpr(const SCEVNAryExpr *S) {
Value *LHS = expand(S->getOperand(S->getNumOperands()-1));
Type *Ty = LHS->getType();
for (int i = S->getNumOperands()-2; i >= 0; --i) {
@@ -1700,7 +1700,7 @@ Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
return LHS;
}
-Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
+Value *SCEVExpander::expandUMaxExpr(const SCEVNAryExpr *S) {
Value *LHS = expand(S->getOperand(S->getNumOperands()-1));
Type *Ty = LHS->getType();
for (int i = S->getNumOperands()-2; i >= 0; --i) {
@@ -1729,7 +1729,7 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
return LHS;
}
-Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) {
+Value *SCEVExpander::expandSMinExpr(const SCEVNAryExpr *S) {
Value *LHS = expand(S->getOperand(S->getNumOperands() - 1));
Type *Ty = LHS->getType();
for (int i = S->getNumOperands() - 2; i >= 0; --i) {
@@ -1758,7 +1758,7 @@ Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) {
return LHS;
}
-Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) {
+Value *SCEVExpander::expandUMinExpr(const SCEVNAryExpr *S) {
Value *LHS = expand(S->getOperand(S->getNumOperands() - 1));
Type *Ty = LHS->getType();
for (int i = S->getNumOperands() - 2; i >= 0; --i) {
@@ -1787,6 +1787,40 @@ Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) {
return LHS;
}
+Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
+ return expandSMaxExpr(S);
+}
+
+Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
+ return expandUMaxExpr(S);
+}
+
+Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) {
+ return expandSMinExpr(S);
+}
+
+Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) {
+ return expandUMinExpr(S);
+}
+
+Value *SCEVExpander::visitSequentialUMinExpr(const SCEVSequentialUMinExpr *S) {
+ SmallVector<Value *> Ops;
+ for (const SCEV *Op : S->operands())
+ Ops.emplace_back(expand(Op));
+
+ Value *SaturationPoint =
+ MinMaxIntrinsic::getSaturationPoint(Intrinsic::umin, S->getType());
+
+ SmallVector<Value *> OpIsZero;
+ for (Value *Op : ArrayRef<Value *>(Ops).drop_back())
+ OpIsZero.emplace_back(Builder.CreateICmpEQ(Op, SaturationPoint));
+
+ Value *AnyOpIsZero = Builder.CreateLogicalOr(OpIsZero);
+
+ Value *NaiveUMin = expandUMinExpr(S);
+ return Builder.CreateSelect(AnyOpIsZero, SaturationPoint, NaiveUMin);
+}
+
Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty,
Instruction *IP, bool Root) {
setInsertPoint(IP);
@@ -1809,8 +1843,8 @@ Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root) {
// instruction.
Instruction *Tmp;
if (Inst->getType()->isIntegerTy())
- Tmp =
- cast<Instruction>(Builder.CreateAdd(Inst, Inst, "tmp.lcssa.user"));
+ Tmp = cast<Instruction>(Builder.CreateIntToPtr(
+ Inst, Inst->getType()->getPointerTo(), "tmp.lcssa.user"));
else {
assert(Inst->getType()->isPointerTy());
Tmp = cast<Instruction>(Builder.CreatePtrToInt(
@@ -1947,22 +1981,14 @@ Value *SCEVExpander::expand(const SCEV *S) {
if (VO.second) {
if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) {
- Type *Ety = Vty->getPointerElementType();
int64_t Offset = VO.second->getSExtValue();
- int64_t ESize = SE.getTypeSizeInBits(Ety);
- if ((Offset * 8) % ESize == 0) {
- ConstantInt *Idx =
- ConstantInt::getSigned(VO.second->getType(), -(Offset * 8) / ESize);
- V = Builder.CreateGEP(Ety, V, Idx, "scevgep");
- } else {
- ConstantInt *Idx =
- ConstantInt::getSigned(VO.second->getType(), -Offset);
- unsigned AS = Vty->getAddressSpace();
- V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS));
- V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx,
- "uglygep");
- V = Builder.CreateBitCast(V, Vty);
- }
+ ConstantInt *Idx =
+ ConstantInt::getSigned(VO.second->getType(), -Offset);
+ unsigned AS = Vty->getAddressSpace();
+ V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS));
+ V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx,
+ "uglygep");
+ V = Builder.CreateBitCast(V, Vty);
} else {
V = Builder.CreateSub(V, VO.second);
}
@@ -2271,10 +2297,27 @@ template<typename T> static InstructionCost costAndCollectOperands(
case scSMaxExpr:
case scUMaxExpr:
case scSMinExpr:
- case scUMinExpr: {
+ case scUMinExpr:
+ case scSequentialUMinExpr: {
// FIXME: should this ask the cost for Intrinsic's?
+ // The reduction tree.
Cost += CmpSelCost(Instruction::ICmp, S->getNumOperands() - 1, 0, 1);
Cost += CmpSelCost(Instruction::Select, S->getNumOperands() - 1, 0, 2);
+ switch (S->getSCEVType()) {
+ case scSequentialUMinExpr: {
+ // The safety net against poison.
+ // FIXME: this is broken.
+ Cost += CmpSelCost(Instruction::ICmp, S->getNumOperands() - 1, 0, 0);
+ Cost += ArithCost(Instruction::Or,
+ S->getNumOperands() > 2 ? S->getNumOperands() - 2 : 0);
+ Cost += CmpSelCost(Instruction::Select, 1, 0, 1);
+ break;
+ }
+ default:
+ assert(!isa<SCEVSequentialMinMaxExpr>(S) &&
+ "Unhandled SCEV expression type?");
+ break;
+ }
break;
}
case scAddRecExpr: {
@@ -2362,7 +2405,7 @@ bool SCEVExpander::isHighCostExpansionHelper(
case scConstant: {
// Only evalulate the costs of constants when optimizing for size.
if (CostKind != TargetTransformInfo::TCK_CodeSize)
- return 0;
+ return false;
const APInt &Imm = cast<SCEVConstant>(S)->getAPInt();
Type *Ty = S->getType();
Cost += TTI.getIntImmCostInst(
@@ -2399,7 +2442,8 @@ bool SCEVExpander::isHighCostExpansionHelper(
case scUMaxExpr:
case scSMaxExpr:
case scUMinExpr:
- case scSMinExpr: {
+ case scSMinExpr:
+ case scSequentialUMinExpr: {
assert(cast<SCEVNAryExpr>(S)->getNumOperands() > 1 &&
"Nary expr should have more than 1 operand.");
// The simple nary expr will require one less op (or pair of ops)
@@ -2490,49 +2534,73 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
Value *StepCompare = Builder.CreateICmp(ICmpInst::ICMP_SLT, StepValue, Zero);
Value *AbsStep = Builder.CreateSelect(StepCompare, NegStepValue, StepValue);
- // Get the backedge taken count and truncate or extended to the AR type.
- Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty);
-
// Compute |Step| * Backedge
- Value *MulV, *OfMul;
- if (Step->isOne()) {
- // Special-case Step of one. Potentially-costly `umul_with_overflow` isn't
- // needed, there is never an overflow, so to avoid artificially inflating
- // the cost of the check, directly emit the optimized IR.
- MulV = TruncTripCount;
- OfMul = ConstantInt::getFalse(MulV->getContext());
- } else {
- auto *MulF = Intrinsic::getDeclaration(Loc->getModule(),
- Intrinsic::umul_with_overflow, Ty);
- CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul");
- MulV = Builder.CreateExtractValue(Mul, 0, "mul.result");
- OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow");
- }
-
// Compute:
- // Start + |Step| * Backedge < Start
- // Start - |Step| * Backedge > Start
- Value *Add = nullptr, *Sub = nullptr;
- if (PointerType *ARPtrTy = dyn_cast<PointerType>(ARTy)) {
- StartValue = InsertNoopCastOfTo(
- StartValue, Builder.getInt8PtrTy(ARPtrTy->getAddressSpace()));
- Value *NegMulV = Builder.CreateNeg(MulV);
- Add = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, MulV);
- Sub = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, NegMulV);
- } else {
- Add = Builder.CreateAdd(StartValue, MulV);
- Sub = Builder.CreateSub(StartValue, MulV);
- }
-
- Value *EndCompareGT = Builder.CreateICmp(
- Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT, Sub, StartValue);
+ // 1. Start + |Step| * Backedge < Start
+ // 2. Start - |Step| * Backedge > Start
+ //
+ // And select either 1. or 2. depending on whether step is positive or
+ // negative. If Step is known to be positive or negative, only create
+ // either 1. or 2.
+ auto ComputeEndCheck = [&]() -> Value * {
+ // Checking <u 0 is always false.
+ if (!Signed && Start->isZero() && SE.isKnownPositive(Step))
+ return ConstantInt::getFalse(Loc->getContext());
+
+ // Get the backedge taken count and truncate or extended to the AR type.
+ Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty);
+
+ Value *MulV, *OfMul;
+ if (Step->isOne()) {
+ // Special-case Step of one. Potentially-costly `umul_with_overflow` isn't
+ // needed, there is never an overflow, so to avoid artificially inflating
+ // the cost of the check, directly emit the optimized IR.
+ MulV = TruncTripCount;
+ OfMul = ConstantInt::getFalse(MulV->getContext());
+ } else {
+ auto *MulF = Intrinsic::getDeclaration(Loc->getModule(),
+ Intrinsic::umul_with_overflow, Ty);
+ CallInst *Mul =
+ Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul");
+ MulV = Builder.CreateExtractValue(Mul, 0, "mul.result");
+ OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow");
+ }
- Value *EndCompareLT = Builder.CreateICmp(
- Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, Add, StartValue);
+ Value *Add = nullptr, *Sub = nullptr;
+ bool NeedPosCheck = !SE.isKnownNegative(Step);
+ bool NeedNegCheck = !SE.isKnownPositive(Step);
+
+ if (PointerType *ARPtrTy = dyn_cast<PointerType>(ARTy)) {
+ StartValue = InsertNoopCastOfTo(
+ StartValue, Builder.getInt8PtrTy(ARPtrTy->getAddressSpace()));
+ Value *NegMulV = Builder.CreateNeg(MulV);
+ if (NeedPosCheck)
+ Add = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, MulV);
+ if (NeedNegCheck)
+ Sub = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, NegMulV);
+ } else {
+ if (NeedPosCheck)
+ Add = Builder.CreateAdd(StartValue, MulV);
+ if (NeedNegCheck)
+ Sub = Builder.CreateSub(StartValue, MulV);
+ }
- // Select the answer based on the sign of Step.
- Value *EndCheck =
- Builder.CreateSelect(StepCompare, EndCompareGT, EndCompareLT);
+ Value *EndCompareLT = nullptr;
+ Value *EndCompareGT = nullptr;
+ Value *EndCheck = nullptr;
+ if (NeedPosCheck)
+ EndCheck = EndCompareLT = Builder.CreateICmp(
+ Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, Add, StartValue);
+ if (NeedNegCheck)
+ EndCheck = EndCompareGT = Builder.CreateICmp(
+ Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT, Sub, StartValue);
+ if (NeedPosCheck && NeedNegCheck) {
+ // Select the answer based on the sign of Step.
+ EndCheck = Builder.CreateSelect(StepCompare, EndCompareGT, EndCompareLT);
+ }
+ return Builder.CreateOr(EndCheck, OfMul);
+ };
+ Value *EndCheck = ComputeEndCheck();
// If the backedge taken count type is larger than the AR type,
// check that we don't drop any bits by truncating it. If we are
@@ -2548,7 +2616,7 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
EndCheck = Builder.CreateOr(EndCheck, BackedgeCheck);
}
- return Builder.CreateOr(EndCheck, OfMul);
+ return EndCheck;
}
Value *SCEVExpander::expandWrapPredicate(const SCEVWrapPredicate *Pred,
@@ -2578,17 +2646,16 @@ Value *SCEVExpander::expandWrapPredicate(const SCEVWrapPredicate *Pred,
Value *SCEVExpander::expandUnionPredicate(const SCEVUnionPredicate *Union,
Instruction *IP) {
- auto *BoolType = IntegerType::get(IP->getContext(), 1);
- Value *Check = ConstantInt::getNullValue(BoolType);
-
// Loop over all checks in this set.
+ SmallVector<Value *> Checks;
for (auto Pred : Union->getPredicates()) {
- auto *NextCheck = expandCodeForPredicate(Pred, IP);
+ Checks.push_back(expandCodeForPredicate(Pred, IP));
Builder.SetInsertPoint(IP);
- Check = Builder.CreateOr(Check, NextCheck);
}
- return Check;
+ if (Checks.empty())
+ return ConstantInt::getFalse(IP->getContext());
+ return Builder.CreateOr(Checks);
}
Value *SCEVExpander::fixupLCSSAFormFor(Instruction *User, unsigned OpIdx) {
@@ -2720,13 +2787,8 @@ void SCEVExpanderCleaner::cleanup() {
// Remove sets with value handles.
Expander.clear();
- // Sort so that earlier instructions do not dominate later instructions.
- stable_sort(InsertedInstructions, [this](Instruction *A, Instruction *B) {
- return DT.dominates(B, A);
- });
// Remove all inserted instructions.
- for (Instruction *I : InsertedInstructions) {
-
+ for (Instruction *I : reverse(InsertedInstructions)) {
#ifndef NDEBUG
assert(all_of(I->users(),
[&InsertedSet](Value *U) {
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 1046998c26de..335ac03ccb52 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2052,109 +2052,119 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB,
if (ScanIdx == 0)
return false;
- // Okay, we *could* sink last ScanIdx instructions. But how many can we
- // actually sink before encountering instruction that is unprofitable to sink?
- auto ProfitableToSinkInstruction = [&](LockstepReverseIterator &LRI) {
- unsigned NumPHIdValues = 0;
- for (auto *I : *LRI)
- for (auto *V : PHIOperands[I]) {
- if (!InstructionsToSink.contains(V))
- ++NumPHIdValues;
- // FIXME: this check is overly optimistic. We may end up not sinking
- // said instruction, due to the very same profitability check.
- // See @creating_too_many_phis in sink-common-code.ll.
- }
- LLVM_DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n");
- unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size();
- if ((NumPHIdValues % UnconditionalPreds.size()) != 0)
+ bool followedByDeoptOrUnreachable = IsBlockFollowedByDeoptOrUnreachable(BB);
+
+ if (!followedByDeoptOrUnreachable) {
+ // Okay, we *could* sink last ScanIdx instructions. But how many can we
+ // actually sink before encountering instruction that is unprofitable to
+ // sink?
+ auto ProfitableToSinkInstruction = [&](LockstepReverseIterator &LRI) {
+ unsigned NumPHIdValues = 0;
+ for (auto *I : *LRI)
+ for (auto *V : PHIOperands[I]) {
+ if (!InstructionsToSink.contains(V))
+ ++NumPHIdValues;
+ // FIXME: this check is overly optimistic. We may end up not sinking
+ // said instruction, due to the very same profitability check.
+ // See @creating_too_many_phis in sink-common-code.ll.
+ }
+ LLVM_DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n");
+ unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size();
+ if ((NumPHIdValues % UnconditionalPreds.size()) != 0)
NumPHIInsts++;
- return NumPHIInsts <= 1;
- };
+ return NumPHIInsts <= 1;
+ };
- // We've determined that we are going to sink last ScanIdx instructions,
- // and recorded them in InstructionsToSink. Now, some instructions may be
- // unprofitable to sink. But that determination depends on the instructions
- // that we are going to sink.
-
- // First, forward scan: find the first instruction unprofitable to sink,
- // recording all the ones that are profitable to sink.
- // FIXME: would it be better, after we detect that not all are profitable.
- // to either record the profitable ones, or erase the unprofitable ones?
- // Maybe we need to choose (at runtime) the one that will touch least instrs?
- LRI.reset();
- int Idx = 0;
- SmallPtrSet<Value *, 4> InstructionsProfitableToSink;
- while (Idx < ScanIdx) {
- if (!ProfitableToSinkInstruction(LRI)) {
- // Too many PHIs would be created.
- LLVM_DEBUG(
- dbgs() << "SINK: stopping here, too many PHIs would be created!\n");
- break;
+ // We've determined that we are going to sink last ScanIdx instructions,
+ // and recorded them in InstructionsToSink. Now, some instructions may be
+ // unprofitable to sink. But that determination depends on the instructions
+ // that we are going to sink.
+
+ // First, forward scan: find the first instruction unprofitable to sink,
+ // recording all the ones that are profitable to sink.
+ // FIXME: would it be better, after we detect that not all are profitable.
+ // to either record the profitable ones, or erase the unprofitable ones?
+ // Maybe we need to choose (at runtime) the one that will touch least
+ // instrs?
+ LRI.reset();
+ int Idx = 0;
+ SmallPtrSet<Value *, 4> InstructionsProfitableToSink;
+ while (Idx < ScanIdx) {
+ if (!ProfitableToSinkInstruction(LRI)) {
+ // Too many PHIs would be created.
+ LLVM_DEBUG(
+ dbgs() << "SINK: stopping here, too many PHIs would be created!\n");
+ break;
+ }
+ InstructionsProfitableToSink.insert((*LRI).begin(), (*LRI).end());
+ --LRI;
+ ++Idx;
}
- InstructionsProfitableToSink.insert((*LRI).begin(), (*LRI).end());
- --LRI;
- ++Idx;
- }
- // If no instructions can be sunk, early-return.
- if (Idx == 0)
- return false;
+ // If no instructions can be sunk, early-return.
+ if (Idx == 0)
+ return false;
- // Did we determine that (only) some instructions are unprofitable to sink?
- if (Idx < ScanIdx) {
- // Okay, some instructions are unprofitable.
- ScanIdx = Idx;
- InstructionsToSink = InstructionsProfitableToSink;
-
- // But, that may make other instructions unprofitable, too.
- // So, do a backward scan, do any earlier instructions become unprofitable?
- assert(!ProfitableToSinkInstruction(LRI) &&
- "We already know that the last instruction is unprofitable to sink");
- ++LRI;
- --Idx;
- while (Idx >= 0) {
- // If we detect that an instruction becomes unprofitable to sink,
- // all earlier instructions won't be sunk either,
- // so preemptively keep InstructionsProfitableToSink in sync.
- // FIXME: is this the most performant approach?
- for (auto *I : *LRI)
- InstructionsProfitableToSink.erase(I);
- if (!ProfitableToSinkInstruction(LRI)) {
- // Everything starting with this instruction won't be sunk.
- ScanIdx = Idx;
- InstructionsToSink = InstructionsProfitableToSink;
- }
+ // Did we determine that (only) some instructions are unprofitable to sink?
+ if (Idx < ScanIdx) {
+ // Okay, some instructions are unprofitable.
+ ScanIdx = Idx;
+ InstructionsToSink = InstructionsProfitableToSink;
+
+ // But, that may make other instructions unprofitable, too.
+ // So, do a backward scan, do any earlier instructions become
+ // unprofitable?
+ assert(
+ !ProfitableToSinkInstruction(LRI) &&
+ "We already know that the last instruction is unprofitable to sink");
++LRI;
--Idx;
+ while (Idx >= 0) {
+ // If we detect that an instruction becomes unprofitable to sink,
+ // all earlier instructions won't be sunk either,
+ // so preemptively keep InstructionsProfitableToSink in sync.
+ // FIXME: is this the most performant approach?
+ for (auto *I : *LRI)
+ InstructionsProfitableToSink.erase(I);
+ if (!ProfitableToSinkInstruction(LRI)) {
+ // Everything starting with this instruction won't be sunk.
+ ScanIdx = Idx;
+ InstructionsToSink = InstructionsProfitableToSink;
+ }
+ ++LRI;
+ --Idx;
+ }
}
- }
- // If no instructions can be sunk, early-return.
- if (ScanIdx == 0)
- return false;
+ // If no instructions can be sunk, early-return.
+ if (ScanIdx == 0)
+ return false;
+ }
bool Changed = false;
if (HaveNonUnconditionalPredecessors) {
- // It is always legal to sink common instructions from unconditional
- // predecessors. However, if not all predecessors are unconditional,
- // this transformation might be pessimizing. So as a rule of thumb,
- // don't do it unless we'd sink at least one non-speculatable instruction.
- // See https://bugs.llvm.org/show_bug.cgi?id=30244
- LRI.reset();
- int Idx = 0;
- bool Profitable = false;
- while (Idx < ScanIdx) {
- if (!isSafeToSpeculativelyExecute((*LRI)[0])) {
- Profitable = true;
- break;
+ if (!followedByDeoptOrUnreachable) {
+ // It is always legal to sink common instructions from unconditional
+ // predecessors. However, if not all predecessors are unconditional,
+ // this transformation might be pessimizing. So as a rule of thumb,
+ // don't do it unless we'd sink at least one non-speculatable instruction.
+ // See https://bugs.llvm.org/show_bug.cgi?id=30244
+ LRI.reset();
+ int Idx = 0;
+ bool Profitable = false;
+ while (Idx < ScanIdx) {
+ if (!isSafeToSpeculativelyExecute((*LRI)[0])) {
+ Profitable = true;
+ break;
+ }
+ --LRI;
+ ++Idx;
}
- --LRI;
- ++Idx;
+ if (!Profitable)
+ return false;
}
- if (!Profitable)
- return false;
LLVM_DEBUG(dbgs() << "SINK: Splitting edge\n");
// We have a conditional edge and we're going to sink some instructions.
@@ -4935,14 +4945,13 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
AssumptionCache *AC,
const DataLayout &DL) {
Value *Cond = SI->getCondition();
- unsigned Bits = Cond->getType()->getIntegerBitWidth();
KnownBits Known = computeKnownBits(Cond, DL, 0, AC, SI);
// We can also eliminate cases by determining that their values are outside of
// the limited range of the condition based on how many significant (non-sign)
// bits are in the condition value.
- unsigned ExtraSignBits = ComputeNumSignBits(Cond, DL, 0, AC, SI) - 1;
- unsigned MaxSignificantBitsInCond = Bits - ExtraSignBits;
+ unsigned MaxSignificantBitsInCond =
+ ComputeMaxSignificantBits(Cond, DL, 0, AC, SI);
// Gather dead cases.
SmallVector<ConstantInt *, 8> DeadCases;
@@ -4973,8 +4982,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
bool HasDefault =
!isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
const unsigned NumUnknownBits =
- Bits - (Known.Zero | Known.One).countPopulation();
- assert(NumUnknownBits <= Bits);
+ Known.getBitWidth() - (Known.Zero | Known.One).countPopulation();
+ assert(NumUnknownBits <= Known.getBitWidth());
if (HasDefault && DeadCases.empty() &&
NumUnknownBits < 64 /* avoid overflow */ &&
SI->getNumCases() == (1ULL << NumUnknownBits)) {
@@ -5796,10 +5805,9 @@ static void reuseTableCompare(
for (auto ValuePair : Values) {
Constant *CaseConst = ConstantExpr::getICmp(CmpInst->getPredicate(),
ValuePair.second, CmpOp1, true);
- if (!CaseConst || CaseConst == DefaultConst || isa<UndefValue>(CaseConst))
+ if (!CaseConst || CaseConst == DefaultConst ||
+ (CaseConst != TrueConst && CaseConst != FalseConst))
return;
- assert((CaseConst == TrueConst || CaseConst == FalseConst) &&
- "Expect true or false as compare result.");
}
// Check if the branch instruction dominates the phi node. It's a simple
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 02727a3dbf9c..e02d02a05752 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -602,7 +602,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) {
Align MemSetAlign =
CI->getAttributes().getParamAttrs(0).getAlignment().valueOrOne();
CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, MemSetAlign);
- AttrBuilder ArgAttrs(CI->getAttributes().getParamAttrs(0));
+ AttrBuilder ArgAttrs(CI->getContext(), CI->getAttributes().getParamAttrs(0));
NewCI->setAttributes(NewCI->getAttributes().addParamAttributes(
CI->getContext(), 0, ArgAttrs));
copyFlags(*CI, NewCI);
@@ -2515,8 +2515,9 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
} else if (Value *V = emitStpCpy(Dest, CI->getArgOperand(2), B, TLI)) {
// sprintf(dest, "%s", str) -> stpcpy(dest, str) - dest
// Handle mismatched pointer types (goes away with typeless pointers?).
- V = B.CreatePointerCast(V, Dest->getType());
- Value *PtrDiff = B.CreatePtrDiff(V, Dest);
+ V = B.CreatePointerCast(V, B.getInt8PtrTy());
+ Dest = B.CreatePointerCast(Dest, B.getInt8PtrTy());
+ Value *PtrDiff = B.CreatePtrDiff(B.getInt8Ty(), V, Dest);
return B.CreateIntCast(PtrDiff, CI->getType(), false);
}
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index b822db938af8..8947303674ee 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -398,13 +398,17 @@ Value *Mapper::mapValue(const Value *V) {
SmallVector<ValueAsMetadata *, 4> MappedArgs;
for (auto *VAM : AL->getArgs()) {
// Map both Local and Constant VAMs here; they will both ultimately
- // be mapped via mapValue (apart from constants when we have no
- // module level changes, which have an identity mapping).
+ // be mapped via mapValue. The exceptions are constants when we have no
+ // module level changes and locals when they have no existing mapped
+ // value and RF_IgnoreMissingLocals is set; these have identity
+ // mappings.
if ((Flags & RF_NoModuleLevelChanges) && isa<ConstantAsMetadata>(VAM)) {
MappedArgs.push_back(VAM);
} else if (Value *LV = mapValue(VAM->getValue())) {
MappedArgs.push_back(
LV == VAM->getValue() ? VAM : ValueAsMetadata::get(LV));
+ } else if ((Flags & RF_IgnoreMissingLocals) && isa<LocalAsMetadata>(VAM)) {
+ MappedArgs.push_back(VAM);
} else {
// If we cannot map the value, set the argument as undef.
MappedArgs.push_back(ValueAsMetadata::get(
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 5a4a2f0924f6..97c2acb7d4c7 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -698,8 +698,9 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
ChainInstrs.push_back(&I);
continue;
}
- if (I.mayThrow()) {
- LLVM_DEBUG(dbgs() << "LSV: Found may-throw operation: " << I << '\n');
+ if (!isGuaranteedToTransferExecutionToSuccessor(&I)) {
+ LLVM_DEBUG(dbgs() << "LSV: Found instruction may not transfer execution: "
+ << I << '\n');
break;
}
if (I.mayReadOrWriteMemory())
@@ -853,13 +854,6 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
(VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
continue;
- // Make sure all the users of a vector are constant-index extracts.
- if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
- const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
- return EEI && isa<ConstantInt>(EEI->getOperand(1));
- }))
- continue;
-
// Save the load locations.
const ChainID ID = getChainID(Ptr);
LoadRefs[ID].push_back(LI);
@@ -900,12 +894,6 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
(VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
continue;
- if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
- const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
- return EEI && isa<ConstantInt>(EEI->getOperand(1));
- }))
- continue;
-
// Save store location.
const ChainID ID = getChainID(Ptr);
StoreRefs[ID].push_back(SI);
@@ -1289,52 +1277,32 @@ bool Vectorizer::vectorizeLoadChain(
Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));
propagateMetadata(LI, Chain);
- if (VecLoadTy) {
- SmallVector<Instruction *, 16> InstrsToErase;
-
- unsigned VecWidth = VecLoadTy->getNumElements();
- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
- for (auto Use : Chain[I]->users()) {
- // All users of vector loads are ExtractElement instructions with
- // constant indices, otherwise we would have bailed before now.
- Instruction *UI = cast<Instruction>(Use);
- unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
- unsigned NewIdx = Idx + I * VecWidth;
- Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
- UI->getName());
- if (V->getType() != UI->getType())
- V = Builder.CreateBitCast(V, UI->getType());
-
- // Replace the old instruction.
- UI->replaceAllUsesWith(V);
- InstrsToErase.push_back(UI);
- }
+ for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+ Value *CV = Chain[I];
+ Value *V;
+ if (VecLoadTy) {
+ // Extract a subvector using shufflevector.
+ unsigned VecWidth = VecLoadTy->getNumElements();
+ auto Mask =
+ llvm::to_vector<8>(llvm::seq<int>(I * VecWidth, (I + 1) * VecWidth));
+ V = Builder.CreateShuffleVector(LI, Mask, CV->getName());
+ } else {
+ V = Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
}
- // Bitcast might not be an Instruction, if the value being loaded is a
- // constant. In that case, no need to reorder anything.
- if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
- reorder(BitcastInst);
-
- for (auto I : InstrsToErase)
- I->eraseFromParent();
- } else {
- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
- Value *CV = Chain[I];
- Value *V =
- Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
- if (V->getType() != CV->getType()) {
- V = Builder.CreateBitOrPointerCast(V, CV->getType());
- }
-
- // Replace the old instruction.
- CV->replaceAllUsesWith(V);
+ if (V->getType() != CV->getType()) {
+ V = Builder.CreateBitOrPointerCast(V, CV->getType());
}
- if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
- reorder(BitcastInst);
+ // Replace the old instruction.
+ CV->replaceAllUsesWith(V);
}
+ // Bitcast might not be an Instruction, if the value being loaded is a
+ // constant. In that case, no need to reorder anything.
+ if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
+ reorder(BitcastInst);
+
eraseInstructions(Chain);
++NumVectorInstructions;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4747f34fcc62..d11f4146b590 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -470,10 +470,11 @@ public:
/// on, while the old loop will be used as the scalar remainder. Control flow
/// is generated around the vectorized (and scalar epilogue) loops consisting
/// of various checks and bypasses. Return the pre-header block of the new
- /// loop.
- /// In the case of epilogue vectorization, this function is overriden to
- /// handle the more complex control flow around the loops.
- virtual BasicBlock *createVectorizedLoopSkeleton();
+ /// loop and the start value for the canonical induction, if it is != 0. The
+ /// latter is the case when vectorizing the epilogue loop. In the case of
+ /// epilogue vectorization, this function is overriden to handle the more
+ /// complex control flow around the loops.
+ virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
/// Widen a single call instruction within the innermost loop.
void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
@@ -507,10 +508,10 @@ public:
/// Widen an integer or floating-point induction variable \p IV. If \p Trunc
/// is provided, the integer induction variable will first be truncated to
- /// the corresponding type.
- void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID,
- Value *Start, TruncInst *Trunc, VPValue *Def,
- VPTransformState &State);
+ /// the corresponding type. \p CanonicalIV is the scalar value generated for
+ /// the canonical induction variable.
+ void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def,
+ VPTransformState &State, Value *CanonicalIV);
/// Construct the vector value of a scalarized value \p V one lane at a time.
void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
@@ -556,6 +557,10 @@ public:
/// vector of instructions.
void addMetadata(ArrayRef<Value *> To, Instruction *From);
+ // Returns the resume value (bc.merge.rdx) for a reduction as
+ // generated by fixReduction.
+ PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
+
protected:
friend class LoopVectorizationPlanner;
@@ -573,16 +578,18 @@ protected:
Value *CountRoundDown, Value *EndValue,
BasicBlock *MiddleBlock);
- /// Create a new induction variable inside L.
- PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
- Value *Step, Instruction *DL);
+ /// Introduce a conditional branch (on true, condition to be set later) at the
+ /// end of the header=latch connecting it to itself (across the backedge) and
+ /// to the exit block of \p L.
+ void createHeaderBranch(Loop *L);
/// Handle all cross-iteration phis in the header.
void fixCrossIterationPHIs(VPTransformState &State);
/// Create the exit value of first order recurrences in the middle block and
/// update their users.
- void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State);
+ void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
+ VPTransformState &State);
/// Create code for the loop exit value of the reduction.
void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
@@ -606,14 +613,6 @@ protected:
/// represented as.
void truncateToMinimalBitwidths(VPTransformState &State);
- /// This function adds
- /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
- /// to each vector element of Val. The sequence starts at StartIndex.
- /// \p Opcode is relevant for FP induction variable.
- virtual Value *
- getStepVector(Value *Val, Value *StartIdx, Value *Step,
- Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd);
-
/// Compute scalar induction steps. \p ScalarIV is the scalar induction
/// variable on which to base the steps, \p Step is the size of the step, and
/// \p EntryVal is the value from the original loop that maps to the steps.
@@ -640,9 +639,6 @@ protected:
/// Returns true if we should generate a scalar version of \p IV.
bool needsScalarInduction(Instruction *IV) const;
- /// Generate a shuffle sequence that will reverse the vector Vec.
- virtual Value *reverseVector(Value *Vec);
-
/// Returns (and creates if needed) the original loop trip count.
Value *getOrCreateTripCount(Loop *NewLoop);
@@ -685,14 +681,13 @@ protected:
Loop *createVectorLoopSkeleton(StringRef Prefix);
/// Create new phi nodes for the induction variables to resume iteration count
- /// in the scalar epilogue, from where the vectorized loop left off (given by
- /// \p VectorTripCount).
+ /// in the scalar epilogue, from where the vectorized loop left off.
/// In cases where the loop skeleton is more complicated (eg. epilogue
/// vectorization) and the resume values can come from an additional bypass
/// block, the \p AdditionalBypass pair provides information about the bypass
/// block and the end value on the edge from bypass to this loop.
void createInductionResumeValues(
- Loop *L, Value *VectorTripCount,
+ Loop *L,
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
/// Complete the loop skeleton by adding debug MDs, creating appropriate
@@ -795,12 +790,6 @@ protected:
/// A list of all bypass blocks. The first block is the entry of the loop.
SmallVector<BasicBlock *, 4> LoopBypassBlocks;
- /// The new Induction variable which was added to the new block.
- PHINode *Induction = nullptr;
-
- /// The induction variable of the old basic block.
- PHINode *OldInduction = nullptr;
-
/// Store instructions that were predicated.
SmallVector<Instruction *, 4> PredicatedInstructions;
@@ -838,6 +827,11 @@ protected:
/// Structure to hold information about generated runtime checks, responsible
/// for cleaning the checks, if vectorization turns out unprofitable.
GeneratedRTChecks &RTChecks;
+
+ // Holds the resume values for reductions in the loops, used to set the
+ // correct start value of reduction PHIs when vectorizing the epilogue.
+ SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
+ ReductionResumeValues;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -856,10 +850,6 @@ public:
private:
Value *getBroadcastInstrs(Value *V) override;
- Value *getStepVector(
- Value *Val, Value *StartIdx, Value *Step,
- Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override;
- Value *reverseVector(Value *Vec) override;
};
/// Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -909,14 +899,16 @@ public:
// Override this function to handle the more complex control flow around the
// three loops.
- BasicBlock *createVectorizedLoopSkeleton() final override {
+ std::pair<BasicBlock *, Value *>
+ createVectorizedLoopSkeleton() final override {
return createEpilogueVectorizedLoopSkeleton();
}
/// The interface for creating a vectorized skeleton using one of two
/// different strategies, each corresponding to one execution of the vplan
/// as described above.
- virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
+ virtual std::pair<BasicBlock *, Value *>
+ createEpilogueVectorizedLoopSkeleton() = 0;
/// Holds and updates state information required to vectorize the main loop
/// and its epilogue in two separate passes. This setup helps us avoid
@@ -944,7 +936,8 @@ public:
EPI, LVL, CM, BFI, PSI, Check) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *main loop* strategy (ie the first pass of vplan execution).
- BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
+ std::pair<BasicBlock *, Value *>
+ createEpilogueVectorizedLoopSkeleton() final override;
protected:
/// Emits an iteration count bypass check once for the main loop (when \p
@@ -973,7 +966,8 @@ public:
EPI, LVL, CM, BFI, PSI, Checks) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *epilogue loop* strategy (ie the second pass of vplan execution).
- BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
+ std::pair<BasicBlock *, Value *>
+ createEpilogueVectorizedLoopSkeleton() final override;
protected:
/// Emits an iteration count bypass check after the main vector loop has
@@ -1069,16 +1063,16 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
}
+namespace llvm {
+
/// Return a value for Step multiplied by VF.
-static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
- int64_t Step) {
+Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
+ int64_t Step) {
assert(Ty->isIntegerTy() && "Expected an integer step");
Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
}
-namespace llvm {
-
/// Return the runtime value for VF.
Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
@@ -1163,7 +1157,8 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
// will lead to gather/scatter instructions, which don't need to be
// handled.
if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
- isa<VPInterleaveRecipe>(CurRec))
+ isa<VPInterleaveRecipe>(CurRec) ||
+ isa<VPCanonicalIVPHIRecipe>(CurRec))
continue;
// This recipe contributes to the address computation of a widen
@@ -1232,6 +1227,14 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
}
}
+PHINode *InnerLoopVectorizer::getReductionResumeValue(
+ const RecurrenceDescriptor &RdxDesc) {
+ auto It = ReductionResumeValues.find(&RdxDesc);
+ assert(It != ReductionResumeValues.end() &&
+ "Expected to find a resume value for the reduction.");
+ return It->second;
+}
+
namespace llvm {
// Loop vectorization cost-model hints how the scalar epilogue loop should be
@@ -1556,13 +1559,16 @@ public:
/// Returns true if the target machine can represent \p V as a masked gather
/// or scatter operation.
- bool isLegalGatherOrScatter(Value *V) {
+ bool isLegalGatherOrScatter(Value *V,
+ ElementCount VF = ElementCount::getFixed(1)) {
bool LI = isa<LoadInst>(V);
bool SI = isa<StoreInst>(V);
if (!LI && !SI)
return false;
auto *Ty = getLoadStoreType(V);
Align Align = getLoadStoreAlignment(V);
+ if (VF.isVector())
+ Ty = VectorType::get(Ty, VF);
return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
(SI && TTI.isLegalMaskedScatter(Ty, Align));
}
@@ -1577,16 +1583,17 @@ public:
}
/// Returns true if \p I is an instruction that will be scalarized with
- /// predication. Such instructions include conditional stores and
- /// instructions that may divide by zero.
- /// If a non-zero VF has been calculated, we check if I will be scalarized
- /// predication for that VF.
- bool isScalarWithPredication(Instruction *I) const;
+ /// predication when vectorizing \p I with vectorization factor \p VF. Such
+ /// instructions include conditional stores and instructions that may divide
+ /// by zero.
+ bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
// Returns true if \p I is an instruction that will be predicated either
// through scalar predication or masked load/store or masked gather/scatter.
+ // \p VF is the vectorization factor that will be used to vectorize \p I.
// Superset of instructions that return true for isScalarWithPredication.
- bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) {
+ bool isPredicatedInst(Instruction *I, ElementCount VF,
+ bool IsKnownUniform = false) {
// When we know the load is uniform and the original scalar loop was not
// predicated we don't need to mark it as a predicated instruction. Any
// vectorised blocks created when tail-folding are something artificial we
@@ -1602,7 +1609,7 @@ public:
// instructions.
if (isa<LoadInst>(I) || isa<StoreInst>(I))
return Legal->isMaskRequired(I);
- return isScalarWithPredication(I);
+ return isScalarWithPredication(I, VF);
}
/// Returns true if \p I is a memory instruction with consecutive memory
@@ -1794,7 +1801,7 @@ private:
/// Returns true if an artificially high cost for emulated masked memrefs
/// should be used.
- bool useEmulatedMaskMemRefHack(Instruction *I);
+ bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
/// Map of scalar integer values to the smallest bitwidth they can be legally
/// represented as. The vector equivalents of these values should be truncated
@@ -2078,8 +2085,8 @@ public:
/// Remove the created SCEV & memory runtime check blocks & instructions, if
/// unused.
~GeneratedRTChecks() {
- SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
- SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
+ SCEVExpanderCleaner SCEVCleaner(SCEVExp);
+ SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
if (!SCEVCheckCond)
SCEVCleaner.markResultUsed();
@@ -2335,6 +2342,60 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
return Shuf;
}
+/// This function adds
+/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
+/// to each vector element of Val. The sequence starts at StartIndex.
+/// \p Opcode is relevant for FP induction variable.
+static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
+ Instruction::BinaryOps BinOp, ElementCount VF,
+ IRBuilder<> &Builder) {
+ assert(VF.isVector() && "only vector VFs are supported");
+
+ // Create and check the types.
+ auto *ValVTy = cast<VectorType>(Val->getType());
+ ElementCount VLen = ValVTy->getElementCount();
+
+ Type *STy = Val->getType()->getScalarType();
+ assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
+ "Induction Step must be an integer or FP");
+ assert(Step->getType() == STy && "Step has wrong type");
+
+ SmallVector<Constant *, 8> Indices;
+
+ // Create a vector of consecutive numbers from zero to VF.
+ VectorType *InitVecValVTy = ValVTy;
+ Type *InitVecValSTy = STy;
+ if (STy->isFloatingPointTy()) {
+ InitVecValSTy =
+ IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
+ InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
+ }
+ Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
+
+ // Splat the StartIdx
+ Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
+
+ if (STy->isIntegerTy()) {
+ InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
+ Step = Builder.CreateVectorSplat(VLen, Step);
+ assert(Step->getType() == Val->getType() && "Invalid step vec");
+ // FIXME: The newly created binary instructions should contain nsw/nuw
+ // flags, which can be found from the original scalar operations.
+ Step = Builder.CreateMul(InitVec, Step);
+ return Builder.CreateAdd(Val, Step, "induction");
+ }
+
+ // Floating point induction.
+ assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
+ "Binary Opcode should be specified for FP induction");
+ InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
+ InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
+
+ Step = Builder.CreateVectorSplat(VLen, Step);
+ Value *MulOp = Builder.CreateFMul(InitVec, Step);
+ return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
+}
+
void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
const InductionDescriptor &II, Value *Step, Value *Start,
Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
@@ -2355,8 +2416,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
- Value *SteppedStart =
- getStepVector(SplatStart, Zero, Step, II.getInductionOpcode());
+ Value *SteppedStart = getStepVector(
+ SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder);
// We create vector phi nodes for both integer and floating-point induction
// variables. Here, we determine the kind of arithmetic we will perform.
@@ -2411,8 +2472,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
// placement of all induction updates.
auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
- auto *ICmp = cast<Instruction>(Br->getCondition());
- LastInduction->moveBefore(ICmp);
+ LastInduction->moveBefore(Br);
LastInduction->setName("vec.ind.next");
VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
@@ -2434,15 +2494,15 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
return llvm::any_of(IV->users(), isScalarInst);
}
-void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
- const InductionDescriptor &ID,
- Value *Start, TruncInst *Trunc,
- VPValue *Def,
- VPTransformState &State) {
+void InnerLoopVectorizer::widenIntOrFpInduction(
+ PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State,
+ Value *CanonicalIV) {
+ Value *Start = Def->getStartValue()->getLiveInIRValue();
+ const InductionDescriptor &ID = Def->getInductionDescriptor();
+ TruncInst *Trunc = Def->getTruncInst();
IRBuilder<> &Builder = State.Builder;
- assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
- "Primary induction variable must have an integer type");
assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
+ assert(!State.VF.isZero() && "VF must be non-zero");
// The value from the original loop to which we are mapping the new induction
// variable.
@@ -2468,12 +2528,13 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
// induction variable and step. Otherwise, derive these values from the
// induction descriptor.
auto CreateScalarIV = [&](Value *&Step) -> Value * {
- Value *ScalarIV = Induction;
- if (IV != OldInduction) {
- ScalarIV = IV->getType()->isIntegerTy()
- ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
- : Builder.CreateCast(Instruction::SIToFP, Induction,
- IV->getType());
+ Value *ScalarIV = CanonicalIV;
+ Type *NeededType = IV->getType();
+ if (!Def->isCanonical() || ScalarIV->getType() != NeededType) {
+ ScalarIV =
+ NeededType->isIntegerTy()
+ ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType)
+ : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType);
ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID,
State.CFG.PrevBB);
ScalarIV->setName("offset.idx");
@@ -2493,7 +2554,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
Value *Broadcasted = getBroadcastInstrs(ScalarIV);
for (unsigned Part = 0; Part < UF; ++Part) {
- assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
Value *StartIdx;
if (Step->getType()->isFloatingPointTy())
StartIdx =
@@ -2502,7 +2562,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part);
Value *EntryPart =
- getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode());
+ getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode(),
+ State.VF, State.Builder);
State.set(Def, EntryPart, Part);
if (Trunc)
addMetadata(EntryPart, Trunc);
@@ -2516,9 +2577,31 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
// Now do the actual transformations, and start with creating the step value.
Value *Step = CreateStepValue(ID.getStep());
- if (State.VF.isZero() || State.VF.isScalar()) {
+ if (State.VF.isScalar()) {
Value *ScalarIV = CreateScalarIV(Step);
- CreateSplatIV(ScalarIV, Step);
+ Type *ScalarTy = IntegerType::get(ScalarIV->getContext(),
+ Step->getType()->getScalarSizeInBits());
+
+ Instruction::BinaryOps IncOp = ID.getInductionOpcode();
+ if (IncOp == Instruction::BinaryOpsEnd)
+ IncOp = Instruction::Add;
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *StartIdx = ConstantInt::get(ScalarTy, Part);
+ Instruction::BinaryOps MulOp = Instruction::Mul;
+ if (Step->getType()->isFloatingPointTy()) {
+ StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType());
+ MulOp = Instruction::FMul;
+ }
+
+ Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
+ Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction");
+ State.set(Def, EntryPart, Part);
+ if (Trunc) {
+ assert(!Step->getType()->isFloatingPointTy() &&
+ "fp inductions shouldn't be truncated");
+ addMetadata(EntryPart, Trunc);
+ }
+ }
return;
}
@@ -2554,54 +2637,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
}
-Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
- Value *Step,
- Instruction::BinaryOps BinOp) {
- // Create and check the types.
- auto *ValVTy = cast<VectorType>(Val->getType());
- ElementCount VLen = ValVTy->getElementCount();
-
- Type *STy = Val->getType()->getScalarType();
- assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
- "Induction Step must be an integer or FP");
- assert(Step->getType() == STy && "Step has wrong type");
-
- SmallVector<Constant *, 8> Indices;
-
- // Create a vector of consecutive numbers from zero to VF.
- VectorType *InitVecValVTy = ValVTy;
- Type *InitVecValSTy = STy;
- if (STy->isFloatingPointTy()) {
- InitVecValSTy =
- IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
- InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
- }
- Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
-
- // Splat the StartIdx
- Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
-
- if (STy->isIntegerTy()) {
- InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
- Step = Builder.CreateVectorSplat(VLen, Step);
- assert(Step->getType() == Val->getType() && "Invalid step vec");
- // FIXME: The newly created binary instructions should contain nsw/nuw flags,
- // which can be found from the original scalar operations.
- Step = Builder.CreateMul(InitVec, Step);
- return Builder.CreateAdd(Val, Step, "induction");
- }
-
- // Floating point induction.
- assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
- "Binary Opcode should be specified for FP induction");
- InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
- InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
-
- Step = Builder.CreateVectorSplat(VLen, Step);
- Value *MulOp = Builder.CreateFMul(InitVec, Step);
- return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
-}
-
void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
Instruction *EntryVal,
const InductionDescriptor &ID,
@@ -2691,11 +2726,6 @@ void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
State.set(Def, VectorValue, Instance.Part);
}
-Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
- assert(Vec->getType()->isVectorTy() && "Invalid type");
- return Builder.CreateVectorReverse(Vec, "reverse");
-}
-
// Return whether we allow using masked interleave-groups (for dealing with
// strided loads/stores that reside in predicated blocks, or for dealing
// with gaps).
@@ -2858,7 +2888,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
}
if (Group->isReverse())
- StridedVec = reverseVector(StridedVec);
+ StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
State.set(VPDefs[J], StridedVec, Part);
}
@@ -2894,7 +2924,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
Value *StoredVec = State.get(StoredValues[i], Part);
if (Group->isReverse())
- StoredVec = reverseVector(StoredVec);
+ StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
// If this member has different type, cast it to a unified type.
@@ -2993,43 +3023,21 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
PredicatedInstructions.push_back(Cloned);
}
-PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
- Value *End, Value *Step,
- Instruction *DL) {
+void InnerLoopVectorizer::createHeaderBranch(Loop *L) {
BasicBlock *Header = L->getHeader();
- BasicBlock *Latch = L->getLoopLatch();
- // As we're just creating this loop, it's possible no latch exists
- // yet. If so, use the header as this will be a single block loop.
- if (!Latch)
- Latch = Header;
-
- IRBuilder<> B(&*Header->getFirstInsertionPt());
- Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
- setDebugLocFromInst(OldInst, &B);
- auto *Induction = B.CreatePHI(Start->getType(), 2, "index");
+ assert(!L->getLoopLatch() && "loop should not have a latch at this point");
- B.SetInsertPoint(Latch->getTerminator());
+ IRBuilder<> B(Header->getTerminator());
+ Instruction *OldInst =
+ getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
setDebugLocFromInst(OldInst, &B);
- // Create i+1 and fill the PHINode.
- //
- // If the tail is not folded, we know that End - Start >= Step (either
- // statically or through the minimum iteration checks). We also know that both
- // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV +
- // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned
- // overflows and we can mark the induction increment as NUW.
- Value *Next = B.CreateAdd(Induction, Step, "index.next",
- /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false);
- Induction->addIncoming(Start, L->getLoopPreheader());
- Induction->addIncoming(Next, Latch);
- // Create the compare.
- Value *ICmp = B.CreateICmpEQ(Next, End);
- B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
+ // Connect the header to the exit and header blocks and replace the old
+ // terminator.
+ B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header);
// Now we have two terminators. Remove the old one from the block.
- Latch->getTerminator()->eraseFromParent();
-
- return Induction;
+ Header->getTerminator()->eraseFromParent();
}
Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
@@ -3099,10 +3107,9 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
if (Cost->foldTailByMasking()) {
assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
"VF*UF must be a power of 2 when folding tail by masking");
- assert(!VF.isScalable() &&
- "Tail folding not yet supported for scalable vectors");
+ Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
TC = Builder.CreateAdd(
- TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
+ TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
}
// Now we need to generate the expression for the part of the loop that the
@@ -3436,12 +3443,13 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
}
void InnerLoopVectorizer::createInductionResumeValues(
- Loop *L, Value *VectorTripCount,
- std::pair<BasicBlock *, Value *> AdditionalBypass) {
- assert(VectorTripCount && L && "Expected valid arguments");
+ Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) {
assert(((AdditionalBypass.first && AdditionalBypass.second) ||
(!AdditionalBypass.first && !AdditionalBypass.second)) &&
"Inconsistent information about additional bypass.");
+
+ Value *VectorTripCount = getOrCreateVectorTripCount(L);
+ assert(VectorTripCount && L && "Expected valid arguments");
// We are going to resume the execution of the scalar loop.
// Go over all of the induction variables that we found and fix the
// PHIs that are left in the scalar version of the loop.
@@ -3449,6 +3457,7 @@ void InnerLoopVectorizer::createInductionResumeValues(
// iteration in the vectorized loop.
// If we come from a bypass edge then we need to start from the original
// start value.
+ Instruction *OldInduction = Legal->getPrimaryInduction();
for (auto &InductionEntry : Legal->getInductionVars()) {
PHINode *OrigPhi = InductionEntry.first;
InductionDescriptor II = InductionEntry.second;
@@ -3546,25 +3555,6 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
"Inconsistent vector loop preheader");
Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
- Optional<MDNode *> VectorizedLoopID =
- makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
- LLVMLoopVectorizeFollowupVectorized});
- if (VectorizedLoopID.hasValue()) {
- L->setLoopID(VectorizedLoopID.getValue());
-
- // Do not setAlreadyVectorized if loop attributes have been defined
- // explicitly.
- return LoopVectorPreHeader;
- }
-
- // Keep all loop hints from the original loop on the vector loop (we'll
- // replace the vectorizer-specific hints below).
- if (MDNode *LID = OrigLoop->getLoopID())
- L->setLoopID(LID);
-
- LoopVectorizeHints Hints(L, true, *ORE, TTI);
- Hints.setAlreadyVectorized();
-
#ifdef EXPENSIVE_CHECKS
assert(DT->verify(DominatorTree::VerificationLevel::Fast));
LI->verify(*DT);
@@ -3573,7 +3563,8 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
return LoopVectorPreHeader;
}
-BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
+std::pair<BasicBlock *, Value *>
+InnerLoopVectorizer::createVectorizedLoopSkeleton() {
/*
In this function we generate a new loop. The new loop will contain
the vectorized instructions while the old loop will continue to run the
@@ -3638,33 +3629,12 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// faster.
emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
- // Some loops have a single integer induction variable, while other loops
- // don't. One example is c++ iterators that often have multiple pointer
- // induction variables. In the code below we also support a case where we
- // don't have a single induction variable.
- //
- // We try to obtain an induction variable from the original loop as hard
- // as possible. However if we don't find one that:
- // - is an integer
- // - counts from zero, stepping by one
- // - is the size of the widest induction variable type
- // then we create a new one.
- OldInduction = Legal->getPrimaryInduction();
- Type *IdxTy = Legal->getWidestInductionType();
- Value *StartIdx = ConstantInt::get(IdxTy, 0);
- // The loop step is equal to the vectorization factor (num of SIMD elements)
- // times the unroll factor (num of SIMD instructions).
- Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
- Value *Step = createStepForVF(Builder, IdxTy, VF, UF);
- Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
- Induction =
- createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
- getDebugLocFromInstOrOperands(OldInduction));
+ createHeaderBranch(Lp);
// Emit phis for the new starting index of the scalar loop.
- createInductionResumeValues(Lp, CountRoundDown);
+ createInductionResumeValues(Lp);
- return completeLoopSkeleton(Lp, OrigLoopID);
+ return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
}
// Fix up external users of the induction variable. At this point, we are
@@ -4088,8 +4058,8 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
}
}
-void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
- VPTransformState &State) {
+void InnerLoopVectorizer::fixFirstOrderRecurrence(
+ VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
// This is the second phase of vectorizing first-order recurrences. An
// overview of the transformation is described below. Suppose we have the
// following loop.
@@ -4334,13 +4304,29 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
: Builder.CreateZExt(ReducedPartRdx, PhiTy);
}
+ PHINode *ResumePhi =
+ dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
+
// Create a phi node that merges control-flow from the backedge-taken check
// block and the middle block.
PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
LoopScalarPreHeader->getTerminator());
- for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
- BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
- BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+
+ // If we are fixing reductions in the epilogue loop then we should already
+ // have created a bc.merge.rdx Phi after the main vector body. Ensure that
+ // we carry over the incoming values correctly.
+ for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
+ if (Incoming == LoopMiddleBlock)
+ BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
+ else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
+ BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
+ Incoming);
+ else
+ BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
+ }
+
+ // Set the resume value for this reduction
+ ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
// Now, we need to fix the users of the reduction variable
// inside and outside of the scalar remainder loop.
@@ -4557,6 +4543,9 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
InductionDescriptor II = Legal->getInductionVars().lookup(P);
const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+ auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV();
+ PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
+
// FIXME: The newly created binary instructions should contain nsw/nuw flags,
// which can be found from the original scalar operations.
switch (II.getKind()) {
@@ -4572,7 +4561,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
if (Cost->isScalarAfterVectorization(P, State.VF)) {
// This is the normalized GEP that starts counting at zero.
Value *PtrInd =
- Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
+ Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType());
// Determine the number of scalars we need to generate for each unroll
// iteration. If the instruction is uniform, we only need to generate the
// first lane. Otherwise, we generate all VF values.
@@ -4602,10 +4591,10 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
Type *PhiType = II.getStep()->getType();
// Build a pointer phi
- Value *ScalarStartValue = II.getStartValue();
+ Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue();
Type *ScStValueType = ScalarStartValue->getType();
PHINode *NewPointerPhi =
- PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
+ PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
// A pointer induction, performed by using a gep
@@ -4916,7 +4905,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
Scalars[VF].insert(Worklist.begin(), Worklist.end());
}
-bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
+bool LoopVectorizationCostModel::isScalarWithPredication(
+ Instruction *I, ElementCount VF) const {
if (!blockNeedsPredicationForAnyReason(I->getParent()))
return false;
switch(I->getOpcode()) {
@@ -4928,11 +4918,14 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
return false;
auto *Ptr = getLoadStorePointerOperand(I);
auto *Ty = getLoadStoreType(I);
+ Type *VTy = Ty;
+ if (VF.isVector())
+ VTy = VectorType::get(Ty, VF);
const Align Alignment = getLoadStoreAlignment(I);
return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
- TTI.isLegalMaskedGather(Ty, Alignment))
+ TTI.isLegalMaskedGather(VTy, Alignment))
: !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
- TTI.isLegalMaskedScatter(Ty, Alignment));
+ TTI.isLegalMaskedScatter(VTy, Alignment));
}
case Instruction::UDiv:
case Instruction::SDiv:
@@ -5005,7 +4998,7 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
// If the instruction is a store located in a predicated block, it will be
// scalarized.
- if (isScalarWithPredication(I))
+ if (isScalarWithPredication(I, VF))
return false;
// If the instruction's allocated size doesn't equal it's type size, it
@@ -5056,7 +5049,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
<< *I << "\n");
return;
}
- if (isScalarWithPredication(I)) {
+ if (isScalarWithPredication(I, VF)) {
LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n");
return;
@@ -5531,10 +5524,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
}
- // For scalable vectors, don't use tail folding as this is currently not yet
- // supported. The code is likely to have ended up here if the tripcount is
- // low, in which case it makes sense not to use scalable vectors.
- if (MaxFactors.ScalableVF.isVector())
+ // For scalable vectors don't use tail folding for low trip counts or
+ // optimizing for code size. We only permit this if the user has explicitly
+ // requested it.
+ if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate &&
+ ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate &&
+ MaxFactors.ScalableVF.isVector())
MaxFactors.ScalableVF = ElementCount::getScalable(0);
// If we don't know the precise trip count, or if the trip count that we
@@ -5849,10 +5844,8 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
const Loop &L, ElementCount VF) const {
// Cross iteration phis such as reductions need special handling and are
// currently unsupported.
- if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
- return Legal->isFirstOrderRecurrence(&Phi) ||
- Legal->isReductionVariable(&Phi);
- }))
+ if (any_of(L.getHeader()->phis(),
+ [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
return false;
// Phis with uses outside of the loop require special handling and are
@@ -5978,11 +5971,29 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
unsigned MinWidth = -1U;
unsigned MaxWidth = 8;
const DataLayout &DL = TheFunction->getParent()->getDataLayout();
- for (Type *T : ElementTypesInLoop) {
- MinWidth = std::min<unsigned>(
- MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
- MaxWidth = std::max<unsigned>(
- MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+ // For in-loop reductions, no element types are added to ElementTypesInLoop
+ // if there are no loads/stores in the loop. In this case, check through the
+ // reduction variables to determine the maximum width.
+ if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
+ // Reset MaxWidth so that we can find the smallest type used by recurrences
+ // in the loop.
+ MaxWidth = -1U;
+ for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
+ const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
+ // When finding the min width used by the recurrence we need to account
+ // for casts on the input operands of the recurrence.
+ MaxWidth = std::min<unsigned>(
+ MaxWidth, std::min<unsigned>(
+ RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
+ RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
+ }
+ } else {
+ for (Type *T : ElementTypesInLoop) {
+ MinWidth = std::min<unsigned>(
+ MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+ MaxWidth = std::max<unsigned>(
+ MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+ }
}
return {MinWidth, MaxWidth};
}
@@ -6022,18 +6033,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
if (auto *ST = dyn_cast<StoreInst>(&I))
T = ST->getValueOperand()->getType();
- // Ignore loaded pointer types and stored pointer types that are not
- // vectorizable.
- //
- // FIXME: The check here attempts to predict whether a load or store will
- // be vectorized. We only know this for certain after a VF has
- // been selected. Here, we assume that if an access can be
- // vectorized, it will be. We should also look at extending this
- // optimization to non-pointer types.
- //
- if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
- !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
- continue;
+ assert(T->isSized() &&
+ "Expected the load/store/recurrence type to be sized");
ElementTypesInLoop.insert(T);
}
@@ -6475,7 +6476,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
return RUs;
}
-bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
+bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
+ ElementCount VF) {
// TODO: Cost model for emulated masked load/store is completely
// broken. This hack guides the cost model to use an artificially
// high enough value to practically disable vectorization with such
@@ -6484,8 +6486,7 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
// from moving "masked load/store" check from legality to cost model.
// Masked Load/Gather emulation was previously never allowed.
// Limited number of Masked Store/Scatter emulation was allowed.
- assert(isPredicatedInst(I) &&
- "Expecting a scalar emulated instruction");
+ assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
return isa<LoadInst>(I) ||
(isa<StoreInst>(I) &&
NumPredStores > NumberOfStoresToPredicate);
@@ -6512,13 +6513,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
if (!blockNeedsPredicationForAnyReason(BB))
continue;
for (Instruction &I : *BB)
- if (isScalarWithPredication(&I)) {
+ if (isScalarWithPredication(&I, VF)) {
ScalarCostsTy ScalarCosts;
// Do not apply discount if scalable, because that would lead to
// invalid scalarization costs.
// Do not apply discount logic if hacked cost is needed
// for emulated masked memrefs.
- if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) &&
+ if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
// Remember that BB will remain after vectorization.
@@ -6554,7 +6555,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// If the instruction is scalar with predication, it will be analyzed
// separately. We ignore it within the context of PredInst.
- if (isScalarWithPredication(I))
+ if (isScalarWithPredication(I, VF))
return false;
// If any of the instruction's operands are uniform after vectorization,
@@ -6601,7 +6602,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
- if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+ if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(ToVectorTy(I->getType(), VF)),
APInt::getAllOnes(VF.getFixedValue()), true, false);
@@ -6764,7 +6765,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// If we have a predicated load/store, it will need extra i1 extracts and
// conditional branches, but may not be executed for each vector lane. Scale
// the cost by the probability of executing the predicated block.
- if (isPredicatedInst(I)) {
+ if (isPredicatedInst(I, VF)) {
Cost /= getReciprocalPredBlockProb();
// Add the cost of an i1 extract and a branch
@@ -6775,7 +6776,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
/*Insert=*/false, /*Extract=*/true);
Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
- if (useEmulatedMaskMemRefHack(I))
+ if (useEmulatedMaskMemRefHack(I, VF))
// Artificially setting to a high enough value to practically disable
// vectorization with such operations.
Cost = 3000000;
@@ -7182,7 +7183,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
// predicated uniform stores. Today they are treated as any other
// predicated store (see added test cases in
// invariant-store-vectorization.ll).
- if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
+ if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
NumPredStores++;
if (Legal->isUniformMemOp(I)) {
@@ -7192,7 +7193,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
InstructionCost Cost;
if (isa<StoreInst>(&I) && VF.isScalable() &&
- isLegalGatherOrScatter(&I)) {
+ isLegalGatherOrScatter(&I, VF)) {
Cost = getGatherScatterCost(&I, VF);
setWideningDecision(&I, VF, CM_GatherScatter, Cost);
} else {
@@ -7234,7 +7235,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
}
InstructionCost GatherScatterCost =
- isLegalGatherOrScatter(&I)
+ isLegalGatherOrScatter(&I, VF)
? getGatherScatterCost(&I, VF) * NumAccesses
: InstructionCost::getInvalid();
@@ -7437,7 +7438,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
// vector lane. Get the scalarization cost and scale this amount by the
// probability of executing the predicated block. If the instruction is not
// predicated, we fall through to the next case.
- if (VF.isVector() && isScalarWithPredication(I)) {
+ if (VF.isVector() && isScalarWithPredication(I, VF)) {
InstructionCost Cost = 0;
// These instructions have a non-void type, so account for the phi nodes
@@ -7941,6 +7942,40 @@ VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
llvm_unreachable("No plan found!");
}
+static void AddRuntimeUnrollDisableMetaData(Loop *L) {
+ SmallVector<Metadata *, 4> MDs;
+ // Reserve first location for self reference to the LoopID metadata node.
+ MDs.push_back(nullptr);
+ bool IsUnrollMetadata = false;
+ MDNode *LoopID = L->getLoopID();
+ if (LoopID) {
+ // First find existing loop unrolling disable metadata.
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+ if (MD) {
+ const auto *S = dyn_cast<MDString>(MD->getOperand(0));
+ IsUnrollMetadata =
+ S && S->getString().startswith("llvm.loop.unroll.disable");
+ }
+ MDs.push_back(LoopID->getOperand(i));
+ }
+ }
+
+ if (!IsUnrollMetadata) {
+ // Add runtime unroll disable metadata.
+ LLVMContext &Context = L->getHeader()->getContext();
+ SmallVector<Metadata *, 1> DisableOperands;
+ DisableOperands.push_back(
+ MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
+ MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+ MDs.push_back(DisableNode);
+ MDNode *NewLoopID = MDNode::get(Context, MDs);
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+ L->setLoopID(NewLoopID);
+ }
+}
+
void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
VPlan &BestVPlan,
InnerLoopVectorizer &ILV,
@@ -7952,9 +7987,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
// 1. Create a new empty loop. Unlink the old loop and connect the new one.
VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
- State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
- State.TripCount = ILV.getOrCreateTripCount(nullptr);
- State.CanonicalIV = ILV.Induction;
+ Value *CanonicalIVStartValue;
+ std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
+ ILV.createVectorizedLoopSkeleton();
ILV.collectPoisonGeneratingRecipes(State);
ILV.printDebugTracesAtStart();
@@ -7968,8 +8003,35 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
//===------------------------------------------------===//
// 2. Copy and widen instructions from the old loop into the new loop.
+ BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
+ ILV.getOrCreateVectorTripCount(nullptr),
+ CanonicalIVStartValue, State);
BestVPlan.execute(&State);
+ // Keep all loop hints from the original loop on the vector loop (we'll
+ // replace the vectorizer-specific hints below).
+ MDNode *OrigLoopID = OrigLoop->getLoopID();
+
+ Optional<MDNode *> VectorizedLoopID =
+ makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
+ LLVMLoopVectorizeFollowupVectorized});
+
+ Loop *L = LI->getLoopFor(State.CFG.PrevBB);
+ if (VectorizedLoopID.hasValue())
+ L->setLoopID(VectorizedLoopID.getValue());
+ else {
+ // Keep all loop hints from the original loop on the vector loop (we'll
+ // replace the vectorizer-specific hints below).
+ if (MDNode *LID = OrigLoop->getLoopID())
+ L->setLoopID(LID);
+
+ LoopVectorizeHints Hints(L, true, *ORE);
+ Hints.setAlreadyVectorized();
+ }
+ // Disable runtime unrolling when vectorizing the epilogue loop.
+ if (CanonicalIVStartValue)
+ AddRuntimeUnrollDisableMetaData(L);
+
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
ILV.fixVectorizedLoop(State);
@@ -8032,66 +8094,16 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
}
}
-Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
-
Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
-Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx,
- Value *Step,
- Instruction::BinaryOps BinOp) {
- // When unrolling and the VF is 1, we only need to add a simple scalar.
- Type *Ty = Val->getType();
- assert(!Ty->isVectorTy() && "Val must be a scalar");
-
- if (Ty->isFloatingPointTy()) {
- // Floating-point operations inherit FMF via the builder's flags.
- Value *MulOp = Builder.CreateFMul(StartIdx, Step);
- return Builder.CreateBinOp(BinOp, Val, MulOp);
- }
- return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction");
-}
-
-static void AddRuntimeUnrollDisableMetaData(Loop *L) {
- SmallVector<Metadata *, 4> MDs;
- // Reserve first location for self reference to the LoopID metadata node.
- MDs.push_back(nullptr);
- bool IsUnrollMetadata = false;
- MDNode *LoopID = L->getLoopID();
- if (LoopID) {
- // First find existing loop unrolling disable metadata.
- for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
- auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
- if (MD) {
- const auto *S = dyn_cast<MDString>(MD->getOperand(0));
- IsUnrollMetadata =
- S && S->getString().startswith("llvm.loop.unroll.disable");
- }
- MDs.push_back(LoopID->getOperand(i));
- }
- }
-
- if (!IsUnrollMetadata) {
- // Add runtime unroll disable metadata.
- LLVMContext &Context = L->getHeader()->getContext();
- SmallVector<Metadata *, 1> DisableOperands;
- DisableOperands.push_back(
- MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
- MDNode *DisableNode = MDNode::get(Context, DisableOperands);
- MDs.push_back(DisableNode);
- MDNode *NewLoopID = MDNode::get(Context, MDs);
- // Set operand 0 to refer to the loop id itself.
- NewLoopID->replaceOperandWith(0, NewLoopID);
- L->setLoopID(NewLoopID);
- }
-}
-
//===--------------------------------------------------------------------===//
// EpilogueVectorizerMainLoop
//===--------------------------------------------------------------------===//
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
-BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
+std::pair<BasicBlock *, Value *>
+EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
MDNode *OrigLoopID = OrigLoop->getLoopID();
Loop *Lp = createVectorLoopSkeleton("");
@@ -8120,24 +8132,16 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
// Generate the induction variable.
- OldInduction = Legal->getPrimaryInduction();
- Type *IdxTy = Legal->getWidestInductionType();
- Value *StartIdx = ConstantInt::get(IdxTy, 0);
-
- IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt());
- Value *Step = getRuntimeVF(B, IdxTy, VF * UF);
Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
EPI.VectorTripCount = CountRoundDown;
- Induction =
- createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
- getDebugLocFromInstOrOperands(OldInduction));
+ createHeaderBranch(Lp);
// Skip induction resume value creation here because they will be created in
// the second pass. If we created them here, they wouldn't be used anyway,
// because the vplan in the second pass still contains the inductions from the
// original loop.
- return completeLoopSkeleton(Lp, OrigLoopID);
+ return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
}
void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
@@ -8219,7 +8223,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
-BasicBlock *
+std::pair<BasicBlock *, Value *>
EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
MDNode *OrigLoopID = OrigLoop->getLoopID();
Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
@@ -8275,6 +8279,25 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
+ // The vec.epilog.iter.check block may contain Phi nodes from reductions which
+ // merge control-flow from the latch block and the middle block. Update the
+ // incoming values here and move the Phi into the preheader.
+ SmallVector<PHINode *, 4> PhisInBlock;
+ for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
+ PhisInBlock.push_back(&Phi);
+
+ for (PHINode *Phi : PhisInBlock) {
+ Phi->replaceIncomingBlockWith(
+ VecEpilogueIterationCountCheck->getSinglePredecessor(),
+ VecEpilogueIterationCountCheck);
+ Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
+ if (EPI.SCEVSafetyCheck)
+ Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
+ if (EPI.MemSafetyCheck)
+ Phi->removeIncomingValue(EPI.MemSafetyCheck);
+ Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
+ }
+
// Generate a resume induction for the vector epilogue and put it in the
// vector epilogue preheader
Type *IdxTy = Legal->getWidestInductionType();
@@ -8285,13 +8308,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
EPI.MainLoopIterationCountCheck);
// Generate the induction variable.
- OldInduction = Legal->getPrimaryInduction();
- Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
- Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
- Value *StartIdx = EPResumeVal;
- Induction =
- createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
- getDebugLocFromInstOrOperands(OldInduction));
+ createHeaderBranch(Lp);
// Generate induction resume values. These variables save the new starting
// indexes for the scalar loop. They are used to test if there are any tail
@@ -8300,12 +8317,10 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
// check, then the resume value for the induction variable comes from
// the trip count of the main vector loop, hence passing the AdditionalBypass
// argument.
- createInductionResumeValues(Lp, CountRoundDown,
- {VecEpilogueIterationCountCheck,
- EPI.VectorTripCount} /* AdditionalBypass */);
+ createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck,
+ EPI.VectorTripCount} /* AdditionalBypass */);
- AddRuntimeUnrollDisableMetaData(Lp);
- return completeLoopSkeleton(Lp, OrigLoopID);
+ return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal};
}
BasicBlock *
@@ -8447,33 +8462,22 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
// Introduce the early-exit compare IV <= BTC to form header block mask.
- // This is used instead of IV < TC because TC may wrap, unlike BTC.
- // Start by constructing the desired canonical IV in the header block.
- VPValue *IV = nullptr;
- if (Legal->getPrimaryInduction())
- IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
- else {
- VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
- auto *IVRecipe = new VPWidenCanonicalIVRecipe();
- HeaderVPBB->insert(IVRecipe, HeaderVPBB->getFirstNonPhi());
- IV = IVRecipe;
- }
+ // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
+ // constructing the desired canonical IV in the header block as its first
+ // non-phi instructions.
+ assert(CM.foldTailByMasking() && "must fold the tail");
+ VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
+ auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
+ auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
+ HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
- // Create the block in mask as the first non-phi instruction in the block.
VPBuilder::InsertPointGuard Guard(Builder);
- auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
- Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
-
- VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
- bool TailFolded = !CM.isScalarEpilogueAllowed();
-
- if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
- // While ActiveLaneMask is a binary op that consumes the loop tripcount
- // as a second argument, we only pass the IV here and extract the
- // tripcount from the transform state where codegen of the VP instructions
- // happen.
- BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
+ Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
+ if (CM.TTI.emitGetActiveLaneMask()) {
+ VPValue *TC = Plan->getOrCreateTripCount();
+ BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
} else {
+ VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
}
return BlockMaskCache[BB] = BlockMask;
@@ -8621,7 +8625,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
VFRange &Range) const {
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
- [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); },
+ [this, CI](ElementCount VF) {
+ return CM.isScalarWithPredication(CI, VF);
+ },
Range);
if (IsPredicated)
@@ -8661,7 +8667,8 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
// scalarization is profitable or it is predicated.
auto WillScalarize = [this, I](ElementCount VF) -> bool {
return CM.isScalarAfterVectorization(I, VF) ||
- CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I);
+ CM.isProfitableToScalarize(I, VF) ||
+ CM.isScalarWithPredication(I, VF);
};
return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
Range);
@@ -8719,7 +8726,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
void VPRecipeBuilder::fixHeaderPhis() {
BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
- for (VPWidenPHIRecipe *R : PhisToFix) {
+ for (VPHeaderPHIRecipe *R : PhisToFix) {
auto *PN = cast<PHINode>(R->getUnderlyingValue());
VPRecipeBase *IncR =
getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
@@ -8735,7 +8742,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
Range);
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); },
+ [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
Range);
// Even if the instruction is not marked as uniform, there are certain
@@ -8861,7 +8868,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands)))
return toVPRecipeResult(Recipe);
- VPWidenPHIRecipe *PhiRecipe = nullptr;
+ VPHeaderPHIRecipe *PhiRecipe = nullptr;
if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
VPValue *StartV = Operands[0];
if (Legal->isReductionVariable(Phi)) {
@@ -8882,11 +8889,14 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
PhisToFix.push_back(PhiRecipe);
} else {
- // TODO: record start and backedge value for remaining pointer induction
- // phis.
+ // TODO: record backedge value for remaining pointer induction phis.
assert(Phi->getType()->isPointerTy() &&
"only pointer phis should be handled here");
- PhiRecipe = new VPWidenPHIRecipe(Phi);
+ assert(Legal->getInductionVars().count(Phi) &&
+ "Not an induction variable");
+ InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
+ VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
+ PhiRecipe = new VPWidenPHIRecipe(Phi, Start);
}
return toVPRecipeResult(PhiRecipe);
@@ -8966,6 +8976,40 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
}
}
+// Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a
+// CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
+// BranchOnCount VPInstruction to the latch.
+static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
+ bool HasNUW, bool IsVPlanNative) {
+ Value *StartIdx = ConstantInt::get(IdxTy, 0);
+ auto *StartV = Plan.getOrAddVPValue(StartIdx);
+
+ auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
+ VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
+ VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
+ if (IsVPlanNative)
+ Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
+ Header->insert(CanonicalIVPHI, Header->begin());
+
+ auto *CanonicalIVIncrement =
+ new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
+ : VPInstruction::CanonicalIVIncrement,
+ {CanonicalIVPHI}, DL);
+ CanonicalIVPHI->addOperand(CanonicalIVIncrement);
+
+ VPBasicBlock *EB = TopRegion->getExitBasicBlock();
+ if (IsVPlanNative) {
+ EB = cast<VPBasicBlock>(EB->getSinglePredecessor());
+ EB->setCondBit(nullptr);
+ }
+ EB->appendRecipe(CanonicalIVIncrement);
+
+ auto *BranchOnCount =
+ new VPInstruction(VPInstruction::BranchOnCount,
+ {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
+ EB->appendRecipe(BranchOnCount);
+}
+
VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
const MapVector<Instruction *, Instruction *> &SinkAfter) {
@@ -9033,6 +9077,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
auto Plan = std::make_unique<VPlan>(TopRegion);
+ Instruction *DLInst =
+ getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
+ addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
+ DLInst ? DLInst->getDebugLoc() : DebugLoc(),
+ !CM.foldTailByMasking(), false);
+
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
LoopBlocksDFS DFS(OrigLoop);
@@ -9194,6 +9244,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}
}
+ VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
VPlanTransforms::removeRedundantInductionCasts(*Plan);
// Now that sink-after is done, move induction recipes for optimized truncates
@@ -9325,6 +9376,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
OrigLoop, Plan,
[this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
DeadInstructions, *PSE.getSE());
+
+ addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
+ true, true);
return Plan;
}
@@ -9414,16 +9468,19 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
}
// If tail is folded by masking, introduce selects between the phi
- // and the live-out instruction of each reduction, at the end of the latch.
+ // and the live-out instruction of each reduction, at the beginning of the
+ // dedicated latch block.
if (CM.foldTailByMasking()) {
+ Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
if (!PhiR || PhiR->isInLoop())
continue;
- Builder.setInsertPoint(LatchVPBB);
VPValue *Cond =
RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
VPValue *Red = PhiR->getBackedgeValue();
+ assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&
+ "reduction recipe must be defined before latch");
Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
}
}
@@ -9682,9 +9739,8 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Int or FP induction being replicated.");
- State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(),
- getStartValue()->getLiveInIRValue(),
- getTruncInst(), getVPValue(0), State);
+ auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
+ State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV);
}
void VPWidenPHIRecipe::execute(VPTransformState &State) {
@@ -10013,7 +10069,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
}
- State.set(getVPSingleValue(), NewLI, Part);
+ State.set(this, NewLI, Part);
}
}
@@ -10561,6 +10617,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
Checks);
VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
+
+ // Ensure that the start values for any VPReductionPHIRecipes are
+ // updated before vectorising the epilogue loop.
+ VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock();
+ for (VPRecipeBase &R : Header->phis()) {
+ if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
+ if (auto *Resume = MainILV.getReductionResumeValue(
+ ReductionPhi->getRecurrenceDescriptor())) {
+ VPValue *StartVal = new VPValue(Resume);
+ BestEpiPlan.addExternalDef(StartVal);
+ ReductionPhi->setOperand(0, StartVal);
+ }
+ }
+ }
+
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
DT);
++LoopsEpilogueVectorized;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 37ae13666f7a..99c265fc5101 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -435,7 +435,7 @@ struct InstructionsState {
}
/// Some of the instructions in the list have alternate opcodes.
- bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
+ bool isAltShuffle() const { return AltOp != MainOp; }
bool isOpcodeOrAlt(Instruction *I) const {
unsigned CheckedOpcode = I->getOpcode();
@@ -581,7 +581,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
}
/// \returns the AA location that is being access by the instruction.
-static MemoryLocation getLocation(Instruction *I, AAResults *AA) {
+static MemoryLocation getLocation(Instruction *I) {
if (StoreInst *SI = dyn_cast<StoreInst>(I))
return MemoryLocation::get(SI);
if (LoadInst *LI = dyn_cast<LoadInst>(I))
@@ -1417,7 +1417,11 @@ public:
HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
} else if (NumFreeOpsHash.NumOfAPOs == Min &&
NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
- ++HashMap[NumFreeOpsHash.Hash].first;
+ auto It = HashMap.find(NumFreeOpsHash.Hash);
+ if (It == HashMap.end())
+ HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
+ else
+ ++It->second.first;
}
}
// Select the lane with the minimum counter.
@@ -2019,9 +2023,7 @@ private:
}
/// Some of the instructions in the list have alternate opcodes.
- bool isAltShuffle() const {
- return getOpcode() != getAltOpcode();
- }
+ bool isAltShuffle() const { return MainOp != AltOp; }
bool isOpcodeOrAlt(Instruction *I) const {
unsigned CheckedOpcode = I->getOpcode();
@@ -2519,12 +2521,11 @@ private:
SD->IsScheduled = true;
LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
- ScheduleData *BundleMember = SD;
- while (BundleMember) {
- if (BundleMember->Inst != BundleMember->OpValue) {
- BundleMember = BundleMember->NextInBundle;
+ for (ScheduleData *BundleMember = SD; BundleMember;
+ BundleMember = BundleMember->NextInBundle) {
+ if (BundleMember->Inst != BundleMember->OpValue)
continue;
- }
+
// Handle the def-use chain dependencies.
// Decrement the unscheduled counter and insert to ready list if ready.
@@ -2589,7 +2590,6 @@ private:
<< "SLP: gets ready (mem): " << *DepBundle << "\n");
}
}
- BundleMember = BundleMember->NextInBundle;
}
}
@@ -2618,6 +2618,10 @@ private:
}
}
+ /// Build a bundle from the ScheduleData nodes corresponding to the
+ /// scalar instruction for each lane.
+ ScheduleData *buildBundle(ArrayRef<Value *> VL);
+
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
/// actually moved at this stage.
@@ -3040,7 +3044,7 @@ Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
void BoUpSLP::reorderTopToBottom() {
// Maps VF to the graph nodes.
- DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries;
+ DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
// ExtractElement gather nodes which can be vectorized and need to handle
// their ordering.
DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
@@ -3051,6 +3055,29 @@ void BoUpSLP::reorderTopToBottom() {
const std::unique_ptr<TreeEntry> &TE) {
if (Optional<OrdersType> CurrentOrder =
getReorderingData(*TE.get(), /*TopToBottom=*/true)) {
+ // Do not include ordering for nodes used in the alt opcode vectorization,
+ // better to reorder them during bottom-to-top stage. If follow the order
+ // here, it causes reordering of the whole graph though actually it is
+ // profitable just to reorder the subgraph that starts from the alternate
+ // opcode vectorization node. Such nodes already end-up with the shuffle
+ // instruction and it is just enough to change this shuffle rather than
+ // rotate the scalars for the whole graph.
+ unsigned Cnt = 0;
+ const TreeEntry *UserTE = TE.get();
+ while (UserTE && Cnt < RecursionMaxDepth) {
+ if (UserTE->UserTreeIndices.size() != 1)
+ break;
+ if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
+ return EI.UserTE->State == TreeEntry::Vectorize &&
+ EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
+ }))
+ return;
+ if (UserTE->UserTreeIndices.empty())
+ UserTE = nullptr;
+ else
+ UserTE = UserTE->UserTreeIndices.back().UserTE;
+ ++Cnt;
+ }
VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
if (TE->State != TreeEntry::Vectorize)
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
@@ -3066,7 +3093,7 @@ void BoUpSLP::reorderTopToBottom() {
// Try to find the most profitable order. We just are looking for the most
// used order and reorder scalar elements in the nodes according to this
// mostly used order.
- const SmallPtrSetImpl<TreeEntry *> &OrderedEntries = It->getSecond();
+ ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
// All operands are reordered and used only in this node - propagate the
// most used order to the user node.
MapVector<OrdersType, unsigned,
@@ -4459,6 +4486,8 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
CurrentOrder.clear();
return false;
}
+ if (ShouldKeepOrder)
+ CurrentOrder.clear();
return ShouldKeepOrder;
}
@@ -7202,6 +7231,33 @@ void BoUpSLP::optimizeGatherSequence() {
GatherShuffleSeq.clear();
}
+BoUpSLP::ScheduleData *
+BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
+ ScheduleData *Bundle = nullptr;
+ ScheduleData *PrevInBundle = nullptr;
+ for (Value *V : VL) {
+ ScheduleData *BundleMember = getScheduleData(V);
+ assert(BundleMember &&
+ "no ScheduleData for bundle member "
+ "(maybe not in same basic block)");
+ assert(BundleMember->isSchedulingEntity() &&
+ "bundle member already part of other bundle");
+ if (PrevInBundle) {
+ PrevInBundle->NextInBundle = BundleMember;
+ } else {
+ Bundle = BundleMember;
+ }
+ BundleMember->UnscheduledDepsInBundle = 0;
+ Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
+
+ // Group the instructions to a bundle.
+ BundleMember->FirstInBundle = Bundle;
+ PrevInBundle = BundleMember;
+ }
+ assert(Bundle && "Failed to find schedule bundle");
+ return Bundle;
+}
+
// Groups the instructions to a bundle (which is then a single scheduling entity)
// and schedules instructions until the bundle gets ready.
Optional<BoUpSLP::ScheduleData *>
@@ -7214,12 +7270,9 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
// Initialize the instruction bundle.
Instruction *OldScheduleEnd = ScheduleEnd;
- ScheduleData *PrevInBundle = nullptr;
- ScheduleData *Bundle = nullptr;
- bool ReSchedule = false;
LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
- auto &&TryScheduleBundle = [this, OldScheduleEnd, SLP](bool ReSchedule,
+ auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
ScheduleData *Bundle) {
// The scheduling region got new instructions at the lower end (or it is a
// new region for the first bundle). This makes it necessary to
@@ -7263,39 +7316,28 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
// Otherwise the compiler may crash trying to incorrectly calculate
// dependencies and emit instruction in the wrong order at the actual
// scheduling.
- TryScheduleBundle(/*ReSchedule=*/false, nullptr);
+ TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
return None;
}
}
+ bool ReSchedule = false;
for (Value *V : VL) {
ScheduleData *BundleMember = getScheduleData(V);
assert(BundleMember &&
"no ScheduleData for bundle member (maybe not in same basic block)");
- if (BundleMember->IsScheduled) {
- // A bundle member was scheduled as single instruction before and now
- // needs to be scheduled as part of the bundle. We just get rid of the
- // existing schedule.
- LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
- << " was already scheduled\n");
- ReSchedule = true;
- }
- assert(BundleMember->isSchedulingEntity() &&
- "bundle member already part of other bundle");
- if (PrevInBundle) {
- PrevInBundle->NextInBundle = BundleMember;
- } else {
- Bundle = BundleMember;
- }
- BundleMember->UnscheduledDepsInBundle = 0;
- Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
-
- // Group the instructions to a bundle.
- BundleMember->FirstInBundle = Bundle;
- PrevInBundle = BundleMember;
+ if (!BundleMember->IsScheduled)
+ continue;
+ // A bundle member was scheduled as single instruction before and now
+ // needs to be scheduled as part of the bundle. We just get rid of the
+ // existing schedule.
+ LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
+ << " was already scheduled\n");
+ ReSchedule = true;
}
- assert(Bundle && "Failed to find schedule bundle");
- TryScheduleBundle(ReSchedule, Bundle);
+
+ auto *Bundle = buildBundle(VL);
+ TryScheduleBundleImpl(ReSchedule, Bundle);
if (!Bundle->isReady()) {
cancelScheduling(VL, S.OpValue);
return None;
@@ -7464,20 +7506,33 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
while (!WorkList.empty()) {
ScheduleData *SD = WorkList.pop_back_val();
-
- ScheduleData *BundleMember = SD;
- while (BundleMember) {
+ for (ScheduleData *BundleMember = SD; BundleMember;
+ BundleMember = BundleMember->NextInBundle) {
assert(isInSchedulingRegion(BundleMember));
- if (!BundleMember->hasValidDependencies()) {
-
- LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
- << "\n");
- BundleMember->Dependencies = 0;
- BundleMember->resetUnscheduledDeps();
+ if (BundleMember->hasValidDependencies())
+ continue;
- // Handle def-use chain dependencies.
- if (BundleMember->OpValue != BundleMember->Inst) {
- ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
+ LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
+ << "\n");
+ BundleMember->Dependencies = 0;
+ BundleMember->resetUnscheduledDeps();
+
+ // Handle def-use chain dependencies.
+ if (BundleMember->OpValue != BundleMember->Inst) {
+ ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
+ if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = UseSD->FirstInBundle;
+ if (!DestBundle->IsScheduled)
+ BundleMember->incrementUnscheduledDeps(1);
+ if (!DestBundle->hasValidDependencies())
+ WorkList.push_back(DestBundle);
+ }
+ } else {
+ for (User *U : BundleMember->Inst->users()) {
+ assert(isa<Instruction>(U) &&
+ "user of instruction must be instruction");
+ ScheduleData *UseSD = getScheduleData(U);
if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
BundleMember->Dependencies++;
ScheduleData *DestBundle = UseSD->FirstInBundle;
@@ -7486,89 +7541,69 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
if (!DestBundle->hasValidDependencies())
WorkList.push_back(DestBundle);
}
- } else {
- for (User *U : BundleMember->Inst->users()) {
- if (isa<Instruction>(U)) {
- ScheduleData *UseSD = getScheduleData(U);
- if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
- BundleMember->Dependencies++;
- ScheduleData *DestBundle = UseSD->FirstInBundle;
- if (!DestBundle->IsScheduled)
- BundleMember->incrementUnscheduledDeps(1);
- if (!DestBundle->hasValidDependencies())
- WorkList.push_back(DestBundle);
- }
- } else {
- // I'm not sure if this can ever happen. But we need to be safe.
- // This lets the instruction/bundle never be scheduled and
- // eventually disable vectorization.
- BundleMember->Dependencies++;
- BundleMember->incrementUnscheduledDeps(1);
- }
- }
}
+ }
- // Handle the memory dependencies.
- ScheduleData *DepDest = BundleMember->NextLoadStore;
- if (DepDest) {
- Instruction *SrcInst = BundleMember->Inst;
- MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
- bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
- unsigned numAliased = 0;
- unsigned DistToSrc = 1;
-
- while (DepDest) {
- assert(isInSchedulingRegion(DepDest));
-
- // We have two limits to reduce the complexity:
- // 1) AliasedCheckLimit: It's a small limit to reduce calls to
- // SLP->isAliased (which is the expensive part in this loop).
- // 2) MaxMemDepDistance: It's for very large blocks and it aborts
- // the whole loop (even if the loop is fast, it's quadratic).
- // It's important for the loop break condition (see below) to
- // check this limit even between two read-only instructions.
- if (DistToSrc >= MaxMemDepDistance ||
- ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
- (numAliased >= AliasedCheckLimit ||
- SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
-
- // We increment the counter only if the locations are aliased
- // (instead of counting all alias checks). This gives a better
- // balance between reduced runtime and accurate dependencies.
- numAliased++;
-
- DepDest->MemoryDependencies.push_back(BundleMember);
- BundleMember->Dependencies++;
- ScheduleData *DestBundle = DepDest->FirstInBundle;
- if (!DestBundle->IsScheduled) {
- BundleMember->incrementUnscheduledDeps(1);
- }
- if (!DestBundle->hasValidDependencies()) {
- WorkList.push_back(DestBundle);
- }
- }
- DepDest = DepDest->NextLoadStore;
-
- // Example, explaining the loop break condition: Let's assume our
- // starting instruction is i0 and MaxMemDepDistance = 3.
- //
- // +--------v--v--v
- // i0,i1,i2,i3,i4,i5,i6,i7,i8
- // +--------^--^--^
- //
- // MaxMemDepDistance let us stop alias-checking at i3 and we add
- // dependencies from i0 to i3,i4,.. (even if they are not aliased).
- // Previously we already added dependencies from i3 to i6,i7,i8
- // (because of MaxMemDepDistance). As we added a dependency from
- // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
- // and we can abort this loop at i6.
- if (DistToSrc >= 2 * MaxMemDepDistance)
- break;
- DistToSrc++;
+ // Handle the memory dependencies (if any).
+ ScheduleData *DepDest = BundleMember->NextLoadStore;
+ if (!DepDest)
+ continue;
+ Instruction *SrcInst = BundleMember->Inst;
+ assert(SrcInst->mayReadOrWriteMemory() &&
+ "NextLoadStore list for non memory effecting bundle?");
+ MemoryLocation SrcLoc = getLocation(SrcInst);
+ bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
+ unsigned numAliased = 0;
+ unsigned DistToSrc = 1;
+
+ for ( ; DepDest; DepDest = DepDest->NextLoadStore) {
+ assert(isInSchedulingRegion(DepDest));
+
+ // We have two limits to reduce the complexity:
+ // 1) AliasedCheckLimit: It's a small limit to reduce calls to
+ // SLP->isAliased (which is the expensive part in this loop).
+ // 2) MaxMemDepDistance: It's for very large blocks and it aborts
+ // the whole loop (even if the loop is fast, it's quadratic).
+ // It's important for the loop break condition (see below) to
+ // check this limit even between two read-only instructions.
+ if (DistToSrc >= MaxMemDepDistance ||
+ ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
+ (numAliased >= AliasedCheckLimit ||
+ SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
+
+ // We increment the counter only if the locations are aliased
+ // (instead of counting all alias checks). This gives a better
+ // balance between reduced runtime and accurate dependencies.
+ numAliased++;
+
+ DepDest->MemoryDependencies.push_back(BundleMember);
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = DepDest->FirstInBundle;
+ if (!DestBundle->IsScheduled) {
+ BundleMember->incrementUnscheduledDeps(1);
+ }
+ if (!DestBundle->hasValidDependencies()) {
+ WorkList.push_back(DestBundle);
}
}
+
+ // Example, explaining the loop break condition: Let's assume our
+ // starting instruction is i0 and MaxMemDepDistance = 3.
+ //
+ // +--------v--v--v
+ // i0,i1,i2,i3,i4,i5,i6,i7,i8
+ // +--------^--^--^
+ //
+ // MaxMemDepDistance let us stop alias-checking at i3 and we add
+ // dependencies from i0 to i3,i4,.. (even if they are not aliased).
+ // Previously we already added dependencies from i3 to i6,i7,i8
+ // (because of MaxMemDepDistance). As we added a dependency from
+ // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
+ // and we can abort this loop at i6.
+ if (DistToSrc >= 2 * MaxMemDepDistance)
+ break;
+ DistToSrc++;
}
- BundleMember = BundleMember->NextInBundle;
}
if (InsertInReadyList && SD->isReady()) {
ReadyInsts.push_back(SD);
@@ -7638,8 +7673,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
// Move the scheduled instruction(s) to their dedicated places, if not
// there yet.
- ScheduleData *BundleMember = picked;
- while (BundleMember) {
+ for (ScheduleData *BundleMember = picked; BundleMember;
+ BundleMember = BundleMember->NextInBundle) {
Instruction *pickedInst = BundleMember->Inst;
if (pickedInst->getNextNode() != LastScheduledInst) {
BS->BB->getInstList().remove(pickedInst);
@@ -7647,7 +7682,6 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
pickedInst);
}
LastScheduledInst = pickedInst;
- BundleMember = BundleMember->NextInBundle;
}
BS->schedule(picked, ReadyInsts);
@@ -8045,8 +8079,11 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// If the target claims to have no vector registers don't attempt
// vectorization.
- if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)))
+ if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
+ LLVM_DEBUG(
+ dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
return false;
+ }
// Don't vectorize when the attribute NoImplicitFloat is used.
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
@@ -8693,7 +8730,6 @@ class HorizontalReduction {
static RecurKind getRdxKind(Instruction *I) {
assert(I && "Expected instruction for reduction matching");
- TargetTransformInfo::ReductionFlags RdxFlags;
if (match(I, m_Add(m_Value(), m_Value())))
return RecurKind::Add;
if (match(I, m_Mul(m_Value(), m_Value())))
@@ -8767,7 +8803,6 @@ class HorizontalReduction {
return RecurKind::None;
}
- TargetTransformInfo::ReductionFlags RdxFlags;
switch (Pred) {
default:
return RecurKind::None;
@@ -9206,7 +9241,7 @@ private:
auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
- /*unsigned=*/false, CostKind);
+ /*IsUnsigned=*/false, CostKind);
CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,
SclCondTy, RdxPred, CostKind) +
@@ -9571,8 +9606,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
return false;
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
- // Aggregate value is unlikely to be processed in vector register, we need to
- // extract scalars into scalar registers, so NeedExtraction is set true.
+ // Aggregate value is unlikely to be processed in vector register.
return tryToVectorizeList(BuildVectorOpds, R);
}
@@ -9598,7 +9632,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
function_ref<unsigned(T *)> Limit,
function_ref<bool(T *, T *)> Comparator,
function_ref<bool(T *, T *)> AreCompatible,
- function_ref<bool(ArrayRef<T *>, bool)> TryToVectorize,
+ function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
bool LimitForRegisterSize) {
bool Changed = false;
// Sort by type, parent, operands.
@@ -9627,7 +9661,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
// same/alternate ops only, this may result in some extra final
// vectorization.
if (NumElts > 1 &&
- TryToVectorize(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) {
+ TryToVectorizeHelper(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) {
// Success start over because instructions might have been changed.
Changed = true;
} else if (NumElts < Limit(*IncIt) &&
@@ -9638,7 +9672,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
// Final attempt to vectorize instructions with the same types.
if (Candidates.size() > 1 &&
(SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
- if (TryToVectorize(Candidates, /*LimitForRegisterSize=*/false)) {
+ if (TryToVectorizeHelper(Candidates, /*LimitForRegisterSize=*/false)) {
// Success start over because instructions might have been changed.
Changed = true;
} else if (LimitForRegisterSize) {
@@ -9649,7 +9683,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
++SameTypeIt;
unsigned NumElts = (SameTypeIt - It);
- if (NumElts > 1 && TryToVectorize(makeArrayRef(It, NumElts),
+ if (NumElts > 1 && TryToVectorizeHelper(makeArrayRef(It, NumElts),
/*LimitForRegisterSize=*/false))
Changed = true;
It = SameTypeIt;
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 65857f034210..e5dded3c0f1e 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -59,7 +59,7 @@ class VPRecipeBuilder {
/// Cross-iteration reduction & first-order recurrence phis for which we need
/// to add the incoming value from the backedge after all recipes have been
/// created.
- SmallVector<VPWidenPHIRecipe *, 4> PhisToFix;
+ SmallVector<VPHeaderPHIRecipe *, 4> PhisToFix;
/// Check if \p I can be widened at the start of \p Range and possibly
/// decrease the range such that the returned value holds for the entire \p
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1d9e71663cd2..a96c122db2a9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -677,10 +677,10 @@ void VPInstruction::generateInstruction(VPTransformState &State,
// Get first lane of vector induction variable.
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
// Get the original loop tripcount.
- Value *ScalarTC = State.TripCount;
+ Value *ScalarTC = State.get(getOperand(1), Part);
auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
- auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue());
+ auto *PredTy = VectorType::get(Int1Ty, State.VF);
Instruction *Call = Builder.CreateIntrinsic(
Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
{VIVElem0, ScalarTC}, nullptr, "active.lane.mask");
@@ -711,6 +711,51 @@ void VPInstruction::generateInstruction(VPTransformState &State,
}
break;
}
+
+ case VPInstruction::CanonicalIVIncrement:
+ case VPInstruction::CanonicalIVIncrementNUW: {
+ Value *Next = nullptr;
+ if (Part == 0) {
+ bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;
+ auto *Phi = State.get(getOperand(0), 0);
+ // The loop step is equal to the vectorization factor (num of SIMD
+ // elements) times the unroll factor (num of SIMD instructions).
+ Value *Step =
+ createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
+ Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false);
+ } else {
+ Next = State.get(this, 0);
+ }
+
+ State.set(this, Next, Part);
+ break;
+ }
+ case VPInstruction::BranchOnCount: {
+ if (Part != 0)
+ break;
+ // First create the compare.
+ Value *IV = State.get(getOperand(0), Part);
+ Value *TC = State.get(getOperand(1), Part);
+ Value *Cond = Builder.CreateICmpEQ(IV, TC);
+
+ // Now create the branch.
+ auto *Plan = getParent()->getPlan();
+ VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
+ VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
+ if (Header->empty()) {
+ assert(EnableVPlanNativePath &&
+ "empty entry block only expected in VPlanNativePath");
+ Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
+ }
+ // TODO: Once the exit block is modeled in VPlan, use it instead of going
+ // through State.CFG.LastBB.
+ BasicBlock *Exit =
+ cast<BranchInst>(State.CFG.LastBB->getTerminator())->getSuccessor(0);
+
+ Builder.CreateCondBr(Cond, Exit, State.CFG.VPBB2IRBB[Header]);
+ Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
+ break;
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -758,6 +803,15 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::FirstOrderRecurrenceSplice:
O << "first-order splice";
break;
+ case VPInstruction::CanonicalIVIncrement:
+ O << "VF * UF + ";
+ break;
+ case VPInstruction::CanonicalIVIncrementNUW:
+ O << "VF * UF +(nuw) ";
+ break;
+ case VPInstruction::BranchOnCount:
+ O << "branch-on-count ";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -786,23 +840,55 @@ void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
FMF = FMFNew;
}
-/// Generate the code inside the body of the vectorized loop. Assumes a single
-/// LoopVectorBody basic-block was created for this. Introduce additional
-/// basic-blocks as needed, and fill them all.
-void VPlan::execute(VPTransformState *State) {
- // -1. Check if the backedge taken count is needed, and if so build it.
+void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
+ Value *CanonicalIVStartValue,
+ VPTransformState &State) {
+ // Check if the trip count is needed, and if so build it.
+ if (TripCount && TripCount->getNumUsers()) {
+ for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+ State.set(TripCount, TripCountV, Part);
+ }
+
+ // Check if the backedge taken count is needed, and if so build it.
if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
- Value *TC = State->TripCount;
- IRBuilder<> Builder(State->CFG.PrevBB->getTerminator());
- auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1),
+ IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
+ auto *TCMO = Builder.CreateSub(TripCountV,
+ ConstantInt::get(TripCountV->getType(), 1),
"trip.count.minus.1");
- auto VF = State->VF;
+ auto VF = State.VF;
Value *VTCMO =
VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
- for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part)
- State->set(BackedgeTakenCount, VTCMO, Part);
+ for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+ State.set(BackedgeTakenCount, VTCMO, Part);
}
+ for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+ State.set(&VectorTripCount, VectorTripCountV, Part);
+
+ // When vectorizing the epilogue loop, the canonical induction start value
+ // needs to be changed from zero to the value after the main vector loop.
+ if (CanonicalIVStartValue) {
+ VPValue *VPV = new VPValue(CanonicalIVStartValue);
+ addExternalDef(VPV);
+ auto *IV = getCanonicalIV();
+ assert(all_of(IV->users(),
+ [](const VPUser *U) {
+ auto *VPI = cast<VPInstruction>(U);
+ return VPI->getOpcode() ==
+ VPInstruction::CanonicalIVIncrement ||
+ VPI->getOpcode() ==
+ VPInstruction::CanonicalIVIncrementNUW;
+ }) &&
+ "the canonical IV should only be used by its increments when "
+ "resetting the start value");
+ IV->setOperand(0, VPV);
+ }
+}
+
+/// Generate the code inside the body of the vectorized loop. Assumes a single
+/// LoopVectorBody basic-block was created for this. Introduce additional
+/// basic-blocks as needed, and fill them all.
+void VPlan::execute(VPTransformState *State) {
// 0. Set the reverse mapping from VPValues to Values for code generation.
for (auto &Entry : Value2VPValue)
State->VPValue2Value[Entry.second] = Entry.first;
@@ -834,28 +920,6 @@ void VPlan::execute(VPTransformState *State) {
for (VPBlockBase *Block : depth_first(Entry))
Block->execute(State);
- // Fix the latch value of reduction and first-order recurrences phis in the
- // vector loop.
- VPBasicBlock *Header = Entry->getEntryBasicBlock();
- for (VPRecipeBase &R : Header->phis()) {
- auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R);
- if (!PhiR || !(isa<VPFirstOrderRecurrencePHIRecipe>(&R) ||
- isa<VPReductionPHIRecipe>(&R)))
- continue;
- // For first-order recurrences and in-order reduction phis, only a single
- // part is generated, which provides the last part from the previous
- // iteration. Otherwise all UF parts are generated.
- bool SinglePartNeeded = isa<VPFirstOrderRecurrencePHIRecipe>(&R) ||
- cast<VPReductionPHIRecipe>(&R)->isOrdered();
- unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
- for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
- Value *VecPhi = State->get(PhiR, Part);
- Value *Val = State->get(PhiR->getBackedgeValue(),
- SinglePartNeeded ? State->UF - 1 : Part);
- cast<PHINode>(VecPhi)->addIncoming(Val, VectorLatchBB);
- }
- }
-
// Setup branch terminator successors for VPBBs in VPBBsToFix based on
// VPBB's successors.
for (auto VPBB : State->CFG.VPBBsToFix) {
@@ -876,13 +940,19 @@ void VPlan::execute(VPTransformState *State) {
// 3. Merge the temporary latch created with the last basic-block filled.
BasicBlock *LastBB = State->CFG.PrevBB;
+ assert(isa<BranchInst>(LastBB->getTerminator()) &&
+ "Expected VPlan CFG to terminate with branch");
+
+ // Move both the branch and check from LastBB to VectorLatchBB.
+ auto *LastBranch = cast<BranchInst>(LastBB->getTerminator());
+ LastBranch->moveBefore(VectorLatchBB->getTerminator());
+ VectorLatchBB->getTerminator()->eraseFromParent();
+ // Move condition so it is guaranteed to be next to branch. This is only done
+ // to avoid excessive test updates.
+ // TODO: Remove special handling once the increments for all inductions are
+ // modeled explicitly in VPlan.
+ cast<Instruction>(LastBranch->getCondition())->moveBefore(LastBranch);
// Connect LastBB to VectorLatchBB to facilitate their merge.
- assert((EnableVPlanNativePath ||
- isa<UnreachableInst>(LastBB->getTerminator())) &&
- "Expected InnerLoop VPlan CFG to terminate with unreachable");
- assert((!EnableVPlanNativePath || isa<BranchInst>(LastBB->getTerminator())) &&
- "Expected VPlan CFG to terminate with branch in NativePath");
- LastBB->getTerminator()->eraseFromParent();
BranchInst::Create(VectorLatchBB, LastBB);
// Merge LastBB with Latch.
@@ -891,6 +961,37 @@ void VPlan::execute(VPTransformState *State) {
assert(Merged && "Could not merge last basic block with latch.");
VectorLatchBB = LastBB;
+ // Fix the latch value of canonical, reduction and first-order recurrences
+ // phis in the vector loop.
+ VPBasicBlock *Header = Entry->getEntryBasicBlock();
+ if (Header->empty()) {
+ assert(EnableVPlanNativePath);
+ Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
+ }
+ for (VPRecipeBase &R : Header->phis()) {
+ // Skip phi-like recipes that generate their backedege values themselves.
+ // TODO: Model their backedge values explicitly.
+ if (isa<VPWidenIntOrFpInductionRecipe>(&R) || isa<VPWidenPHIRecipe>(&R))
+ continue;
+
+ auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
+ // For canonical IV, first-order recurrences and in-order reduction phis,
+ // only a single part is generated, which provides the last part from the
+ // previous iteration. For non-ordered reductions all UF parts are
+ // generated.
+ bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
+ isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
+ cast<VPReductionPHIRecipe>(PhiR)->isOrdered();
+ unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
+
+ for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
+ Value *Phi = State->get(PhiR, Part);
+ Value *Val = State->get(PhiR->getBackedgeValue(),
+ SinglePartNeeded ? State->UF - 1 : Part);
+ cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
+ }
+ }
+
// We do not attempt to preserve DT for outer loop vectorization currently.
if (!EnableVPlanNativePath)
updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB,
@@ -904,6 +1005,12 @@ void VPlan::print(raw_ostream &O) const {
O << "VPlan '" << Name << "' {";
+ if (VectorTripCount.getNumUsers() > 0) {
+ O << "\nLive-in ";
+ VectorTripCount.printAsOperand(O, SlotTracker);
+ O << " = vector-trip-count\n";
+ }
+
if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
O << "\nLive-in ";
BackedgeTakenCount->printAsOperand(O, SlotTracker);
@@ -1155,7 +1262,15 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
} else
O << " " << VPlanIngredient(IV);
}
+#endif
+bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
+ auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
+ auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep());
+ return StartC && StartC->isZero() && StepC && StepC->isOne();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-GEP ";
@@ -1255,7 +1370,7 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
O << Indent << "WIDEN ";
if (!isStore()) {
- getVPSingleValue()->printAsOperand(O, SlotTracker);
+ printAsOperand(O, SlotTracker);
O << " = ";
}
O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
@@ -1264,26 +1379,39 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
+ Value *Start = getStartValue()->getLiveInIRValue();
+ PHINode *EntryPart = PHINode::Create(
+ Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt());
+ EntryPart->addIncoming(Start, State.CFG.VectorPreHeader);
+ EntryPart->setDebugLoc(DL);
+ for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+ State.set(this, EntryPart, Part);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "EMIT ";
+ printAsOperand(O, SlotTracker);
+ O << " = CANONICAL-INDUCTION";
+}
+#endif
+
void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
- Value *CanonicalIV = State.CanonicalIV;
+ Value *CanonicalIV = State.get(getOperand(0), 0);
Type *STy = CanonicalIV->getType();
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
ElementCount VF = State.VF;
- assert(!VF.isScalable() && "the code following assumes non scalables ECs");
Value *VStart = VF.isScalar()
? CanonicalIV
- : Builder.CreateVectorSplat(VF.getKnownMinValue(),
- CanonicalIV, "broadcast");
+ : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
- SmallVector<Constant *, 8> Indices;
- for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
- Indices.push_back(
- ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane));
- // If VF == 1, there is only one iteration in the loop above, thus the
- // element pushed back into Indices is ConstantInt::get(STy, Part)
- Constant *VStep =
- VF.isScalar() ? Indices.back() : ConstantVector::get(Indices);
- // Add the consecutive indices to the vector value.
+ Value *VStep = createStepForVF(Builder, STy, VF, Part);
+ if (VF.isVector()) {
+ VStep = Builder.CreateVectorSplat(VF, VStep);
+ VStep = Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
+ }
Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
State.set(this, CanonicalVectorIV, Part);
}
@@ -1294,7 +1422,8 @@ void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "EMIT ";
printAsOperand(O, SlotTracker);
- O << " = WIDEN-CANONICAL-INDUCTION";
+ O << " = WIDEN-CANONICAL-INDUCTION ";
+ printOperands(O, SlotTracker);
}
#endif
@@ -1461,7 +1590,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
InterleavedAccessInfo &IAI) {
if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) {
for (VPRecipeBase &VPI : *VPBB) {
- if (isa<VPWidenPHIRecipe>(&VPI))
+ if (isa<VPHeaderPHIRecipe>(&VPI))
continue;
assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions");
auto *VPInst = cast<VPInstruction>(&VPI);
@@ -1506,6 +1635,7 @@ void VPSlotTracker::assignSlots(const VPlan &Plan) {
for (const VPValue *V : Plan.VPExternalDefs)
assignSlot(V);
+ assignSlot(&Plan.VectorTripCount);
if (Plan.BackedgeTakenCount)
assignSlot(Plan.BackedgeTakenCount);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index f4a1883e35d5..824440f98a8b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -69,6 +69,9 @@ class VPlanSlp;
/// vectors it is an expression determined at runtime.
Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF);
+/// Return a value for Step multiplied by VF.
+Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, int64_t Step);
+
/// A range of powers-of-2 vectorization factors with fixed start and
/// adjustable end. The range includes start and excludes end, e.g.,:
/// [1, 9) = {1, 2, 4, 8}
@@ -198,8 +201,8 @@ struct VPTransformState {
VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
DominatorTree *DT, IRBuilder<> &Builder,
InnerLoopVectorizer *ILV, VPlan *Plan)
- : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder), ILV(ILV),
- Plan(Plan) {}
+ : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan) {
+ }
/// The chosen Vectorization and Unroll Factors of the loop being vectorized.
ElementCount VF;
@@ -341,9 +344,6 @@ struct VPTransformState {
/// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF).
Value *CanonicalIV = nullptr;
- /// Hold the trip count of the scalar loop.
- Value *TripCount = nullptr;
-
/// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
InnerLoopVectorizer *ILV;
@@ -793,6 +793,9 @@ public:
SLPLoad,
SLPStore,
ActiveLaneMask,
+ CanonicalIVIncrement,
+ CanonicalIVIncrementNUW,
+ BranchOnCount,
};
private:
@@ -833,6 +836,16 @@ public:
return R->getVPDefID() == VPRecipeBase::VPInstructionSC;
}
+ /// Extra classof implementations to allow directly casting from VPUser ->
+ /// VPInstruction.
+ static inline bool classof(const VPUser *U) {
+ auto *R = dyn_cast<VPRecipeBase>(U);
+ return R && R->getVPDefID() == VPRecipeBase::VPInstructionSC;
+ }
+ static inline bool classof(const VPRecipeBase *R) {
+ return R->getVPDefID() == VPRecipeBase::VPInstructionSC;
+ }
+
unsigned getOpcode() const { return Opcode; }
/// Generate the instruction.
@@ -871,6 +884,7 @@ public:
case Instruction::Unreachable:
case Instruction::Fence:
case Instruction::AtomicRMW:
+ case VPInstruction::BranchOnCount:
return false;
default:
return true;
@@ -1045,6 +1059,7 @@ public:
/// Returns the start value of the induction.
VPValue *getStartValue() { return getOperand(0); }
+ const VPValue *getStartValue() const { return getOperand(0); }
/// Returns the first defined value as TruncInst, if it is one or nullptr
/// otherwise.
@@ -1057,66 +1072,65 @@ public:
/// Returns the induction descriptor for the recipe.
const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
-};
-/// A recipe for handling first order recurrences and pointer inductions. For
-/// first-order recurrences, the start value is the first operand of the recipe
-/// and the incoming value from the backedge is the second operand. It also
-/// serves as base class for VPReductionPHIRecipe. In the VPlan native path, all
-/// incoming VPValues & VPBasicBlock pairs are managed in the recipe directly.
-class VPWidenPHIRecipe : public VPRecipeBase, public VPValue {
- /// List of incoming blocks. Only used in the VPlan native path.
- SmallVector<VPBasicBlock *, 2> IncomingBlocks;
+ /// Returns true if the induction is canonical, i.e. starting at 0 and
+ /// incremented by UF * VF (= the original IV is incremented by 1).
+ bool isCanonical() const;
+
+ /// Returns the scalar type of the induction.
+ const Type *getScalarType() const {
+ const TruncInst *TruncI = getTruncInst();
+ return TruncI ? TruncI->getType() : IV->getType();
+ }
+};
+/// A pure virtual base class for all recipes modeling header phis, including
+/// phis for first order recurrences, pointer inductions and reductions. The
+/// start value is the first operand of the recipe and the incoming value from
+/// the backedge is the second operand.
+class VPHeaderPHIRecipe : public VPRecipeBase, public VPValue {
protected:
- VPWidenPHIRecipe(unsigned char VPVID, unsigned char VPDefID, PHINode *Phi,
- VPValue *Start = nullptr)
+ VPHeaderPHIRecipe(unsigned char VPVID, unsigned char VPDefID, PHINode *Phi,
+ VPValue *Start = nullptr)
: VPRecipeBase(VPDefID, {}), VPValue(VPVID, Phi, this) {
if (Start)
addOperand(Start);
}
public:
- /// Create a VPWidenPHIRecipe for \p Phi
- VPWidenPHIRecipe(PHINode *Phi)
- : VPWidenPHIRecipe(VPVWidenPHISC, VPWidenPHISC, Phi) {}
-
- /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start.
- VPWidenPHIRecipe(PHINode *Phi, VPValue &Start) : VPWidenPHIRecipe(Phi) {
- addOperand(&Start);
- }
-
- ~VPWidenPHIRecipe() override = default;
+ ~VPHeaderPHIRecipe() override = default;
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPRecipeBase *B) {
- return B->getVPDefID() == VPRecipeBase::VPWidenPHISC ||
+ return B->getVPDefID() == VPRecipeBase::VPCanonicalIVPHISC ||
B->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC ||
- B->getVPDefID() == VPRecipeBase::VPReductionPHISC;
+ B->getVPDefID() == VPRecipeBase::VPReductionPHISC ||
+ B->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC ||
+ B->getVPDefID() == VPRecipeBase::VPWidenPHISC;
}
static inline bool classof(const VPValue *V) {
- return V->getVPValueID() == VPValue::VPVWidenPHISC ||
+ return V->getVPValueID() == VPValue::VPVCanonicalIVPHISC ||
V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC ||
- V->getVPValueID() == VPValue::VPVReductionPHISC;
+ V->getVPValueID() == VPValue::VPVReductionPHISC ||
+ V->getVPValueID() == VPValue::VPVWidenIntOrFpInductionSC ||
+ V->getVPValueID() == VPValue::VPVWidenPHISC;
}
- /// Generate the phi/select nodes.
- void execute(VPTransformState &State) override;
+ /// Generate the phi nodes.
+ void execute(VPTransformState &State) override = 0;
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
+ VPSlotTracker &SlotTracker) const override = 0;
#endif
- /// Returns the start value of the phi, if it is a reduction or first-order
- /// recurrence.
+ /// Returns the start value of the phi, if one is set.
VPValue *getStartValue() {
return getNumOperands() == 0 ? nullptr : getOperand(0);
}
- /// Returns the incoming value from the loop backedge, if it is a reduction or
- /// first-order recurrence.
+ /// Returns the incoming value from the loop backedge.
VPValue *getBackedgeValue() {
return getOperand(1);
}
@@ -1126,6 +1140,44 @@ public:
VPRecipeBase *getBackedgeRecipe() {
return cast<VPRecipeBase>(getBackedgeValue()->getDef());
}
+};
+
+/// A recipe for handling header phis that are widened in the vector loop.
+/// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are
+/// managed in the recipe directly.
+class VPWidenPHIRecipe : public VPHeaderPHIRecipe {
+ /// List of incoming blocks. Only used in the VPlan native path.
+ SmallVector<VPBasicBlock *, 2> IncomingBlocks;
+
+public:
+ /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start.
+ VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr)
+ : VPHeaderPHIRecipe(VPVWidenPHISC, VPWidenPHISC, Phi) {
+ if (Start)
+ addOperand(Start);
+ }
+
+ ~VPWidenPHIRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *B) {
+ return B->getVPDefID() == VPRecipeBase::VPWidenPHISC;
+ }
+ static inline bool classof(const VPHeaderPHIRecipe *R) {
+ return R->getVPDefID() == VPRecipeBase::VPWidenPHISC;
+ }
+ static inline bool classof(const VPValue *V) {
+ return V->getVPValueID() == VPValue::VPVWidenPHISC;
+ }
+
+ /// Generate the phi/select nodes.
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
/// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi.
void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) {
@@ -1133,27 +1185,27 @@ public:
IncomingBlocks.push_back(IncomingBlock);
}
- /// Returns the \p I th incoming VPValue.
- VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
-
/// Returns the \p I th incoming VPBasicBlock.
VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; }
+
+ /// Returns the \p I th incoming VPValue.
+ VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
};
/// A recipe for handling first-order recurrence phis. The start value is the
/// first operand of the recipe and the incoming value from the backedge is the
/// second operand.
-struct VPFirstOrderRecurrencePHIRecipe : public VPWidenPHIRecipe {
+struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
VPFirstOrderRecurrencePHIRecipe(PHINode *Phi, VPValue &Start)
- : VPWidenPHIRecipe(VPVFirstOrderRecurrencePHISC,
- VPFirstOrderRecurrencePHISC, Phi, &Start) {}
+ : VPHeaderPHIRecipe(VPVFirstOrderRecurrencePHISC,
+ VPFirstOrderRecurrencePHISC, Phi, &Start) {}
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC;
}
- static inline bool classof(const VPWidenPHIRecipe *D) {
- return D->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC;
+ static inline bool classof(const VPHeaderPHIRecipe *R) {
+ return R->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC;
}
static inline bool classof(const VPValue *V) {
return V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC;
@@ -1171,7 +1223,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPWidenPHIRecipe {
/// A recipe for handling reduction phis. The start value is the first operand
/// of the recipe and the incoming value from the backedge is the second
/// operand.
-class VPReductionPHIRecipe : public VPWidenPHIRecipe {
+class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
/// Descriptor for the reduction.
const RecurrenceDescriptor &RdxDesc;
@@ -1187,7 +1239,7 @@ public:
VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
VPValue &Start, bool IsInLoop = false,
bool IsOrdered = false)
- : VPWidenPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start),
+ : VPHeaderPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start),
RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
}
@@ -1198,12 +1250,12 @@ public:
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPReductionPHISC;
}
+ static inline bool classof(const VPHeaderPHIRecipe *R) {
+ return R->getVPDefID() == VPRecipeBase::VPReductionPHISC;
+ }
static inline bool classof(const VPValue *V) {
return V->getVPValueID() == VPValue::VPVReductionPHISC;
}
- static inline bool classof(const VPWidenPHIRecipe *R) {
- return R->getVPDefID() == VPRecipeBase::VPReductionPHISC;
- }
/// Generate the phi/select nodes.
void execute(VPTransformState &State) override;
@@ -1601,11 +1653,46 @@ public:
#endif
};
+/// Canonical scalar induction phi of the vector loop. Starting at the specified
+/// start value (either 0 or the resume value when vectorizing the epilogue
+/// loop). VPWidenCanonicalIVRecipe represents the vector version of the
+/// canonical induction variable.
+class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
+ DebugLoc DL;
+
+public:
+ VPCanonicalIVPHIRecipe(VPValue *StartV, DebugLoc DL)
+ : VPHeaderPHIRecipe(VPValue::VPVCanonicalIVPHISC, VPCanonicalIVPHISC,
+ nullptr, StartV),
+ DL(DL) {}
+
+ ~VPCanonicalIVPHIRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPDef *D) {
+ return D->getVPDefID() == VPCanonicalIVPHISC;
+ }
+
+ /// Generate the canonical scalar induction phi of the vector loop.
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// Returns the scalar type of the induction.
+ const Type *getScalarType() const {
+ return getOperand(0)->getLiveInIRValue()->getType();
+ }
+};
+
/// A Recipe for widening the canonical induction variable of the vector loop.
class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue {
public:
- VPWidenCanonicalIVRecipe()
- : VPRecipeBase(VPWidenCanonicalIVSC, {}),
+ VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV)
+ : VPRecipeBase(VPWidenCanonicalIVSC, {CanonicalIV}),
VPValue(VPValue::VPVWidenCanonicalIVSC, nullptr, this) {}
~VPWidenCanonicalIVRecipe() override = default;
@@ -1615,6 +1702,16 @@ public:
return D->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC;
}
+ /// Extra classof implementations to allow directly casting from VPUser ->
+ /// VPWidenCanonicalIVRecipe.
+ static inline bool classof(const VPUser *U) {
+ auto *R = dyn_cast<VPRecipeBase>(U);
+ return R && R->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC;
+ }
+ static inline bool classof(const VPRecipeBase *R) {
+ return R->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC;
+ }
+
/// Generate a canonical vector induction variable of the vector loop, with
/// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and
/// step = <VF*UF, VF*UF, ..., VF*UF>.
@@ -1625,6 +1722,12 @@ public:
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
+
+ /// Returns the scalar type of the induction.
+ const Type *getScalarType() const {
+ return cast<VPCanonicalIVPHIRecipe>(getOperand(0)->getDef())
+ ->getScalarType();
+ }
};
/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
@@ -2112,10 +2215,17 @@ class VPlan {
// (operators '==' and '<').
SetVector<VPValue *> VPExternalDefs;
- /// Represents the backedge taken count of the original loop, for folding
+ /// Represents the trip count of the original loop, for folding
/// the tail.
+ VPValue *TripCount = nullptr;
+
+ /// Represents the backedge taken count of the original loop, for folding
+ /// the tail. It equals TripCount - 1.
VPValue *BackedgeTakenCount = nullptr;
+ /// Represents the vector trip count.
+ VPValue VectorTripCount;
+
/// Holds a mapping between Values and their corresponding VPValue inside
/// VPlan.
Value2VPValueTy Value2VPValue;
@@ -2147,12 +2257,18 @@ public:
}
for (VPValue *VPV : VPValuesToFree)
delete VPV;
+ if (TripCount)
+ delete TripCount;
if (BackedgeTakenCount)
delete BackedgeTakenCount;
for (VPValue *Def : VPExternalDefs)
delete Def;
}
+ /// Prepare the plan for execution, setting up the required live-in values.
+ void prepareToExecute(Value *TripCount, Value *VectorTripCount,
+ Value *CanonicalIVStartValue, VPTransformState &State);
+
/// Generate the IR code for this VPlan.
void execute(struct VPTransformState *State);
@@ -2165,6 +2281,13 @@ public:
return Entry;
}
+ /// The trip count of the original loop.
+ VPValue *getOrCreateTripCount() {
+ if (!TripCount)
+ TripCount = new VPValue();
+ return TripCount;
+ }
+
/// The backedge taken count of the original loop.
VPValue *getOrCreateBackedgeTakenCount() {
if (!BackedgeTakenCount)
@@ -2172,6 +2295,9 @@ public:
return BackedgeTakenCount;
}
+ /// The vector trip count.
+ VPValue &getVectorTripCount() { return VectorTripCount; }
+
/// Mark the plan to indicate that using Value2VPValue is not safe any
/// longer, because it may be stale.
void disableValue2VPValue() { Value2VPValueEnabled = false; }
@@ -2264,6 +2390,21 @@ public:
return !VPV->getDef() || (RepR && RepR->isUniform());
}
+ /// Returns the VPRegionBlock of the vector loop.
+ VPRegionBlock *getVectorLoopRegion() {
+ return cast<VPRegionBlock>(getEntry());
+ }
+
+ /// Returns the canonical induction recipe of the vector loop.
+ VPCanonicalIVPHIRecipe *getCanonicalIV() {
+ VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock();
+ if (EntryVPBB->empty()) {
+ // VPlan native path.
+ EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor());
+ }
+ return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
+ }
+
private:
/// Add to the given dominator tree the header block and every new basic block
/// that was created between it and the latch block, inclusive.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 86ecd6817873..e879a33db6ee 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -231,7 +231,7 @@ void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) {
}
// Entry point. The driver function for the predicator.
-void VPlanPredicator::predicate(void) {
+void VPlanPredicator::predicate() {
// Predicate the blocks within Region.
predicateRegionRec(cast<VPRegionBlock>(Plan.getEntry()));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.h b/llvm/lib/Transforms/Vectorize/VPlanPredicator.h
index 692afd2978d5..a5db9a54da3c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.h
@@ -68,7 +68,7 @@ public:
VPlanPredicator(VPlan &Plan);
/// Predicate Plan's HCFG.
- void predicate(void);
+ void predicate();
};
} // end namespace llvm
#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d2daf558c2c5..fb5f3d428189 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -324,3 +324,30 @@ void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) {
E.first->eraseFromParent();
}
}
+
+void VPlanTransforms::removeRedundantCanonicalIVs(VPlan &Plan) {
+ VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
+ VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
+ for (VPUser *U : CanonicalIV->users()) {
+ WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(U);
+ if (WidenNewIV)
+ break;
+ }
+
+ if (!WidenNewIV)
+ return;
+
+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
+ auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
+
+ // If the induction recipe is canonical and the types match, use it
+ // directly.
+ if (WidenOriginalIV && WidenOriginalIV->isCanonical() &&
+ WidenOriginalIV->getScalarType() == WidenNewIV->getScalarType()) {
+ WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
+ WidenNewIV->eraseFromParent();
+ return;
+ }
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index a82a562d5e35..e74409a86466 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -45,6 +45,10 @@ struct VPlanTransforms {
/// in the vectorized loop. There is no need to vectorize the cast - the same
/// value can be used for both the phi and casts in the vector loop.
static void removeRedundantInductionCasts(VPlan &Plan);
+
+ /// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
+ /// recipe, if it exists.
+ static void removeRedundantCanonicalIVs(VPlan &Plan);
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index fd92201614df..5296d2b9485c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -96,14 +96,15 @@ public:
VPVReplicateSC,
VPVWidenSC,
VPVWidenCallSC,
+ VPVWidenCanonicalIVSC,
VPVWidenGEPSC,
VPVWidenSelectSC,
// Phi-like VPValues. Need to be kept together.
VPVBlendSC,
+ VPVCanonicalIVPHISC,
VPVFirstOrderRecurrencePHISC,
VPVWidenPHISC,
- VPVWidenCanonicalIVSC,
VPVWidenIntOrFpInductionSC,
VPVPredInstPHI,
VPVReductionPHISC,
@@ -177,6 +178,7 @@ public:
void replaceAllUsesWith(VPValue *New);
VPDef *getDef() { return Def; }
+ const VPDef *getDef() const { return Def; }
/// Returns the underlying IR value, if this VPValue is defined outside the
/// scope of VPlan. Returns nullptr if the VPValue is defined by a VPDef
@@ -186,6 +188,11 @@ public:
"VPValue is not a live-in; it is defined by a VPDef inside a VPlan");
return getUnderlyingValue();
}
+ const Value *getLiveInIRValue() const {
+ assert(!getDef() &&
+ "VPValue is not a live-in; it is defined by a VPDef inside a VPlan");
+ return getUnderlyingValue();
+ }
};
typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
@@ -325,6 +332,7 @@ public:
VPReductionSC,
VPReplicateSC,
VPWidenCallSC,
+ VPWidenCanonicalIVSC,
VPWidenGEPSC,
VPWidenMemoryInstructionSC,
VPWidenSC,
@@ -332,9 +340,9 @@ public:
// Phi-like recipes. Need to be kept together.
VPBlendSC,
+ VPCanonicalIVPHISC,
VPFirstOrderRecurrencePHISC,
VPWidenPHISC,
- VPWidenCanonicalIVSC,
VPWidenIntOrFpInductionSC,
VPPredInstPHISC,
VPReductionPHISC,
@@ -403,7 +411,6 @@ public:
class VPlan;
class VPBasicBlock;
-class VPRegionBlock;
/// This class can be used to assign consecutive numbers to all VPValues in a
/// VPlan and allows querying the numbering for printing, similar to the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 7732d9367985..d36f250995e1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -163,12 +163,32 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {
errs() << "VPlan entry block is not a VPBasicBlock\n";
return false;
}
+
+ if (!isa<VPCanonicalIVPHIRecipe>(&*Entry->begin())) {
+ errs() << "VPlan vector loop header does not start with a "
+ "VPCanonicalIVPHIRecipe\n";
+ return false;
+ }
+
const VPBasicBlock *Exit = dyn_cast<VPBasicBlock>(TopRegion->getExit());
if (!Exit) {
errs() << "VPlan exit block is not a VPBasicBlock\n";
return false;
}
+ if (Exit->empty()) {
+ errs() << "VPlan vector loop exit must end with BranchOnCount "
+ "VPInstruction but is empty\n";
+ return false;
+ }
+
+ auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exit->end()));
+ if (!LastInst || LastInst->getOpcode() != VPInstruction::BranchOnCount) {
+ errs() << "VPlan vector loop exit must end with BranchOnCount "
+ "VPInstruction\n";
+ return false;
+ }
+
for (const VPRegionBlock *Region :
VPBlockUtils::blocksOnly<const VPRegionBlock>(
depth_first(VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index c0aedab2fed0..620d388199e0 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -881,7 +881,8 @@ static ScalarizationResult canScalarizeAccess(FixedVectorType *VecTy,
ConstantRange IdxRange(IntWidth, true);
if (isGuaranteedNotToBePoison(Idx, &AC)) {
- if (ValidIndices.contains(computeConstantRange(Idx, true, &AC, CtxI, &DT)))
+ if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
+ true, &AC, CtxI, &DT)))
return ScalarizationResult::safe();
return ScalarizationResult::unsafe();
}