aboutsummaryrefslogtreecommitdiff
path: root/lib/Transforms/Scalar
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-12-18 20:10:56 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-12-18 20:10:56 +0000
commit044eb2f6afba375a914ac9d8024f8f5142bb912e (patch)
tree1475247dc9f9fe5be155ebd4c9069c75aadf8c20 /lib/Transforms/Scalar
parenteb70dddbd77e120e5d490bd8fbe7ff3f8fa81c6b (diff)
Notes
Diffstat (limited to 'lib/Transforms/Scalar')
-rw-r--r--lib/Transforms/Scalar/ADCE.cpp142
-rw-r--r--lib/Transforms/Scalar/BDCE.cpp18
-rw-r--r--lib/Transforms/Scalar/CMakeLists.txt4
-rw-r--r--lib/Transforms/Scalar/CallSiteSplitting.cpp428
-rw-r--r--lib/Transforms/Scalar/ConstantHoisting.cpp57
-rw-r--r--lib/Transforms/Scalar/CorrelatedValuePropagation.cpp115
-rw-r--r--lib/Transforms/Scalar/DeadStoreElimination.cpp159
-rw-r--r--lib/Transforms/Scalar/DivRemPairs.cpp206
-rw-r--r--lib/Transforms/Scalar/EarlyCSE.cpp168
-rw-r--r--lib/Transforms/Scalar/GVN.cpp425
-rw-r--r--lib/Transforms/Scalar/GVNHoist.cpp794
-rw-r--r--lib/Transforms/Scalar/GVNSink.cpp129
-rw-r--r--lib/Transforms/Scalar/GuardWidening.cpp2
-rw-r--r--lib/Transforms/Scalar/IndVarSimplify.cpp129
-rw-r--r--lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp681
-rw-r--r--lib/Transforms/Scalar/InferAddressSpaces.cpp83
-rw-r--r--lib/Transforms/Scalar/JumpThreading.cpp212
-rw-r--r--lib/Transforms/Scalar/LICM.cpp666
-rw-r--r--lib/Transforms/Scalar/LoopDataPrefetch.cpp17
-rw-r--r--lib/Transforms/Scalar/LoopDeletion.cpp161
-rw-r--r--lib/Transforms/Scalar/LoopDistribute.cpp104
-rw-r--r--lib/Transforms/Scalar/LoopIdiomRecognize.cpp167
-rw-r--r--lib/Transforms/Scalar/LoopInstSimplify.cpp25
-rw-r--r--lib/Transforms/Scalar/LoopInterchange.cpp356
-rw-r--r--lib/Transforms/Scalar/LoopLoadElimination.cpp43
-rw-r--r--lib/Transforms/Scalar/LoopPredication.cpp498
-rw-r--r--lib/Transforms/Scalar/LoopRerollPass.cpp70
-rw-r--r--lib/Transforms/Scalar/LoopRotation.cpp157
-rw-r--r--lib/Transforms/Scalar/LoopStrengthReduce.cpp271
-rw-r--r--lib/Transforms/Scalar/LoopUnrollPass.cpp440
-rw-r--r--lib/Transforms/Scalar/LoopUnswitch.cpp188
-rw-r--r--lib/Transforms/Scalar/LoopVersioningLICM.cpp102
-rw-r--r--lib/Transforms/Scalar/LowerAtomic.cpp1
-rw-r--r--lib/Transforms/Scalar/MemCpyOptimizer.cpp41
-rw-r--r--lib/Transforms/Scalar/MergeICmps.cpp650
-rw-r--r--lib/Transforms/Scalar/MergedLoadStoreMotion.cpp4
-rw-r--r--lib/Transforms/Scalar/NaryReassociate.cpp35
-rw-r--r--lib/Transforms/Scalar/NewGVN.cpp803
-rw-r--r--lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp26
-rw-r--r--lib/Transforms/Scalar/PlaceSafepoints.cpp29
-rw-r--r--lib/Transforms/Scalar/Reassociate.cpp220
-rw-r--r--lib/Transforms/Scalar/RewriteStatepointsForGC.cpp290
-rw-r--r--lib/Transforms/Scalar/SCCP.cpp319
-rw-r--r--lib/Transforms/Scalar/SROA.cpp382
-rw-r--r--lib/Transforms/Scalar/Scalar.cpp14
-rw-r--r--lib/Transforms/Scalar/Scalarizer.cpp94
-rw-r--r--lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp62
-rw-r--r--lib/Transforms/Scalar/SimpleLoopUnswitch.cpp1515
-rw-r--r--lib/Transforms/Scalar/SimplifyCFGPass.cpp159
-rw-r--r--lib/Transforms/Scalar/Sink.cpp4
-rw-r--r--lib/Transforms/Scalar/SpeculateAroundPHIs.cpp811
-rw-r--r--lib/Transforms/Scalar/StraightLineStrengthReduce.cpp80
-rw-r--r--lib/Transforms/Scalar/StructurizeCFG.cpp60
-rw-r--r--lib/Transforms/Scalar/TailRecursionElimination.cpp71
54 files changed, 9667 insertions, 3020 deletions
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index 5b467dc9fe12..1e683db50206 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -15,8 +15,10 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/ADCE.h"
-
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
@@ -27,13 +29,29 @@
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Function.h"
#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include <cassert>
+#include <cstddef>
+#include <utility>
+
using namespace llvm;
#define DEBUG_TYPE "adce"
@@ -52,10 +70,12 @@ static cl::opt<bool> RemoveLoops("adce-remove-loops", cl::init(false),
cl::Hidden);
namespace {
+
/// Information about Instructions
struct InstInfoType {
/// True if the associated instruction is live.
bool Live = false;
+
/// Quick access to information for block containing associated Instruction.
struct BlockInfoType *Block = nullptr;
};
@@ -64,10 +84,13 @@ struct InstInfoType {
struct BlockInfoType {
/// True when this block contains a live instructions.
bool Live = false;
+
/// True when this block ends in an unconditional branch.
bool UnconditionalBranch = false;
+
/// True when this block is known to have live PHI nodes.
bool HasLivePhiNodes = false;
+
/// Control dependence sources need to be live for this block.
bool CFLive = false;
@@ -75,8 +98,6 @@ struct BlockInfoType {
/// holds the value &InstInfo[Terminator]
InstInfoType *TerminatorLiveInfo = nullptr;
- bool terminatorIsLive() const { return TerminatorLiveInfo->Live; }
-
/// Corresponding BasicBlock.
BasicBlock *BB = nullptr;
@@ -85,14 +106,21 @@ struct BlockInfoType {
/// Post-order numbering of reverse control flow graph.
unsigned PostOrder;
+
+ bool terminatorIsLive() const { return TerminatorLiveInfo->Live; }
};
class AggressiveDeadCodeElimination {
Function &F;
+
+ // ADCE does not use DominatorTree per se, but it updates it to preserve the
+ // analysis.
+ DominatorTree &DT;
PostDominatorTree &PDT;
/// Mapping of blocks to associated information, an element in BlockInfoVec.
- DenseMap<BasicBlock *, BlockInfoType> BlockInfo;
+ /// Use MapVector to get deterministic iteration order.
+ MapVector<BasicBlock *, BlockInfoType> BlockInfo;
bool isLive(BasicBlock *BB) { return BlockInfo[BB].Live; }
/// Mapping of instructions to associated information.
@@ -102,6 +130,7 @@ class AggressiveDeadCodeElimination {
/// Instructions known to be live where we need to mark
/// reaching definitions as live.
SmallVector<Instruction *, 128> Worklist;
+
/// Debug info scopes around a live instruction.
SmallPtrSet<const Metadata *, 32> AliveScopes;
@@ -116,15 +145,19 @@ class AggressiveDeadCodeElimination {
/// Set up auxiliary data structures for Instructions and BasicBlocks and
/// initialize the Worklist to the set of must-be-live Instruscions.
void initialize();
+
/// Return true for operations which are always treated as live.
bool isAlwaysLive(Instruction &I);
+
/// Return true for instrumentation instructions for value profiling.
bool isInstrumentsConstant(Instruction &I);
/// Propagate liveness to reaching definitions.
void markLiveInstructions();
+
/// Mark an instruction as live.
void markLive(Instruction *I);
+
/// Mark a block as live.
void markLive(BlockInfoType &BB);
void markLive(BasicBlock *BB) { markLive(BlockInfo[BB]); }
@@ -157,11 +190,14 @@ class AggressiveDeadCodeElimination {
void makeUnconditional(BasicBlock *BB, BasicBlock *Target);
public:
- AggressiveDeadCodeElimination(Function &F, PostDominatorTree &PDT)
- : F(F), PDT(PDT) {}
+ AggressiveDeadCodeElimination(Function &F, DominatorTree &DT,
+ PostDominatorTree &PDT)
+ : F(F), DT(DT), PDT(PDT) {}
+
bool performDeadCodeElimination();
};
-}
+
+} // end anonymous namespace
bool AggressiveDeadCodeElimination::performDeadCodeElimination() {
initialize();
@@ -175,7 +211,6 @@ static bool isUnconditionalBranch(TerminatorInst *Term) {
}
void AggressiveDeadCodeElimination::initialize() {
-
auto NumBlocks = F.size();
// We will have an entry in the map for each block so we grow the
@@ -217,7 +252,8 @@ void AggressiveDeadCodeElimination::initialize() {
// to recording which nodes have been visited we also record whether
// a node is currently on the "stack" of active ancestors of the current
// node.
- typedef DenseMap<BasicBlock *, bool> StatusMap ;
+ using StatusMap = DenseMap<BasicBlock *, bool>;
+
class DFState : public StatusMap {
public:
std::pair<StatusMap::iterator, bool> insert(BasicBlock *BB) {
@@ -253,27 +289,23 @@ void AggressiveDeadCodeElimination::initialize() {
}
}
- // Mark blocks live if there is no path from the block to the
- // return of the function or a successor for which this is true.
- // This protects IDFCalculator which cannot handle such blocks.
- for (auto &BBInfoPair : BlockInfo) {
- auto &BBInfo = BBInfoPair.second;
- if (BBInfo.terminatorIsLive())
- continue;
- auto *BB = BBInfo.BB;
- if (!PDT.getNode(BB)) {
- DEBUG(dbgs() << "Not post-dominated by return: " << BB->getName()
+ // Mark blocks live if there is no path from the block to a
+ // return of the function.
+ // We do this by seeing which of the postdomtree root children exit the
+ // program, and for all others, mark the subtree live.
+ for (auto &PDTChild : children<DomTreeNode *>(PDT.getRootNode())) {
+ auto *BB = PDTChild->getBlock();
+ auto &Info = BlockInfo[BB];
+ // Real function return
+ if (isa<ReturnInst>(Info.Terminator)) {
+ DEBUG(dbgs() << "post-dom root child is a return: " << BB->getName()
<< '\n';);
- markLive(BBInfo.Terminator);
continue;
}
- for (auto *Succ : successors(BB))
- if (!PDT.getNode(Succ)) {
- DEBUG(dbgs() << "Successor not post-dominated by return: "
- << BB->getName() << '\n';);
- markLive(BBInfo.Terminator);
- break;
- }
+
+ // This child is something else, like an infinite loop.
+ for (auto DFNode : depth_first(PDTChild))
+ markLive(BlockInfo[DFNode->getBlock()].Terminator);
}
// Treat the entry block as always live
@@ -318,7 +350,6 @@ bool AggressiveDeadCodeElimination::isInstrumentsConstant(Instruction &I) {
}
void AggressiveDeadCodeElimination::markLiveInstructions() {
-
// Propagate liveness backwards to operands.
do {
// Worklist holds newly discovered live instructions
@@ -343,7 +374,6 @@ void AggressiveDeadCodeElimination::markLiveInstructions() {
}
void AggressiveDeadCodeElimination::markLive(Instruction *I) {
-
auto &Info = InstInfo[I];
if (Info.Live)
return;
@@ -430,7 +460,6 @@ void AggressiveDeadCodeElimination::markPhiLive(PHINode *PN) {
}
void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() {
-
if (BlocksWithDeadTerminators.empty())
return;
@@ -469,7 +498,6 @@ void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() {
//
//===----------------------------------------------------------------------===//
bool AggressiveDeadCodeElimination::removeDeadInstructions() {
-
// Updates control and dataflow around dead blocks
updateDeadRegions();
@@ -527,7 +555,6 @@ bool AggressiveDeadCodeElimination::removeDeadInstructions() {
// A dead region is the set of dead blocks with a common live post-dominator.
void AggressiveDeadCodeElimination::updateDeadRegions() {
-
DEBUG({
dbgs() << "final dead terminator blocks: " << '\n';
for (auto *BB : BlocksWithDeadTerminators)
@@ -561,21 +588,40 @@ void AggressiveDeadCodeElimination::updateDeadRegions() {
}
assert((PreferredSucc && PreferredSucc->PostOrder > 0) &&
"Failed to find safe successor for dead branch");
+
+ // Collect removed successors to update the (Post)DominatorTrees.
+ SmallPtrSet<BasicBlock *, 4> RemovedSuccessors;
bool First = true;
for (auto *Succ : successors(BB)) {
- if (!First || Succ != PreferredSucc->BB)
+ if (!First || Succ != PreferredSucc->BB) {
Succ->removePredecessor(BB);
- else
+ RemovedSuccessors.insert(Succ);
+ } else
First = false;
}
makeUnconditional(BB, PreferredSucc->BB);
+
+ // Inform the dominators about the deleted CFG edges.
+ SmallVector<DominatorTree::UpdateType, 4> DeletedEdges;
+ for (auto *Succ : RemovedSuccessors) {
+ // It might have happened that the same successor appeared multiple times
+ // and the CFG edge wasn't really removed.
+ if (Succ != PreferredSucc->BB) {
+ DEBUG(dbgs() << "ADCE: (Post)DomTree edge enqueued for deletion"
+ << BB->getName() << " -> " << Succ->getName() << "\n");
+ DeletedEdges.push_back({DominatorTree::Delete, BB, Succ});
+ }
+ }
+
+ DT.applyUpdates(DeletedEdges);
+ PDT.applyUpdates(DeletedEdges);
+
NumBranchesRemoved += 1;
}
}
// reverse top-sort order
void AggressiveDeadCodeElimination::computeReversePostOrder() {
-
// This provides a post-order numbering of the reverse control flow graph
// Note that it is incomplete in the presence of infinite loops but we don't
// need numbers blocks which don't reach the end of the functions since
@@ -613,6 +659,9 @@ void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB,
InstInfo[NewTerm].Live = true;
if (const DILocation *DL = PredTerm->getDebugLoc())
NewTerm->setDebugLoc(DL);
+
+ InstInfo.erase(PredTerm);
+ PredTerm->eraseFromParent();
}
//===----------------------------------------------------------------------===//
@@ -621,19 +670,24 @@ void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB,
//
//===----------------------------------------------------------------------===//
PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &FAM) {
+ auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
auto &PDT = FAM.getResult<PostDominatorTreeAnalysis>(F);
- if (!AggressiveDeadCodeElimination(F, PDT).performDeadCodeElimination())
+ if (!AggressiveDeadCodeElimination(F, DT, PDT).performDeadCodeElimination())
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
PA.preserve<GlobalsAA>();
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<PostDominatorTreeAnalysis>();
return PA;
}
namespace {
+
struct ADCELegacyPass : public FunctionPass {
static char ID; // Pass identification, replacement for typeid
+
ADCELegacyPass() : FunctionPass(ID) {
initializeADCELegacyPassPass(*PassRegistry::getPassRegistry());
}
@@ -641,22 +695,34 @@ struct ADCELegacyPass : public FunctionPass {
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
+
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
- return AggressiveDeadCodeElimination(F, PDT).performDeadCodeElimination();
+ return AggressiveDeadCodeElimination(F, DT, PDT)
+ .performDeadCodeElimination();
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // We require DominatorTree here only to update and thus preserve it.
+ AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<PostDominatorTreeWrapperPass>();
if (!RemoveControlFlowFlag)
AU.setPreservesCFG();
+ else {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<PostDominatorTreeWrapperPass>();
+ }
AU.addPreserved<GlobalsAAWrapperPass>();
}
};
-}
+
+} // end anonymous namespace
char ADCELegacyPass::ID = 0;
+
INITIALIZE_PASS_BEGIN(ADCELegacyPass, "adce",
"Aggressive Dead Code Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
INITIALIZE_PASS_END(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination",
false, false)
diff --git a/lib/Transforms/Scalar/BDCE.cpp b/lib/Transforms/Scalar/BDCE.cpp
index 2e5618686ec2..851efa000f65 100644
--- a/lib/Transforms/Scalar/BDCE.cpp
+++ b/lib/Transforms/Scalar/BDCE.cpp
@@ -20,11 +20,8 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/IR/CFG.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Operator.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -48,8 +45,18 @@ static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) {
// If all bits of a user are demanded, then we know that nothing below that
// in the def-use chain needs to be changed.
auto *J = dyn_cast<Instruction>(JU);
- if (J && !DB.getDemandedBits(J).isAllOnesValue())
+ if (J && J->getType()->isSized() &&
+ !DB.getDemandedBits(J).isAllOnesValue())
WorkList.push_back(J);
+
+ // Note that we need to check for unsized types above before asking for
+ // demanded bits. Normally, the only way to reach an instruction with an
+ // unsized type is via an instruction that has side effects (or otherwise
+ // will demand its input bits). However, if we have a readnone function
+ // that returns an unsized type (e.g., void), we must avoid asking for the
+ // demanded bits of the function call's return value. A void-returning
+ // readnone function is always dead (and so we can stop walking the use/def
+ // chain here), but the check is necessary to avoid asserting.
}
// DFS through subsequent users while tracking visits to avoid cycles.
@@ -70,7 +77,8 @@ static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) {
// If all bits of a user are demanded, then we know that nothing below
// that in the def-use chain needs to be changed.
auto *K = dyn_cast<Instruction>(KU);
- if (K && !Visited.count(K) && !DB.getDemandedBits(K).isAllOnesValue())
+ if (K && !Visited.count(K) && K->getType()->isSized() &&
+ !DB.getDemandedBits(K).isAllOnesValue())
WorkList.push_back(K);
}
}
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index 457c9427ab9a..0562d3882f8b 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -2,11 +2,13 @@ add_llvm_library(LLVMScalarOpts
ADCE.cpp
AlignmentFromAssumptions.cpp
BDCE.cpp
+ CallSiteSplitting.cpp
ConstantHoisting.cpp
ConstantProp.cpp
CorrelatedValuePropagation.cpp
DCE.cpp
DeadStoreElimination.cpp
+ DivRemPairs.cpp
EarlyCSE.cpp
FlattenCFGPass.cpp
Float2Int.cpp
@@ -42,6 +44,7 @@ add_llvm_library(LLVMScalarOpts
LowerExpectIntrinsic.cpp
LowerGuardIntrinsic.cpp
MemCpyOptimizer.cpp
+ MergeICmps.cpp
MergedLoadStoreMotion.cpp
NaryReassociate.cpp
NewGVN.cpp
@@ -59,6 +62,7 @@ add_llvm_library(LLVMScalarOpts
SimplifyCFGPass.cpp
Sink.cpp
SpeculativeExecution.cpp
+ SpeculateAroundPHIs.cpp
StraightLineStrengthReduce.cpp
StructurizeCFG.cpp
TailRecursionElimination.cpp
diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp
new file mode 100644
index 000000000000..d8c408035038
--- /dev/null
+++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -0,0 +1,428 @@
+//===- CallSiteSplitting.cpp ----------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a transformation that tries to split a call-site to pass
+// more constrained arguments if its argument is predicated in the control flow
+// so that we can expose better context to the later passes (e.g, inliner, jump
+// threading, or IPA-CP based function cloning, etc.).
+// As of now we support two cases :
+//
+// 1) If a call site is dominated by an OR condition and if any of its arguments
+// are predicated on this OR condition, try to split the condition with more
+// constrained arguments. For example, in the code below, we try to split the
+// call site since we can predicate the argument(ptr) based on the OR condition.
+//
+// Split from :
+// if (!ptr || c)
+// callee(ptr);
+// to :
+// if (!ptr)
+// callee(null) // set the known constant value
+// else if (c)
+// callee(nonnull ptr) // set non-null attribute in the argument
+//
+// 2) We can also split a call-site based on constant incoming values of a PHI
+// For example,
+// from :
+// Header:
+// %c = icmp eq i32 %i1, %i2
+// br i1 %c, label %Tail, label %TBB
+// TBB:
+// br label Tail%
+// Tail:
+// %p = phi i32 [ 0, %Header], [ 1, %TBB]
+// call void @bar(i32 %p)
+// to
+// Header:
+// %c = icmp eq i32 %i1, %i2
+// br i1 %c, label %Tail-split0, label %TBB
+// TBB:
+// br label %Tail-split1
+// Tail-split0:
+// call void @bar(i32 0)
+// br label %Tail
+// Tail-split1:
+// call void @bar(i32 1)
+// br label %Tail
+// Tail:
+// %p = phi i32 [ 0, %Tail-split0 ], [ 1, %Tail-split1 ]
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/CallSiteSplitting.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "callsite-splitting"
+
+STATISTIC(NumCallSiteSplit, "Number of call-site split");
+
+static void addNonNullAttribute(Instruction *CallI, Instruction *NewCallI,
+ Value *Op) {
+ CallSite CS(NewCallI);
+ unsigned ArgNo = 0;
+ for (auto &I : CS.args()) {
+ if (&*I == Op)
+ CS.addParamAttr(ArgNo, Attribute::NonNull);
+ ++ArgNo;
+ }
+}
+
+static void setConstantInArgument(Instruction *CallI, Instruction *NewCallI,
+ Value *Op, Constant *ConstValue) {
+ CallSite CS(NewCallI);
+ unsigned ArgNo = 0;
+ for (auto &I : CS.args()) {
+ if (&*I == Op)
+ CS.setArgument(ArgNo, ConstValue);
+ ++ArgNo;
+ }
+}
+
+static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) {
+ assert(isa<Constant>(Cmp->getOperand(1)) && "Expected a constant operand.");
+ Value *Op0 = Cmp->getOperand(0);
+ unsigned ArgNo = 0;
+ for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
+ ++I, ++ArgNo) {
+ // Don't consider constant or arguments that are already known non-null.
+ if (isa<Constant>(*I) || CS.paramHasAttr(ArgNo, Attribute::NonNull))
+ continue;
+
+ if (*I == Op0)
+ return true;
+ }
+ return false;
+}
+
+/// If From has a conditional jump to To, add the condition to Conditions,
+/// if it is relevant to any argument at CS.
+static void
+recordCondition(const CallSite &CS, BasicBlock *From, BasicBlock *To,
+ SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
+ auto *BI = dyn_cast<BranchInst>(From->getTerminator());
+ if (!BI || !BI->isConditional())
+ return;
+
+ CmpInst::Predicate Pred;
+ Value *Cond = BI->getCondition();
+ if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant())))
+ return;
+
+ ICmpInst *Cmp = cast<ICmpInst>(Cond);
+ if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)
+ if (isCondRelevantToAnyCallArgument(Cmp, CS))
+ Conditions.push_back({Cmp, From->getTerminator()->getSuccessor(0) == To
+ ? Pred
+ : Cmp->getInversePredicate()});
+}
+
+/// Record ICmp conditions relevant to any argument in CS following Pred's
+/// single successors. If there are conflicting conditions along a path, like
+/// x == 1 and x == 0, the first condition will be used.
+static void
+recordConditions(const CallSite &CS, BasicBlock *Pred,
+ SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
+ recordCondition(CS, Pred, CS.getInstruction()->getParent(), Conditions);
+ BasicBlock *From = Pred;
+ BasicBlock *To = Pred;
+ SmallPtrSet<BasicBlock *, 4> Visited = {From};
+ while (!Visited.count(From->getSinglePredecessor()) &&
+ (From = From->getSinglePredecessor())) {
+ recordCondition(CS, From, To, Conditions);
+ To = From;
+ }
+}
+
+static Instruction *
+addConditions(CallSite &CS,
+ SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
+ if (Conditions.empty())
+ return nullptr;
+
+ Instruction *NewCI = CS.getInstruction()->clone();
+ for (auto &Cond : Conditions) {
+ Value *Arg = Cond.first->getOperand(0);
+ Constant *ConstVal = cast<Constant>(Cond.first->getOperand(1));
+ if (Cond.second == ICmpInst::ICMP_EQ)
+ setConstantInArgument(CS.getInstruction(), NewCI, Arg, ConstVal);
+ else if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) {
+ assert(Cond.second == ICmpInst::ICMP_NE);
+ addNonNullAttribute(CS.getInstruction(), NewCI, Arg);
+ }
+ }
+ return NewCI;
+}
+
+static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) {
+ SmallVector<BasicBlock *, 2> Preds(predecessors((BB)));
+ assert(Preds.size() == 2 && "Expected exactly 2 predecessors!");
+ return Preds;
+}
+
+static bool canSplitCallSite(CallSite CS) {
+ // FIXME: As of now we handle only CallInst. InvokeInst could be handled
+ // without too much effort.
+ Instruction *Instr = CS.getInstruction();
+ if (!isa<CallInst>(Instr))
+ return false;
+
+ // Allow splitting a call-site only when there is no instruction before the
+ // call-site in the basic block. Based on this constraint, we only clone the
+ // call instruction, and we do not move a call-site across any other
+ // instruction.
+ BasicBlock *CallSiteBB = Instr->getParent();
+ if (Instr != CallSiteBB->getFirstNonPHIOrDbg())
+ return false;
+
+ // Need 2 predecessors and cannot split an edge from an IndirectBrInst.
+ SmallVector<BasicBlock *, 2> Preds(predecessors(CallSiteBB));
+ if (Preds.size() != 2 || isa<IndirectBrInst>(Preds[0]->getTerminator()) ||
+ isa<IndirectBrInst>(Preds[1]->getTerminator()))
+ return false;
+
+ return CallSiteBB->canSplitPredecessors();
+}
+
+/// Return true if the CS is split into its new predecessors which are directly
+/// hooked to each of its orignial predecessors pointed by PredBB1 and PredBB2.
+/// In OR predicated case, PredBB1 will point the header, and PredBB2 will point
+/// to the second compare block. CallInst1 and CallInst2 will be the new
+/// call-sites placed in the new predecessors split for PredBB1 and PredBB2,
+/// repectively. Therefore, CallInst1 will be the call-site placed
+/// between Header and Tail, and CallInst2 will be the call-site between TBB and
+/// Tail. For example, in the IR below with an OR condition, the call-site can
+/// be split
+///
+/// from :
+///
+/// Header:
+/// %c = icmp eq i32* %a, null
+/// br i1 %c %Tail, %TBB
+/// TBB:
+/// %c2 = icmp eq i32* %b, null
+/// br i1 %c %Tail, %End
+/// Tail:
+/// %ca = call i1 @callee (i32* %a, i32* %b)
+///
+/// to :
+///
+/// Header: // PredBB1 is Header
+/// %c = icmp eq i32* %a, null
+/// br i1 %c %Tail-split1, %TBB
+/// TBB: // PredBB2 is TBB
+/// %c2 = icmp eq i32* %b, null
+/// br i1 %c %Tail-split2, %End
+/// Tail-split1:
+/// %ca1 = call @callee (i32* null, i32* %b) // CallInst1
+/// br %Tail
+/// Tail-split2:
+/// %ca2 = call @callee (i32* nonnull %a, i32* null) // CallInst2
+/// br %Tail
+/// Tail:
+/// %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2]
+///
+/// Note that for an OR predicated case, CallInst1 and CallInst2 should be
+/// created with more constrained arguments in
+/// createCallSitesOnOrPredicatedArgument().
+static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2,
+ Instruction *CallInst1, Instruction *CallInst2) {
+ Instruction *Instr = CS.getInstruction();
+ BasicBlock *TailBB = Instr->getParent();
+ assert(Instr == (TailBB->getFirstNonPHIOrDbg()) && "Unexpected call-site");
+
+ BasicBlock *SplitBlock1 =
+ SplitBlockPredecessors(TailBB, PredBB1, ".predBB1.split");
+ BasicBlock *SplitBlock2 =
+ SplitBlockPredecessors(TailBB, PredBB2, ".predBB2.split");
+
+ assert((SplitBlock1 && SplitBlock2) && "Unexpected new basic block split.");
+
+ if (!CallInst1)
+ CallInst1 = Instr->clone();
+ if (!CallInst2)
+ CallInst2 = Instr->clone();
+
+ CallInst1->insertBefore(&*SplitBlock1->getFirstInsertionPt());
+ CallInst2->insertBefore(&*SplitBlock2->getFirstInsertionPt());
+
+ CallSite CS1(CallInst1);
+ CallSite CS2(CallInst2);
+
+ // Handle PHIs used as arguments in the call-site.
+ for (auto &PI : *TailBB) {
+ PHINode *PN = dyn_cast<PHINode>(&PI);
+ if (!PN)
+ break;
+ unsigned ArgNo = 0;
+ for (auto &CI : CS.args()) {
+ if (&*CI == PN) {
+ CS1.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock1));
+ CS2.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock2));
+ }
+ ++ArgNo;
+ }
+ }
+
+ // Replace users of the original call with a PHI mering call-sites split.
+ if (Instr->getNumUses()) {
+ PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call",
+ TailBB->getFirstNonPHI());
+ PN->addIncoming(CallInst1, SplitBlock1);
+ PN->addIncoming(CallInst2, SplitBlock2);
+ Instr->replaceAllUsesWith(PN);
+ }
+ DEBUG(dbgs() << "split call-site : " << *Instr << " into \n");
+ DEBUG(dbgs() << " " << *CallInst1 << " in " << SplitBlock1->getName()
+ << "\n");
+ DEBUG(dbgs() << " " << *CallInst2 << " in " << SplitBlock2->getName()
+ << "\n");
+ Instr->eraseFromParent();
+ NumCallSiteSplit++;
+}
+
+// Return true if the call-site has an argument which is a PHI with only
+// constant incoming values.
+static bool isPredicatedOnPHI(CallSite CS) {
+ Instruction *Instr = CS.getInstruction();
+ BasicBlock *Parent = Instr->getParent();
+ if (Instr != Parent->getFirstNonPHIOrDbg())
+ return false;
+
+ for (auto &BI : *Parent) {
+ if (PHINode *PN = dyn_cast<PHINode>(&BI)) {
+ for (auto &I : CS.args())
+ if (&*I == PN) {
+ assert(PN->getNumIncomingValues() == 2 &&
+ "Unexpected number of incoming values");
+ if (PN->getIncomingBlock(0) == PN->getIncomingBlock(1))
+ return false;
+ if (PN->getIncomingValue(0) == PN->getIncomingValue(1))
+ continue;
+ if (isa<Constant>(PN->getIncomingValue(0)) &&
+ isa<Constant>(PN->getIncomingValue(1)))
+ return true;
+ }
+ }
+ break;
+ }
+ return false;
+}
+
+static bool tryToSplitOnPHIPredicatedArgument(CallSite CS) {
+ if (!isPredicatedOnPHI(CS))
+ return false;
+
+ auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
+ splitCallSite(CS, Preds[0], Preds[1], nullptr, nullptr);
+ return true;
+}
+// Check if one of the predecessors is a single predecessors of the other.
+// This is a requirement for control flow modeling an OR. HeaderBB points to
+// the single predecessor and OrBB points to other node. HeaderBB potentially
+// contains the first compare of the OR and OrBB the second.
+static bool isOrHeader(BasicBlock *HeaderBB, BasicBlock *OrBB) {
+ return OrBB->getSinglePredecessor() == HeaderBB &&
+ HeaderBB->getTerminator()->getNumSuccessors() == 2;
+}
+
+static bool tryToSplitOnOrPredicatedArgument(CallSite CS) {
+ auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
+ if (!isOrHeader(Preds[0], Preds[1]) && !isOrHeader(Preds[1], Preds[0]))
+ return false;
+
+ SmallVector<std::pair<ICmpInst *, unsigned>, 2> C1, C2;
+ recordConditions(CS, Preds[0], C1);
+ recordConditions(CS, Preds[1], C2);
+
+ Instruction *CallInst1 = addConditions(CS, C1);
+ Instruction *CallInst2 = addConditions(CS, C2);
+ if (!CallInst1 && !CallInst2)
+ return false;
+
+ splitCallSite(CS, Preds[1], Preds[0], CallInst2, CallInst1);
+ return true;
+}
+
+static bool tryToSplitCallSite(CallSite CS) {
+ if (!CS.arg_size() || !canSplitCallSite(CS))
+ return false;
+ return tryToSplitOnOrPredicatedArgument(CS) ||
+ tryToSplitOnPHIPredicatedArgument(CS);
+}
+
+static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) {
+ bool Changed = false;
+ for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) {
+ BasicBlock &BB = *BI++;
+ for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) {
+ Instruction *I = &*II++;
+ CallSite CS(cast<Value>(I));
+ if (!CS || isa<IntrinsicInst>(I) || isInstructionTriviallyDead(I, &TLI))
+ continue;
+
+ Function *Callee = CS.getCalledFunction();
+ if (!Callee || Callee->isDeclaration())
+ continue;
+ Changed |= tryToSplitCallSite(CS);
+ }
+ }
+ return Changed;
+}
+
+namespace {
+struct CallSiteSplittingLegacyPass : public FunctionPass {
+ static char ID;
+ CallSiteSplittingLegacyPass() : FunctionPass(ID) {
+ initializeCallSiteSplittingLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ return doCallSiteSplitting(F, TLI);
+ }
+};
+} // namespace
+
+char CallSiteSplittingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting",
+ "Call-site splitting", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting",
+ "Call-site splitting", false, false)
+FunctionPass *llvm::createCallSiteSplittingPass() {
+ return new CallSiteSplittingLegacyPass();
+}
+
+PreservedAnalyses CallSiteSplittingPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+
+ if (!doCallSiteSplitting(F, TLI))
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ return PA;
+}
diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp
index 122c9314e022..e4b08c5ed305 100644
--- a/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -34,18 +34,39 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/ConstantHoisting.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
#include <tuple>
+#include <utility>
using namespace llvm;
using namespace consthoist;
@@ -62,10 +83,12 @@ static cl::opt<bool> ConstHoistWithBlockFrequency(
"without hoisting."));
namespace {
+
/// \brief The constant hoisting pass.
class ConstantHoistingLegacyPass : public FunctionPass {
public:
static char ID; // Pass identification, replacement for typeid
+
ConstantHoistingLegacyPass() : FunctionPass(ID) {
initializeConstantHoistingLegacyPassPass(*PassRegistry::getPassRegistry());
}
@@ -87,9 +110,11 @@ public:
private:
ConstantHoistingPass Impl;
};
-}
+
+} // end anonymous namespace
char ConstantHoistingLegacyPass::ID = 0;
+
INITIALIZE_PASS_BEGIN(ConstantHoistingLegacyPass, "consthoist",
"Constant Hoisting", false, false)
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
@@ -128,7 +153,6 @@ bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
return MadeChange;
}
-
/// \brief Find the constant materialization insertion point.
Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
unsigned Idx) const {
@@ -217,8 +241,9 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
}
// Visit Orders in bottom-up order.
- typedef std::pair<SmallPtrSet<BasicBlock *, 16>, BlockFrequency>
- InsertPtsCostPair;
+ using InsertPtsCostPair =
+ std::pair<SmallPtrSet<BasicBlock *, 16>, BlockFrequency>;
+
// InsertPtsMap is a map from a BB to the best insertion points for the
// subtree of BB (subtree not including the BB itself).
DenseMap<BasicBlock *, InsertPtsCostPair> InsertPtsMap;
@@ -310,7 +335,6 @@ SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
return InsertPts;
}
-
/// \brief Record constant integer ConstInt for instruction Inst at operand
/// index Idx.
///
@@ -351,7 +375,6 @@ void ConstantHoistingPass::collectConstantCandidates(
}
}
-
/// \brief Check the operand for instruction Inst at index Idx.
void ConstantHoistingPass::collectConstantCandidates(
ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) {
@@ -393,7 +416,6 @@ void ConstantHoistingPass::collectConstantCandidates(
}
}
-
/// \brief Scan the instruction for expensive integer constants and record them
/// in the constant candidate vector.
void ConstantHoistingPass::collectConstantCandidates(
@@ -427,9 +449,8 @@ void ConstantHoistingPass::collectConstantCandidates(Function &Fn) {
// bit widths (APInt Operator- does not like that). If the value cannot be
// represented in uint64 we return an "empty" APInt. This is then interpreted
// as the value is not in range.
-static llvm::Optional<APInt> calculateOffsetDiff(const APInt &V1,
- const APInt &V2) {
- llvm::Optional<APInt> Res = None;
+static Optional<APInt> calculateOffsetDiff(const APInt &V1, const APInt &V2) {
+ Optional<APInt> Res = None;
unsigned BW = V1.getBitWidth() > V2.getBitWidth() ?
V1.getBitWidth() : V2.getBitWidth();
uint64_t LimVal1 = V1.getLimitedValue();
@@ -496,9 +517,9 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
DEBUG(dbgs() << "Cost: " << Cost << "\n");
for (auto C2 = S; C2 != E; ++C2) {
- llvm::Optional<APInt> Diff = calculateOffsetDiff(
- C2->ConstInt->getValue(),
- ConstCand->ConstInt->getValue());
+ Optional<APInt> Diff = calculateOffsetDiff(
+ C2->ConstInt->getValue(),
+ ConstCand->ConstInt->getValue());
if (Diff) {
const int ImmCosts =
TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.getValue(), Ty);
@@ -696,6 +717,9 @@ bool ConstantHoistingPass::emitBaseConstants() {
IntegerType *Ty = ConstInfo.BaseConstant->getType();
Instruction *Base =
new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP);
+
+ Base->setDebugLoc(IP->getDebugLoc());
+
DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant
<< ") to BB " << IP->getParent()->getName() << '\n'
<< *Base << '\n');
@@ -714,6 +738,8 @@ bool ConstantHoistingPass::emitBaseConstants() {
emitBaseConstants(Base, RCI.Offset, U);
ReBasesNum++;
}
+
+ Base->setDebugLoc(DILocation::getMergedLocation(Base->getDebugLoc(), U.Inst->getDebugLoc()));
}
}
UsesNum = Uses;
@@ -722,7 +748,6 @@ bool ConstantHoistingPass::emitBaseConstants() {
assert(!Base->use_empty() && "The use list is empty!?");
assert(isa<Instruction>(Base->user_back()) &&
"All uses should be instructions.");
- Base->setDebugLoc(cast<Instruction>(Base->user_back())->getDebugLoc());
}
(void)UsesNum;
(void)ReBasesNum;
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 28157783daa7..8f468ebf8949 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -12,22 +12,41 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <utility>
+
using namespace llvm;
#define DEBUG_TYPE "correlated-value-propagation"
@@ -41,13 +60,16 @@ STATISTIC(NumDeadCases, "Number of switch cases removed");
STATISTIC(NumSDivs, "Number of sdiv converted to udiv");
STATISTIC(NumAShrs, "Number of ashr converted to lshr");
STATISTIC(NumSRems, "Number of srem converted to urem");
+STATISTIC(NumOverflows, "Number of overflow checks removed");
static cl::opt<bool> DontProcessAdds("cvp-dont-process-adds", cl::init(true));
namespace {
+
class CorrelatedValuePropagation : public FunctionPass {
public:
static char ID;
+
CorrelatedValuePropagation(): FunctionPass(ID) {
initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry());
}
@@ -59,9 +81,11 @@ namespace {
AU.addPreserved<GlobalsAAWrapperPass>();
}
};
-}
+
+} // end anonymous namespace
char CorrelatedValuePropagation::ID = 0;
+
INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation",
"Value Propagation", false, false)
INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
@@ -302,11 +326,72 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
return Changed;
}
+// See if we can prove that the given overflow intrinsic will not overflow.
+static bool willNotOverflow(IntrinsicInst *II, LazyValueInfo *LVI) {
+ using OBO = OverflowingBinaryOperator;
+ auto NoWrap = [&] (Instruction::BinaryOps BinOp, unsigned NoWrapKind) {
+ Value *RHS = II->getOperand(1);
+ ConstantRange RRange = LVI->getConstantRange(RHS, II->getParent(), II);
+ ConstantRange NWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
+ BinOp, RRange, NoWrapKind);
+ // As an optimization, do not compute LRange if we do not need it.
+ if (NWRegion.isEmptySet())
+ return false;
+ Value *LHS = II->getOperand(0);
+ ConstantRange LRange = LVI->getConstantRange(LHS, II->getParent(), II);
+ return NWRegion.contains(LRange);
+ };
+ switch (II->getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::uadd_with_overflow:
+ return NoWrap(Instruction::Add, OBO::NoUnsignedWrap);
+ case Intrinsic::sadd_with_overflow:
+ return NoWrap(Instruction::Add, OBO::NoSignedWrap);
+ case Intrinsic::usub_with_overflow:
+ return NoWrap(Instruction::Sub, OBO::NoUnsignedWrap);
+ case Intrinsic::ssub_with_overflow:
+ return NoWrap(Instruction::Sub, OBO::NoSignedWrap);
+ }
+ return false;
+}
+
+static void processOverflowIntrinsic(IntrinsicInst *II) {
+ Value *NewOp = nullptr;
+ switch (II->getIntrinsicID()) {
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::sadd_with_overflow:
+ NewOp = BinaryOperator::CreateAdd(II->getOperand(0), II->getOperand(1),
+ II->getName(), II);
+ break;
+ case Intrinsic::usub_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ NewOp = BinaryOperator::CreateSub(II->getOperand(0), II->getOperand(1),
+ II->getName(), II);
+ break;
+ }
+ ++NumOverflows;
+ IRBuilder<> B(II);
+ Value *NewI = B.CreateInsertValue(UndefValue::get(II->getType()), NewOp, 0);
+ NewI = B.CreateInsertValue(NewI, ConstantInt::getFalse(II->getContext()), 1);
+ II->replaceAllUsesWith(NewI);
+ II->eraseFromParent();
+}
+
/// Infer nonnull attributes for the arguments at the specified callsite.
static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
SmallVector<unsigned, 4> ArgNos;
unsigned ArgNo = 0;
+ if (auto *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
+ if (willNotOverflow(II, LVI)) {
+ processOverflowIntrinsic(II);
+ return true;
+ }
+ }
+
for (Value *V : CS.args()) {
PointerType *Type = dyn_cast<PointerType>(V->getType());
// Try to mark pointer typed parameters as non-null. We skip the
@@ -335,18 +420,6 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
return true;
}
-// Helper function to rewrite srem and sdiv. As a policy choice, we choose not
-// to waste compile time on anything where the operands are local defs. While
-// LVI can sometimes reason about such cases, it's not its primary purpose.
-static bool hasLocalDefs(BinaryOperator *SDI) {
- for (Value *O : SDI->operands()) {
- auto *I = dyn_cast<Instruction>(O);
- if (I && I->getParent() == SDI->getParent())
- return true;
- }
- return false;
-}
-
static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) {
Constant *Zero = ConstantInt::get(SDI->getType(), 0);
for (Value *O : SDI->operands()) {
@@ -358,7 +431,7 @@ static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) {
}
static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
- if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI) ||
+ if (SDI->getType()->isVectorTy() ||
!hasPositiveOperands(SDI, LVI))
return false;
@@ -376,7 +449,7 @@ static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
/// conditions, this can sometimes prove conditions instcombine can't by
/// exploiting range information.
static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
- if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI) ||
+ if (SDI->getType()->isVectorTy() ||
!hasPositiveOperands(SDI, LVI))
return false;
@@ -391,7 +464,7 @@ static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
}
static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
- if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI))
+ if (SDI->getType()->isVectorTy())
return false;
Constant *Zero = ConstantInt::get(SDI->getType(), 0);
@@ -410,12 +483,12 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
}
static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
- typedef OverflowingBinaryOperator OBO;
+ using OBO = OverflowingBinaryOperator;
if (DontProcessAdds)
return false;
- if (AddOp->getType()->isVectorTy() || hasLocalDefs(AddOp))
+ if (AddOp->getType()->isVectorTy())
return false;
bool NSW = AddOp->hasNoSignedWrap();
@@ -492,7 +565,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
// blocks before querying later blocks (which require us to analyze early
// blocks). Eagerly simplifying shallow blocks means there is strictly less
// work to do for deep blocks. This also means we don't visit unreachable
- // blocks.
+ // blocks.
for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
bool BBChanged = false;
for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 1ec38e56aa4c..e703014bb0e6 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -16,31 +16,55 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/DeadStoreElimination.h"
+#include "llvm/ADT/APInt.h"
#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstddef>
+#include <iterator>
#include <map>
+#include <utility>
+
using namespace llvm;
#define DEBUG_TYPE "dse"
@@ -49,18 +73,23 @@ STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
STATISTIC(NumFastStores, "Number of stores deleted");
STATISTIC(NumFastOther , "Number of other instrs removed");
STATISTIC(NumCompletePartials, "Number of stores dead by later partials");
+STATISTIC(NumModifiedStores, "Number of stores modified");
static cl::opt<bool>
EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking",
cl::init(true), cl::Hidden,
cl::desc("Enable partial-overwrite tracking in DSE"));
+static cl::opt<bool>
+EnablePartialStoreMerging("enable-dse-partial-store-merging",
+ cl::init(true), cl::Hidden,
+ cl::desc("Enable partial store merging in DSE"));
//===----------------------------------------------------------------------===//
// Helper functions
//===----------------------------------------------------------------------===//
-typedef std::map<int64_t, int64_t> OverlapIntervalsTy;
-typedef DenseMap<Instruction *, OverlapIntervalsTy> InstOverlapIntervalsTy;
+using OverlapIntervalsTy = std::map<int64_t, int64_t>;
+using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>;
/// Delete this instruction. Before we do, go through and zero out all the
/// operands of this instruction. If any of them become dead, delete them and
@@ -209,7 +238,6 @@ static bool isRemovable(Instruction *I) {
case Intrinsic::init_trampoline:
// Always safe to remove init_trampoline.
return true;
-
case Intrinsic::memset:
case Intrinsic::memmove:
case Intrinsic::memcpy:
@@ -224,7 +252,6 @@ static bool isRemovable(Instruction *I) {
return false;
}
-
/// Returns true if the end of this instruction can be safely shortened in
/// length.
static bool isShortenableAtTheEnd(Instruction *I) {
@@ -287,14 +314,24 @@ static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
}
namespace {
-enum OverwriteResult { OW_Begin, OW_Complete, OW_End, OW_Unknown };
-}
+
+enum OverwriteResult {
+ OW_Begin,
+ OW_Complete,
+ OW_End,
+ OW_PartialEarlierWithFullLater,
+ OW_Unknown
+};
+
+} // end anonymous namespace
/// Return 'OW_Complete' if a store to the 'Later' location completely
/// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the
/// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the
-/// beginning of the 'Earlier' location is overwritten by 'Later', or
-/// 'OW_Unknown' if nothing can be determined.
+/// beginning of the 'Earlier' location is overwritten by 'Later'.
+/// 'OW_PartialEarlierWithFullLater' means that an earlier (big) store was
+/// overwritten by a latter (smaller) store which doesn't write outside the big
+/// store's memory locations. Returns 'OW_Unknown' if nothing can be determined.
static OverwriteResult isOverwrite(const MemoryLocation &Later,
const MemoryLocation &Earlier,
const DataLayout &DL,
@@ -427,6 +464,19 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
}
}
+ // Check for an earlier store which writes to all the memory locations that
+ // the later store writes to.
+ if (EnablePartialStoreMerging && LaterOff >= EarlierOff &&
+ int64_t(EarlierOff + Earlier.Size) > LaterOff &&
+ uint64_t(LaterOff - EarlierOff) + Later.Size <= Earlier.Size) {
+ DEBUG(dbgs() << "DSE: Partial overwrite an earlier load [" << EarlierOff
+ << ", " << int64_t(EarlierOff + Earlier.Size)
+ << ") by a later store [" << LaterOff << ", "
+ << int64_t(LaterOff + Later.Size) << ")\n");
+ // TODO: Maybe come up with a better name?
+ return OW_PartialEarlierWithFullLater;
+ }
+
// Another interesting case is if the later store overwrites the end of the
// earlier store.
//
@@ -544,11 +594,9 @@ static bool memoryIsNotModifiedBetween(Instruction *FirstI,
}
for (; BI != EI; ++BI) {
Instruction *I = &*BI;
- if (I->mayWriteToMemory() && I != SecondI) {
- auto Res = AA->getModRefInfo(I, MemLoc);
- if (Res & MRI_Mod)
+ if (I->mayWriteToMemory() && I != SecondI)
+ if (isModSet(AA->getModRefInfo(I, MemLoc)))
return false;
- }
}
if (B != FirstBB) {
assert(B != &FirstBB->getParent()->getEntryBlock() &&
@@ -772,9 +820,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
// the call is live.
DeadStackObjects.remove_if([&](Value *I) {
// See if the call site touches the value.
- ModRefInfo A = AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI));
-
- return A == MRI_ModRef || A == MRI_Ref;
+ return isRefSet(AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI)));
});
// If all of the allocas were clobbered by the call then we're not going
@@ -840,7 +886,7 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
if (!IsOverwriteEnd)
LaterOffset = int64_t(LaterOffset + LaterSize);
- if (!(llvm::isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) &&
+ if (!(isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) &&
!((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0))
return false;
@@ -1094,6 +1140,8 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
// If we find a write that is a) removable (i.e., non-volatile), b) is
// completely obliterated by the store to 'Loc', and c) which we know that
// 'Inst' doesn't load from, then we can remove it.
+ // Also try to merge two stores if a later one only touches memory written
+ // to by the earlier one.
if (isRemovable(DepWrite) &&
!isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
int64_t InstWriteOffset, DepWriteOffset;
@@ -1123,6 +1171,72 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
bool IsOverwriteEnd = (OR == OW_End);
MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize,
InstWriteOffset, LaterSize, IsOverwriteEnd);
+ } else if (EnablePartialStoreMerging &&
+ OR == OW_PartialEarlierWithFullLater) {
+ auto *Earlier = dyn_cast<StoreInst>(DepWrite);
+ auto *Later = dyn_cast<StoreInst>(Inst);
+ if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) &&
+ Later && isa<ConstantInt>(Later->getValueOperand())) {
+ // If the store we find is:
+ // a) partially overwritten by the store to 'Loc'
+ // b) the later store is fully contained in the earlier one and
+ // c) they both have a constant value
+ // Merge the two stores, replacing the earlier store's value with a
+ // merge of both values.
+ // TODO: Deal with other constant types (vectors, etc), and probably
+ // some mem intrinsics (if needed)
+
+ APInt EarlierValue =
+ cast<ConstantInt>(Earlier->getValueOperand())->getValue();
+ APInt LaterValue =
+ cast<ConstantInt>(Later->getValueOperand())->getValue();
+ unsigned LaterBits = LaterValue.getBitWidth();
+ assert(EarlierValue.getBitWidth() > LaterValue.getBitWidth());
+ LaterValue = LaterValue.zext(EarlierValue.getBitWidth());
+
+ // Offset of the smaller store inside the larger store
+ unsigned BitOffsetDiff = (InstWriteOffset - DepWriteOffset) * 8;
+ unsigned LShiftAmount =
+ DL.isBigEndian()
+ ? EarlierValue.getBitWidth() - BitOffsetDiff - LaterBits
+ : BitOffsetDiff;
+ APInt Mask =
+ APInt::getBitsSet(EarlierValue.getBitWidth(), LShiftAmount,
+ LShiftAmount + LaterBits);
+ // Clear the bits we'll be replacing, then OR with the smaller
+ // store, shifted appropriately.
+ APInt Merged =
+ (EarlierValue & ~Mask) | (LaterValue << LShiftAmount);
+ DEBUG(dbgs() << "DSE: Merge Stores:\n Earlier: " << *DepWrite
+ << "\n Later: " << *Inst
+ << "\n Merged Value: " << Merged << '\n');
+
+ auto *SI = new StoreInst(
+ ConstantInt::get(Earlier->getValueOperand()->getType(), Merged),
+ Earlier->getPointerOperand(), false, Earlier->getAlignment(),
+ Earlier->getOrdering(), Earlier->getSyncScopeID(), DepWrite);
+
+ unsigned MDToKeep[] = {LLVMContext::MD_dbg, LLVMContext::MD_tbaa,
+ LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias,
+ LLVMContext::MD_nontemporal};
+ SI->copyMetadata(*DepWrite, MDToKeep);
+ ++NumModifiedStores;
+
+ // Remove earlier, wider, store
+ size_t Idx = InstrOrdering.lookup(DepWrite);
+ InstrOrdering.erase(DepWrite);
+ InstrOrdering.insert(std::make_pair(SI, Idx));
+
+ // Delete the old stores and now-dead instructions that feed them.
+ deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL, &InstrOrdering);
+ deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL,
+ &InstrOrdering);
+ MadeChange = true;
+
+ // We erased DepWrite and Inst (Loc); start over.
+ break;
+ }
}
}
@@ -1137,7 +1251,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
if (DepWrite == &BB.front()) break;
// Can't look past this instruction if it might read 'Loc'.
- if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref)
+ if (isRefSet(AA->getModRefInfo(DepWrite, Loc)))
break;
InstDep = MD->getPointerDependencyFrom(Loc, /*isLoad=*/ false,
@@ -1190,9 +1304,12 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
}
namespace {
+
/// A legacy pass for the legacy pass manager that wraps \c DSEPass.
class DSELegacyPass : public FunctionPass {
public:
+ static char ID; // Pass identification, replacement for typeid
+
DSELegacyPass() : FunctionPass(ID) {
initializeDSELegacyPassPass(*PassRegistry::getPassRegistry());
}
@@ -1221,12 +1338,12 @@ public:
AU.addPreserved<GlobalsAAWrapperPass>();
AU.addPreserved<MemoryDependenceWrapperPass>();
}
-
- static char ID; // Pass identification, replacement for typeid
};
+
} // end anonymous namespace
char DSELegacyPass::ID = 0;
+
INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false,
false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
diff --git a/lib/Transforms/Scalar/DivRemPairs.cpp b/lib/Transforms/Scalar/DivRemPairs.cpp
new file mode 100644
index 000000000000..e383af89a384
--- /dev/null
+++ b/lib/Transforms/Scalar/DivRemPairs.cpp
@@ -0,0 +1,206 @@
+//===- DivRemPairs.cpp - Hoist/decompose division and remainder -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass hoists and/or decomposes integer division and remainder
+// instructions to enable CFG improvements and better codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/DivRemPairs.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "div-rem-pairs"
+STATISTIC(NumPairs, "Number of div/rem pairs");
+STATISTIC(NumHoisted, "Number of instructions hoisted");
+STATISTIC(NumDecomposed, "Number of instructions decomposed");
+
+/// Find matching pairs of integer div/rem ops (they have the same numerator,
+/// denominator, and signedness). If they exist in different basic blocks, bring
+/// them together by hoisting or replace the common division operation that is
+/// implicit in the remainder:
+/// X % Y <--> X - ((X / Y) * Y).
+///
+/// We can largely ignore the normal safety and cost constraints on speculation
+/// of these ops when we find a matching pair. This is because we are already
+/// guaranteed that any exceptions and most cost are already incurred by the
+/// first member of the pair.
+///
+/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or
+/// SimplifyCFG, but it's split off on its own because it's different enough
+/// that it doesn't quite match the stated objectives of those passes.
+static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
+ const DominatorTree &DT) {
+ bool Changed = false;
+
+ // Insert all divide and remainder instructions into maps keyed by their
+ // operands and opcode (signed or unsigned).
+ DenseMap<DivRemMapKey, Instruction *> DivMap, RemMap;
+ for (auto &BB : F) {
+ for (auto &I : BB) {
+ if (I.getOpcode() == Instruction::SDiv)
+ DivMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
+ else if (I.getOpcode() == Instruction::UDiv)
+ DivMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
+ else if (I.getOpcode() == Instruction::SRem)
+ RemMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
+ else if (I.getOpcode() == Instruction::URem)
+ RemMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
+ }
+ }
+
+ // We can iterate over either map because we are only looking for matched
+ // pairs. Choose remainders for efficiency because they are usually even more
+ // rare than division.
+ for (auto &RemPair : RemMap) {
+ // Find the matching division instruction from the division map.
+ Instruction *DivInst = DivMap[RemPair.getFirst()];
+ if (!DivInst)
+ continue;
+
+ // We have a matching pair of div/rem instructions. If one dominates the
+ // other, hoist and/or replace one.
+ NumPairs++;
+ Instruction *RemInst = RemPair.getSecond();
+ bool IsSigned = DivInst->getOpcode() == Instruction::SDiv;
+ bool HasDivRemOp = TTI.hasDivRemOp(DivInst->getType(), IsSigned);
+
+ // If the target supports div+rem and the instructions are in the same block
+ // already, there's nothing to do. The backend should handle this. If the
+ // target does not support div+rem, then we will decompose the rem.
+ if (HasDivRemOp && RemInst->getParent() == DivInst->getParent())
+ continue;
+
+ bool DivDominates = DT.dominates(DivInst, RemInst);
+ if (!DivDominates && !DT.dominates(RemInst, DivInst))
+ continue;
+
+ if (HasDivRemOp) {
+ // The target has a single div/rem operation. Hoist the lower instruction
+ // to make the matched pair visible to the backend.
+ if (DivDominates)
+ RemInst->moveAfter(DivInst);
+ else
+ DivInst->moveAfter(RemInst);
+ NumHoisted++;
+ } else {
+ // The target does not have a single div/rem operation. Decompose the
+ // remainder calculation as:
+ // X % Y --> X - ((X / Y) * Y).
+ Value *X = RemInst->getOperand(0);
+ Value *Y = RemInst->getOperand(1);
+ Instruction *Mul = BinaryOperator::CreateMul(DivInst, Y);
+ Instruction *Sub = BinaryOperator::CreateSub(X, Mul);
+
+ // If the remainder dominates, then hoist the division up to that block:
+ //
+ // bb1:
+ // %rem = srem %x, %y
+ // bb2:
+ // %div = sdiv %x, %y
+ // -->
+ // bb1:
+ // %div = sdiv %x, %y
+ // %mul = mul %div, %y
+ // %rem = sub %x, %mul
+ //
+ // If the division dominates, it's already in the right place. The mul+sub
+ // will be in a different block because we don't assume that they are
+ // cheap to speculatively execute:
+ //
+ // bb1:
+ // %div = sdiv %x, %y
+ // bb2:
+ // %rem = srem %x, %y
+ // -->
+ // bb1:
+ // %div = sdiv %x, %y
+ // bb2:
+ // %mul = mul %div, %y
+ // %rem = sub %x, %mul
+ //
+ // If the div and rem are in the same block, we do the same transform,
+ // but any code movement would be within the same block.
+
+ if (!DivDominates)
+ DivInst->moveBefore(RemInst);
+ Mul->insertAfter(RemInst);
+ Sub->insertAfter(Mul);
+
+ // Now kill the explicit remainder. We have replaced it with:
+ // (sub X, (mul (div X, Y), Y)
+ RemInst->replaceAllUsesWith(Sub);
+ RemInst->eraseFromParent();
+ NumDecomposed++;
+ }
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+// Pass manager boilerplate below here.
+
+namespace {
+struct DivRemPairsLegacyPass : public FunctionPass {
+ static char ID;
+ DivRemPairsLegacyPass() : FunctionPass(ID) {
+ initializeDivRemPairsLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.setPreservesCFG();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ return optimizeDivRem(F, TTI, DT);
+ }
+};
+}
+
+char DivRemPairsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(DivRemPairsLegacyPass, "div-rem-pairs",
+ "Hoist/decompose integer division and remainder", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(DivRemPairsLegacyPass, "div-rem-pairs",
+ "Hoist/decompose integer division and remainder", false,
+ false)
+FunctionPass *llvm::createDivRemPairsPass() {
+ return new DivRemPairsLegacyPass();
+}
+
+PreservedAnalyses DivRemPairsPass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
+ DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+ if (!optimizeDivRem(F, TTI, DT))
+ return PreservedAnalyses::all();
+ // TODO: This pass just hoists/replaces math ops - all analyses are preserved?
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index c5c9b2c185d6..5798e1c4ee99 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -13,9 +13,12 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopedHashTable.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/GlobalsModRef.h"
@@ -24,18 +27,37 @@
#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/RecyclingAllocator.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
#include <deque>
+#include <memory>
+#include <utility>
+
using namespace llvm;
using namespace llvm::PatternMatch;
@@ -53,6 +75,7 @@ STATISTIC(NumDSE, "Number of trivial dead stores removed");
//===----------------------------------------------------------------------===//
namespace {
+
/// \brief Struct representing the available values in the scoped hash table.
struct SimpleValue {
Instruction *Inst;
@@ -77,20 +100,25 @@ struct SimpleValue {
isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst);
}
};
-}
+
+} // end anonymous namespace
namespace llvm {
+
template <> struct DenseMapInfo<SimpleValue> {
static inline SimpleValue getEmptyKey() {
return DenseMapInfo<Instruction *>::getEmptyKey();
}
+
static inline SimpleValue getTombstoneKey() {
return DenseMapInfo<Instruction *>::getTombstoneKey();
}
+
static unsigned getHashValue(SimpleValue Val);
static bool isEqual(SimpleValue LHS, SimpleValue RHS);
};
-}
+
+} // end namespace llvm
unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
Instruction *Inst = Val.Inst;
@@ -115,6 +143,21 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
return hash_combine(Inst->getOpcode(), Pred, LHS, RHS);
}
+ // Hash min/max/abs (cmp + select) to allow for commuted operands.
+ // Min/max may also have non-canonical compare predicate (eg, the compare for
+ // smin may use 'sgt' rather than 'slt'), and non-canonical operands in the
+ // compare.
+ Value *A, *B;
+ SelectPatternFlavor SPF = matchSelectPattern(Inst, A, B).Flavor;
+ // TODO: We should also detect FP min/max.
+ if (SPF == SPF_SMIN || SPF == SPF_SMAX ||
+ SPF == SPF_UMIN || SPF == SPF_UMAX ||
+ SPF == SPF_ABS || SPF == SPF_NABS) {
+ if (A > B)
+ std::swap(A, B);
+ return hash_combine(Inst->getOpcode(), SPF, A, B);
+ }
+
if (CastInst *CI = dyn_cast<CastInst>(Inst))
return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0));
@@ -173,6 +216,20 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
}
+ // Min/max/abs can occur with commuted operands, non-canonical predicates,
+ // and/or non-canonical operands.
+ Value *LHSA, *LHSB;
+ SelectPatternFlavor LSPF = matchSelectPattern(LHSI, LHSA, LHSB).Flavor;
+ // TODO: We should also detect FP min/max.
+ if (LSPF == SPF_SMIN || LSPF == SPF_SMAX ||
+ LSPF == SPF_UMIN || LSPF == SPF_UMAX ||
+ LSPF == SPF_ABS || LSPF == SPF_NABS) {
+ Value *RHSA, *RHSB;
+ SelectPatternFlavor RSPF = matchSelectPattern(RHSI, RHSA, RHSB).Flavor;
+ return (LSPF == RSPF && ((LHSA == RHSA && LHSB == RHSB) ||
+ (LHSA == RHSB && LHSB == RHSA)));
+ }
+
return false;
}
@@ -181,6 +238,7 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
//===----------------------------------------------------------------------===//
namespace {
+
/// \brief Struct representing the available call values in the scoped hash
/// table.
struct CallValue {
@@ -206,20 +264,25 @@ struct CallValue {
return true;
}
};
-}
+
+} // end anonymous namespace
namespace llvm {
+
template <> struct DenseMapInfo<CallValue> {
static inline CallValue getEmptyKey() {
return DenseMapInfo<Instruction *>::getEmptyKey();
}
+
static inline CallValue getTombstoneKey() {
return DenseMapInfo<Instruction *>::getTombstoneKey();
}
+
static unsigned getHashValue(CallValue Val);
static bool isEqual(CallValue LHS, CallValue RHS);
};
-}
+
+} // end namespace llvm
unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
Instruction *Inst = Val.Inst;
@@ -241,6 +304,7 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
//===----------------------------------------------------------------------===//
namespace {
+
/// \brief A simple and fast domtree-based CSE pass.
///
/// This pass does a simple depth-first walk over the dominator tree,
@@ -257,10 +321,13 @@ public:
const SimplifyQuery SQ;
MemorySSA *MSSA;
std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
- typedef RecyclingAllocator<
- BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy;
- typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
- AllocatorTy> ScopedHTType;
+
+ using AllocatorTy =
+ RecyclingAllocator<BumpPtrAllocator,
+ ScopedHashTableVal<SimpleValue, Value *>>;
+ using ScopedHTType =
+ ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
+ AllocatorTy>;
/// \brief A scoped hash table of the current values of all of our simple
/// scalar expressions.
@@ -285,44 +352,45 @@ public:
/// present the table; it is the responsibility of the consumer to inspect
/// the atomicity/volatility if needed.
struct LoadValue {
- Instruction *DefInst;
- unsigned Generation;
- int MatchingId;
- bool IsAtomic;
- bool IsInvariant;
- LoadValue()
- : DefInst(nullptr), Generation(0), MatchingId(-1), IsAtomic(false),
- IsInvariant(false) {}
+ Instruction *DefInst = nullptr;
+ unsigned Generation = 0;
+ int MatchingId = -1;
+ bool IsAtomic = false;
+ bool IsInvariant = false;
+
+ LoadValue() = default;
LoadValue(Instruction *Inst, unsigned Generation, unsigned MatchingId,
bool IsAtomic, bool IsInvariant)
: DefInst(Inst), Generation(Generation), MatchingId(MatchingId),
IsAtomic(IsAtomic), IsInvariant(IsInvariant) {}
};
- typedef RecyclingAllocator<BumpPtrAllocator,
- ScopedHashTableVal<Value *, LoadValue>>
- LoadMapAllocator;
- typedef ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>,
- LoadMapAllocator> LoadHTType;
+
+ using LoadMapAllocator =
+ RecyclingAllocator<BumpPtrAllocator,
+ ScopedHashTableVal<Value *, LoadValue>>;
+ using LoadHTType =
+ ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>,
+ LoadMapAllocator>;
+
LoadHTType AvailableLoads;
/// \brief A scoped hash table of the current values of read-only call
/// values.
///
/// It uses the same generation count as loads.
- typedef ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>
- CallHTType;
+ using CallHTType =
+ ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>;
CallHTType AvailableCalls;
/// \brief This is the current generation of the memory value.
- unsigned CurrentGeneration;
+ unsigned CurrentGeneration = 0;
/// \brief Set up the EarlyCSE runner for a particular function.
EarlyCSE(const DataLayout &DL, const TargetLibraryInfo &TLI,
const TargetTransformInfo &TTI, DominatorTree &DT,
AssumptionCache &AC, MemorySSA *MSSA)
: TLI(TLI), TTI(TTI), DT(DT), AC(AC), SQ(DL, &TLI, &DT, &AC), MSSA(MSSA),
- MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)), CurrentGeneration(0) {
- }
+ MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {}
bool run();
@@ -336,11 +404,10 @@ private:
CallHTType &AvailableCalls)
: Scope(AvailableValues), LoadScope(AvailableLoads),
CallScope(AvailableCalls) {}
-
- private:
NodeScope(const NodeScope &) = delete;
- void operator=(const NodeScope &) = delete;
+ NodeScope &operator=(const NodeScope &) = delete;
+ private:
ScopedHTType::ScopeTy Scope;
LoadHTType::ScopeTy LoadScope;
CallHTType::ScopeTy CallScope;
@@ -356,8 +423,10 @@ private:
CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n,
DomTreeNode::iterator child, DomTreeNode::iterator end)
: CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
- EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls),
- Processed(false) {}
+ EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls)
+ {}
+ StackNode(const StackNode &) = delete;
+ StackNode &operator=(const StackNode &) = delete;
// Accessors.
unsigned currentGeneration() { return CurrentGeneration; }
@@ -365,27 +434,25 @@ private:
void childGeneration(unsigned generation) { ChildGeneration = generation; }
DomTreeNode *node() { return Node; }
DomTreeNode::iterator childIter() { return ChildIter; }
+
DomTreeNode *nextChild() {
DomTreeNode *child = *ChildIter;
++ChildIter;
return child;
}
+
DomTreeNode::iterator end() { return EndIter; }
bool isProcessed() { return Processed; }
void process() { Processed = true; }
private:
- StackNode(const StackNode &) = delete;
- void operator=(const StackNode &) = delete;
-
- // Members.
unsigned CurrentGeneration;
unsigned ChildGeneration;
DomTreeNode *Node;
DomTreeNode::iterator ChildIter;
DomTreeNode::iterator EndIter;
NodeScope Scopes;
- bool Processed;
+ bool Processed = false;
};
/// \brief Wrapper class to handle memory instructions, including loads,
@@ -393,24 +460,28 @@ private:
class ParseMemoryInst {
public:
ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
- : IsTargetMemInst(false), Inst(Inst) {
+ : Inst(Inst) {
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
if (TTI.getTgtMemIntrinsic(II, Info))
IsTargetMemInst = true;
}
+
bool isLoad() const {
if (IsTargetMemInst) return Info.ReadMem;
return isa<LoadInst>(Inst);
}
+
bool isStore() const {
if (IsTargetMemInst) return Info.WriteMem;
return isa<StoreInst>(Inst);
}
+
bool isAtomic() const {
if (IsTargetMemInst)
return Info.Ordering != AtomicOrdering::NotAtomic;
return Inst->isAtomic();
}
+
bool isUnordered() const {
if (IsTargetMemInst)
return Info.isUnordered();
@@ -447,6 +518,7 @@ private:
return (getPointerOperand() == Inst.getPointerOperand() &&
getMatchingId() == Inst.getMatchingId());
}
+
bool isValid() const { return getPointerOperand() != nullptr; }
// For regular (non-intrinsic) loads/stores, this is set to -1. For
@@ -457,6 +529,7 @@ private:
if (IsTargetMemInst) return Info.MatchingId;
return -1;
}
+
Value *getPointerOperand() const {
if (IsTargetMemInst) return Info.PtrVal;
if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
@@ -466,17 +539,19 @@ private:
}
return nullptr;
}
+
bool mayReadFromMemory() const {
if (IsTargetMemInst) return Info.ReadMem;
return Inst->mayReadFromMemory();
}
+
bool mayWriteToMemory() const {
if (IsTargetMemInst) return Info.WriteMem;
return Inst->mayWriteToMemory();
}
private:
- bool IsTargetMemInst;
+ bool IsTargetMemInst = false;
MemIntrinsicInfo Info;
Instruction *Inst;
};
@@ -524,8 +599,8 @@ private:
for (MemoryPhi *MP : PhisToCheck) {
MemoryAccess *FirstIn = MP->getIncomingValue(0);
- if (all_of(MP->incoming_values(),
- [=](Use &In) { return In == FirstIn; }))
+ if (llvm::all_of(MP->incoming_values(),
+ [=](Use &In) { return In == FirstIn; }))
WorkQueue.push_back(MP);
}
PhisToCheck.clear();
@@ -533,7 +608,8 @@ private:
}
}
};
-}
+
+} // end anonymous namespace
/// Determine if the memory referenced by LaterInst is from the same heap
/// version as EarlierInst.
@@ -663,6 +739,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
continue;
}
+ // Skip sideeffect intrinsics, for the same reason as assume intrinsics.
+ if (match(Inst, m_Intrinsic<Intrinsic::sideeffect>())) {
+ DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << *Inst << '\n');
+ continue;
+ }
+
// Skip invariant.start intrinsics since they only read memory, and we can
// forward values across it. Also, we dont need to consume the last store
// since the semantics of invariant.start allow us to perform DSE of the
@@ -1014,6 +1096,7 @@ PreservedAnalyses EarlyCSEPass::run(Function &F,
}
namespace {
+
/// \brief A simple and fast domtree-based CSE pass.
///
/// This pass does a simple depth-first walk over the dominator tree,
@@ -1062,7 +1145,8 @@ public:
AU.setPreservesCFG();
}
};
-}
+
+} // end anonymous namespace
using EarlyCSELegacyPass = EarlyCSELegacyCommonPass</*UseMemorySSA=*/false>;
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index ea28705e684d..e2c1eaf58e43 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -20,39 +20,64 @@
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PointerIntPair.h"
#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/PHITransAddr.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Dominators.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
#include "llvm/Transforms/Utils/VNCoercion.h"
-
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
#include <vector>
+
using namespace llvm;
using namespace llvm::gvn;
using namespace llvm::VNCoercion;
@@ -80,6 +105,7 @@ MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore,
struct llvm::GVN::Expression {
uint32_t opcode;
Type *type;
+ bool commutative = false;
SmallVector<uint32_t, 4> varargs;
Expression(uint32_t o = ~2U) : opcode(o) {}
@@ -104,20 +130,23 @@ struct llvm::GVN::Expression {
};
namespace llvm {
+
template <> struct DenseMapInfo<GVN::Expression> {
static inline GVN::Expression getEmptyKey() { return ~0U; }
-
static inline GVN::Expression getTombstoneKey() { return ~1U; }
static unsigned getHashValue(const GVN::Expression &e) {
using llvm::hash_value;
+
return static_cast<unsigned>(hash_value(e));
}
+
static bool isEqual(const GVN::Expression &LHS, const GVN::Expression &RHS) {
return LHS == RHS;
}
};
-} // End llvm namespace.
+
+} // end namespace llvm
/// Represents a particular available value that we know how to materialize.
/// Materialization of an AvailableValue never fails. An AvailableValue is
@@ -216,6 +245,7 @@ struct llvm::gvn::AvailableValueInBlock {
unsigned Offset = 0) {
return get(BB, AvailableValue::get(V, Offset));
}
+
static AvailableValueInBlock getUndef(BasicBlock *BB) {
return get(BB, AvailableValue::getUndef());
}
@@ -246,6 +276,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
if (e.varargs[0] > e.varargs[1])
std::swap(e.varargs[0], e.varargs[1]);
+ e.commutative = true;
}
if (CmpInst *C = dyn_cast<CmpInst>(I)) {
@@ -256,6 +287,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
Predicate = CmpInst::getSwappedPredicate(Predicate);
}
e.opcode = (C->getOpcode() << 8) | Predicate;
+ e.commutative = true;
} else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) {
for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end();
II != IE; ++II)
@@ -281,6 +313,7 @@ GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode,
Predicate = CmpInst::getSwappedPredicate(Predicate);
}
e.opcode = (Opcode << 8) | Predicate;
+ e.commutative = true;
return e;
}
@@ -340,7 +373,7 @@ GVN::Expression GVN::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
// ValueTable External Functions
//===----------------------------------------------------------------------===//
-GVN::ValueTable::ValueTable() : nextValueNumber(1) {}
+GVN::ValueTable::ValueTable() = default;
GVN::ValueTable::ValueTable(const ValueTable &) = default;
GVN::ValueTable::ValueTable(ValueTable &&) = default;
GVN::ValueTable::~ValueTable() = default;
@@ -348,25 +381,25 @@ GVN::ValueTable::~ValueTable() = default;
/// add - Insert a value into the table with a specified value number.
void GVN::ValueTable::add(Value *V, uint32_t num) {
valueNumbering.insert(std::make_pair(V, num));
+ if (PHINode *PN = dyn_cast<PHINode>(V))
+ NumberingPhi[num] = PN;
}
uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
if (AA->doesNotAccessMemory(C)) {
Expression exp = createExpr(C);
- uint32_t &e = expressionNumbering[exp];
- if (!e) e = nextValueNumber++;
+ uint32_t e = assignExpNewValueNum(exp).first;
valueNumbering[C] = e;
return e;
} else if (AA->onlyReadsMemory(C)) {
Expression exp = createExpr(C);
- uint32_t &e = expressionNumbering[exp];
- if (!e) {
- e = nextValueNumber++;
- valueNumbering[C] = e;
- return e;
+ auto ValNum = assignExpNewValueNum(exp);
+ if (ValNum.second) {
+ valueNumbering[C] = ValNum.first;
+ return ValNum.first;
}
if (!MD) {
- e = nextValueNumber++;
+ uint32_t e = assignExpNewValueNum(exp).first;
valueNumbering[C] = e;
return e;
}
@@ -452,7 +485,6 @@ uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
uint32_t v = lookupOrAdd(cdep);
valueNumbering[C] = v;
return v;
-
} else {
valueNumbering[C] = nextValueNumber;
return nextValueNumber++;
@@ -522,23 +554,29 @@ uint32_t GVN::ValueTable::lookupOrAdd(Value *V) {
case Instruction::ExtractValue:
exp = createExtractvalueExpr(cast<ExtractValueInst>(I));
break;
+ case Instruction::PHI:
+ valueNumbering[V] = nextValueNumber;
+ NumberingPhi[nextValueNumber] = cast<PHINode>(V);
+ return nextValueNumber++;
default:
valueNumbering[V] = nextValueNumber;
return nextValueNumber++;
}
- uint32_t& e = expressionNumbering[exp];
- if (!e) e = nextValueNumber++;
+ uint32_t e = assignExpNewValueNum(exp).first;
valueNumbering[V] = e;
return e;
}
/// Returns the value number of the specified value. Fails if
/// the value has not yet been numbered.
-uint32_t GVN::ValueTable::lookup(Value *V) const {
+uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const {
DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V);
- assert(VI != valueNumbering.end() && "Value not numbered?");
- return VI->second;
+ if (Verify) {
+ assert(VI != valueNumbering.end() && "Value not numbered?");
+ return VI->second;
+ }
+ return (VI != valueNumbering.end()) ? VI->second : 0;
}
/// Returns the value number of the given comparison,
@@ -549,21 +587,28 @@ uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode,
CmpInst::Predicate Predicate,
Value *LHS, Value *RHS) {
Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS);
- uint32_t& e = expressionNumbering[exp];
- if (!e) e = nextValueNumber++;
- return e;
+ return assignExpNewValueNum(exp).first;
}
/// Remove all entries from the ValueTable.
void GVN::ValueTable::clear() {
valueNumbering.clear();
expressionNumbering.clear();
+ NumberingPhi.clear();
+ PhiTranslateTable.clear();
nextValueNumber = 1;
+ Expressions.clear();
+ ExprIdx.clear();
+ nextExprNumber = 0;
}
/// Remove a value from the value numbering.
void GVN::ValueTable::erase(Value *V) {
+ uint32_t Num = valueNumbering.lookup(V);
valueNumbering.erase(V);
+ // If V is PHINode, V <--> value number is an one-to-one mapping.
+ if (isa<PHINode>(V))
+ NumberingPhi.erase(Num);
}
/// verifyRemoved - Verify that the value is removed from all internal data
@@ -693,9 +738,6 @@ SpeculationFailure:
return false;
}
-
-
-
/// Given a set of loads specified by ValuesPerBlock,
/// construct SSA form, allowing us to eliminate LI. This returns the value
/// that should be used at LI's definition site.
@@ -789,6 +831,7 @@ static void reportMayClobberedLoad(LoadInst *LI, MemDepResult DepInfo,
DominatorTree *DT,
OptimizationRemarkEmitter *ORE) {
using namespace ore;
+
User *OtherAccess = nullptr;
OptimizationRemarkMissed R(DEBUG_TYPE, "LoadClobbered", LI);
@@ -817,7 +860,6 @@ static void reportMayClobberedLoad(LoadInst *LI, MemDepResult DepInfo,
bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
Value *Address, AvailableValue &Res) {
-
assert((DepInfo.isDef() || DepInfo.isClobber()) &&
"expected a local dependence");
assert(LI->isUnordered() && "rules below are incorrect for ordered access");
@@ -879,8 +921,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
Instruction *I = DepInfo.getInst();
dbgs() << " is clobbered by " << *I << '\n';
);
-
- if (ORE->allowExtraAnalysis())
+ if (ORE->allowExtraAnalysis(DEBUG_TYPE))
reportMayClobberedLoad(LI, DepInfo, DT, ORE);
return false;
@@ -949,7 +990,6 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
AvailValInBlkVect &ValuesPerBlock,
UnavailBlkVect &UnavailableBlocks) {
-
// Filter out useless results (non-locals, etc). Keep track of the blocks
// where we have a value available in repl, also keep track of whether we see
// dependencies that produce an unknown value for the load (such as a call
@@ -1009,7 +1049,32 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
// backwards through predecessors if needed.
BasicBlock *LoadBB = LI->getParent();
BasicBlock *TmpBB = LoadBB;
+ bool IsSafeToSpeculativelyExecute = isSafeToSpeculativelyExecute(LI);
+ // Check that there is no implicit control flow instructions above our load in
+ // its block. If there is an instruction that doesn't always pass the
+ // execution to the following instruction, then moving through it may become
+ // invalid. For example:
+ //
+ // int arr[LEN];
+ // int index = ???;
+ // ...
+ // guard(0 <= index && index < LEN);
+ // use(arr[index]);
+ //
+ // It is illegal to move the array access to any point above the guard,
+ // because if the index is out of bounds we should deoptimize rather than
+ // access the array.
+ // Check that there is no guard in this block above our intruction.
+ if (!IsSafeToSpeculativelyExecute) {
+ auto It = FirstImplicitControlFlowInsts.find(TmpBB);
+ if (It != FirstImplicitControlFlowInsts.end()) {
+ assert(It->second->getParent() == TmpBB &&
+ "Implicit control flow map broken?");
+ if (OI->dominates(It->second, LI))
+ return false;
+ }
+ }
while (TmpBB->getSinglePredecessor()) {
TmpBB = TmpBB->getSinglePredecessor();
if (TmpBB == LoadBB) // Infinite (unreachable) loop.
@@ -1024,6 +1089,11 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
// which it was not previously executed.
if (TmpBB->getTerminator()->getNumSuccessors() != 1)
return false;
+
+ // Check that there is no implicit control flow in a block above.
+ if (!IsSafeToSpeculativelyExecute &&
+ FirstImplicitControlFlowInsts.count(TmpBB))
+ return false;
}
assert(TmpBB);
@@ -1128,8 +1198,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
if (!CanDoPRE) {
while (!NewInsts.empty()) {
Instruction *I = NewInsts.pop_back_val();
- if (MD) MD->removeInstruction(I);
- I->eraseFromParent();
+ markInstructionForDeletion(I);
}
// HINT: Don't revert the edge-splitting as following transformation may
// also need to split these critical edges.
@@ -1206,8 +1275,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
if (V->getType()->isPtrOrPtrVectorTy())
MD->invalidateCachedPointerInfo(V);
markInstructionForDeletion(LI);
- ORE->emit(OptimizationRemark(DEBUG_TYPE, "LoadPRE", LI)
- << "load eliminated by PRE");
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "LoadPRE", LI)
+ << "load eliminated by PRE";
+ });
++NumPRELoad;
return true;
}
@@ -1215,17 +1286,23 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
static void reportLoadElim(LoadInst *LI, Value *AvailableValue,
OptimizationRemarkEmitter *ORE) {
using namespace ore;
- ORE->emit(OptimizationRemark(DEBUG_TYPE, "LoadElim", LI)
- << "load of type " << NV("Type", LI->getType()) << " eliminated"
- << setExtraArgs() << " in favor of "
- << NV("InfavorOfValue", AvailableValue));
+
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "LoadElim", LI)
+ << "load of type " << NV("Type", LI->getType()) << " eliminated"
+ << setExtraArgs() << " in favor of "
+ << NV("InfavorOfValue", AvailableValue);
+ });
}
/// Attempt to eliminate a load whose dependencies are
/// non-local by performing PHI construction.
bool GVN::processNonLocalLoad(LoadInst *LI) {
// non-local speculations are not allowed under asan.
- if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeAddress))
+ if (LI->getParent()->getParent()->hasFnAttribute(
+ Attribute::SanitizeAddress) ||
+ LI->getParent()->getParent()->hasFnAttribute(
+ Attribute::SanitizeHWAddress))
return false;
// Step 1: Find the non-local dependencies of the load.
@@ -1322,6 +1399,11 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
}
markInstructionForDeletion(IntrinsicI);
return false;
+ } else if (isa<Constant>(V)) {
+ // If it's not false, and constant, it must evaluate to true. This means our
+ // assume is assume(true), and thus, pointless, and we don't want to do
+ // anything more here.
+ return false;
}
Constant *True = ConstantInt::getTrue(V->getContext());
@@ -1452,6 +1534,106 @@ bool GVN::processLoad(LoadInst *L) {
return false;
}
+/// Return a pair the first field showing the value number of \p Exp and the
+/// second field showing whether it is a value number newly created.
+std::pair<uint32_t, bool>
+GVN::ValueTable::assignExpNewValueNum(Expression &Exp) {
+ uint32_t &e = expressionNumbering[Exp];
+ bool CreateNewValNum = !e;
+ if (CreateNewValNum) {
+ Expressions.push_back(Exp);
+ if (ExprIdx.size() < nextValueNumber + 1)
+ ExprIdx.resize(nextValueNumber * 2);
+ e = nextValueNumber;
+ ExprIdx[nextValueNumber++] = nextExprNumber++;
+ }
+ return {e, CreateNewValNum};
+}
+
+/// Return whether all the values related with the same \p num are
+/// defined in \p BB.
+bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB,
+ GVN &Gvn) {
+ LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
+ while (Vals && Vals->BB == BB)
+ Vals = Vals->Next;
+ return !Vals;
+}
+
+/// Wrap phiTranslateImpl to provide caching functionality.
+uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred,
+ const BasicBlock *PhiBlock, uint32_t Num,
+ GVN &Gvn) {
+ auto FindRes = PhiTranslateTable.find({Num, Pred});
+ if (FindRes != PhiTranslateTable.end())
+ return FindRes->second;
+ uint32_t NewNum = phiTranslateImpl(Pred, PhiBlock, Num, Gvn);
+ PhiTranslateTable.insert({{Num, Pred}, NewNum});
+ return NewNum;
+}
+
+/// Translate value number \p Num using phis, so that it has the values of
+/// the phis in BB.
+uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
+ const BasicBlock *PhiBlock,
+ uint32_t Num, GVN &Gvn) {
+ if (PHINode *PN = NumberingPhi[Num]) {
+ for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+ if (PN->getParent() == PhiBlock && PN->getIncomingBlock(i) == Pred)
+ if (uint32_t TransVal = lookup(PN->getIncomingValue(i), false))
+ return TransVal;
+ }
+ return Num;
+ }
+
+ // If there is any value related with Num is defined in a BB other than
+ // PhiBlock, it cannot depend on a phi in PhiBlock without going through
+ // a backedge. We can do an early exit in that case to save compile time.
+ if (!areAllValsInBB(Num, PhiBlock, Gvn))
+ return Num;
+
+ if (Num >= ExprIdx.size() || ExprIdx[Num] == 0)
+ return Num;
+ Expression Exp = Expressions[ExprIdx[Num]];
+
+ for (unsigned i = 0; i < Exp.varargs.size(); i++) {
+ // For InsertValue and ExtractValue, some varargs are index numbers
+ // instead of value numbers. Those index numbers should not be
+ // translated.
+ if ((i > 1 && Exp.opcode == Instruction::InsertValue) ||
+ (i > 0 && Exp.opcode == Instruction::ExtractValue))
+ continue;
+ Exp.varargs[i] = phiTranslate(Pred, PhiBlock, Exp.varargs[i], Gvn);
+ }
+
+ if (Exp.commutative) {
+ assert(Exp.varargs.size() == 2 && "Unsupported commutative expression!");
+ if (Exp.varargs[0] > Exp.varargs[1]) {
+ std::swap(Exp.varargs[0], Exp.varargs[1]);
+ uint32_t Opcode = Exp.opcode >> 8;
+ if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)
+ Exp.opcode = (Opcode << 8) |
+ CmpInst::getSwappedPredicate(
+ static_cast<CmpInst::Predicate>(Exp.opcode & 255));
+ }
+ }
+
+ if (uint32_t NewNum = expressionNumbering[Exp])
+ return NewNum;
+ return Num;
+}
+
+/// Erase stale entry from phiTranslate cache so phiTranslate can be computed
+/// again.
+void GVN::ValueTable::eraseTranslateCacheEntry(uint32_t Num,
+ const BasicBlock &CurrBlock) {
+ for (const BasicBlock *Pred : predecessors(&CurrBlock)) {
+ auto FindRes = PhiTranslateTable.find({Num, Pred});
+ if (FindRes != PhiTranslateTable.end())
+ PhiTranslateTable.erase(FindRes);
+ }
+}
+
// In order to find a leader for a given value number at a
// specific basic block, we first obtain the list of all Values for that number,
// and then scan the list to find one whose block dominates the block in
@@ -1496,6 +1678,13 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
return Pred != nullptr;
}
+void GVN::assignBlockRPONumber(Function &F) {
+ uint32_t NextBlockNumber = 1;
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+ for (BasicBlock *BB : RPOT)
+ BlockRPONumber[BB] = NextBlockNumber++;
+}
+
// Tries to replace instruction with const, using information from
// ReplaceWithConstMap.
bool GVN::replaceOperandsWithConsts(Instruction *Instr) const {
@@ -1827,6 +2016,8 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
TLI = &RunTLI;
VN.setAliasAnalysis(&RunAA);
MD = RunMD;
+ OrderedInstructions OrderedInstrs(DT);
+ OI = &OrderedInstrs;
VN.setMemDep(MD);
ORE = RunORE;
@@ -1857,6 +2048,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
// Fabricate val-num for dead-code in order to suppress assertion in
// performPRE().
assignValNumForDeadCode();
+ assignBlockRPONumber(F);
bool PREChanged = true;
while (PREChanged) {
PREChanged = performPRE(F);
@@ -1908,14 +2100,26 @@ bool GVN::processBlock(BasicBlock *BB) {
if (!AtStart)
--BI;
- for (SmallVectorImpl<Instruction *>::iterator I = InstrsToErase.begin(),
- E = InstrsToErase.end(); I != E; ++I) {
- DEBUG(dbgs() << "GVN removed: " << **I << '\n');
- if (MD) MD->removeInstruction(*I);
- DEBUG(verifyRemoved(*I));
- (*I)->eraseFromParent();
+ bool InvalidateImplicitCF = false;
+ const Instruction *MaybeFirstICF = FirstImplicitControlFlowInsts.lookup(BB);
+ for (auto *I : InstrsToErase) {
+ assert(I->getParent() == BB && "Removing instruction from wrong block?");
+ DEBUG(dbgs() << "GVN removed: " << *I << '\n');
+ if (MD) MD->removeInstruction(I);
+ DEBUG(verifyRemoved(I));
+ if (MaybeFirstICF == I) {
+ // We have erased the first ICF in block. The map needs to be updated.
+ InvalidateImplicitCF = true;
+ // Do not keep dangling pointer on the erased instruction.
+ MaybeFirstICF = nullptr;
+ }
+ I->eraseFromParent();
}
+
+ OI->invalidateBlock(BB);
InstrsToErase.clear();
+ if (InvalidateImplicitCF)
+ fillImplicitControlFlowInfo(BB);
if (AtStart)
BI = BB->begin();
@@ -1928,7 +2132,7 @@ bool GVN::processBlock(BasicBlock *BB) {
// Instantiate an expression in a predecessor that lacked it.
bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
- unsigned int ValNo) {
+ BasicBlock *Curr, unsigned int ValNo) {
// Because we are going top-down through the block, all value numbers
// will be available in the predecessor by the time we need them. Any
// that weren't originally present will have been instantiated earlier
@@ -1946,7 +2150,9 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
success = false;
break;
}
- if (Value *V = findLeader(Pred, VN.lookup(Op))) {
+ uint32_t TValNo =
+ VN.phiTranslate(Pred, Curr, VN.lookup(Op), *this);
+ if (Value *V = findLeader(Pred, TValNo)) {
Instr->setOperand(i, V);
} else {
success = false;
@@ -1963,10 +2169,12 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
Instr->insertBefore(Pred->getTerminator());
Instr->setName(Instr->getName() + ".pre");
Instr->setDebugLoc(Instr->getDebugLoc());
- VN.add(Instr, ValNo);
+
+ unsigned Num = VN.lookupOrAdd(Instr);
+ VN.add(Instr, Num);
// Update the availability map to include the new instruction.
- addToLeaderTable(ValNo, Instr, Pred);
+ addToLeaderTable(Num, Instr, Pred);
return true;
}
@@ -2004,18 +2212,27 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
SmallVector<std::pair<Value *, BasicBlock *>, 8> predMap;
for (BasicBlock *P : predecessors(CurrentBlock)) {
- // We're not interested in PRE where the block is its
- // own predecessor, or in blocks with predecessors
- // that are not reachable.
- if (P == CurrentBlock) {
+ // We're not interested in PRE where blocks with predecessors that are
+ // not reachable.
+ if (!DT->isReachableFromEntry(P)) {
NumWithout = 2;
break;
- } else if (!DT->isReachableFromEntry(P)) {
+ }
+ // It is not safe to do PRE when P->CurrentBlock is a loop backedge, and
+ // when CurInst has operand defined in CurrentBlock (so it may be defined
+ // by phi in the loop header).
+ if (BlockRPONumber[P] >= BlockRPONumber[CurrentBlock] &&
+ llvm::any_of(CurInst->operands(), [&](const Use &U) {
+ if (auto *Inst = dyn_cast<Instruction>(U.get()))
+ return Inst->getParent() == CurrentBlock;
+ return false;
+ })) {
NumWithout = 2;
break;
}
- Value *predV = findLeader(P, ValNo);
+ uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this);
+ Value *predV = findLeader(P, TValNo);
if (!predV) {
predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
PREPred = P;
@@ -2041,6 +2258,20 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
Instruction *PREInstr = nullptr;
if (NumWithout != 0) {
+ if (!isSafeToSpeculativelyExecute(CurInst)) {
+ // It is only valid to insert a new instruction if the current instruction
+ // is always executed. An instruction with implicit control flow could
+ // prevent us from doing it. If we cannot speculate the execution, then
+ // PRE should be prohibited.
+ auto It = FirstImplicitControlFlowInsts.find(CurrentBlock);
+ if (It != FirstImplicitControlFlowInsts.end()) {
+ assert(It->second->getParent() == CurrentBlock &&
+ "Implicit control flow map broken?");
+ if (OI->dominates(It->second, CurInst))
+ return false;
+ }
+ }
+
// Don't do PRE across indirect branch.
if (isa<IndirectBrInst>(PREPred->getTerminator()))
return false;
@@ -2055,7 +2286,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
}
// We need to insert somewhere, so let's give it a shot
PREInstr = CurInst->clone();
- if (!performScalarPREInsertion(PREInstr, PREPred, ValNo)) {
+ if (!performScalarPREInsertion(PREInstr, PREPred, CurrentBlock, ValNo)) {
// If we failed insertion, make sure we remove the instruction.
DEBUG(verifyRemoved(PREInstr));
PREInstr->deleteValue();
@@ -2065,7 +2296,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
// Either we should have filled in the PRE instruction, or we should
// not have needed insertions.
- assert (PREInstr != nullptr || NumWithout == 0);
+ assert(PREInstr != nullptr || NumWithout == 0);
++NumGVNPRE;
@@ -2074,13 +2305,19 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
PHINode::Create(CurInst->getType(), predMap.size(),
CurInst->getName() + ".pre-phi", &CurrentBlock->front());
for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
- if (Value *V = predMap[i].first)
+ if (Value *V = predMap[i].first) {
+ // If we use an existing value in this phi, we have to patch the original
+ // value because the phi will be used to replace a later value.
+ patchReplacementInstruction(CurInst, V);
Phi->addIncoming(V, predMap[i].second);
- else
+ } else
Phi->addIncoming(PREInstr, PREPred);
}
VN.add(Phi, ValNo);
+ // After creating a new PHI for ValNo, the phi translate result for ValNo will
+ // be changed, so erase the related stale entries in phi translate cache.
+ VN.eraseTranslateCacheEntry(ValNo, *CurrentBlock);
addToLeaderTable(ValNo, Phi, CurrentBlock);
Phi->setDebugLoc(CurInst->getDebugLoc());
CurInst->replaceAllUsesWith(Phi);
@@ -2093,7 +2330,14 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
if (MD)
MD->removeInstruction(CurInst);
DEBUG(verifyRemoved(CurInst));
+ bool InvalidateImplicitCF =
+ FirstImplicitControlFlowInsts.lookup(CurInst->getParent()) == CurInst;
+ // FIXME: Intended to be markInstructionForDeletion(CurInst), but it causes
+ // some assertion failures.
+ OI->invalidateBlock(CurrentBlock);
CurInst->eraseFromParent();
+ if (InvalidateImplicitCF)
+ fillImplicitControlFlowInfo(CurrentBlock);
++NumGVNInstr;
return true;
@@ -2160,6 +2404,9 @@ bool GVN::iterateOnFunction(Function &F) {
// RPOT walks the graph in its constructor and will not be invalidated during
// processBlock.
ReversePostOrderTraversal<Function *> RPOT(&F);
+
+ for (BasicBlock *BB : RPOT)
+ fillImplicitControlFlowInfo(BB);
for (BasicBlock *BB : RPOT)
Changed |= processBlock(BB);
@@ -2169,7 +2416,50 @@ bool GVN::iterateOnFunction(Function &F) {
void GVN::cleanupGlobalSets() {
VN.clear();
LeaderTable.clear();
+ BlockRPONumber.clear();
TableAllocator.Reset();
+ FirstImplicitControlFlowInsts.clear();
+}
+
+void
+GVN::fillImplicitControlFlowInfo(BasicBlock *BB) {
+ // Make sure that all marked instructions are actually deleted by this point,
+ // so that we don't need to care about omitting them.
+ assert(InstrsToErase.empty() && "Filling before removed all marked insns?");
+ auto MayNotTransferExecutionToSuccessor = [&](const Instruction *I) {
+ // If a block's instruction doesn't always pass the control to its successor
+ // instruction, mark the block as having implicit control flow. We use them
+ // to avoid wrong assumptions of sort "if A is executed and B post-dominates
+ // A, then B is also executed". This is not true is there is an implicit
+ // control flow instruction (e.g. a guard) between them.
+ //
+ // TODO: Currently, isGuaranteedToTransferExecutionToSuccessor returns false
+ // for volatile stores and loads because they can trap. The discussion on
+ // whether or not it is correct is still ongoing. We might want to get rid
+ // of this logic in the future. Anyways, trapping instructions shouldn't
+ // introduce implicit control flow, so we explicitly allow them here. This
+ // must be removed once isGuaranteedToTransferExecutionToSuccessor is fixed.
+ if (isGuaranteedToTransferExecutionToSuccessor(I))
+ return false;
+ if (isa<LoadInst>(I)) {
+ assert(cast<LoadInst>(I)->isVolatile() &&
+ "Non-volatile load should transfer execution to successor!");
+ return false;
+ }
+ if (isa<StoreInst>(I)) {
+ assert(cast<StoreInst>(I)->isVolatile() &&
+ "Non-volatile store should transfer execution to successor!");
+ return false;
+ }
+ return true;
+ };
+ FirstImplicitControlFlowInsts.erase(BB);
+
+ for (auto &I : *BB)
+ if (MayNotTransferExecutionToSuccessor(&I)) {
+ FirstImplicitControlFlowInsts[BB] = &I;
+ break;
+ }
}
/// Verify that the specified instruction does not occur in our
@@ -2317,6 +2607,7 @@ void GVN::assignValNumForDeadCode() {
class llvm::gvn::GVNLegacyPass : public FunctionPass {
public:
static char ID; // Pass identification, replacement for typeid
+
explicit GVNLegacyPass(bool NoLoads = false)
: FunctionPass(ID), NoLoads(NoLoads) {
initializeGVNLegacyPassPass(*PassRegistry::getPassRegistry());
@@ -2360,11 +2651,6 @@ private:
char GVNLegacyPass::ID = 0;
-// The public interface to this file...
-FunctionPass *llvm::createGVNPass(bool NoLoads) {
- return new GVNLegacyPass(NoLoads);
-}
-
INITIALIZE_PASS_BEGIN(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
@@ -2374,3 +2660,8 @@ INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
+
+// The public interface to this file...
+FunctionPass *llvm::createGVNPass(bool NoLoads) {
+ return new GVNLegacyPass(NoLoads);
+}
diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp
index 29de792bd248..c0cd1ea74a74 100644
--- a/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/lib/Transforms/Scalar/GVNHoist.cpp
@@ -13,44 +13,72 @@
// 1. To reduce the code size.
// 2. In some cases reduce critical path (by exposing more ILP).
//
+// The algorithm factors out the reachability of values such that multiple
+// queries to find reachability of values are fast. This is based on finding the
+// ANTIC points in the CFG which do not change during hoisting. The ANTIC points
+// are basically the dominance-frontiers in the inverse graph. So we introduce a
+// data structure (CHI nodes) to keep track of values flowing out of a basic
+// block. We only do this for values with multiple occurrences in the function
+// as they are the potential hoistable candidates. This approach allows us to
+// hoist instructions to a basic block with more than two successors, as well as
+// deal with infinite loops in a trivial way.
+//
+// Limitations: This pass does not hoist fully redundant expressions because
+// they are already handled by GVN-PRE. It is advisable to run gvn-hoist before
+// and after gvn-pre because gvn-pre creates opportunities for more instructions
+// to be hoisted.
+//
// Hoisting may affect the performance in some cases. To mitigate that, hoisting
// is disabled in the following cases.
// 1. Scalars across calls.
// 2. geps when corresponding load/store cannot be hoisted.
-//
-// TODO: Hoist from >2 successors. Currently GVNHoist will not hoist stores
-// in this case because it works on two instructions at a time.
-// entry:
-// switch i32 %c1, label %exit1 [
-// i32 0, label %sw0
-// i32 1, label %sw1
-// ]
-//
-// sw0:
-// store i32 1, i32* @G
-// br label %exit
-//
-// sw1:
-// store i32 1, i32* @G
-// br label %exit
-//
-// exit1:
-// store i32 1, i32* @G
-// ret void
-// exit:
-// ret void
//===----------------------------------------------------------------------===//
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <utility>
+#include <vector>
using namespace llvm;
@@ -69,6 +97,7 @@ static cl::opt<int>
MaxHoistedThreshold("gvn-max-hoisted", cl::Hidden, cl::init(-1),
cl::desc("Max number of instructions to hoist "
"(default unlimited = -1)"));
+
static cl::opt<int> MaxNumberOfBBSInPath(
"gvn-hoist-max-bbs", cl::Hidden, cl::init(4),
cl::desc("Max number of basic blocks on the path between "
@@ -86,34 +115,50 @@ static cl::opt<int>
namespace llvm {
-// Provides a sorting function based on the execution order of two instructions.
-struct SortByDFSIn {
-private:
- DenseMap<const Value *, unsigned> &DFSNumber;
+using BBSideEffectsSet = DenseMap<const BasicBlock *, bool>;
+using SmallVecInsn = SmallVector<Instruction *, 4>;
+using SmallVecImplInsn = SmallVectorImpl<Instruction *>;
-public:
- SortByDFSIn(DenseMap<const Value *, unsigned> &D) : DFSNumber(D) {}
+// Each element of a hoisting list contains the basic block where to hoist and
+// a list of instructions to be hoisted.
+using HoistingPointInfo = std::pair<BasicBlock *, SmallVecInsn>;
- // Returns true when A executes before B.
- bool operator()(const Instruction *A, const Instruction *B) const {
- const BasicBlock *BA = A->getParent();
- const BasicBlock *BB = B->getParent();
- unsigned ADFS, BDFS;
- if (BA == BB) {
- ADFS = DFSNumber.lookup(A);
- BDFS = DFSNumber.lookup(B);
- } else {
- ADFS = DFSNumber.lookup(BA);
- BDFS = DFSNumber.lookup(BB);
- }
- assert(ADFS && BDFS);
- return ADFS < BDFS;
- }
-};
+using HoistingPointList = SmallVector<HoistingPointInfo, 4>;
// A map from a pair of VNs to all the instructions with those VNs.
-typedef DenseMap<std::pair<unsigned, unsigned>, SmallVector<Instruction *, 4>>
- VNtoInsns;
+using VNType = std::pair<unsigned, unsigned>;
+
+using VNtoInsns = DenseMap<VNType, SmallVector<Instruction *, 4>>;
+
+// CHI keeps information about values flowing out of a basic block. It is
+// similar to PHI but in the inverse graph, and used for outgoing values on each
+// edge. For conciseness, it is computed only for instructions with multiple
+// occurrences in the CFG because they are the only hoistable candidates.
+// A (CHI[{V, B, I1}, {V, C, I2}]
+// / \
+// / \
+// B(I1) C (I2)
+// The Value number for both I1 and I2 is V, the CHI node will save the
+// instruction as well as the edge where the value is flowing to.
+struct CHIArg {
+ VNType VN;
+
+ // Edge destination (shows the direction of flow), may not be where the I is.
+ BasicBlock *Dest;
+
+ // The instruction (VN) which uses the values flowing out of CHI.
+ Instruction *I;
+
+ bool operator==(const CHIArg &A) { return VN == A.VN; }
+ bool operator!=(const CHIArg &A) { return !(*this == A); }
+};
+
+using CHIIt = SmallVectorImpl<CHIArg>::iterator;
+using CHIArgs = iterator_range<CHIIt>;
+using OutValuesType = DenseMap<BasicBlock *, SmallVector<CHIArg, 2>>;
+using InValuesType =
+ DenseMap<BasicBlock *, SmallVector<std::pair<VNType, Instruction *>, 2>>;
+
// An invalid value number Used when inserting a single value number into
// VNtoInsns.
enum : unsigned { InvalidVN = ~2U };
@@ -192,16 +237,10 @@ public:
}
const VNtoInsns &getScalarVNTable() const { return VNtoCallsScalars; }
-
const VNtoInsns &getLoadVNTable() const { return VNtoCallsLoads; }
-
const VNtoInsns &getStoreVNTable() const { return VNtoCallsStores; }
};
-typedef DenseMap<const BasicBlock *, bool> BBSideEffectsSet;
-typedef SmallVector<Instruction *, 4> SmallVecInsn;
-typedef SmallVectorImpl<Instruction *> SmallVecImplInsn;
-
static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) {
static const unsigned KnownIDs[] = {
LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
@@ -216,15 +255,13 @@ static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) {
// cases reduce critical path (by exposing more ILP).
class GVNHoist {
public:
- GVNHoist(DominatorTree *DT, AliasAnalysis *AA, MemoryDependenceResults *MD,
- MemorySSA *MSSA)
- : DT(DT), AA(AA), MD(MD), MSSA(MSSA),
- MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)),
- HoistingGeps(false),
- HoistedCtr(0)
- { }
+ GVNHoist(DominatorTree *DT, PostDominatorTree *PDT, AliasAnalysis *AA,
+ MemoryDependenceResults *MD, MemorySSA *MSSA)
+ : DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA),
+ MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {}
bool run(Function &F) {
+ NumFuncArgs = F.arg_size();
VN.setDomTree(DT);
VN.setAliasAnalysis(AA);
VN.setMemDep(MD);
@@ -241,7 +278,7 @@ public:
int ChainLength = 0;
// FIXME: use lazy evaluation of VN to avoid the fix-point computation.
- while (1) {
+ while (true) {
if (MaxChainLength != -1 && ++ChainLength >= MaxChainLength)
return Res;
@@ -261,18 +298,48 @@ public:
return Res;
}
+ // Copied from NewGVN.cpp
+ // This function provides global ranking of operations so that we can place
+ // them in a canonical order. Note that rank alone is not necessarily enough
+ // for a complete ordering, as constants all have the same rank. However,
+ // generally, we will simplify an operation with all constants so that it
+ // doesn't matter what order they appear in.
+ unsigned int rank(const Value *V) const {
+ // Prefer constants to undef to anything else
+ // Undef is a constant, have to check it first.
+ // Prefer smaller constants to constantexprs
+ if (isa<ConstantExpr>(V))
+ return 2;
+ if (isa<UndefValue>(V))
+ return 1;
+ if (isa<Constant>(V))
+ return 0;
+ else if (auto *A = dyn_cast<Argument>(V))
+ return 3 + A->getArgNo();
+
+ // Need to shift the instruction DFS by number of arguments + 3 to account
+ // for the constant and argument ranking above.
+ auto Result = DFSNumber.lookup(V);
+ if (Result > 0)
+ return 4 + NumFuncArgs + Result;
+ // Unreachable or something else, just return a really large number.
+ return ~0;
+ }
+
private:
GVN::ValueTable VN;
DominatorTree *DT;
+ PostDominatorTree *PDT;
AliasAnalysis *AA;
MemoryDependenceResults *MD;
MemorySSA *MSSA;
std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
- const bool HoistingGeps;
DenseMap<const Value *, unsigned> DFSNumber;
BBSideEffectsSet BBSideEffects;
- DenseSet<const BasicBlock*> HoistBarrier;
- int HoistedCtr;
+ DenseSet<const BasicBlock *> HoistBarrier;
+ SmallVector<BasicBlock *, 32> IDFBlocks;
+ unsigned NumFuncArgs;
+ const bool HoistingGeps = false;
enum InsKind { Unknown, Scalar, Load, Store };
@@ -305,45 +372,7 @@ private:
return false;
}
- // Return true when all paths from HoistBB to the end of the function pass
- // through one of the blocks in WL.
- bool hoistingFromAllPaths(const BasicBlock *HoistBB,
- SmallPtrSetImpl<const BasicBlock *> &WL) {
-
- // Copy WL as the loop will remove elements from it.
- SmallPtrSet<const BasicBlock *, 2> WorkList(WL.begin(), WL.end());
-
- for (auto It = df_begin(HoistBB), E = df_end(HoistBB); It != E;) {
- // There exists a path from HoistBB to the exit of the function if we are
- // still iterating in DF traversal and we removed all instructions from
- // the work list.
- if (WorkList.empty())
- return false;
-
- const BasicBlock *BB = *It;
- if (WorkList.erase(BB)) {
- // Stop DFS traversal when BB is in the work list.
- It.skipChildren();
- continue;
- }
-
- // We reached the leaf Basic Block => not all paths have this instruction.
- if (!BB->getTerminator()->getNumSuccessors())
- return false;
-
- // When reaching the back-edge of a loop, there may be a path through the
- // loop that does not pass through B or C before exiting the loop.
- if (successorDominate(BB, HoistBB))
- return false;
-
- // Increment DFS traversal when not skipping children.
- ++It;
- }
-
- return true;
- }
-
- /* Return true when I1 appears before I2 in the instructions of BB. */
+ // Return true when I1 appears before I2 in the instructions of BB.
bool firstInBB(const Instruction *I1, const Instruction *I2) {
assert(I1->getParent() == I2->getParent());
unsigned I1DFS = DFSNumber.lookup(I1);
@@ -387,6 +416,25 @@ private:
return false;
}
+ bool hasEHhelper(const BasicBlock *BB, const BasicBlock *SrcBB,
+ int &NBBsOnAllPaths) {
+ // Stop walk once the limit is reached.
+ if (NBBsOnAllPaths == 0)
+ return true;
+
+ // Impossible to hoist with exceptions on the path.
+ if (hasEH(BB))
+ return true;
+
+ // No such instruction after HoistBarrier in a basic block was
+ // selected for hoisting so instructions selected within basic block with
+ // a hoist barrier can be hoisted.
+ if ((BB != SrcBB) && HoistBarrier.count(BB))
+ return true;
+
+ return false;
+ }
+
// Return true when there are exception handling or loads of memory Def
// between Def and NewPt. This function is only called for stores: Def is
// the MemoryDef of the store to be hoisted.
@@ -414,18 +462,7 @@ private:
continue;
}
- // Stop walk once the limit is reached.
- if (NBBsOnAllPaths == 0)
- return true;
-
- // Impossible to hoist with exceptions on the path.
- if (hasEH(BB))
- return true;
-
- // No such instruction after HoistBarrier in a basic block was
- // selected for hoisting so instructions selected within basic block with
- // a hoist barrier can be hoisted.
- if ((BB != OldBB) && HoistBarrier.count(BB))
+ if (hasEHhelper(BB, OldBB, NBBsOnAllPaths))
return true;
// Check that we do not move a store past loads.
@@ -463,18 +500,7 @@ private:
continue;
}
- // Stop walk once the limit is reached.
- if (NBBsOnAllPaths == 0)
- return true;
-
- // Impossible to hoist with exceptions on the path.
- if (hasEH(BB))
- return true;
-
- // No such instruction after HoistBarrier in a basic block was
- // selected for hoisting so instructions selected within basic block with
- // a hoist barrier can be hoisted.
- if ((BB != SrcBB) && HoistBarrier.count(BB))
+ if (hasEHhelper(BB, SrcBB, NBBsOnAllPaths))
return true;
// -1 is unlimited number of blocks on all paths.
@@ -491,7 +517,6 @@ private:
// to NewPt.
bool safeToHoistLdSt(const Instruction *NewPt, const Instruction *OldPt,
MemoryUseOrDef *U, InsKind K, int &NBBsOnAllPaths) {
-
// In place hoisting is safe.
if (NewPt == OldPt)
return true;
@@ -533,141 +558,258 @@ private:
// Return true when it is safe to hoist scalar instructions from all blocks in
// WL to HoistBB.
- bool safeToHoistScalar(const BasicBlock *HoistBB,
- SmallPtrSetImpl<const BasicBlock *> &WL,
+ bool safeToHoistScalar(const BasicBlock *HoistBB, const BasicBlock *BB,
int &NBBsOnAllPaths) {
- // Check that the hoisted expression is needed on all paths.
- if (!hoistingFromAllPaths(HoistBB, WL))
- return false;
+ return !hasEHOnPath(HoistBB, BB, NBBsOnAllPaths);
+ }
- for (const BasicBlock *BB : WL)
- if (hasEHOnPath(HoistBB, BB, NBBsOnAllPaths))
- return false;
+ // In the inverse CFG, the dominance frontier of basic block (BB) is the
+ // point where ANTIC needs to be computed for instructions which are going
+ // to be hoisted. Since this point does not change during gvn-hoist,
+ // we compute it only once (on demand).
+ // The ides is inspired from:
+ // "Partial Redundancy Elimination in SSA Form"
+ // ROBERT KENNEDY, SUN CHAN, SHIN-MING LIU, RAYMOND LO, PENG TU and FRED CHOW
+ // They use similar idea in the forward graph to to find fully redundant and
+ // partially redundant expressions, here it is used in the inverse graph to
+ // find fully anticipable instructions at merge point (post-dominator in
+ // the inverse CFG).
+ // Returns the edge via which an instruction in BB will get the values from.
+
+ // Returns true when the values are flowing out to each edge.
+ bool valueAnticipable(CHIArgs C, TerminatorInst *TI) const {
+ if (TI->getNumSuccessors() > (unsigned)std::distance(C.begin(), C.end()))
+ return false; // Not enough args in this CHI.
+ for (auto CHI : C) {
+ BasicBlock *Dest = CHI.Dest;
+ // Find if all the edges have values flowing out of BB.
+ bool Found = llvm::any_of(TI->successors(), [Dest](const BasicBlock *BB) {
+ return BB == Dest; });
+ if (!Found)
+ return false;
+ }
return true;
}
- // Each element of a hoisting list contains the basic block where to hoist and
- // a list of instructions to be hoisted.
- typedef std::pair<BasicBlock *, SmallVecInsn> HoistingPointInfo;
- typedef SmallVector<HoistingPointInfo, 4> HoistingPointList;
+ // Check if it is safe to hoist values tracked by CHI in the range
+ // [Begin, End) and accumulate them in Safe.
+ void checkSafety(CHIArgs C, BasicBlock *BB, InsKind K,
+ SmallVectorImpl<CHIArg> &Safe) {
+ int NumBBsOnAllPaths = MaxNumberOfBBSInPath;
+ for (auto CHI : C) {
+ Instruction *Insn = CHI.I;
+ if (!Insn) // No instruction was inserted in this CHI.
+ continue;
+ if (K == InsKind::Scalar) {
+ if (safeToHoistScalar(BB, Insn->getParent(), NumBBsOnAllPaths))
+ Safe.push_back(CHI);
+ } else {
+ MemoryUseOrDef *UD = MSSA->getMemoryAccess(Insn);
+ if (safeToHoistLdSt(BB->getTerminator(), Insn, UD, K, NumBBsOnAllPaths))
+ Safe.push_back(CHI);
+ }
+ }
+ }
+
+ using RenameStackType = DenseMap<VNType, SmallVector<Instruction *, 2>>;
- // Partition InstructionsToHoist into a set of candidates which can share a
- // common hoisting point. The partitions are collected in HPL. IsScalar is
- // true when the instructions in InstructionsToHoist are scalars. IsLoad is
- // true when the InstructionsToHoist are loads, false when they are stores.
- void partitionCandidates(SmallVecImplInsn &InstructionsToHoist,
- HoistingPointList &HPL, InsKind K) {
- // No need to sort for two instructions.
- if (InstructionsToHoist.size() > 2) {
- SortByDFSIn Pred(DFSNumber);
- std::sort(InstructionsToHoist.begin(), InstructionsToHoist.end(), Pred);
+ // Push all the VNs corresponding to BB into RenameStack.
+ void fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
+ RenameStackType &RenameStack) {
+ auto it1 = ValueBBs.find(BB);
+ if (it1 != ValueBBs.end()) {
+ // Iterate in reverse order to keep lower ranked values on the top.
+ for (std::pair<VNType, Instruction *> &VI : reverse(it1->second)) {
+ // Get the value of instruction I
+ DEBUG(dbgs() << "\nPushing on stack: " << *VI.second);
+ RenameStack[VI.first].push_back(VI.second);
+ }
}
+ }
- int NumBBsOnAllPaths = MaxNumberOfBBSInPath;
+ void fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs,
+ RenameStackType &RenameStack) {
+ // For each *predecessor* (because Post-DOM) of BB check if it has a CHI
+ for (auto Pred : predecessors(BB)) {
+ auto P = CHIBBs.find(Pred);
+ if (P == CHIBBs.end()) {
+ continue;
+ }
+ DEBUG(dbgs() << "\nLooking at CHIs in: " << Pred->getName(););
+ // A CHI is found (BB -> Pred is an edge in the CFG)
+ // Pop the stack until Top(V) = Ve.
+ auto &VCHI = P->second;
+ for (auto It = VCHI.begin(), E = VCHI.end(); It != E;) {
+ CHIArg &C = *It;
+ if (!C.Dest) {
+ auto si = RenameStack.find(C.VN);
+ // The Basic Block where CHI is must dominate the value we want to
+ // track in a CHI. In the PDom walk, there can be values in the
+ // stack which are not control dependent e.g., nested loop.
+ if (si != RenameStack.end() && si->second.size() &&
+ DT->dominates(Pred, si->second.back()->getParent())) {
+ C.Dest = BB; // Assign the edge
+ C.I = si->second.pop_back_val(); // Assign the argument
+ DEBUG(dbgs() << "\nCHI Inserted in BB: " << C.Dest->getName()
+ << *C.I << ", VN: " << C.VN.first << ", "
+ << C.VN.second);
+ }
+ // Move to next CHI of a different value
+ It = std::find_if(It, VCHI.end(),
+ [It](CHIArg &A) { return A != *It; });
+ } else
+ ++It;
+ }
+ }
+ }
- SmallVecImplInsn::iterator II = InstructionsToHoist.begin();
- SmallVecImplInsn::iterator Start = II;
- Instruction *HoistPt = *II;
- BasicBlock *HoistBB = HoistPt->getParent();
- MemoryUseOrDef *UD;
- if (K != InsKind::Scalar)
- UD = MSSA->getMemoryAccess(HoistPt);
+ // Walk the post-dominator tree top-down and use a stack for each value to
+ // store the last value you see. When you hit a CHI from a given edge, the
+ // value to use as the argument is at the top of the stack, add the value to
+ // CHI and pop.
+ void insertCHI(InValuesType &ValueBBs, OutValuesType &CHIBBs) {
+ auto Root = PDT->getNode(nullptr);
+ if (!Root)
+ return;
+ // Depth first walk on PDom tree to fill the CHIargs at each PDF.
+ RenameStackType RenameStack;
+ for (auto Node : depth_first(Root)) {
+ BasicBlock *BB = Node->getBlock();
+ if (!BB)
+ continue;
- for (++II; II != InstructionsToHoist.end(); ++II) {
- Instruction *Insn = *II;
- BasicBlock *BB = Insn->getParent();
- BasicBlock *NewHoistBB;
- Instruction *NewHoistPt;
+ // Collect all values in BB and push to stack.
+ fillRenameStack(BB, ValueBBs, RenameStack);
- if (BB == HoistBB) { // Both are in the same Basic Block.
- NewHoistBB = HoistBB;
- NewHoistPt = firstInBB(Insn, HoistPt) ? Insn : HoistPt;
- } else {
- // If the hoisting point contains one of the instructions,
- // then hoist there, otherwise hoist before the terminator.
- NewHoistBB = DT->findNearestCommonDominator(HoistBB, BB);
- if (NewHoistBB == BB)
- NewHoistPt = Insn;
- else if (NewHoistBB == HoistBB)
- NewHoistPt = HoistPt;
- else
- NewHoistPt = NewHoistBB->getTerminator();
- }
+ // Fill outgoing values in each CHI corresponding to BB.
+ fillChiArgs(BB, CHIBBs, RenameStack);
+ }
+ }
- SmallPtrSet<const BasicBlock *, 2> WL;
- WL.insert(HoistBB);
- WL.insert(BB);
+ // Walk all the CHI-nodes to find ones which have a empty-entry and remove
+ // them Then collect all the instructions which are safe to hoist and see if
+ // they form a list of anticipable values. OutValues contains CHIs
+ // corresponding to each basic block.
+ void findHoistableCandidates(OutValuesType &CHIBBs, InsKind K,
+ HoistingPointList &HPL) {
+ auto cmpVN = [](const CHIArg &A, const CHIArg &B) { return A.VN < B.VN; };
- if (K == InsKind::Scalar) {
- if (safeToHoistScalar(NewHoistBB, WL, NumBBsOnAllPaths)) {
- // Extend HoistPt to NewHoistPt.
- HoistPt = NewHoistPt;
- HoistBB = NewHoistBB;
- continue;
- }
- } else {
- // When NewBB already contains an instruction to be hoisted, the
- // expression is needed on all paths.
- // Check that the hoisted expression is needed on all paths: it is
- // unsafe to hoist loads to a place where there may be a path not
- // loading from the same address: for instance there may be a branch on
- // which the address of the load may not be initialized.
- if ((HoistBB == NewHoistBB || BB == NewHoistBB ||
- hoistingFromAllPaths(NewHoistBB, WL)) &&
- // Also check that it is safe to move the load or store from HoistPt
- // to NewHoistPt, and from Insn to NewHoistPt.
- safeToHoistLdSt(NewHoistPt, HoistPt, UD, K, NumBBsOnAllPaths) &&
- safeToHoistLdSt(NewHoistPt, Insn, MSSA->getMemoryAccess(Insn),
- K, NumBBsOnAllPaths)) {
- // Extend HoistPt to NewHoistPt.
- HoistPt = NewHoistPt;
- HoistBB = NewHoistBB;
- continue;
- }
- }
+ // CHIArgs now have the outgoing values, so check for anticipability and
+ // accumulate hoistable candidates in HPL.
+ for (std::pair<BasicBlock *, SmallVector<CHIArg, 2>> &A : CHIBBs) {
+ BasicBlock *BB = A.first;
+ SmallVectorImpl<CHIArg> &CHIs = A.second;
+ // Vector of PHIs contains PHIs for different instructions.
+ // Sort the args according to their VNs, such that identical
+ // instructions are together.
+ std::stable_sort(CHIs.begin(), CHIs.end(), cmpVN);
+ auto TI = BB->getTerminator();
+ auto B = CHIs.begin();
+ // [PreIt, PHIIt) form a range of CHIs which have identical VNs.
+ auto PHIIt = std::find_if(CHIs.begin(), CHIs.end(),
+ [B](CHIArg &A) { return A != *B; });
+ auto PrevIt = CHIs.begin();
+ while (PrevIt != PHIIt) {
+ // Collect values which satisfy safety checks.
+ SmallVector<CHIArg, 2> Safe;
+ // We check for safety first because there might be multiple values in
+ // the same path, some of which are not safe to be hoisted, but overall
+ // each edge has at least one value which can be hoisted, making the
+ // value anticipable along that path.
+ checkSafety(make_range(PrevIt, PHIIt), BB, K, Safe);
- // At this point it is not safe to extend the current hoisting to
- // NewHoistPt: save the hoisting list so far.
- if (std::distance(Start, II) > 1)
- HPL.push_back({HoistBB, SmallVecInsn(Start, II)});
+ // List of safe values should be anticipable at TI.
+ if (valueAnticipable(make_range(Safe.begin(), Safe.end()), TI)) {
+ HPL.push_back({BB, SmallVecInsn()});
+ SmallVecInsn &V = HPL.back().second;
+ for (auto B : Safe)
+ V.push_back(B.I);
+ }
- // Start over from BB.
- Start = II;
- if (K != InsKind::Scalar)
- UD = MSSA->getMemoryAccess(*Start);
- HoistPt = Insn;
- HoistBB = BB;
- NumBBsOnAllPaths = MaxNumberOfBBSInPath;
+ // Check other VNs
+ PrevIt = PHIIt;
+ PHIIt = std::find_if(PrevIt, CHIs.end(),
+ [PrevIt](CHIArg &A) { return A != *PrevIt; });
+ }
}
-
- // Save the last partition.
- if (std::distance(Start, II) > 1)
- HPL.push_back({HoistBB, SmallVecInsn(Start, II)});
}
- // Initialize HPL from Map.
+ // Compute insertion points for each values which can be fully anticipated at
+ // a dominator. HPL contains all such values.
void computeInsertionPoints(const VNtoInsns &Map, HoistingPointList &HPL,
InsKind K) {
+ // Sort VNs based on their rankings
+ std::vector<VNType> Ranks;
for (const auto &Entry : Map) {
- if (MaxHoistedThreshold != -1 && ++HoistedCtr > MaxHoistedThreshold)
- return;
+ Ranks.push_back(Entry.first);
+ }
+
+ // TODO: Remove fully-redundant expressions.
+ // Get instruction from the Map, assume that all the Instructions
+ // with same VNs have same rank (this is an approximation).
+ std::sort(Ranks.begin(), Ranks.end(),
+ [this, &Map](const VNType &r1, const VNType &r2) {
+ return (rank(*Map.lookup(r1).begin()) <
+ rank(*Map.lookup(r2).begin()));
+ });
- const SmallVecInsn &V = Entry.second;
+ // - Sort VNs according to their rank, and start with lowest ranked VN
+ // - Take a VN and for each instruction with same VN
+ // - Find the dominance frontier in the inverse graph (PDF)
+ // - Insert the chi-node at PDF
+ // - Remove the chi-nodes with missing entries
+ // - Remove values from CHI-nodes which do not truly flow out, e.g.,
+ // modified along the path.
+ // - Collect the remaining values that are still anticipable
+ SmallVector<BasicBlock *, 2> IDFBlocks;
+ ReverseIDFCalculator IDFs(*PDT);
+ OutValuesType OutValue;
+ InValuesType InValue;
+ for (const auto &R : Ranks) {
+ const SmallVecInsn &V = Map.lookup(R);
if (V.size() < 2)
continue;
+ const VNType &VN = R;
+ SmallPtrSet<BasicBlock *, 2> VNBlocks;
+ for (auto &I : V) {
+ BasicBlock *BBI = I->getParent();
+ if (!hasEH(BBI))
+ VNBlocks.insert(BBI);
+ }
+ // Compute the Post Dominance Frontiers of each basic block
+ // The dominance frontier of a live block X in the reverse
+ // control graph is the set of blocks upon which X is control
+ // dependent. The following sequence computes the set of blocks
+ // which currently have dead terminators that are control
+ // dependence sources of a block which is in NewLiveBlocks.
+ IDFs.setDefiningBlocks(VNBlocks);
+ IDFs.calculate(IDFBlocks);
- // Compute the insertion point and the list of expressions to be hoisted.
- SmallVecInsn InstructionsToHoist;
- for (auto I : V)
- // We don't need to check for hoist-barriers here because if
- // I->getParent() is a barrier then I precedes the barrier.
- if (!hasEH(I->getParent()))
- InstructionsToHoist.push_back(I);
-
- if (!InstructionsToHoist.empty())
- partitionCandidates(InstructionsToHoist, HPL, K);
+ // Make a map of BB vs instructions to be hoisted.
+ for (unsigned i = 0; i < V.size(); ++i) {
+ InValue[V[i]->getParent()].push_back(std::make_pair(VN, V[i]));
+ }
+ // Insert empty CHI node for this VN. This is used to factor out
+ // basic blocks where the ANTIC can potentially change.
+ for (auto IDFB : IDFBlocks) { // TODO: Prune out useless CHI insertions.
+ for (unsigned i = 0; i < V.size(); ++i) {
+ CHIArg C = {VN, nullptr, nullptr};
+ // Ignore spurious PDFs.
+ if (DT->properlyDominates(IDFB, V[i]->getParent())) {
+ OutValue[IDFB].push_back(C);
+ DEBUG(dbgs() << "\nInsertion a CHI for BB: " << IDFB->getName()
+ << ", for Insn: " << *V[i]);
+ }
+ }
+ }
}
+
+ // Insert CHI args at each PDF to iterate on factored graph of
+ // control dependence.
+ insertCHI(InValue, OutValue);
+ // Using the CHI args inserted at each PDF, find fully anticipable values.
+ findHoistableCandidates(OutValue, K, HPL);
}
// Return true when all operands of Instr are available at insertion point
@@ -714,7 +856,6 @@ private:
Instruction *ClonedGep = Gep->clone();
for (unsigned i = 0, e = Gep->getNumOperands(); i != e; ++i)
if (Instruction *Op = dyn_cast<Instruction>(Gep->getOperand(i))) {
-
// Check whether the operand is already available.
if (DT->dominates(Op->getParent(), HoistPt))
continue;
@@ -748,6 +889,88 @@ private:
Repl->replaceUsesOfWith(Gep, ClonedGep);
}
+ void updateAlignment(Instruction *I, Instruction *Repl) {
+ if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) {
+ ReplacementLoad->setAlignment(
+ std::min(ReplacementLoad->getAlignment(),
+ cast<LoadInst>(I)->getAlignment()));
+ ++NumLoadsRemoved;
+ } else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) {
+ ReplacementStore->setAlignment(
+ std::min(ReplacementStore->getAlignment(),
+ cast<StoreInst>(I)->getAlignment()));
+ ++NumStoresRemoved;
+ } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) {
+ ReplacementAlloca->setAlignment(
+ std::max(ReplacementAlloca->getAlignment(),
+ cast<AllocaInst>(I)->getAlignment()));
+ } else if (isa<CallInst>(Repl)) {
+ ++NumCallsRemoved;
+ }
+ }
+
+ // Remove all the instructions in Candidates and replace their usage with Repl.
+ // Returns the number of instructions removed.
+ unsigned rauw(const SmallVecInsn &Candidates, Instruction *Repl,
+ MemoryUseOrDef *NewMemAcc) {
+ unsigned NR = 0;
+ for (Instruction *I : Candidates) {
+ if (I != Repl) {
+ ++NR;
+ updateAlignment(I, Repl);
+ if (NewMemAcc) {
+ // Update the uses of the old MSSA access with NewMemAcc.
+ MemoryAccess *OldMA = MSSA->getMemoryAccess(I);
+ OldMA->replaceAllUsesWith(NewMemAcc);
+ MSSAUpdater->removeMemoryAccess(OldMA);
+ }
+
+ Repl->andIRFlags(I);
+ combineKnownMetadata(Repl, I);
+ I->replaceAllUsesWith(Repl);
+ // Also invalidate the Alias Analysis cache.
+ MD->removeInstruction(I);
+ I->eraseFromParent();
+ }
+ }
+ return NR;
+ }
+
+ // Replace all Memory PHI usage with NewMemAcc.
+ void raMPHIuw(MemoryUseOrDef *NewMemAcc) {
+ SmallPtrSet<MemoryPhi *, 4> UsePhis;
+ for (User *U : NewMemAcc->users())
+ if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(U))
+ UsePhis.insert(Phi);
+
+ for (MemoryPhi *Phi : UsePhis) {
+ auto In = Phi->incoming_values();
+ if (llvm::all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
+ Phi->replaceAllUsesWith(NewMemAcc);
+ MSSAUpdater->removeMemoryAccess(Phi);
+ }
+ }
+ }
+
+ // Remove all other instructions and replace them with Repl.
+ unsigned removeAndReplace(const SmallVecInsn &Candidates, Instruction *Repl,
+ BasicBlock *DestBB, bool MoveAccess) {
+ MemoryUseOrDef *NewMemAcc = MSSA->getMemoryAccess(Repl);
+ if (MoveAccess && NewMemAcc) {
+ // The definition of this ld/st will not change: ld/st hoisting is
+ // legal when the ld/st is not moved past its current definition.
+ MSSAUpdater->moveToPlace(NewMemAcc, DestBB, MemorySSA::End);
+ }
+
+ // Replace all other instructions with Repl with memory access NewMemAcc.
+ unsigned NR = rauw(Candidates, Repl, NewMemAcc);
+
+ // Remove MemorySSA phi nodes with the same arguments.
+ if (NewMemAcc)
+ raMPHIuw(NewMemAcc);
+ return NR;
+ }
+
// In the case Repl is a load or a store, we make all their GEPs
// available: GEPs are not hoisted by default to avoid the address
// computations to be hoisted without the associated load or store.
@@ -789,11 +1012,11 @@ private:
for (const HoistingPointInfo &HP : HPL) {
// Find out whether we already have one of the instructions in HoistPt,
// in which case we do not have to move it.
- BasicBlock *HoistPt = HP.first;
+ BasicBlock *DestBB = HP.first;
const SmallVecInsn &InstructionsToHoist = HP.second;
Instruction *Repl = nullptr;
for (Instruction *I : InstructionsToHoist)
- if (I->getParent() == HoistPt)
+ if (I->getParent() == DestBB)
// If there are two instructions in HoistPt to be hoisted in place:
// update Repl to be the first one, such that we can rename the uses
// of the second based on the first.
@@ -805,7 +1028,7 @@ private:
bool MoveAccess = true;
if (Repl) {
// Repl is already in HoistPt: it remains in place.
- assert(allOperandsAvailable(Repl, HoistPt) &&
+ assert(allOperandsAvailable(Repl, DestBB) &&
"instruction depends on operands that are not available");
MoveAccess = false;
} else {
@@ -816,40 +1039,26 @@ private:
// We can move Repl in HoistPt only when all operands are available.
// The order in which hoistings are done may influence the availability
// of operands.
- if (!allOperandsAvailable(Repl, HoistPt)) {
-
+ if (!allOperandsAvailable(Repl, DestBB)) {
// When HoistingGeps there is nothing more we can do to make the
// operands available: just continue.
if (HoistingGeps)
continue;
// When not HoistingGeps we need to copy the GEPs.
- if (!makeGepOperandsAvailable(Repl, HoistPt, InstructionsToHoist))
+ if (!makeGepOperandsAvailable(Repl, DestBB, InstructionsToHoist))
continue;
}
// Move the instruction at the end of HoistPt.
- Instruction *Last = HoistPt->getTerminator();
+ Instruction *Last = DestBB->getTerminator();
MD->removeInstruction(Repl);
Repl->moveBefore(Last);
DFSNumber[Repl] = DFSNumber[Last]++;
}
- MemoryAccess *NewMemAcc = MSSA->getMemoryAccess(Repl);
-
- if (MoveAccess) {
- if (MemoryUseOrDef *OldMemAcc =
- dyn_cast_or_null<MemoryUseOrDef>(NewMemAcc)) {
- // The definition of this ld/st will not change: ld/st hoisting is
- // legal when the ld/st is not moved past its current definition.
- MemoryAccess *Def = OldMemAcc->getDefiningAccess();
- NewMemAcc =
- MSSAUpdater->createMemoryAccessInBB(Repl, Def, HoistPt, MemorySSA::End);
- OldMemAcc->replaceAllUsesWith(NewMemAcc);
- MSSAUpdater->removeMemoryAccess(OldMemAcc);
- }
- }
+ NR += removeAndReplace(InstructionsToHoist, Repl, DestBB, MoveAccess);
if (isa<LoadInst>(Repl))
++NL;
@@ -859,59 +1068,6 @@ private:
++NC;
else // Scalar
++NI;
-
- // Remove and rename all other instructions.
- for (Instruction *I : InstructionsToHoist)
- if (I != Repl) {
- ++NR;
- if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) {
- ReplacementLoad->setAlignment(
- std::min(ReplacementLoad->getAlignment(),
- cast<LoadInst>(I)->getAlignment()));
- ++NumLoadsRemoved;
- } else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) {
- ReplacementStore->setAlignment(
- std::min(ReplacementStore->getAlignment(),
- cast<StoreInst>(I)->getAlignment()));
- ++NumStoresRemoved;
- } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) {
- ReplacementAlloca->setAlignment(
- std::max(ReplacementAlloca->getAlignment(),
- cast<AllocaInst>(I)->getAlignment()));
- } else if (isa<CallInst>(Repl)) {
- ++NumCallsRemoved;
- }
-
- if (NewMemAcc) {
- // Update the uses of the old MSSA access with NewMemAcc.
- MemoryAccess *OldMA = MSSA->getMemoryAccess(I);
- OldMA->replaceAllUsesWith(NewMemAcc);
- MSSAUpdater->removeMemoryAccess(OldMA);
- }
-
- Repl->andIRFlags(I);
- combineKnownMetadata(Repl, I);
- I->replaceAllUsesWith(Repl);
- // Also invalidate the Alias Analysis cache.
- MD->removeInstruction(I);
- I->eraseFromParent();
- }
-
- // Remove MemorySSA phi nodes with the same arguments.
- if (NewMemAcc) {
- SmallPtrSet<MemoryPhi *, 4> UsePhis;
- for (User *U : NewMemAcc->users())
- if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(U))
- UsePhis.insert(Phi);
-
- for (auto *Phi : UsePhis) {
- auto In = Phi->incoming_values();
- if (all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
- Phi->replaceAllUsesWith(NewMemAcc);
- MSSAUpdater->removeMemoryAccess(Phi);
- }
- }
- }
}
NumHoisted += NL + NS + NC + NI;
@@ -935,8 +1091,8 @@ private:
// If I1 cannot guarantee progress, subsequent instructions
// in BB cannot be hoisted anyways.
if (!isGuaranteedToTransferExecutionToSuccessor(&I1)) {
- HoistBarrier.insert(BB);
- break;
+ HoistBarrier.insert(BB);
+ break;
}
// Only hoist the first instructions in BB up to MaxDepthInBB. Hoisting
// deeper may increase the register pressure and compilation time.
@@ -954,7 +1110,8 @@ private:
else if (auto *Call = dyn_cast<CallInst>(&I1)) {
if (auto *Intr = dyn_cast<IntrinsicInst>(Call)) {
if (isa<DbgInfoIntrinsic>(Intr) ||
- Intr->getIntrinsicID() == Intrinsic::assume)
+ Intr->getIntrinsicID() == Intrinsic::assume ||
+ Intr->getIntrinsicID() == Intrinsic::sideeffect)
continue;
}
if (Call->mayHaveSideEffects())
@@ -996,16 +1153,18 @@ public:
if (skipFunction(F))
return false;
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
auto &MD = getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
auto &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
- GVNHoist G(&DT, &AA, &MD, &MSSA);
+ GVNHoist G(&DT, &PDT, &AA, &MD, &MSSA);
return G.run(F);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<PostDominatorTreeWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<MemoryDependenceWrapperPass>();
AU.addRequired<MemorySSAWrapperPass>();
@@ -1014,14 +1173,16 @@ public:
AU.addPreserved<GlobalsAAWrapperPass>();
}
};
-} // namespace
+
+} // end namespace llvm
PreservedAnalyses GVNHoistPass::run(Function &F, FunctionAnalysisManager &AM) {
DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
AliasAnalysis &AA = AM.getResult<AAManager>(F);
MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
- GVNHoist G(&DT, &AA, &MD, &MSSA);
+ GVNHoist G(&DT, &PDT, &AA, &MD, &MSSA);
if (!G.run(F))
return PreservedAnalyses::all();
@@ -1033,6 +1194,7 @@ PreservedAnalyses GVNHoistPass::run(Function &F, FunctionAnalysisManager &AM) {
}
char GVNHoistLegacyPass::ID = 0;
+
INITIALIZE_PASS_BEGIN(GVNHoistLegacyPass, "gvn-hoist",
"Early GVN Hoisting of Expressions", false, false)
INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
diff --git a/lib/Transforms/Scalar/GVNSink.cpp b/lib/Transforms/Scalar/GVNSink.cpp
index 5fd2dfc118b4..814a62cd7d65 100644
--- a/lib/Transforms/Scalar/GVNSink.cpp
+++ b/lib/Transforms/Scalar/GVNSink.cpp
@@ -1,4 +1,4 @@
-//===- GVNSink.cpp - sink expressions into successors -------------------===//
+//===- GVNSink.cpp - sink expressions into successors ---------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -31,33 +31,54 @@
/// replace %a1 with %c1, will it contribute in an equivalent way to all
/// successive instructions?". The PostValueTable class in GVN provides this
/// mapping.
-///
+//
//===----------------------------------------------------------------------===//
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/Support/MathExtras.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/ArrayRecycler.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/Transforms/Scalar/GVNExpression.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
-#include <unordered_set>
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
using namespace llvm;
#define DEBUG_TYPE "gvn-sink"
@@ -72,8 +93,8 @@ LLVM_DUMP_METHOD void Expression::dump() const {
dbgs() << "\n";
}
-}
-}
+} // end namespace GVNExpression
+} // end namespace llvm
namespace {
@@ -97,7 +118,7 @@ static bool isMemoryInst(const Instruction *I) {
/// list returned by operator*.
class LockstepReverseIterator {
ArrayRef<BasicBlock *> Blocks;
- SmallPtrSet<BasicBlock *, 4> ActiveBlocks;
+ SmallSetVector<BasicBlock *, 4> ActiveBlocks;
SmallVector<Instruction *, 4> Insts;
bool Fail;
@@ -115,7 +136,7 @@ public:
for (BasicBlock *BB : Blocks) {
if (BB->size() <= 1) {
// Block wasn't big enough - only contained a terminator.
- ActiveBlocks.erase(BB);
+ ActiveBlocks.remove(BB);
continue;
}
Insts.push_back(BB->getTerminator()->getPrevNode());
@@ -126,13 +147,20 @@ public:
bool isValid() const { return !Fail; }
ArrayRef<Instruction *> operator*() const { return Insts; }
- SmallPtrSet<BasicBlock *, 4> &getActiveBlocks() { return ActiveBlocks; }
- void restrictToBlocks(SmallPtrSetImpl<BasicBlock *> &Blocks) {
+ // Note: This needs to return a SmallSetVector as the elements of
+ // ActiveBlocks will be later copied to Blocks using std::copy. The
+ // resultant order of elements in Blocks needs to be deterministic.
+ // Using SmallPtrSet instead causes non-deterministic order while
+ // copying. And we cannot simply sort Blocks as they need to match the
+ // corresponding Values.
+ SmallSetVector<BasicBlock *, 4> &getActiveBlocks() { return ActiveBlocks; }
+
+ void restrictToBlocks(SmallSetVector<BasicBlock *, 4> &Blocks) {
for (auto II = Insts.begin(); II != Insts.end();) {
if (std::find(Blocks.begin(), Blocks.end(), (*II)->getParent()) ==
Blocks.end()) {
- ActiveBlocks.erase((*II)->getParent());
+ ActiveBlocks.remove((*II)->getParent());
II = Insts.erase(II);
} else {
++II;
@@ -146,7 +174,7 @@ public:
SmallVector<Instruction *, 4> NewInsts;
for (auto *Inst : Insts) {
if (Inst == &Inst->getParent()->front())
- ActiveBlocks.erase(Inst->getParent());
+ ActiveBlocks.remove(Inst->getParent());
else
NewInsts.push_back(Inst->getPrevNode());
}
@@ -180,14 +208,14 @@ struct SinkingInstructionCandidate {
NumExtraPHIs) // PHIs are expensive, so make sure they're worth it.
- SplitEdgeCost;
}
+
bool operator>(const SinkingInstructionCandidate &Other) const {
return Cost > Other.Cost;
}
};
#ifndef NDEBUG
-llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
- const SinkingInstructionCandidate &C) {
+raw_ostream &operator<<(raw_ostream &OS, const SinkingInstructionCandidate &C) {
OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks
<< " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">";
return OS;
@@ -204,17 +232,20 @@ class ModelledPHI {
SmallVector<BasicBlock *, 4> Blocks;
public:
- ModelledPHI() {}
+ ModelledPHI() = default;
+
ModelledPHI(const PHINode *PN) {
+ // BasicBlock comes first so we sort by basic block pointer order, then by value pointer order.
+ SmallVector<std::pair<BasicBlock *, Value *>, 4> Ops;
for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I)
- Blocks.push_back(PN->getIncomingBlock(I));
- std::sort(Blocks.begin(), Blocks.end());
-
- // This assumes the PHI is already well-formed and there aren't conflicting
- // incoming values for the same block.
- for (auto *B : Blocks)
- Values.push_back(PN->getIncomingValueForBlock(B));
+ Ops.push_back({PN->getIncomingBlock(I), PN->getIncomingValue(I)});
+ std::sort(Ops.begin(), Ops.end());
+ for (auto &P : Ops) {
+ Blocks.push_back(P.first);
+ Values.push_back(P.second);
+ }
}
+
/// Create a dummy ModelledPHI that will compare unequal to any other ModelledPHI
/// without the same ID.
/// \note This is specifically for DenseMapInfo - do not use this!
@@ -241,7 +272,7 @@ public:
/// Restrict the PHI's contents down to only \c NewBlocks.
/// \c NewBlocks must be a subset of \c this->Blocks.
- void restrictToBlocks(const SmallPtrSetImpl<BasicBlock *> &NewBlocks) {
+ void restrictToBlocks(const SmallSetVector<BasicBlock *, 4> &NewBlocks) {
auto BI = Blocks.begin();
auto VI = Values.begin();
while (BI != Blocks.end()) {
@@ -261,19 +292,23 @@ public:
ArrayRef<Value *> getValues() const { return Values; }
bool areAllIncomingValuesSame() const {
- return all_of(Values, [&](Value *V) { return V == Values[0]; });
+ return llvm::all_of(Values, [&](Value *V) { return V == Values[0]; });
}
+
bool areAllIncomingValuesSameType() const {
- return all_of(
+ return llvm::all_of(
Values, [&](Value *V) { return V->getType() == Values[0]->getType(); });
}
+
bool areAnyIncomingValuesConstant() const {
- return any_of(Values, [&](Value *V) { return isa<Constant>(V); });
+ return llvm::any_of(Values, [&](Value *V) { return isa<Constant>(V); });
}
+
// Hash functor
unsigned hash() const {
return (unsigned)hash_combine_range(Values.begin(), Values.end());
}
+
bool operator==(const ModelledPHI &Other) const {
return Values == Other.Values && Blocks == Other.Blocks;
}
@@ -284,17 +319,20 @@ template <typename ModelledPHI> struct DenseMapInfo {
static ModelledPHI Dummy = ModelledPHI::createDummy(0);
return Dummy;
}
+
static inline ModelledPHI &getTombstoneKey() {
static ModelledPHI Dummy = ModelledPHI::createDummy(1);
return Dummy;
}
+
static unsigned getHashValue(const ModelledPHI &V) { return V.hash(); }
+
static bool isEqual(const ModelledPHI &LHS, const ModelledPHI &RHS) {
return LHS == RHS;
}
};
-typedef DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>> ModelledPHISet;
+using ModelledPHISet = DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>>;
//===----------------------------------------------------------------------===//
// ValueTable
@@ -325,10 +363,11 @@ public:
op_push_back(U.getUser());
std::sort(op_begin(), op_end());
}
+
void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; }
void setVolatile(bool V) { Volatile = V; }
- virtual hash_code getHashValue() const {
+ hash_code getHashValue() const override {
return hash_combine(GVNExpression::BasicExpression::getHashValue(),
MemoryUseOrder, Volatile);
}
@@ -348,7 +387,7 @@ class ValueTable {
DenseMap<size_t, uint32_t> HashNumbering;
BumpPtrAllocator Allocator;
ArrayRecycler<Value *> Recycler;
- uint32_t nextValueNumber;
+ uint32_t nextValueNumber = 1;
/// Create an expression for I based on its opcode and its uses. If I
/// touches or reads memory, the expression is also based upon its memory
@@ -378,6 +417,8 @@ class ValueTable {
}
public:
+ ValueTable() = default;
+
/// Returns the value number for the specified value, assigning
/// it a new number if it did not have one before.
uint32_t lookupOrAdd(Value *V) {
@@ -483,8 +524,6 @@ public:
nextValueNumber = 1;
}
- ValueTable() : nextValueNumber(1) {}
-
/// \c Inst uses or touches memory. Return an ID describing the memory state
/// at \c Inst such that if getMemoryUseOrder(I1) == getMemoryUseOrder(I2),
/// the exact same memory operations happen after I1 and I2.
@@ -519,7 +558,8 @@ public:
class GVNSink {
public:
- GVNSink() : VN() {}
+ GVNSink() = default;
+
bool run(Function &F) {
DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() << "\n");
@@ -576,8 +616,9 @@ private:
void foldPointlessPHINodes(BasicBlock *BB) {
auto I = BB->begin();
while (PHINode *PN = dyn_cast<PHINode>(I++)) {
- if (!all_of(PN->incoming_values(),
- [&](const Value *V) { return V == PN->getIncomingValue(0); }))
+ if (!llvm::all_of(PN->incoming_values(), [&](const Value *V) {
+ return V == PN->getIncomingValue(0);
+ }))
continue;
if (PN->getIncomingValue(0) != PN)
PN->replaceAllUsesWith(PN->getIncomingValue(0));
@@ -624,7 +665,7 @@ Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
SmallVector<Instruction *, 4> NewInsts;
for (auto *I : Insts) {
if (VN.lookup(I) != VNumToSink)
- ActivePreds.erase(I->getParent());
+ ActivePreds.remove(I->getParent());
else
NewInsts.push_back(I);
}
@@ -794,7 +835,7 @@ void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks,
SmallVector<Value *, 4> NewOperands;
for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
- bool NeedPHI = any_of(Insts, [&I0, O](const Instruction *I) {
+ bool NeedPHI = llvm::any_of(Insts, [&I0, O](const Instruction *I) {
return I->getOperand(O) != I0->getOperand(O);
});
if (!NeedPHI) {
@@ -860,7 +901,8 @@ public:
AU.addPreserved<GlobalsAAWrapperPass>();
}
};
-} // namespace
+
+} // end anonymous namespace
PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) {
GVNSink G;
@@ -873,6 +915,7 @@ PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) {
}
char GVNSinkLegacyPass::ID = 0;
+
INITIALIZE_PASS_BEGIN(GVNSinkLegacyPass, "gvn-sink",
"Early GVN sinking of Expressions", false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
diff --git a/lib/Transforms/Scalar/GuardWidening.cpp b/lib/Transforms/Scalar/GuardWidening.cpp
index fb7c6e15758d..c4aeccb85ca7 100644
--- a/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/lib/Transforms/Scalar/GuardWidening.cpp
@@ -664,6 +664,7 @@ PreservedAnalyses GuardWideningPass::run(Function &F,
return PA;
}
+#ifndef NDEBUG
StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
switch (WS) {
case WS_IllegalOrNegative:
@@ -678,6 +679,7 @@ StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
llvm_unreachable("Fully covered switch above!");
}
+#endif
char GuardWideningLegacyPass::ID = 0;
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 10782963177c..74d6014d3e3d 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -25,27 +25,54 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/IndVarSimplify.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
@@ -53,6 +80,10 @@
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
using namespace llvm;
#define DEBUG_TYPE "indvars"
@@ -91,6 +122,7 @@ DisableLFTR("disable-lftr", cl::Hidden, cl::init(false),
cl::desc("Disable Linear Function Test Replace optimization"));
namespace {
+
struct RewritePhi;
class IndVarSimplify {
@@ -131,7 +163,8 @@ public:
bool run(Loop *L);
};
-}
+
+} // end anonymous namespace
/// Return true if the SCEV expansion generated by the rewriter can replace the
/// original value. SCEV guarantees that it produces the same value, but the way
@@ -251,7 +284,6 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
/// is converted into
/// for(int i = 0; i < 10000; ++i)
/// bar((double)i);
-///
void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0));
unsigned BackEdge = IncomingEdge^1;
@@ -305,7 +337,6 @@ void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
L->contains(TheBr->getSuccessor(1))))
return;
-
// If it isn't a comparison with an integer-as-fp (the exit value), we can't
// transform it.
ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1));
@@ -373,7 +404,6 @@ void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
// transform the IV.
if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue)
return;
-
} else {
// If we have a negative stride, we require the init to be greater than the
// exit value.
@@ -452,7 +482,6 @@ void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) {
// First step. Check to see if there are any floating-point recurrences.
// If there are, change them into integer recurrences, permitting analysis by
// the SCEV routines.
- //
BasicBlock *Header = L->getHeader();
SmallVector<WeakTrackingVH, 8> PHIs;
@@ -472,18 +501,26 @@ void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) {
}
namespace {
+
// Collect information about PHI nodes which can be transformed in
// rewriteLoopExitValues.
struct RewritePhi {
PHINode *PN;
- unsigned Ith; // Ith incoming value.
- Value *Val; // Exit value after expansion.
- bool HighCost; // High Cost when expansion.
+
+ // Ith incoming value.
+ unsigned Ith;
+
+ // Exit value after expansion.
+ Value *Val;
+
+ // High Cost when expansion.
+ bool HighCost;
RewritePhi(PHINode *P, unsigned I, Value *V, bool H)
: PN(P), Ith(I), Val(V), HighCost(H) {}
};
-}
+
+} // end anonymous namespace
Value *IndVarSimplify::expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S,
Loop *L, Instruction *InsertPt,
@@ -747,7 +784,6 @@ void IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) {
/// aggressively.
bool IndVarSimplify::canLoopBeDeleted(
Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) {
-
BasicBlock *Preheader = L->getLoopPreheader();
// If there is no preheader, the loop will not be deleted.
if (!Preheader)
@@ -790,7 +826,9 @@ bool IndVarSimplify::canLoopBeDeleted(
}
for (auto *BB : L->blocks())
- if (any_of(*BB, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+ if (llvm::any_of(*BB, [](Instruction &I) {
+ return I.mayHaveSideEffects();
+ }))
return false;
return true;
@@ -801,15 +839,21 @@ bool IndVarSimplify::canLoopBeDeleted(
//===----------------------------------------------------------------------===//
namespace {
+
// Collect information about induction variables that are used by sign/zero
// extend operations. This information is recorded by CollectExtend and provides
// the input to WidenIV.
struct WideIVInfo {
PHINode *NarrowIV = nullptr;
- Type *WidestNativeType = nullptr; // Widest integer type created [sz]ext
- bool IsSigned = false; // Was a sext user seen before a zext?
+
+ // Widest integer type created [sz]ext
+ Type *WidestNativeType = nullptr;
+
+ // Was a sext user seen before a zext?
+ bool IsSigned = false;
};
-}
+
+} // end anonymous namespace
/// Update information about the induction variable that is extended by this
/// sign or zero extend operation. This is used to determine the final width of
@@ -885,7 +929,6 @@ struct NarrowIVDefUse {
/// creating any new induction variables. To do this, it creates a new phi of
/// the wider type and redirects all users, either removing extends or inserting
/// truncs whenever we stop propagating the type.
-///
class WidenIV {
// Parameters
PHINode *OrigPhi;
@@ -902,22 +945,24 @@ class WidenIV {
bool HasGuards;
// Result
- PHINode *WidePhi;
- Instruction *WideInc;
- const SCEV *WideIncExpr;
+ PHINode *WidePhi = nullptr;
+ Instruction *WideInc = nullptr;
+ const SCEV *WideIncExpr = nullptr;
SmallVectorImpl<WeakTrackingVH> &DeadInsts;
SmallPtrSet<Instruction *,16> Widened;
SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
enum ExtendKind { ZeroExtended, SignExtended, Unknown };
+
// A map tracking the kind of extension used to widen each narrow IV
// and narrow IV user.
// Key: pointer to a narrow IV or IV user.
// Value: the kind of extension used to widen this Instruction.
DenseMap<AssertingVH<Instruction>, ExtendKind> ExtendKindMap;
- typedef std::pair<AssertingVH<Value>, AssertingVH<Instruction>> DefUserPair;
+ using DefUserPair = std::pair<AssertingVH<Value>, AssertingVH<Instruction>>;
+
// A map with control-dependent ranges for post increment IV uses. The key is
// a pair of IV def and a use of this def denoting the context. The value is
// a ConstantRange representing possible values of the def at the given
@@ -935,6 +980,7 @@ class WidenIV {
void calculatePostIncRanges(PHINode *OrigPhi);
void calculatePostIncRange(Instruction *NarrowDef, Instruction *NarrowUser);
+
void updatePostIncRangeInfo(Value *Def, Instruction *UseI, ConstantRange R) {
DefUserPair Key(Def, UseI);
auto It = PostIncRangeInfos.find(Key);
@@ -950,8 +996,7 @@ public:
bool HasGuards)
: OrigPhi(WI.NarrowIV), WideType(WI.WidestNativeType), LI(LInfo),
L(LI->getLoopFor(OrigPhi->getParent())), SE(SEv), DT(DTree),
- HasGuards(HasGuards), WidePhi(nullptr), WideInc(nullptr),
- WideIncExpr(nullptr), DeadInsts(DI) {
+ HasGuards(HasGuards), DeadInsts(DI) {
assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");
ExtendKindMap[OrigPhi] = WI.IsSigned ? SignExtended : ZeroExtended;
}
@@ -969,7 +1014,7 @@ protected:
ExtendKind getExtendKind(Instruction *I);
- typedef std::pair<const SCEVAddRecExpr *, ExtendKind> WidenedRecTy;
+ using WidenedRecTy = std::pair<const SCEVAddRecExpr *, ExtendKind>;
WidenedRecTy getWideRecurrence(NarrowIVDefUse DU);
@@ -984,7 +1029,8 @@ protected:
void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
};
-} // anonymous namespace
+
+} // end anonymous namespace
/// Perform a quick domtree based check for loop invariance assuming that V is
/// used within the loop. LoopInfo::isLoopInvariant() seems gratuitous for this
@@ -1182,7 +1228,6 @@ const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
/// operands is an AddRec for this loop, return the AddRec and the kind of
/// extension used.
WidenIV::WidenedRecTy WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) {
-
// Handle the common case of add<nsw/nuw>
const unsigned OpCode = DU.NarrowUse->getOpcode();
// Only Add/Sub/Mul instructions supported yet.
@@ -1310,7 +1355,7 @@ bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) {
Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0);
unsigned CastWidth = SE->getTypeSizeInBits(Op->getType());
unsigned IVWidth = SE->getTypeSizeInBits(WideType);
- assert (CastWidth <= IVWidth && "Unexpected width while widening compare.");
+ assert(CastWidth <= IVWidth && "Unexpected width while widening compare.");
// Widen the compare instruction.
IRBuilder<> Builder(
@@ -1461,7 +1506,6 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
}
/// Add eligible users of NarrowDef to NarrowIVUsers.
-///
void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
const SCEV *NarrowSCEV = SE->getSCEV(NarrowDef);
bool NonNegativeDef =
@@ -1494,7 +1538,6 @@ void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
///
/// It would be simpler to delete uses as they are processed, but we must avoid
/// invalidating SCEV expressions.
-///
PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
// Is this phi an induction variable?
const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi));
@@ -1581,6 +1624,15 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
if (DU.NarrowDef->use_empty())
DeadInsts.emplace_back(DU.NarrowDef);
}
+
+ // Attach any debug information to the new PHI. Since OrigPhi and WidePHI
+ // evaluate the same recurrence, we can just copy the debug info over.
+ SmallVector<DbgValueInst *, 1> DbgValues;
+ llvm::findDbgValues(DbgValues, OrigPhi);
+ auto *MDPhi = MetadataAsValue::get(WidePhi->getContext(),
+ ValueAsMetadata::get(WidePhi));
+ for (auto &DbgValue : DbgValues)
+ DbgValue->setOperand(0, MDPhi);
return WidePhi;
}
@@ -1696,12 +1748,12 @@ void WidenIV::calculatePostIncRanges(PHINode *OrigPhi) {
// Live IV Reduction - Minimize IVs live across the loop.
//===----------------------------------------------------------------------===//
-
//===----------------------------------------------------------------------===//
// Simplification of IV users based on SCEV evaluation.
//===----------------------------------------------------------------------===//
namespace {
+
class IndVarSimplifyVisitor : public IVVisitor {
ScalarEvolution *SE;
const TargetTransformInfo *TTI;
@@ -1721,14 +1773,14 @@ public:
// Implement the interface used by simplifyUsersOfIV.
void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); }
};
-}
+
+} // end anonymous namespace
/// Iteratively perform simplification on a worklist of IV users. Each
/// successive simplification may push more users which may themselves be
/// candidates for simplification.
///
/// Sign/Zero extend elimination is interleaved with IV simplification.
-///
void IndVarSimplify::simplifyAndExtend(Loop *L,
SCEVExpander &Rewriter,
LoopInfo *LI) {
@@ -1759,7 +1811,8 @@ void IndVarSimplify::simplifyAndExtend(Loop *L,
// Information about sign/zero extensions of CurrIV.
IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT);
- Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, DeadInsts, &Visitor);
+ Changed |=
+ simplifyUsersOfIV(CurrIV, SE, DT, LI, DeadInsts, Rewriter, &Visitor);
if (Visitor.WI.WidestNativeType) {
WideIVs.push_back(Visitor.WI);
@@ -2501,8 +2554,10 @@ PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
}
namespace {
+
struct IndVarSimplifyLegacyPass : public LoopPass {
static char ID; // Pass identification, replacement for typeid
+
IndVarSimplifyLegacyPass() : LoopPass(ID) {
initializeIndVarSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
}
@@ -2529,9 +2584,11 @@ struct IndVarSimplifyLegacyPass : public LoopPass {
getLoopAnalysisUsage(AU);
}
};
-}
+
+} // end anonymous namespace
char IndVarSimplifyLegacyPass::ID = 0;
+
INITIALIZE_PASS_BEGIN(IndVarSimplifyLegacyPass, "indvars",
"Induction Variable Simplification", false, false)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 99b4458ea0fa..5c4d55bfbb2b 100644
--- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -1,4 +1,4 @@
-//===-- InductiveRangeCheckElimination.cpp - ------------------------------===//
+//===- InductiveRangeCheckElimination.cpp - -------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,6 +6,7 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
+//
// The InductiveRangeCheckElimination pass splits a loop's iteration space into
// three disjoint ranges. It does that in a way such that the loop running in
// the middle loop provably does not need range checks. As an example, it will
@@ -39,30 +40,61 @@
// throw_out_of_bounds();
// }
// }
+//
//===----------------------------------------------------------------------===//
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
using namespace llvm;
+using namespace llvm::PatternMatch;
static cl::opt<unsigned> LoopSizeCutoff("irce-loop-size-cutoff", cl::Hidden,
cl::init(64));
@@ -79,6 +111,9 @@ static cl::opt<int> MaxExitProbReciprocal("irce-max-exit-prob-reciprocal",
static cl::opt<bool> SkipProfitabilityChecks("irce-skip-profitability-checks",
cl::Hidden, cl::init(false));
+static cl::opt<bool> AllowUnsignedLatchCondition("irce-allow-unsigned-latch",
+ cl::Hidden, cl::init(true));
+
static const char *ClonedLoopTag = "irce.loop.clone";
#define DEBUG_TYPE "irce"
@@ -114,15 +149,16 @@ class InductiveRangeCheck {
static StringRef rangeCheckKindToStr(RangeCheckKind);
- const SCEV *Offset = nullptr;
- const SCEV *Scale = nullptr;
- Value *Length = nullptr;
+ const SCEV *Begin = nullptr;
+ const SCEV *Step = nullptr;
+ const SCEV *End = nullptr;
Use *CheckUse = nullptr;
RangeCheckKind Kind = RANGE_CHECK_UNKNOWN;
+ bool IsSigned = true;
static RangeCheckKind parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
ScalarEvolution &SE, Value *&Index,
- Value *&Length);
+ Value *&Length, bool &IsSigned);
static void
extractRangeChecksFromCond(Loop *L, ScalarEvolution &SE, Use &ConditionUse,
@@ -130,20 +166,21 @@ class InductiveRangeCheck {
SmallPtrSetImpl<Value *> &Visited);
public:
- const SCEV *getOffset() const { return Offset; }
- const SCEV *getScale() const { return Scale; }
- Value *getLength() const { return Length; }
+ const SCEV *getBegin() const { return Begin; }
+ const SCEV *getStep() const { return Step; }
+ const SCEV *getEnd() const { return End; }
+ bool isSigned() const { return IsSigned; }
void print(raw_ostream &OS) const {
OS << "InductiveRangeCheck:\n";
OS << " Kind: " << rangeCheckKindToStr(Kind) << "\n";
- OS << " Offset: ";
- Offset->print(OS);
- OS << " Scale: ";
- Scale->print(OS);
- OS << " Length: ";
- if (Length)
- Length->print(OS);
+ OS << " Begin: ";
+ Begin->print(OS);
+ OS << " Step: ";
+ Step->print(OS);
+ OS << " End: ";
+ if (End)
+ End->print(OS);
else
OS << "(null)";
OS << "\n CheckUse: ";
@@ -173,6 +210,14 @@ public:
Type *getType() const { return Begin->getType(); }
const SCEV *getBegin() const { return Begin; }
const SCEV *getEnd() const { return End; }
+ bool isEmpty(ScalarEvolution &SE, bool IsSigned) const {
+ if (Begin == End)
+ return true;
+ if (IsSigned)
+ return SE.isKnownPredicate(ICmpInst::ICMP_SGE, Begin, End);
+ else
+ return SE.isKnownPredicate(ICmpInst::ICMP_UGE, Begin, End);
+ }
};
/// This is the value the condition of the branch needs to evaluate to for the
@@ -183,7 +228,8 @@ public:
/// check is redundant and can be constant-folded away. The induction
/// variable is not required to be the canonical {0,+,1} induction variable.
Optional<Range> computeSafeIterationSpace(ScalarEvolution &SE,
- const SCEVAddRecExpr *IndVar) const;
+ const SCEVAddRecExpr *IndVar,
+ bool IsLatchSigned) const;
/// Parse out a set of inductive range checks from \p BI and append them to \p
/// Checks.
@@ -199,6 +245,7 @@ public:
class InductiveRangeCheckElimination : public LoopPass {
public:
static char ID;
+
InductiveRangeCheckElimination() : LoopPass(ID) {
initializeInductiveRangeCheckEliminationPass(
*PassRegistry::getPassRegistry());
@@ -212,8 +259,9 @@ public:
bool runOnLoop(Loop *L, LPPassManager &LPM) override;
};
+} // end anonymous namespace
+
char InductiveRangeCheckElimination::ID = 0;
-}
INITIALIZE_PASS_BEGIN(InductiveRangeCheckElimination, "irce",
"Inductive range check elimination", false, false)
@@ -247,12 +295,10 @@ StringRef InductiveRangeCheck::rangeCheckKindToStr(
/// range checked, and set `Length` to the upper limit `Index` is being range
/// checked with if (and only if) the range check type is stronger or equal to
/// RANGE_CHECK_UPPER.
-///
InductiveRangeCheck::RangeCheckKind
InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
ScalarEvolution &SE, Value *&Index,
- Value *&Length) {
-
+ Value *&Length, bool &IsSigned) {
auto IsNonNegativeAndNotLoopVarying = [&SE, L](Value *V) {
const SCEV *S = SE.getSCEV(V);
if (isa<SCEVCouldNotCompute>(S))
@@ -262,8 +308,6 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
SE.isKnownNonNegative(S);
};
- using namespace llvm::PatternMatch;
-
ICmpInst::Predicate Pred = ICI->getPredicate();
Value *LHS = ICI->getOperand(0);
Value *RHS = ICI->getOperand(1);
@@ -276,6 +320,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
std::swap(LHS, RHS);
LLVM_FALLTHROUGH;
case ICmpInst::ICMP_SGE:
+ IsSigned = true;
if (match(RHS, m_ConstantInt<0>())) {
Index = LHS;
return RANGE_CHECK_LOWER;
@@ -286,6 +331,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
std::swap(LHS, RHS);
LLVM_FALLTHROUGH;
case ICmpInst::ICMP_SGT:
+ IsSigned = true;
if (match(RHS, m_ConstantInt<-1>())) {
Index = LHS;
return RANGE_CHECK_LOWER;
@@ -302,6 +348,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
std::swap(LHS, RHS);
LLVM_FALLTHROUGH;
case ICmpInst::ICMP_UGT:
+ IsSigned = false;
if (IsNonNegativeAndNotLoopVarying(LHS)) {
Index = RHS;
Length = LHS;
@@ -317,42 +364,16 @@ void InductiveRangeCheck::extractRangeChecksFromCond(
Loop *L, ScalarEvolution &SE, Use &ConditionUse,
SmallVectorImpl<InductiveRangeCheck> &Checks,
SmallPtrSetImpl<Value *> &Visited) {
- using namespace llvm::PatternMatch;
-
Value *Condition = ConditionUse.get();
if (!Visited.insert(Condition).second)
return;
+ // TODO: Do the same for OR, XOR, NOT etc?
if (match(Condition, m_And(m_Value(), m_Value()))) {
- SmallVector<InductiveRangeCheck, 8> SubChecks;
extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(0),
- SubChecks, Visited);
+ Checks, Visited);
extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(1),
- SubChecks, Visited);
-
- if (SubChecks.size() == 2) {
- // Handle a special case where we know how to merge two checks separately
- // checking the upper and lower bounds into a full range check.
- const auto &RChkA = SubChecks[0];
- const auto &RChkB = SubChecks[1];
- if ((RChkA.Length == RChkB.Length || !RChkA.Length || !RChkB.Length) &&
- RChkA.Offset == RChkB.Offset && RChkA.Scale == RChkB.Scale) {
-
- // If RChkA.Kind == RChkB.Kind then we just found two identical checks.
- // But if one of them is a RANGE_CHECK_LOWER and the other is a
- // RANGE_CHECK_UPPER (only possibility if they're different) then
- // together they form a RANGE_CHECK_BOTH.
- SubChecks[0].Kind =
- (InductiveRangeCheck::RangeCheckKind)(RChkA.Kind | RChkB.Kind);
- SubChecks[0].Length = RChkA.Length ? RChkA.Length : RChkB.Length;
- SubChecks[0].CheckUse = &ConditionUse;
-
- // We updated one of the checks in place, now erase the other.
- SubChecks.pop_back();
- }
- }
-
- Checks.insert(Checks.end(), SubChecks.begin(), SubChecks.end());
+ Checks, Visited);
return;
}
@@ -361,7 +382,8 @@ void InductiveRangeCheck::extractRangeChecksFromCond(
return;
Value *Length = nullptr, *Index;
- auto RCKind = parseRangeCheckICmp(L, ICI, SE, Index, Length);
+ bool IsSigned;
+ auto RCKind = parseRangeCheckICmp(L, ICI, SE, Index, Length, IsSigned);
if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN)
return;
@@ -373,18 +395,18 @@ void InductiveRangeCheck::extractRangeChecksFromCond(
return;
InductiveRangeCheck IRC;
- IRC.Length = Length;
- IRC.Offset = IndexAddRec->getStart();
- IRC.Scale = IndexAddRec->getStepRecurrence(SE);
+ IRC.End = Length ? SE.getSCEV(Length) : nullptr;
+ IRC.Begin = IndexAddRec->getStart();
+ IRC.Step = IndexAddRec->getStepRecurrence(SE);
IRC.CheckUse = &ConditionUse;
IRC.Kind = RCKind;
+ IRC.IsSigned = IsSigned;
Checks.push_back(IRC);
}
void InductiveRangeCheck::extractRangeChecksFromBranch(
BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo &BPI,
SmallVectorImpl<InductiveRangeCheck> &Checks) {
-
if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch())
return;
@@ -435,16 +457,16 @@ namespace {
// kinds of loops we can deal with -- ones that have a single latch that is also
// an exiting block *and* have a canonical induction variable.
struct LoopStructure {
- const char *Tag;
+ const char *Tag = "";
- BasicBlock *Header;
- BasicBlock *Latch;
+ BasicBlock *Header = nullptr;
+ BasicBlock *Latch = nullptr;
// `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th
// successor is `LatchExit', the exit block of the loop.
- BranchInst *LatchBr;
- BasicBlock *LatchExit;
- unsigned LatchBrExitIdx;
+ BranchInst *LatchBr = nullptr;
+ BasicBlock *LatchExit = nullptr;
+ unsigned LatchBrExitIdx = std::numeric_limits<unsigned>::max();
// The loop represented by this instance of LoopStructure is semantically
// equivalent to:
@@ -452,18 +474,17 @@ struct LoopStructure {
// intN_ty inc = IndVarIncreasing ? 1 : -1;
// pred_ty predicate = IndVarIncreasing ? ICMP_SLT : ICMP_SGT;
//
- // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarNext)
+ // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarBase)
// ... body ...
- Value *IndVarNext;
- Value *IndVarStart;
- Value *LoopExitAt;
- bool IndVarIncreasing;
+ Value *IndVarBase = nullptr;
+ Value *IndVarStart = nullptr;
+ Value *IndVarStep = nullptr;
+ Value *LoopExitAt = nullptr;
+ bool IndVarIncreasing = false;
+ bool IsSignedPredicate = true;
- LoopStructure()
- : Tag(""), Header(nullptr), Latch(nullptr), LatchBr(nullptr),
- LatchExit(nullptr), LatchBrExitIdx(-1), IndVarNext(nullptr),
- IndVarStart(nullptr), LoopExitAt(nullptr), IndVarIncreasing(false) {}
+ LoopStructure() = default;
template <typename M> LoopStructure map(M Map) const {
LoopStructure Result;
@@ -473,10 +494,12 @@ struct LoopStructure {
Result.LatchBr = cast<BranchInst>(Map(LatchBr));
Result.LatchExit = cast<BasicBlock>(Map(LatchExit));
Result.LatchBrExitIdx = LatchBrExitIdx;
- Result.IndVarNext = Map(IndVarNext);
+ Result.IndVarBase = Map(IndVarBase);
Result.IndVarStart = Map(IndVarStart);
+ Result.IndVarStep = Map(IndVarStep);
Result.LoopExitAt = Map(LoopExitAt);
Result.IndVarIncreasing = IndVarIncreasing;
+ Result.IsSignedPredicate = IsSignedPredicate;
return Result;
}
@@ -494,7 +517,6 @@ struct LoopStructure {
/// loops to run any remaining iterations. The pre loop runs any iterations in
/// which the induction variable is < Begin, and the post loop runs any
/// iterations in which the induction variable is >= End.
-///
class LoopConstrainer {
// The representation of a clone of the original loop we started out with.
struct ClonedLoop {
@@ -511,13 +533,12 @@ class LoopConstrainer {
// Result of rewriting the range of a loop. See changeIterationSpaceEnd for
// more details on what these fields mean.
struct RewrittenRangeInfo {
- BasicBlock *PseudoExit;
- BasicBlock *ExitSelector;
+ BasicBlock *PseudoExit = nullptr;
+ BasicBlock *ExitSelector = nullptr;
std::vector<PHINode *> PHIValuesAtPseudoExit;
- PHINode *IndVarEnd;
+ PHINode *IndVarEnd = nullptr;
- RewrittenRangeInfo()
- : PseudoExit(nullptr), ExitSelector(nullptr), IndVarEnd(nullptr) {}
+ RewrittenRangeInfo() = default;
};
// Calculated subranges we restrict the iteration space of the main loop to.
@@ -541,14 +562,12 @@ class LoopConstrainer {
// Compute a safe set of limits for the main loop to run in -- effectively the
// intersection of `Range' and the iteration space of the original loop.
// Return None if unable to compute the set of subranges.
- //
- Optional<SubRanges> calculateSubRanges() const;
+ Optional<SubRanges> calculateSubRanges(bool IsSignedPredicate) const;
// Clone `OriginalLoop' and return the result in CLResult. The IR after
// running `cloneLoop' is well formed except for the PHI nodes in CLResult --
// the PHI nodes say that there is an incoming edge from `OriginalPreheader`
// but there is no such edge.
- //
void cloneLoop(ClonedLoop &CLResult, const char *Tag) const;
// Create the appropriate loop structure needed to describe a cloned copy of
@@ -577,7 +596,6 @@ class LoopConstrainer {
// After changeIterationSpaceEnd, `Preheader' is no longer a legitimate
// preheader because it is made to branch to the loop header only
// conditionally.
- //
RewrittenRangeInfo
changeIterationSpaceEnd(const LoopStructure &LS, BasicBlock *Preheader,
Value *ExitLoopAt,
@@ -585,7 +603,6 @@ class LoopConstrainer {
// The loop denoted by `LS' has `OldPreheader' as its preheader. This
// function creates a new preheader for `LS' and returns it.
- //
BasicBlock *createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader,
const char *Tag) const;
@@ -613,12 +630,13 @@ class LoopConstrainer {
// Information about the original loop we started out with.
Loop &OriginalLoop;
- const SCEV *LatchTakenCount;
- BasicBlock *OriginalPreheader;
+
+ const SCEV *LatchTakenCount = nullptr;
+ BasicBlock *OriginalPreheader = nullptr;
// The preheader of the main loop. This may or may not be different from
// `OriginalPreheader'.
- BasicBlock *MainLoopPreheader;
+ BasicBlock *MainLoopPreheader = nullptr;
// The range we need to run the main loop in.
InductiveRangeCheck::Range Range;
@@ -632,15 +650,14 @@ public:
const LoopStructure &LS, ScalarEvolution &SE,
DominatorTree &DT, InductiveRangeCheck::Range R)
: F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()),
- SE(SE), DT(DT), LPM(LPM), LI(LI), OriginalLoop(L),
- LatchTakenCount(nullptr), OriginalPreheader(nullptr),
- MainLoopPreheader(nullptr), Range(R), MainLoopStructure(LS) {}
+ SE(SE), DT(DT), LPM(LPM), LI(LI), OriginalLoop(L), Range(R),
+ MainLoopStructure(LS) {}
// Entry point for the algorithm. Returns true on success.
bool run();
};
-}
+} // end anonymous namespace
void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block,
BasicBlock *ReplaceBy) {
@@ -649,22 +666,55 @@ void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block,
PN->setIncomingBlock(i, ReplaceBy);
}
-static bool CanBeSMax(ScalarEvolution &SE, const SCEV *S) {
- APInt SMax =
- APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth());
- return SE.getSignedRange(S).contains(SMax) &&
- SE.getUnsignedRange(S).contains(SMax);
+static bool CanBeMax(ScalarEvolution &SE, const SCEV *S, bool Signed) {
+ APInt Max = Signed ?
+ APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth()) :
+ APInt::getMaxValue(cast<IntegerType>(S->getType())->getBitWidth());
+ return SE.getSignedRange(S).contains(Max) &&
+ SE.getUnsignedRange(S).contains(Max);
+}
+
+static bool SumCanReachMax(ScalarEvolution &SE, const SCEV *S1, const SCEV *S2,
+ bool Signed) {
+ // S1 < INT_MAX - S2 ===> S1 + S2 < INT_MAX.
+ assert(SE.isKnownNonNegative(S2) &&
+ "We expected the 2nd arg to be non-negative!");
+ const SCEV *Max = SE.getConstant(
+ Signed ? APInt::getSignedMaxValue(
+ cast<IntegerType>(S1->getType())->getBitWidth())
+ : APInt::getMaxValue(
+ cast<IntegerType>(S1->getType())->getBitWidth()));
+ const SCEV *CapForS1 = SE.getMinusSCEV(Max, S2);
+ return !SE.isKnownPredicate(Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+ S1, CapForS1);
+}
+
+static bool CanBeMin(ScalarEvolution &SE, const SCEV *S, bool Signed) {
+ APInt Min = Signed ?
+ APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth()) :
+ APInt::getMinValue(cast<IntegerType>(S->getType())->getBitWidth());
+ return SE.getSignedRange(S).contains(Min) &&
+ SE.getUnsignedRange(S).contains(Min);
}
-static bool CanBeSMin(ScalarEvolution &SE, const SCEV *S) {
- APInt SMin =
- APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth());
- return SE.getSignedRange(S).contains(SMin) &&
- SE.getUnsignedRange(S).contains(SMin);
+static bool SumCanReachMin(ScalarEvolution &SE, const SCEV *S1, const SCEV *S2,
+ bool Signed) {
+ // S1 > INT_MIN - S2 ===> S1 + S2 > INT_MIN.
+ assert(SE.isKnownNonPositive(S2) &&
+ "We expected the 2nd arg to be non-positive!");
+ const SCEV *Max = SE.getConstant(
+ Signed ? APInt::getSignedMinValue(
+ cast<IntegerType>(S1->getType())->getBitWidth())
+ : APInt::getMinValue(
+ cast<IntegerType>(S1->getType())->getBitWidth()));
+ const SCEV *CapForS1 = SE.getMinusSCEV(Max, S2);
+ return !SE.isKnownPredicate(Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT,
+ S1, CapForS1);
}
Optional<LoopStructure>
-LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BPI,
+LoopStructure::parseLoopStructure(ScalarEvolution &SE,
+ BranchProbabilityInfo &BPI,
Loop &L, const char *&FailureReason) {
if (!L.isLoopSimplifyForm()) {
FailureReason = "loop not in LoopSimplify form";
@@ -766,7 +816,11 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap;
};
- auto IsInductionVar = [&](const SCEVAddRecExpr *AR, bool &IsIncreasing) {
+ // Here we check whether the suggested AddRec is an induction variable that
+ // can be handled (i.e. with known constant step), and if yes, calculate its
+ // step and identify whether it is increasing or decreasing.
+ auto IsInductionVar = [&](const SCEVAddRecExpr *AR, bool &IsIncreasing,
+ ConstantInt *&StepCI) {
if (!AR->isAffine())
return false;
@@ -778,11 +832,10 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
if (const SCEVConstant *StepExpr =
dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) {
- ConstantInt *StepCI = StepExpr->getValue();
- if (StepCI->isOne() || StepCI->isMinusOne()) {
- IsIncreasing = StepCI->isOne();
- return true;
- }
+ StepCI = StepExpr->getValue();
+ assert(!StepCI->isZero() && "Zero step?");
+ IsIncreasing = !StepCI->isNegative();
+ return true;
}
return false;
@@ -791,59 +844,87 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
// `ICI` is interpreted as taking the backedge if the *next* value of the
// induction variable satisfies some constraint.
- const SCEVAddRecExpr *IndVarNext = cast<SCEVAddRecExpr>(LeftSCEV);
+ const SCEVAddRecExpr *IndVarBase = cast<SCEVAddRecExpr>(LeftSCEV);
bool IsIncreasing = false;
- if (!IsInductionVar(IndVarNext, IsIncreasing)) {
+ bool IsSignedPredicate = true;
+ ConstantInt *StepCI;
+ if (!IsInductionVar(IndVarBase, IsIncreasing, StepCI)) {
FailureReason = "LHS in icmp not induction variable";
return None;
}
- const SCEV *StartNext = IndVarNext->getStart();
- const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE));
+ const SCEV *StartNext = IndVarBase->getStart();
+ const SCEV *Addend = SE.getNegativeSCEV(IndVarBase->getStepRecurrence(SE));
const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
+ const SCEV *Step = SE.getSCEV(StepCI);
ConstantInt *One = ConstantInt::get(IndVarTy, 1);
- // TODO: generalize the predicates here to also match their unsigned variants.
if (IsIncreasing) {
bool DecreasedRightValueByOne = false;
- // Try to turn eq/ne predicates to those we can work with.
- if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
- // while (++i != len) { while (++i < len) {
- // ... ---> ...
- // } }
- Pred = ICmpInst::ICMP_SLT;
- else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
- !CanBeSMin(SE, RightSCEV)) {
- // while (true) { while (true) {
- // if (++i == len) ---> if (++i > len - 1)
- // break; break;
- // ... ...
- // } }
- Pred = ICmpInst::ICMP_SGT;
- RightSCEV = SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
- DecreasedRightValueByOne = true;
+ if (StepCI->isOne()) {
+ // Try to turn eq/ne predicates to those we can work with.
+ if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+ // while (++i != len) { while (++i < len) {
+ // ... ---> ...
+ // } }
+ // If both parts are known non-negative, it is profitable to use
+ // unsigned comparison in increasing loop. This allows us to make the
+ // comparison check against "RightSCEV + 1" more optimistic.
+ if (SE.isKnownNonNegative(IndVarStart) &&
+ SE.isKnownNonNegative(RightSCEV))
+ Pred = ICmpInst::ICMP_ULT;
+ else
+ Pred = ICmpInst::ICMP_SLT;
+ else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
+ !CanBeMin(SE, RightSCEV, /* IsSignedPredicate */ true)) {
+ // while (true) { while (true) {
+ // if (++i == len) ---> if (++i > len - 1)
+ // break; break;
+ // ... ...
+ // } }
+ // TODO: Insert ICMP_UGT if both are non-negative?
+ Pred = ICmpInst::ICMP_SGT;
+ RightSCEV = SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
+ DecreasedRightValueByOne = true;
+ }
}
+ bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT);
+ bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT);
bool FoundExpectedPred =
- (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 1) ||
- (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 0);
+ (LTPred && LatchBrExitIdx == 1) || (GTPred && LatchBrExitIdx == 0);
if (!FoundExpectedPred) {
FailureReason = "expected icmp slt semantically, found something else";
return None;
}
+ IsSignedPredicate =
+ Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGT;
+
+ if (!IsSignedPredicate && !AllowUnsignedLatchCondition) {
+ FailureReason = "unsigned latch conditions are explicitly prohibited";
+ return None;
+ }
+
+ // The predicate that we need to check that the induction variable lies
+ // within bounds.
+ ICmpInst::Predicate BoundPred =
+ IsSignedPredicate ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
+
if (LatchBrExitIdx == 0) {
- if (CanBeSMax(SE, RightSCEV)) {
+ const SCEV *StepMinusOne = SE.getMinusSCEV(Step,
+ SE.getOne(Step->getType()));
+ if (SumCanReachMax(SE, RightSCEV, StepMinusOne, IsSignedPredicate)) {
// TODO: this restriction is easily removable -- we just have to
// remember that the icmp was an slt and not an sle.
- FailureReason = "limit may overflow when coercing sle to slt";
+ FailureReason = "limit may overflow when coercing le to lt";
return None;
}
if (!SE.isLoopEntryGuardedByCond(
- &L, CmpInst::ICMP_SLT, IndVarStart,
- SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())))) {
+ &L, BoundPred, IndVarStart,
+ SE.getAddExpr(RightSCEV, Step))) {
FailureReason = "Induction variable start not bounded by upper limit";
return None;
}
@@ -855,8 +936,7 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
RightValue = B.CreateAdd(RightValue, One);
}
} else {
- if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SLT, IndVarStart,
- RightSCEV)) {
+ if (!SE.isLoopEntryGuardedByCond(&L, BoundPred, IndVarStart, RightSCEV)) {
FailureReason = "Induction variable start not bounded by upper limit";
return None;
}
@@ -865,43 +945,65 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
}
} else {
bool IncreasedRightValueByOne = false;
- // Try to turn eq/ne predicates to those we can work with.
- if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
- // while (--i != len) { while (--i > len) {
- // ... ---> ...
- // } }
- Pred = ICmpInst::ICMP_SGT;
- else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
- !CanBeSMax(SE, RightSCEV)) {
- // while (true) { while (true) {
- // if (--i == len) ---> if (--i < len + 1)
- // break; break;
- // ... ...
- // } }
- Pred = ICmpInst::ICMP_SLT;
- RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
- IncreasedRightValueByOne = true;
+ if (StepCI->isMinusOne()) {
+ // Try to turn eq/ne predicates to those we can work with.
+ if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+ // while (--i != len) { while (--i > len) {
+ // ... ---> ...
+ // } }
+ // We intentionally don't turn the predicate into UGT even if we know
+ // that both operands are non-negative, because it will only pessimize
+ // our check against "RightSCEV - 1".
+ Pred = ICmpInst::ICMP_SGT;
+ else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
+ !CanBeMax(SE, RightSCEV, /* IsSignedPredicate */ true)) {
+ // while (true) { while (true) {
+ // if (--i == len) ---> if (--i < len + 1)
+ // break; break;
+ // ... ...
+ // } }
+ // TODO: Insert ICMP_ULT if both are non-negative?
+ Pred = ICmpInst::ICMP_SLT;
+ RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+ IncreasedRightValueByOne = true;
+ }
}
+ bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT);
+ bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT);
+
bool FoundExpectedPred =
- (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) ||
- (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 0);
+ (GTPred && LatchBrExitIdx == 1) || (LTPred && LatchBrExitIdx == 0);
if (!FoundExpectedPred) {
FailureReason = "expected icmp sgt semantically, found something else";
return None;
}
+ IsSignedPredicate =
+ Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGT;
+
+ if (!IsSignedPredicate && !AllowUnsignedLatchCondition) {
+ FailureReason = "unsigned latch conditions are explicitly prohibited";
+ return None;
+ }
+
+ // The predicate that we need to check that the induction variable lies
+ // within bounds.
+ ICmpInst::Predicate BoundPred =
+ IsSignedPredicate ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT;
+
if (LatchBrExitIdx == 0) {
- if (CanBeSMin(SE, RightSCEV)) {
+ const SCEV *StepPlusOne = SE.getAddExpr(Step, SE.getOne(Step->getType()));
+ if (SumCanReachMin(SE, RightSCEV, StepPlusOne, IsSignedPredicate)) {
// TODO: this restriction is easily removable -- we just have to
// remember that the icmp was an sgt and not an sge.
- FailureReason = "limit may overflow when coercing sge to sgt";
+ FailureReason = "limit may overflow when coercing ge to gt";
return None;
}
if (!SE.isLoopEntryGuardedByCond(
- &L, CmpInst::ICMP_SGT, IndVarStart,
+ &L, BoundPred, IndVarStart,
SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())))) {
FailureReason = "Induction variable start not bounded by lower limit";
return None;
@@ -914,8 +1016,7 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
RightValue = B.CreateSub(RightValue, One);
}
} else {
- if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SGT, IndVarStart,
- RightSCEV)) {
+ if (!SE.isLoopEntryGuardedByCond(&L, BoundPred, IndVarStart, RightSCEV)) {
FailureReason = "Induction variable start not bounded by lower limit";
return None;
}
@@ -923,7 +1024,6 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
"Right value can be increased only for LatchBrExitIdx == 0!");
}
}
-
BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
assert(SE.getLoopDisposition(LatchCount, &L) ==
@@ -946,9 +1046,11 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
Result.LatchExit = LatchExit;
Result.LatchBrExitIdx = LatchBrExitIdx;
Result.IndVarStart = IndVarStartV;
- Result.IndVarNext = LeftValue;
+ Result.IndVarStep = StepCI;
+ Result.IndVarBase = LeftValue;
Result.IndVarIncreasing = IsIncreasing;
Result.LoopExitAt = RightValue;
+ Result.IsSignedPredicate = IsSignedPredicate;
FailureReason = nullptr;
@@ -956,7 +1058,7 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
}
Optional<LoopConstrainer::SubRanges>
-LoopConstrainer::calculateSubRanges() const {
+LoopConstrainer::calculateSubRanges(bool IsSignedPredicate) const {
IntegerType *Ty = cast<IntegerType>(LatchTakenCount->getType());
if (Range.getType() != Ty)
@@ -999,26 +1101,31 @@ LoopConstrainer::calculateSubRanges() const {
// that case, `Clamp` will always return `Smallest` and
// [`Result.LowLimit`, `Result.HighLimit`) = [`Smallest`, `Smallest`)
// will be an empty range. Returning an empty range is always safe.
- //
Smallest = SE.getAddExpr(End, One);
Greatest = SE.getAddExpr(Start, One);
GreatestSeen = Start;
}
- auto Clamp = [this, Smallest, Greatest](const SCEV *S) {
- return SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S));
+ auto Clamp = [this, Smallest, Greatest, IsSignedPredicate](const SCEV *S) {
+ return IsSignedPredicate
+ ? SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S))
+ : SE.getUMaxExpr(Smallest, SE.getUMinExpr(Greatest, S));
};
- // In some cases we can prove that we don't need a pre or post loop
+ // In some cases we can prove that we don't need a pre or post loop.
+ ICmpInst::Predicate PredLE =
+ IsSignedPredicate ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
+ ICmpInst::Predicate PredLT =
+ IsSignedPredicate ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
bool ProvablyNoPreloop =
- SE.isKnownPredicate(ICmpInst::ICMP_SLE, Range.getBegin(), Smallest);
+ SE.isKnownPredicate(PredLE, Range.getBegin(), Smallest);
if (!ProvablyNoPreloop)
Result.LowLimit = Clamp(Range.getBegin());
bool ProvablyNoPostLoop =
- SE.isKnownPredicate(ICmpInst::ICMP_SLT, GreatestSeen, Range.getEnd());
+ SE.isKnownPredicate(PredLT, GreatestSeen, Range.getEnd());
if (!ProvablyNoPostLoop)
Result.HighLimit = Clamp(Range.getEnd());
@@ -1082,7 +1189,6 @@ void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result,
LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
const LoopStructure &LS, BasicBlock *Preheader, Value *ExitSubloopAt,
BasicBlock *ContinuationBlock) const {
-
// We start with a loop with a single latch:
//
// +--------------------+
@@ -1153,7 +1259,6 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
// | original exit <----+
// | |
// +--------------------+
- //
RewrittenRangeInfo RRI;
@@ -1165,22 +1270,35 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
BranchInst *PreheaderJump = cast<BranchInst>(Preheader->getTerminator());
bool Increasing = LS.IndVarIncreasing;
+ bool IsSignedPredicate = LS.IsSignedPredicate;
IRBuilder<> B(PreheaderJump);
// EnterLoopCond - is it okay to start executing this `LS'?
- Value *EnterLoopCond = Increasing
- ? B.CreateICmpSLT(LS.IndVarStart, ExitSubloopAt)
- : B.CreateICmpSGT(LS.IndVarStart, ExitSubloopAt);
+ Value *EnterLoopCond = nullptr;
+ if (Increasing)
+ EnterLoopCond = IsSignedPredicate
+ ? B.CreateICmpSLT(LS.IndVarStart, ExitSubloopAt)
+ : B.CreateICmpULT(LS.IndVarStart, ExitSubloopAt);
+ else
+ EnterLoopCond = IsSignedPredicate
+ ? B.CreateICmpSGT(LS.IndVarStart, ExitSubloopAt)
+ : B.CreateICmpUGT(LS.IndVarStart, ExitSubloopAt);
B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit);
PreheaderJump->eraseFromParent();
LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector);
B.SetInsertPoint(LS.LatchBr);
- Value *TakeBackedgeLoopCond =
- Increasing ? B.CreateICmpSLT(LS.IndVarNext, ExitSubloopAt)
- : B.CreateICmpSGT(LS.IndVarNext, ExitSubloopAt);
+ Value *TakeBackedgeLoopCond = nullptr;
+ if (Increasing)
+ TakeBackedgeLoopCond = IsSignedPredicate
+ ? B.CreateICmpSLT(LS.IndVarBase, ExitSubloopAt)
+ : B.CreateICmpULT(LS.IndVarBase, ExitSubloopAt);
+ else
+ TakeBackedgeLoopCond = IsSignedPredicate
+ ? B.CreateICmpSGT(LS.IndVarBase, ExitSubloopAt)
+ : B.CreateICmpUGT(LS.IndVarBase, ExitSubloopAt);
Value *CondForBranch = LS.LatchBrExitIdx == 1
? TakeBackedgeLoopCond
: B.CreateNot(TakeBackedgeLoopCond);
@@ -1192,9 +1310,15 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
// IterationsLeft - are there any more iterations left, given the original
// upper bound on the induction variable? If not, we branch to the "real"
// exit.
- Value *IterationsLeft = Increasing
- ? B.CreateICmpSLT(LS.IndVarNext, LS.LoopExitAt)
- : B.CreateICmpSGT(LS.IndVarNext, LS.LoopExitAt);
+ Value *IterationsLeft = nullptr;
+ if (Increasing)
+ IterationsLeft = IsSignedPredicate
+ ? B.CreateICmpSLT(LS.IndVarBase, LS.LoopExitAt)
+ : B.CreateICmpULT(LS.IndVarBase, LS.LoopExitAt);
+ else
+ IterationsLeft = IsSignedPredicate
+ ? B.CreateICmpSGT(LS.IndVarBase, LS.LoopExitAt)
+ : B.CreateICmpUGT(LS.IndVarBase, LS.LoopExitAt);
B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit);
BranchInst *BranchToContinuation =
@@ -1217,10 +1341,10 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
RRI.PHIValuesAtPseudoExit.push_back(NewPHI);
}
- RRI.IndVarEnd = PHINode::Create(LS.IndVarNext->getType(), 2, "indvar.end",
+ RRI.IndVarEnd = PHINode::Create(LS.IndVarBase->getType(), 2, "indvar.end",
BranchToContinuation);
RRI.IndVarEnd->addIncoming(LS.IndVarStart, Preheader);
- RRI.IndVarEnd->addIncoming(LS.IndVarNext, RRI.ExitSelector);
+ RRI.IndVarEnd->addIncoming(LS.IndVarBase, RRI.ExitSelector);
// The latch exit now has a branch from `RRI.ExitSelector' instead of
// `LS.Latch'. The PHI nodes need to be updated to reflect that.
@@ -1237,7 +1361,6 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
void LoopConstrainer::rewriteIncomingValuesForPHIs(
LoopStructure &LS, BasicBlock *ContinuationBlock,
const LoopConstrainer::RewrittenRangeInfo &RRI) const {
-
unsigned PHIIndex = 0;
for (Instruction &I : *LS.Header) {
auto *PN = dyn_cast<PHINode>(&I);
@@ -1255,7 +1378,6 @@ void LoopConstrainer::rewriteIncomingValuesForPHIs(
BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS,
BasicBlock *OldPreheader,
const char *Tag) const {
-
BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header);
BranchInst::Create(LS.Header, Preheader);
@@ -1282,7 +1404,7 @@ void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
ValueToValueMapTy &VM) {
- Loop &New = *new Loop();
+ Loop &New = *LI.AllocateLoop();
if (Parent)
Parent->addChildLoop(&New);
else
@@ -1311,7 +1433,8 @@ bool LoopConstrainer::run() {
OriginalPreheader = Preheader;
MainLoopPreheader = Preheader;
- Optional<SubRanges> MaybeSR = calculateSubRanges();
+ bool IsSignedPredicate = MainLoopStructure.IsSignedPredicate;
+ Optional<SubRanges> MaybeSR = calculateSubRanges(IsSignedPredicate);
if (!MaybeSR.hasValue()) {
DEBUG(dbgs() << "irce: could not compute subranges\n");
return false;
@@ -1320,7 +1443,7 @@ bool LoopConstrainer::run() {
SubRanges SR = MaybeSR.getValue();
bool Increasing = MainLoopStructure.IndVarIncreasing;
IntegerType *IVTy =
- cast<IntegerType>(MainLoopStructure.IndVarNext->getType());
+ cast<IntegerType>(MainLoopStructure.IndVarBase->getType());
SCEVExpander Expander(SE, F.getParent()->getDataLayout(), "irce");
Instruction *InsertPt = OriginalPreheader->getTerminator();
@@ -1345,7 +1468,7 @@ bool LoopConstrainer::run() {
if (Increasing)
ExitPreLoopAtSCEV = *SR.LowLimit;
else {
- if (CanBeSMin(SE, *SR.HighLimit)) {
+ if (CanBeMin(SE, *SR.HighLimit, IsSignedPredicate)) {
DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
<< "preloop exit limit. HighLimit = " << *(*SR.HighLimit)
<< "\n");
@@ -1354,6 +1477,13 @@ bool LoopConstrainer::run() {
ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
}
+ if (!isSafeToExpandAt(ExitPreLoopAtSCEV, InsertPt, SE)) {
+ DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
+ << " preloop exit limit " << *ExitPreLoopAtSCEV
+ << " at block " << InsertPt->getParent()->getName() << "\n");
+ return false;
+ }
+
ExitPreLoopAt = Expander.expandCodeFor(ExitPreLoopAtSCEV, IVTy, InsertPt);
ExitPreLoopAt->setName("exit.preloop.at");
}
@@ -1364,7 +1494,7 @@ bool LoopConstrainer::run() {
if (Increasing)
ExitMainLoopAtSCEV = *SR.HighLimit;
else {
- if (CanBeSMin(SE, *SR.LowLimit)) {
+ if (CanBeMin(SE, *SR.LowLimit, IsSignedPredicate)) {
DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
<< "mainloop exit limit. LowLimit = " << *(*SR.LowLimit)
<< "\n");
@@ -1373,6 +1503,13 @@ bool LoopConstrainer::run() {
ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
}
+ if (!isSafeToExpandAt(ExitMainLoopAtSCEV, InsertPt, SE)) {
+ DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
+ << " main loop exit limit " << *ExitMainLoopAtSCEV
+ << " at block " << InsertPt->getParent()->getName() << "\n");
+ return false;
+ }
+
ExitMainLoopAt = Expander.expandCodeFor(ExitMainLoopAtSCEV, IVTy, InsertPt);
ExitMainLoopAt->setName("exit.mainloop.at");
}
@@ -1463,34 +1600,27 @@ bool LoopConstrainer::run() {
/// range, returns None.
Optional<InductiveRangeCheck::Range>
InductiveRangeCheck::computeSafeIterationSpace(
- ScalarEvolution &SE, const SCEVAddRecExpr *IndVar) const {
+ ScalarEvolution &SE, const SCEVAddRecExpr *IndVar,
+ bool IsLatchSigned) const {
// IndVar is of the form "A + B * I" (where "I" is the canonical induction
// variable, that may or may not exist as a real llvm::Value in the loop) and
// this inductive range check is a range check on the "C + D * I" ("C" is
- // getOffset() and "D" is getScale()). We rewrite the value being range
+ // getBegin() and "D" is getStep()). We rewrite the value being range
// checked to "M + N * IndVar" where "N" = "D * B^(-1)" and "M" = "C - NA".
- // Currently we support this only for "B" = "D" = { 1 or -1 }, but the code
- // can be generalized as needed.
//
// The actual inequalities we solve are of the form
//
// 0 <= M + 1 * IndVar < L given L >= 0 (i.e. N == 1)
//
- // The inequality is satisfied by -M <= IndVar < (L - M) [^1]. All additions
- // and subtractions are twos-complement wrapping and comparisons are signed.
- //
- // Proof:
- //
- // If there exists IndVar such that -M <= IndVar < (L - M) then it follows
- // that -M <= (-M + L) [== Eq. 1]. Since L >= 0, if (-M + L) sign-overflows
- // then (-M + L) < (-M). Hence by [Eq. 1], (-M + L) could not have
- // overflown.
- //
- // This means IndVar = t + (-M) for t in [0, L). Hence (IndVar + M) = t.
- // Hence 0 <= (IndVar + M) < L
-
- // [^1]: Note that the solution does _not_ apply if L < 0; consider values M =
- // 127, IndVar = 126 and L = -2 in an i8 world.
+ // Here L stands for upper limit of the safe iteration space.
+ // The inequality is satisfied by (0 - M) <= IndVar < (L - M). To avoid
+ // overflows when calculating (0 - M) and (L - M) we, depending on type of
+ // IV's iteration space, limit the calculations by borders of the iteration
+ // space. For example, if IndVar is unsigned, (0 - M) overflows for any M > 0.
+ // If we figured out that "anything greater than (-M) is safe", we strengthen
+ // this to "everything greater than 0 is safe", assuming that values between
+ // -M and 0 just do not exist in unsigned iteration space, and we don't want
+ // to deal with overflown values.
if (!IndVar->isAffine())
return None;
@@ -1499,42 +1629,89 @@ InductiveRangeCheck::computeSafeIterationSpace(
const SCEVConstant *B = dyn_cast<SCEVConstant>(IndVar->getStepRecurrence(SE));
if (!B)
return None;
+ assert(!B->isZero() && "Recurrence with zero step?");
- const SCEV *C = getOffset();
- const SCEVConstant *D = dyn_cast<SCEVConstant>(getScale());
+ const SCEV *C = getBegin();
+ const SCEVConstant *D = dyn_cast<SCEVConstant>(getStep());
if (D != B)
return None;
- ConstantInt *ConstD = D->getValue();
- if (!(ConstD->isMinusOne() || ConstD->isOne()))
- return None;
+ assert(!D->getValue()->isZero() && "Recurrence with zero step?");
+ unsigned BitWidth = cast<IntegerType>(IndVar->getType())->getBitWidth();
+ const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
+ // Substract Y from X so that it does not go through border of the IV
+ // iteration space. Mathematically, it is equivalent to:
+ //
+ // ClampedSubstract(X, Y) = min(max(X - Y, INT_MIN), INT_MAX). [1]
+ //
+ // In [1], 'X - Y' is a mathematical substraction (result is not bounded to
+ // any width of bit grid). But after we take min/max, the result is
+ // guaranteed to be within [INT_MIN, INT_MAX].
+ //
+ // In [1], INT_MAX and INT_MIN are respectively signed and unsigned max/min
+ // values, depending on type of latch condition that defines IV iteration
+ // space.
+ auto ClampedSubstract = [&](const SCEV *X, const SCEV *Y) {
+ assert(SE.isKnownNonNegative(X) &&
+ "We can only substract from values in [0; SINT_MAX]!");
+ if (IsLatchSigned) {
+ // X is a number from signed range, Y is interpreted as signed.
+ // Even if Y is SINT_MAX, (X - Y) does not reach SINT_MIN. So the only
+ // thing we should care about is that we didn't cross SINT_MAX.
+ // So, if Y is positive, we substract Y safely.
+ // Rule 1: Y > 0 ---> Y.
+ // If 0 <= -Y <= (SINT_MAX - X), we substract Y safely.
+ // Rule 2: Y >=s (X - SINT_MAX) ---> Y.
+ // If 0 <= (SINT_MAX - X) < -Y, we can only substract (X - SINT_MAX).
+ // Rule 3: Y <s (X - SINT_MAX) ---> (X - SINT_MAX).
+ // It gives us smax(Y, X - SINT_MAX) to substract in all cases.
+ const SCEV *XMinusSIntMax = SE.getMinusSCEV(X, SIntMax);
+ return SE.getMinusSCEV(X, SE.getSMaxExpr(Y, XMinusSIntMax),
+ SCEV::FlagNSW);
+ } else
+ // X is a number from unsigned range, Y is interpreted as signed.
+ // Even if Y is SINT_MIN, (X - Y) does not reach UINT_MAX. So the only
+ // thing we should care about is that we didn't cross zero.
+ // So, if Y is negative, we substract Y safely.
+ // Rule 1: Y <s 0 ---> Y.
+ // If 0 <= Y <= X, we substract Y safely.
+ // Rule 2: Y <=s X ---> Y.
+ // If 0 <= X < Y, we should stop at 0 and can only substract X.
+ // Rule 3: Y >s X ---> X.
+ // It gives us smin(X, Y) to substract in all cases.
+ return SE.getMinusSCEV(X, SE.getSMinExpr(X, Y), SCEV::FlagNUW);
+ };
const SCEV *M = SE.getMinusSCEV(C, A);
-
- const SCEV *Begin = SE.getNegativeSCEV(M);
- const SCEV *UpperLimit = nullptr;
+ const SCEV *Zero = SE.getZero(M->getType());
+ const SCEV *Begin = ClampedSubstract(Zero, M);
+ const SCEV *L = nullptr;
// We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L".
// We can potentially do much better here.
- if (Value *V = getLength()) {
- UpperLimit = SE.getSCEV(V);
- } else {
+ if (const SCEV *EndLimit = getEnd())
+ L = EndLimit;
+ else {
assert(Kind == InductiveRangeCheck::RANGE_CHECK_LOWER && "invariant!");
- unsigned BitWidth = cast<IntegerType>(IndVar->getType())->getBitWidth();
- UpperLimit = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
+ L = SIntMax;
}
-
- const SCEV *End = SE.getMinusSCEV(UpperLimit, M);
+ const SCEV *End = ClampedSubstract(L, M);
return InductiveRangeCheck::Range(Begin, End);
}
static Optional<InductiveRangeCheck::Range>
-IntersectRange(ScalarEvolution &SE,
- const Optional<InductiveRangeCheck::Range> &R1,
- const InductiveRangeCheck::Range &R2) {
+IntersectSignedRange(ScalarEvolution &SE,
+ const Optional<InductiveRangeCheck::Range> &R1,
+ const InductiveRangeCheck::Range &R2) {
+ if (R2.isEmpty(SE, /* IsSigned */ true))
+ return None;
if (!R1.hasValue())
return R2;
auto &R1Value = R1.getValue();
+ // We never return empty ranges from this function, and R1 is supposed to be
+ // a result of intersection. Thus, R1 is never empty.
+ assert(!R1Value.isEmpty(SE, /* IsSigned */ true) &&
+ "We should never have empty R1!");
// TODO: we could widen the smaller range and have this work; but for now we
// bail out to keep things simple.
@@ -1544,7 +1721,40 @@ IntersectRange(ScalarEvolution &SE,
const SCEV *NewBegin = SE.getSMaxExpr(R1Value.getBegin(), R2.getBegin());
const SCEV *NewEnd = SE.getSMinExpr(R1Value.getEnd(), R2.getEnd());
- return InductiveRangeCheck::Range(NewBegin, NewEnd);
+ // If the resulting range is empty, just return None.
+ auto Ret = InductiveRangeCheck::Range(NewBegin, NewEnd);
+ if (Ret.isEmpty(SE, /* IsSigned */ true))
+ return None;
+ return Ret;
+}
+
+static Optional<InductiveRangeCheck::Range>
+IntersectUnsignedRange(ScalarEvolution &SE,
+ const Optional<InductiveRangeCheck::Range> &R1,
+ const InductiveRangeCheck::Range &R2) {
+ if (R2.isEmpty(SE, /* IsSigned */ false))
+ return None;
+ if (!R1.hasValue())
+ return R2;
+ auto &R1Value = R1.getValue();
+ // We never return empty ranges from this function, and R1 is supposed to be
+ // a result of intersection. Thus, R1 is never empty.
+ assert(!R1Value.isEmpty(SE, /* IsSigned */ false) &&
+ "We should never have empty R1!");
+
+ // TODO: we could widen the smaller range and have this work; but for now we
+ // bail out to keep things simple.
+ if (R1Value.getType() != R2.getType())
+ return None;
+
+ const SCEV *NewBegin = SE.getUMaxExpr(R1Value.getBegin(), R2.getBegin());
+ const SCEV *NewEnd = SE.getUMinExpr(R1Value.getEnd(), R2.getEnd());
+
+ // If the resulting range is empty, just return None.
+ auto Ret = InductiveRangeCheck::Range(NewBegin, NewEnd);
+ if (Ret.isEmpty(SE, /* IsSigned */ false))
+ return None;
+ return Ret;
}
bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
@@ -1598,24 +1808,31 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
return false;
}
LoopStructure LS = MaybeLoopStructure.getValue();
- bool Increasing = LS.IndVarIncreasing;
- const SCEV *MinusOne =
- SE.getConstant(LS.IndVarNext->getType(), Increasing ? -1 : 1, true);
const SCEVAddRecExpr *IndVar =
- cast<SCEVAddRecExpr>(SE.getAddExpr(SE.getSCEV(LS.IndVarNext), MinusOne));
+ cast<SCEVAddRecExpr>(SE.getMinusSCEV(SE.getSCEV(LS.IndVarBase), SE.getSCEV(LS.IndVarStep)));
Optional<InductiveRangeCheck::Range> SafeIterRange;
Instruction *ExprInsertPt = Preheader->getTerminator();
SmallVector<InductiveRangeCheck, 4> RangeChecksToEliminate;
+ // Basing on the type of latch predicate, we interpret the IV iteration range
+ // as signed or unsigned range. We use different min/max functions (signed or
+ // unsigned) when intersecting this range with safe iteration ranges implied
+ // by range checks.
+ auto IntersectRange =
+ LS.IsSignedPredicate ? IntersectSignedRange : IntersectUnsignedRange;
IRBuilder<> B(ExprInsertPt);
for (InductiveRangeCheck &IRC : RangeChecks) {
- auto Result = IRC.computeSafeIterationSpace(SE, IndVar);
+ auto Result = IRC.computeSafeIterationSpace(SE, IndVar,
+ LS.IsSignedPredicate);
if (Result.hasValue()) {
auto MaybeSafeIterRange =
IntersectRange(SE, SafeIterRange, Result.getValue());
if (MaybeSafeIterRange.hasValue()) {
+ assert(
+ !MaybeSafeIterRange.getValue().isEmpty(SE, LS.IsSignedPredicate) &&
+ "We should never return empty ranges!");
RangeChecksToEliminate.push_back(IRC);
SafeIterRange = MaybeSafeIterRange.getValue();
}
diff --git a/lib/Transforms/Scalar/InferAddressSpaces.cpp b/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 89b28f0aeee6..7d66c0f73821 100644
--- a/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -1,4 +1,4 @@
-//===-- NVPTXInferAddressSpace.cpp - ---------------------*- C++ -*-===//
+//===- InferAddressSpace.cpp - --------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -89,26 +89,54 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
#define DEBUG_TYPE "infer-address-spaces"
using namespace llvm;
+static const unsigned UninitializedAddressSpace =
+ std::numeric_limits<unsigned>::max();
+
namespace {
-static const unsigned UninitializedAddressSpace = ~0u;
using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
@@ -146,10 +174,9 @@ private:
// Changes the flat address expressions in function F to point to specific
// address spaces if InferredAddrSpace says so. Postorder is the postorder of
// all flat expressions in the use-def graph of function F.
- bool
- rewriteWithNewAddressSpaces(ArrayRef<WeakTrackingVH> Postorder,
- const ValueToAddrSpaceMapTy &InferredAddrSpace,
- Function *F) const;
+ bool rewriteWithNewAddressSpaces(
+ const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder,
+ const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const;
void appendsFlatAddressExpressionToPostorderStack(
Value *V, std::vector<std::pair<Value *, bool>> &PostorderStack,
@@ -170,13 +197,16 @@ private:
SmallVectorImpl<const Use *> *UndefUsesToFix) const;
unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) const;
};
+
} // end anonymous namespace
char InferAddressSpaces::ID = 0;
namespace llvm {
+
void initializeInferAddressSpacesPass(PassRegistry &);
-}
+
+} // end namespace llvm
INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
false, false)
@@ -454,11 +484,10 @@ static Value *cloneInstructionWithNewAddressSpace(
NewGEP->setIsInBounds(GEP->isInBounds());
return NewGEP;
}
- case Instruction::Select: {
+ case Instruction::Select:
assert(I->getType()->isPointerTy());
return SelectInst::Create(I->getOperand(0), NewPointerOperands[1],
NewPointerOperands[2], "", nullptr, I);
- }
default:
llvm_unreachable("Unexpected opcode");
}
@@ -600,7 +629,7 @@ bool InferAddressSpaces::runOnFunction(Function &F) {
// Changes the address spaces of the flat address expressions who are inferred
// to point to a specific address space.
- return rewriteWithNewAddressSpaces(Postorder, InferredAddrSpace, &F);
+ return rewriteWithNewAddressSpaces(TTI, Postorder, InferredAddrSpace, &F);
}
// Constants need to be tracked through RAUW to handle cases with nested
@@ -708,24 +737,32 @@ Optional<unsigned> InferAddressSpaces::updateAddressSpace(
/// \p returns true if \p U is the pointer operand of a memory instruction with
/// a single pointer operand that can have its address space changed by simply
-/// mutating the use to a new value.
-static bool isSimplePointerUseValidToReplace(Use &U) {
+/// mutating the use to a new value. If the memory instruction is volatile,
+/// return true only if the target allows the memory instruction to be volatile
+/// in the new address space.
+static bool isSimplePointerUseValidToReplace(const TargetTransformInfo &TTI,
+ Use &U, unsigned AddrSpace) {
User *Inst = U.getUser();
unsigned OpNo = U.getOperandNo();
+ bool VolatileIsAllowed = false;
+ if (auto *I = dyn_cast<Instruction>(Inst))
+ VolatileIsAllowed = TTI.hasVolatileVariant(I, AddrSpace);
if (auto *LI = dyn_cast<LoadInst>(Inst))
- return OpNo == LoadInst::getPointerOperandIndex() && !LI->isVolatile();
+ return OpNo == LoadInst::getPointerOperandIndex() &&
+ (VolatileIsAllowed || !LI->isVolatile());
if (auto *SI = dyn_cast<StoreInst>(Inst))
- return OpNo == StoreInst::getPointerOperandIndex() && !SI->isVolatile();
+ return OpNo == StoreInst::getPointerOperandIndex() &&
+ (VolatileIsAllowed || !SI->isVolatile());
if (auto *RMW = dyn_cast<AtomicRMWInst>(Inst))
- return OpNo == AtomicRMWInst::getPointerOperandIndex() && !RMW->isVolatile();
+ return OpNo == AtomicRMWInst::getPointerOperandIndex() &&
+ (VolatileIsAllowed || !RMW->isVolatile());
- if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+ if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst))
return OpNo == AtomicCmpXchgInst::getPointerOperandIndex() &&
- !CmpX->isVolatile();
- }
+ (VolatileIsAllowed || !CmpX->isVolatile());
return false;
}
@@ -818,7 +855,7 @@ static Value::use_iterator skipToNextUser(Value::use_iterator I,
}
bool InferAddressSpaces::rewriteWithNewAddressSpaces(
- ArrayRef<WeakTrackingVH> Postorder,
+ const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder,
const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const {
// For each address expression to be modified, creates a clone of it with its
// pointer operands converted to the new address space. Since the pointer
@@ -878,7 +915,8 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
// to the next instruction.
I = skipToNextUser(I, E);
- if (isSimplePointerUseValidToReplace(U)) {
+ if (isSimplePointerUseValidToReplace(
+ TTI, U, V->getType()->getPointerAddressSpace())) {
// If V is used as the pointer operand of a compatible memory operation,
// sets the pointer operand to NewV. This replacement does not change
// the element type, so the resultant load/store is still valid.
@@ -933,6 +971,11 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(CurUser)) {
unsigned NewAS = NewV->getType()->getPointerAddressSpace();
if (ASC->getDestAddressSpace() == NewAS) {
+ if (ASC->getType()->getPointerElementType() !=
+ NewV->getType()->getPointerElementType()) {
+ NewV = CastInst::Create(Instruction::BitCast, NewV,
+ ASC->getType(), "", ASC);
+ }
ASC->replaceAllUsesWith(NewV);
DeadInstructions.push_back(ASC);
continue;
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index dc9143bebc45..6b0377e0ecb3 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -14,25 +14,50 @@
#include "llvm/Transforms/Scalar/JumpThreading.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -41,8 +66,15 @@
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
#include <memory>
+#include <utility>
+
using namespace llvm;
using namespace jumpthreading;
@@ -70,6 +102,7 @@ static cl::opt<bool> PrintLVIAfterJumpThreading(
cl::Hidden);
namespace {
+
/// This pass performs 'jump threading', which looks at blocks that have
/// multiple predecessors and multiple successors. If one or more of the
/// predecessors of the block can be proven to always jump to one of the
@@ -85,12 +118,12 @@ namespace {
///
/// In this case, the unconditional branch at the end of the first if can be
/// revectored to the false side of the second if.
- ///
class JumpThreading : public FunctionPass {
JumpThreadingPass Impl;
public:
static char ID; // Pass identification
+
JumpThreading(int T = -1) : FunctionPass(ID), Impl(T) {
initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
}
@@ -108,9 +141,11 @@ namespace {
void releaseMemory() override { Impl.releaseMemory(); }
};
-}
+
+} // end anonymous namespace
char JumpThreading::ID = 0;
+
INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
"Jump Threading", false, false)
INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
@@ -120,14 +155,125 @@ INITIALIZE_PASS_END(JumpThreading, "jump-threading",
"Jump Threading", false, false)
// Public interface to the Jump Threading pass
-FunctionPass *llvm::createJumpThreadingPass(int Threshold) { return new JumpThreading(Threshold); }
+FunctionPass *llvm::createJumpThreadingPass(int Threshold) {
+ return new JumpThreading(Threshold);
+}
JumpThreadingPass::JumpThreadingPass(int T) {
BBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
}
-/// runOnFunction - Top level algorithm.
-///
+// Update branch probability information according to conditional
+// branch probablity. This is usually made possible for cloned branches
+// in inline instances by the context specific profile in the caller.
+// For instance,
+//
+// [Block PredBB]
+// [Branch PredBr]
+// if (t) {
+// Block A;
+// } else {
+// Block B;
+// }
+//
+// [Block BB]
+// cond = PN([true, %A], [..., %B]); // PHI node
+// [Branch CondBr]
+// if (cond) {
+// ... // P(cond == true) = 1%
+// }
+//
+// Here we know that when block A is taken, cond must be true, which means
+// P(cond == true | A) = 1
+//
+// Given that P(cond == true) = P(cond == true | A) * P(A) +
+// P(cond == true | B) * P(B)
+// we get:
+// P(cond == true ) = P(A) + P(cond == true | B) * P(B)
+//
+// which gives us:
+// P(A) is less than P(cond == true), i.e.
+// P(t == true) <= P(cond == true)
+//
+// In other words, if we know P(cond == true) is unlikely, we know
+// that P(t == true) is also unlikely.
+//
+static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
+ BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!CondBr)
+ return;
+
+ BranchProbability BP;
+ uint64_t TrueWeight, FalseWeight;
+ if (!CondBr->extractProfMetadata(TrueWeight, FalseWeight))
+ return;
+
+ // Returns the outgoing edge of the dominating predecessor block
+ // that leads to the PhiNode's incoming block:
+ auto GetPredOutEdge =
+ [](BasicBlock *IncomingBB,
+ BasicBlock *PhiBB) -> std::pair<BasicBlock *, BasicBlock *> {
+ auto *PredBB = IncomingBB;
+ auto *SuccBB = PhiBB;
+ while (true) {
+ BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
+ if (PredBr && PredBr->isConditional())
+ return {PredBB, SuccBB};
+ auto *SinglePredBB = PredBB->getSinglePredecessor();
+ if (!SinglePredBB)
+ return {nullptr, nullptr};
+ SuccBB = PredBB;
+ PredBB = SinglePredBB;
+ }
+ };
+
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ Value *PhiOpnd = PN->getIncomingValue(i);
+ ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd);
+
+ if (!CI || !CI->getType()->isIntegerTy(1))
+ continue;
+
+ BP = (CI->isOne() ? BranchProbability::getBranchProbability(
+ TrueWeight, TrueWeight + FalseWeight)
+ : BranchProbability::getBranchProbability(
+ FalseWeight, TrueWeight + FalseWeight));
+
+ auto PredOutEdge = GetPredOutEdge(PN->getIncomingBlock(i), BB);
+ if (!PredOutEdge.first)
+ return;
+
+ BasicBlock *PredBB = PredOutEdge.first;
+ BranchInst *PredBr = cast<BranchInst>(PredBB->getTerminator());
+
+ uint64_t PredTrueWeight, PredFalseWeight;
+ // FIXME: We currently only set the profile data when it is missing.
+ // With PGO, this can be used to refine even existing profile data with
+ // context information. This needs to be done after more performance
+ // testing.
+ if (PredBr->extractProfMetadata(PredTrueWeight, PredFalseWeight))
+ continue;
+
+ // We can not infer anything useful when BP >= 50%, because BP is the
+ // upper bound probability value.
+ if (BP >= BranchProbability(50, 100))
+ continue;
+
+ SmallVector<uint32_t, 2> Weights;
+ if (PredBr->getSuccessor(0) == PredOutEdge.second) {
+ Weights.push_back(BP.getNumerator());
+ Weights.push_back(BP.getCompl().getNumerator());
+ } else {
+ Weights.push_back(BP.getCompl().getNumerator());
+ Weights.push_back(BP.getNumerator());
+ }
+ PredBr->setMetadata(LLVMContext::MD_prof,
+ MDBuilder(PredBr->getParent()->getContext())
+ .createBranchWeights(Weights));
+ }
+}
+
+/// runOnFunction - Toplevel algorithm.
bool JumpThreading::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
@@ -155,7 +301,6 @@ bool JumpThreading::runOnFunction(Function &F) {
PreservedAnalyses JumpThreadingPass::run(Function &F,
FunctionAnalysisManager &AM) {
-
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto &LVI = AM.getResult<LazyValueAnalysis>(F);
auto &AA = AM.getResult<AAManager>(F);
@@ -184,7 +329,6 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
bool HasProfileData_,
std::unique_ptr<BlockFrequencyInfo> BFI_,
std::unique_ptr<BranchProbabilityInfo> BPI_) {
-
DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
TLI = TLI_;
LVI = LVI_;
@@ -384,7 +528,6 @@ static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
/// within the loop (forming a nested loop). This simple analysis is not rich
/// enough to track all of these properties and keep it up-to-date as the CFG
/// mutates, so we don't allow any of these transformations.
-///
void JumpThreadingPass::FindLoopHeaders(Function &F) {
SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
FindFunctionBackedges(F, Edges);
@@ -418,7 +561,6 @@ static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) {
/// BB in the result vector.
///
/// This returns true if there were any known values.
-///
bool JumpThreadingPass::ComputeValueKnownInPredecessors(
Value *V, BasicBlock *BB, PredValueInfo &Result,
ConstantPreference Preference, Instruction *CxtI) {
@@ -507,8 +649,6 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
return true;
}
- PredValueInfoTy LHSVals, RHSVals;
-
// Handle some boolean conditions.
if (I->getType()->getPrimitiveSizeInBits() == 1) {
assert(Preference == WantInteger && "One-bit non-integer type?");
@@ -516,6 +656,8 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
// X & false -> false
if (I->getOpcode() == Instruction::Or ||
I->getOpcode() == Instruction::And) {
+ PredValueInfoTy LHSVals, RHSVals;
+
ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
WantInteger, CxtI);
ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals,
@@ -655,6 +797,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
// x as a live-in.
{
using namespace PatternMatch;
+
Value *AddLHS;
ConstantInt *AddConst;
if (isa<ConstantInt>(CmpConst) &&
@@ -751,14 +894,11 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
return !Result.empty();
}
-
-
/// GetBestDestForBranchOnUndef - If we determine that the specified block ends
/// in an undefined jump, decide which block is best to revector to.
///
/// Since we can pick an arbitrary destination, we pick the successor with the
/// fewest predecessors. This should reduce the in-degree of the others.
-///
static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) {
TerminatorInst *BBTerm = BB->getTerminator();
unsigned MinSucc = 0;
@@ -979,7 +1119,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// for loads that are used by a switch or by the condition for the branch. If
// we see one, check to see if it's partially redundant. If so, insert a PHI
// which can then be used to thread the values.
- //
Value *SimplifyValue = CondInst;
if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
if (isa<Constant>(CondCmp->getOperand(1)))
@@ -991,10 +1130,14 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
if (SimplifyPartiallyRedundantLoad(LI))
return true;
+ // Before threading, try to propagate profile data backwards:
+ if (PHINode *PN = dyn_cast<PHINode>(CondInst))
+ if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+ updatePredecessorProfileMetadata(PN, BB);
+
// Handle a variety of cases where we are branching on something derived from
// a PHI node in the current block. If we can prove that any predecessors
// compute a predictable value based on a PHI node, thread those predecessors.
- //
if (ProcessThreadableEdges(CondInst, BB, Preference, Terminator))
return true;
@@ -1036,9 +1179,9 @@ bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
if (PBI->getSuccessor(0) != CurrentBB && PBI->getSuccessor(1) != CurrentBB)
return false;
- bool FalseDest = PBI->getSuccessor(1) == CurrentBB;
+ bool CondIsTrue = PBI->getSuccessor(0) == CurrentBB;
Optional<bool> Implication =
- isImpliedCondition(PBI->getCondition(), Cond, DL, FalseDest);
+ isImpliedCondition(PBI->getCondition(), Cond, DL, CondIsTrue);
if (Implication) {
BI->getSuccessor(*Implication ? 1 : 0)->removePredecessor(BB);
BranchInst::Create(BI->getSuccessor(*Implication ? 0 : 1), BI);
@@ -1124,7 +1267,9 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
LI->getAAMetadata(AATags);
SmallPtrSet<BasicBlock*, 8> PredsScanned;
- typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy;
+
+ using AvailablePredsTy = SmallVector<std::pair<BasicBlock *, Value *>, 8>;
+
AvailablePredsTy AvailablePreds;
BasicBlock *OneUnavailablePred = nullptr;
SmallVector<LoadInst*, 8> CSELoads;
@@ -1283,8 +1428,8 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
/// the list.
static BasicBlock *
FindMostPopularDest(BasicBlock *BB,
- const SmallVectorImpl<std::pair<BasicBlock*,
- BasicBlock*> > &PredToDestList) {
+ const SmallVectorImpl<std::pair<BasicBlock *,
+ BasicBlock *>> &PredToDestList) {
assert(!PredToDestList.empty());
// Determine popularity. If there are multiple possible destinations, we
@@ -1502,7 +1647,6 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
/// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on
/// a PHI node in the current block. See if there are any simplifications we
/// can do based on inputs to the phi node.
-///
bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) {
BasicBlock *BB = PN->getParent();
@@ -1532,7 +1676,6 @@ bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) {
/// ProcessBranchOnXOR - We have an otherwise unthreadable conditional branch on
/// a xor instruction in the current block. See if there are any
/// simplifications we can do based on inputs to the xor.
-///
bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) {
BasicBlock *BB = BO->getParent();
@@ -1637,7 +1780,6 @@ bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) {
return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto);
}
-
/// AddPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new
/// predecessor to the PHIBB block. If it has PHI nodes, add entries for
/// NewPred using the entries from OldPred (suitably mapped).
@@ -1677,10 +1819,15 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
// If threading this would thread across a loop header, don't thread the edge.
// See the comments above FindLoopHeaders for justifications and caveats.
- if (LoopHeaders.count(BB)) {
- DEBUG(dbgs() << " Not threading across loop header BB '" << BB->getName()
- << "' to dest BB '" << SuccBB->getName()
- << "' - it might create an irreducible loop!\n");
+ if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
+ DEBUG({
+ bool BBIsHeader = LoopHeaders.count(BB);
+ bool SuccIsHeader = LoopHeaders.count(SuccBB);
+ dbgs() << " Not threading across "
+ << (BBIsHeader ? "loop header BB '" : "block BB '") << BB->getName()
+ << "' to dest " << (SuccIsHeader ? "loop header BB '" : "block BB '")
+ << SuccBB->getName() << "' - it might create an irreducible loop!\n";
+ });
return false;
}
@@ -1795,7 +1942,6 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
DEBUG(dbgs() << "\n");
}
-
// Ok, NewBB is good to go. Update the terminator of PredBB to jump to
// NewBB instead of BB. This eliminates predecessors from BB, which requires
// us to simplify any PHI nodes in BB.
@@ -2194,7 +2340,7 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
/// %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ...
/// %c = cmp %p, 0
/// %s = select %c, trueval, falseval
-//
+///
/// And expand the select into a branch structure. This later enables
/// jump-threading over bb in this pass.
///
@@ -2280,6 +2426,7 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
/// guard is then threaded to one of them.
bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
using namespace PatternMatch;
+
// We only want to deal with two predecessors.
BasicBlock *Pred1, *Pred2;
auto PI = pred_begin(BB), PE = pred_end(BB);
@@ -2331,8 +2478,7 @@ bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
TrueDestIsSafe = true;
else {
// False dest is safe if !BranchCond => GuardCond.
- Impl =
- isImpliedCondition(BranchCond, GuardCond, DL, /* InvertAPred */ true);
+ Impl = isImpliedCondition(BranchCond, GuardCond, DL, /* LHSIsTrue */ false);
if (Impl && *Impl)
FalseDestIsSafe = true;
}
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 37b9c4b1094e..4ea935793b80 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -42,7 +42,8 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
@@ -62,6 +63,7 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -88,15 +90,15 @@ static cl::opt<uint32_t> MaxNumUsesTraversed(
"invariance in loop using invariant start (default = 8)"));
static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
-static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
- const LoopSafetyInfo *SafetyInfo);
+static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
+ const LoopSafetyInfo *SafetyInfo,
+ TargetTransformInfo *TTI, bool &FreeInLoop);
static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
const LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE);
-static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
- const Loop *CurLoop, AliasSetTracker *CurAST,
- const LoopSafetyInfo *SafetyInfo,
- OptimizationRemarkEmitter *ORE);
+static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
+ const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE, bool FreeInLoop);
static bool isSafeToExecuteUnconditionally(Instruction &Inst,
const DominatorTree *DT,
const Loop *CurLoop,
@@ -114,7 +116,8 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
namespace {
struct LoopInvariantCodeMotion {
bool runOnLoop(Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
- TargetLibraryInfo *TLI, ScalarEvolution *SE,
+ TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
+ ScalarEvolution *SE, MemorySSA *MSSA,
OptimizationRemarkEmitter *ORE, bool DeleteAST);
DenseMap<Loop *, AliasSetTracker *> &getLoopToAliasSetMap() {
@@ -146,6 +149,9 @@ struct LegacyLICMPass : public LoopPass {
}
auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+ MemorySSA *MSSA = EnableMSSALoopDependency
+ ? (&getAnalysis<MemorySSAWrapperPass>().getMSSA())
+ : nullptr;
// For the old PM, we can't use OptimizationRemarkEmitter as an analysis
// pass. Function analyses need to be preserved across loop transformations
// but ORE cannot be preserved (see comment before the pass definition).
@@ -155,7 +161,9 @@ struct LegacyLICMPass : public LoopPass {
&getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
&getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
&getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
- SE ? &SE->getSE() : nullptr, &ORE, false);
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *L->getHeader()->getParent()),
+ SE ? &SE->getSE() : nullptr, MSSA, &ORE, false);
}
/// This transformation requires natural loop information & requires that
@@ -164,6 +172,9 @@ struct LegacyLICMPass : public LoopPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<TargetLibraryInfoWrapperPass>();
+ if (EnableMSSALoopDependency)
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
getLoopAnalysisUsage(AU);
}
@@ -189,7 +200,7 @@ private:
/// Simple Analysis hook. Delete loop L from alias set map.
void deleteAnalysisLoop(Loop *L) override;
};
-}
+} // namespace
PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR, LPMUpdater &) {
@@ -204,7 +215,8 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
"cached at a higher level");
LoopInvariantCodeMotion LICM;
- if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.SE, ORE, true))
+ if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.TTI, &AR.SE,
+ AR.MSSA, ORE, true))
return PreservedAnalyses::all();
auto PA = getLoopPassPreservedAnalyses();
@@ -217,6 +229,8 @@ INITIALIZE_PASS_BEGIN(LegacyLICMPass, "licm", "Loop Invariant Code Motion",
false, false)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false,
false)
@@ -228,12 +242,10 @@ Pass *llvm::createLICMPass() { return new LegacyLICMPass(); }
/// We should delete AST for inner loops in the new pass manager to avoid
/// memory leak.
///
-bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
- LoopInfo *LI, DominatorTree *DT,
- TargetLibraryInfo *TLI,
- ScalarEvolution *SE,
- OptimizationRemarkEmitter *ORE,
- bool DeleteAST) {
+bool LoopInvariantCodeMotion::runOnLoop(
+ Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
+ TargetLibraryInfo *TLI, TargetTransformInfo *TTI, ScalarEvolution *SE,
+ MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, bool DeleteAST) {
bool Changed = false;
assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
@@ -258,7 +270,7 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
// instructions, we perform another pass to hoist them out of the loop.
//
if (L->hasDedicatedExits())
- Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
+ Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,
CurAST, &SafetyInfo, ORE);
if (Preheader)
Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
@@ -292,10 +304,26 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
bool Promoted = false;
// Loop over all of the alias sets in the tracker object.
- for (AliasSet &AS : *CurAST)
- Promoted |=
- promoteLoopAccessesToScalars(AS, ExitBlocks, InsertPts, PIC, LI, DT,
- TLI, L, CurAST, &SafetyInfo, ORE);
+ for (AliasSet &AS : *CurAST) {
+ // We can promote this alias set if it has a store, if it is a "Must"
+ // alias set, if the pointer is loop invariant, and if we are not
+ // eliminating any volatile loads or stores.
+ if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
+ AS.isVolatile() || !L->isLoopInvariant(AS.begin()->getValue()))
+ continue;
+
+ assert(
+ !AS.empty() &&
+ "Must alias set should have at least one pointer element in it!");
+
+ SmallSetVector<Value *, 8> PointerMustAliases;
+ for (const auto &ASI : AS)
+ PointerMustAliases.insert(ASI.getValue());
+
+ Promoted |= promoteLoopAccessesToScalars(PointerMustAliases, ExitBlocks,
+ InsertPts, PIC, LI, DT, TLI, L,
+ CurAST, &SafetyInfo, ORE);
+ }
// Once we have promoted values across the loop body we have to
// recursively reform LCSSA as any nested loop may now have values defined
@@ -335,7 +363,8 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
/// definitions, allowing us to sink a loop body in one pass without iteration.
///
bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
- DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
+ DominatorTree *DT, TargetLibraryInfo *TLI,
+ TargetTransformInfo *TTI, Loop *CurLoop,
AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE) {
@@ -344,46 +373,50 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr &&
"Unexpected input to sinkRegion");
- BasicBlock *BB = N->getBlock();
- // If this subregion is not in the top level loop at all, exit.
- if (!CurLoop->contains(BB))
- return false;
+ // We want to visit children before parents. We will enque all the parents
+ // before their children in the worklist and process the worklist in reverse
+ // order.
+ SmallVector<DomTreeNode *, 16> Worklist = collectChildrenInLoop(N, CurLoop);
- // We are processing blocks in reverse dfo, so process children first.
bool Changed = false;
- const std::vector<DomTreeNode *> &Children = N->getChildren();
- for (DomTreeNode *Child : Children)
- Changed |=
- sinkRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo, ORE);
-
- // Only need to process the contents of this block if it is not part of a
- // subloop (which would already have been processed).
- if (inSubLoop(BB, CurLoop, LI))
- return Changed;
+ for (DomTreeNode *DTN : reverse(Worklist)) {
+ BasicBlock *BB = DTN->getBlock();
+ // Only need to process the contents of this block if it is not part of a
+ // subloop (which would already have been processed).
+ if (inSubLoop(BB, CurLoop, LI))
+ continue;
- for (BasicBlock::iterator II = BB->end(); II != BB->begin();) {
- Instruction &I = *--II;
+ for (BasicBlock::iterator II = BB->end(); II != BB->begin();) {
+ Instruction &I = *--II;
- // If the instruction is dead, we would try to sink it because it isn't used
- // in the loop, instead, just delete it.
- if (isInstructionTriviallyDead(&I, TLI)) {
- DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
- ++II;
- CurAST->deleteValue(&I);
- I.eraseFromParent();
- Changed = true;
- continue;
- }
+ // If the instruction is dead, we would try to sink it because it isn't
+ // used in the loop, instead, just delete it.
+ if (isInstructionTriviallyDead(&I, TLI)) {
+ DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
+ ++II;
+ CurAST->deleteValue(&I);
+ I.eraseFromParent();
+ Changed = true;
+ continue;
+ }
- // Check to see if we can sink this instruction to the exit blocks
- // of the loop. We can do this if the all users of the instruction are
- // outside of the loop. In this case, it doesn't even matter if the
- // operands of the instruction are loop invariant.
- //
- if (isNotUsedInLoop(I, CurLoop, SafetyInfo) &&
- canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) {
- ++II;
- Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo, ORE);
+ // Check to see if we can sink this instruction to the exit blocks
+ // of the loop. We can do this if the all users of the instruction are
+ // outside of the loop. In this case, it doesn't even matter if the
+ // operands of the instruction are loop invariant.
+ //
+ bool FreeInLoop = false;
+ if (isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) &&
+ canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) {
+ if (sink(I, LI, DT, CurLoop, SafetyInfo, ORE, FreeInLoop)) {
+ if (!FreeInLoop) {
+ ++II;
+ CurAST->deleteValue(&I);
+ I.eraseFromParent();
+ }
+ Changed = true;
+ }
+ }
}
}
return Changed;
@@ -403,73 +436,70 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr &&
"Unexpected input to hoistRegion");
- BasicBlock *BB = N->getBlock();
-
- // If this subregion is not in the top level loop at all, exit.
- if (!CurLoop->contains(BB))
- return false;
+ // We want to visit parents before children. We will enque all the parents
+ // before their children in the worklist and process the worklist in order.
+ SmallVector<DomTreeNode *, 16> Worklist = collectChildrenInLoop(N, CurLoop);
- // Only need to process the contents of this block if it is not part of a
- // subloop (which would already have been processed).
bool Changed = false;
- if (!inSubLoop(BB, CurLoop, LI))
- for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
- Instruction &I = *II++;
- // Try constant folding this instruction. If all the operands are
- // constants, it is technically hoistable, but it would be better to just
- // fold it.
- if (Constant *C = ConstantFoldInstruction(
- &I, I.getModule()->getDataLayout(), TLI)) {
- DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n');
- CurAST->copyValue(&I, C);
- I.replaceAllUsesWith(C);
- if (isInstructionTriviallyDead(&I, TLI)) {
- CurAST->deleteValue(&I);
- I.eraseFromParent();
+ for (DomTreeNode *DTN : Worklist) {
+ BasicBlock *BB = DTN->getBlock();
+ // Only need to process the contents of this block if it is not part of a
+ // subloop (which would already have been processed).
+ if (!inSubLoop(BB, CurLoop, LI))
+ for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
+ Instruction &I = *II++;
+ // Try constant folding this instruction. If all the operands are
+ // constants, it is technically hoistable, but it would be better to
+ // just fold it.
+ if (Constant *C = ConstantFoldInstruction(
+ &I, I.getModule()->getDataLayout(), TLI)) {
+ DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n');
+ CurAST->copyValue(&I, C);
+ I.replaceAllUsesWith(C);
+ if (isInstructionTriviallyDead(&I, TLI)) {
+ CurAST->deleteValue(&I);
+ I.eraseFromParent();
+ }
+ Changed = true;
+ continue;
}
- Changed = true;
- continue;
- }
- // Attempt to remove floating point division out of the loop by converting
- // it to a reciprocal multiplication.
- if (I.getOpcode() == Instruction::FDiv &&
- CurLoop->isLoopInvariant(I.getOperand(1)) &&
- I.hasAllowReciprocal()) {
- auto Divisor = I.getOperand(1);
- auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
- auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
- ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
- ReciprocalDivisor->insertBefore(&I);
+ // Attempt to remove floating point division out of the loop by
+ // converting it to a reciprocal multiplication.
+ if (I.getOpcode() == Instruction::FDiv &&
+ CurLoop->isLoopInvariant(I.getOperand(1)) &&
+ I.hasAllowReciprocal()) {
+ auto Divisor = I.getOperand(1);
+ auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
+ auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
+ ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
+ ReciprocalDivisor->insertBefore(&I);
- auto Product = BinaryOperator::CreateFMul(I.getOperand(0),
- ReciprocalDivisor);
- Product->setFastMathFlags(I.getFastMathFlags());
- Product->insertAfter(&I);
- I.replaceAllUsesWith(Product);
- I.eraseFromParent();
+ auto Product =
+ BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor);
+ Product->setFastMathFlags(I.getFastMathFlags());
+ Product->insertAfter(&I);
+ I.replaceAllUsesWith(Product);
+ I.eraseFromParent();
- hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
- Changed = true;
- continue;
- }
+ hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
+ Changed = true;
+ continue;
+ }
- // Try hoisting the instruction out to the preheader. We can only do this
- // if all of the operands of the instruction are loop invariant and if it
- // is safe to hoist the instruction.
- //
- if (CurLoop->hasLoopInvariantOperands(&I) &&
- canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) &&
- isSafeToExecuteUnconditionally(
- I, DT, CurLoop, SafetyInfo, ORE,
- CurLoop->getLoopPreheader()->getTerminator()))
- Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE);
- }
+ // Try hoisting the instruction out to the preheader. We can only do
+ // this if all of the operands of the instruction are loop invariant and
+ // if it is safe to hoist the instruction.
+ //
+ if (CurLoop->hasLoopInvariantOperands(&I) &&
+ canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) &&
+ isSafeToExecuteUnconditionally(
+ I, DT, CurLoop, SafetyInfo, ORE,
+ CurLoop->getLoopPreheader()->getTerminator()))
+ Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE);
+ }
+ }
- const std::vector<DomTreeNode *> &Children = N->getChildren();
- for (DomTreeNode *Child : Children)
- Changed |=
- hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo, ORE);
return Changed;
}
@@ -492,7 +522,8 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
// Iterate over loop instructions and compute safety info.
// Skip header as it has been computed and stored in HeaderMayThrow.
// The first block in loopinfo.Blocks is guaranteed to be the header.
- assert(Header == *CurLoop->getBlocks().begin() && "First block must be header");
+ assert(Header == *CurLoop->getBlocks().begin() &&
+ "First block must be header");
for (Loop::block_iterator BB = std::next(CurLoop->block_begin()),
BBE = CurLoop->block_end();
(BB != BBE) && !SafetyInfo->MayThrow; ++BB)
@@ -510,9 +541,9 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
}
// Return true if LI is invariant within scope of the loop. LI is invariant if
-// CurLoop is dominated by an invariant.start representing the same memory location
-// and size as the memory location LI loads from, and also the invariant.start
-// has no uses.
+// CurLoop is dominated by an invariant.start representing the same memory
+// location and size as the memory location LI loads from, and also the
+// invariant.start has no uses.
static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
Loop *CurLoop) {
Value *Addr = LI->getOperand(0);
@@ -566,10 +597,13 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
Loop *CurLoop, AliasSetTracker *CurAST,
LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE) {
+ // SafetyInfo is nullptr if we are checking for sinking from preheader to
+ // loop body.
+ const bool SinkingToLoopBody = !SafetyInfo;
// Loads have extra constraints we have to verify before we can hoist them.
if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
if (!LI->isUnordered())
- return false; // Don't hoist volatile/atomic loads!
+ return false; // Don't sink/hoist volatile or ordered atomic loads!
// Loads from constant memory are always safe to move, even if they end up
// in the same alias set as something that ends up being modified.
@@ -578,6 +612,9 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
if (LI->getMetadata(LLVMContext::MD_invariant_load))
return true;
+ if (LI->isAtomic() && SinkingToLoopBody)
+ return false; // Don't sink unordered atomic loads to loop body.
+
// This checks for an invariant.start dominating the load.
if (isLoadInvariantInLoop(LI, DT, CurLoop))
return true;
@@ -595,10 +632,12 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
// Check loop-invariant address because this may also be a sinkable load
// whose address is not necessarily loop-invariant.
if (ORE && Invalidated && CurLoop->isLoopInvariant(LI->getPointerOperand()))
- ORE->emit(OptimizationRemarkMissed(
- DEBUG_TYPE, "LoadWithLoopInvariantAddressInvalidated", LI)
- << "failed to move load with loop-invariant address "
- "because the loop may invalidate its value");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(
+ DEBUG_TYPE, "LoadWithLoopInvariantAddressInvalidated", LI)
+ << "failed to move load with loop-invariant address "
+ "because the loop may invalidate its value";
+ });
return !Invalidated;
} else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
@@ -653,9 +692,9 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
!isa<InsertValueInst>(I))
return false;
- // SafetyInfo is nullptr if we are checking for sinking from preheader to
- // loop body. It will be always safe as there is no speculative execution.
- if (!SafetyInfo)
+ // If we are checking for sinking from preheader to loop body it will be
+ // always safe as there is no speculative execution.
+ if (SinkingToLoopBody)
return true;
// TODO: Plumb the context instruction through to make hoisting and sinking
@@ -677,13 +716,40 @@ static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) {
return true;
}
+/// Return true if the instruction is free in the loop.
+static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
+ const TargetTransformInfo *TTI) {
+
+ if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ if (TTI->getUserCost(GEP) != TargetTransformInfo::TCC_Free)
+ return false;
+ // For a GEP, we cannot simply use getUserCost because currently it
+ // optimistically assume that a GEP will fold into addressing mode
+ // regardless of its users.
+ const BasicBlock *BB = GEP->getParent();
+ for (const User *U : GEP->users()) {
+ const Instruction *UI = cast<Instruction>(U);
+ if (CurLoop->contains(UI) &&
+ (BB != UI->getParent() ||
+ (!isa<StoreInst>(UI) && !isa<LoadInst>(UI))))
+ return false;
+ }
+ return true;
+ } else
+ return TTI->getUserCost(&I) == TargetTransformInfo::TCC_Free;
+}
+
/// Return true if the only users of this instruction are outside of
/// the loop. If this is true, we can sink the instruction to the exit
/// blocks of the loop.
///
-static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
- const LoopSafetyInfo *SafetyInfo) {
+/// We also return true if the instruction could be folded away in lowering.
+/// (e.g., a GEP can be folded into a load as an addressing mode in the loop).
+static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
+ const LoopSafetyInfo *SafetyInfo,
+ TargetTransformInfo *TTI, bool &FreeInLoop) {
const auto &BlockColors = SafetyInfo->BlockColors;
+ bool IsFree = isFreeInLoop(I, CurLoop, TTI);
for (const User *U : I.users()) {
const Instruction *UI = cast<Instruction>(U);
if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
@@ -698,30 +764,15 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
if (!BlockColors.empty() &&
BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1)
return false;
-
- // A PHI node where all of the incoming values are this instruction are
- // special -- they can just be RAUW'ed with the instruction and thus
- // don't require a use in the predecessor. This is a particular important
- // special case because it is the pattern found in LCSSA form.
- if (isTriviallyReplacablePHI(*PN, I)) {
- if (CurLoop->contains(PN))
- return false;
- else
- continue;
- }
-
- // Otherwise, PHI node uses occur in predecessor blocks if the incoming
- // values. Check for such a use being inside the loop.
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
- if (PN->getIncomingValue(i) == &I)
- if (CurLoop->contains(PN->getIncomingBlock(i)))
- return false;
-
- continue;
}
- if (CurLoop->contains(UI))
+ if (CurLoop->contains(UI)) {
+ if (IsFree) {
+ FreeInLoop = true;
+ continue;
+ }
return false;
+ }
}
return true;
}
@@ -787,77 +838,189 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
return New;
}
+static Instruction *sinkThroughTriviallyReplacablePHI(
+ PHINode *TPN, Instruction *I, LoopInfo *LI,
+ SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
+ const LoopSafetyInfo *SafetyInfo, const Loop *CurLoop) {
+ assert(isTriviallyReplacablePHI(*TPN, *I) &&
+ "Expect only trivially replacalbe PHI");
+ BasicBlock *ExitBlock = TPN->getParent();
+ Instruction *New;
+ auto It = SunkCopies.find(ExitBlock);
+ if (It != SunkCopies.end())
+ New = It->second;
+ else
+ New = SunkCopies[ExitBlock] =
+ CloneInstructionInExitBlock(*I, *ExitBlock, *TPN, LI, SafetyInfo);
+ return New;
+}
+
+static bool canSplitPredecessors(PHINode *PN) {
+ BasicBlock *BB = PN->getParent();
+ if (!BB->canSplitPredecessors())
+ return false;
+ for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+ BasicBlock *BBPred = *PI;
+ if (isa<IndirectBrInst>(BBPred->getTerminator()))
+ return false;
+ }
+ return true;
+}
+
+static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
+ LoopInfo *LI, const Loop *CurLoop) {
+#ifndef NDEBUG
+ SmallVector<BasicBlock *, 32> ExitBlocks;
+ CurLoop->getUniqueExitBlocks(ExitBlocks);
+ SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
+ ExitBlocks.end());
+#endif
+ BasicBlock *ExitBB = PN->getParent();
+ assert(ExitBlockSet.count(ExitBB) && "Expect the PHI is in an exit block.");
+
+ // Split predecessors of the loop exit to make instructions in the loop are
+ // exposed to exit blocks through trivially replacable PHIs while keeping the
+ // loop in the canonical form where each predecessor of each exit block should
+ // be contained within the loop. For example, this will convert the loop below
+ // from
+ //
+ // LB1:
+ // %v1 =
+ // br %LE, %LB2
+ // LB2:
+ // %v2 =
+ // br %LE, %LB1
+ // LE:
+ // %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replacable
+ //
+ // to
+ //
+ // LB1:
+ // %v1 =
+ // br %LE.split, %LB2
+ // LB2:
+ // %v2 =
+ // br %LE.split2, %LB1
+ // LE.split:
+ // %p1 = phi [%v1, %LB1] <-- trivially replacable
+ // br %LE
+ // LE.split2:
+ // %p2 = phi [%v2, %LB2] <-- trivially replacable
+ // br %LE
+ // LE:
+ // %p = phi [%p1, %LE.split], [%p2, %LE.split2]
+ //
+ SmallSetVector<BasicBlock *, 8> PredBBs(pred_begin(ExitBB), pred_end(ExitBB));
+ while (!PredBBs.empty()) {
+ BasicBlock *PredBB = *PredBBs.begin();
+ assert(CurLoop->contains(PredBB) &&
+ "Expect all predecessors are in the loop");
+ if (PN->getBasicBlockIndex(PredBB) >= 0)
+ SplitBlockPredecessors(ExitBB, PredBB, ".split.loop.exit", DT, LI, true);
+ PredBBs.remove(PredBB);
+ }
+}
+
/// When an instruction is found to only be used outside of the loop, this
/// function moves it to the exit blocks and patches up SSA form as needed.
/// This method is guaranteed to remove the original instruction from its
/// position, and may either delete it or move it to outside of the loop.
///
-static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
- const Loop *CurLoop, AliasSetTracker *CurAST,
- const LoopSafetyInfo *SafetyInfo,
- OptimizationRemarkEmitter *ORE) {
+static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
+ const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE, bool FreeInLoop) {
DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
- ORE->emit(OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
- << "sinking " << ore::NV("Inst", &I));
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
+ << "sinking " << ore::NV("Inst", &I);
+ });
bool Changed = false;
if (isa<LoadInst>(I))
++NumMovedLoads;
else if (isa<CallInst>(I))
++NumMovedCalls;
++NumSunk;
- Changed = true;
-#ifndef NDEBUG
- SmallVector<BasicBlock *, 32> ExitBlocks;
- CurLoop->getUniqueExitBlocks(ExitBlocks);
- SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
- ExitBlocks.end());
-#endif
+ // Iterate over users to be ready for actual sinking. Replace users via
+ // unrechable blocks with undef and make all user PHIs trivially replcable.
+ SmallPtrSet<Instruction *, 8> VisitedUsers;
+ for (Value::user_iterator UI = I.user_begin(), UE = I.user_end(); UI != UE;) {
+ auto *User = cast<Instruction>(*UI);
+ Use &U = UI.getUse();
+ ++UI;
- // Clones of this instruction. Don't create more than one per exit block!
- SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies;
+ if (VisitedUsers.count(User) || CurLoop->contains(User))
+ continue;
- // If this instruction is only used outside of the loop, then all users are
- // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
- // the instruction.
- while (!I.use_empty()) {
- Value::user_iterator UI = I.user_begin();
- auto *User = cast<Instruction>(*UI);
if (!DT->isReachableFromEntry(User->getParent())) {
- User->replaceUsesOfWith(&I, UndefValue::get(I.getType()));
+ U = UndefValue::get(I.getType());
+ Changed = true;
continue;
}
+
// The user must be a PHI node.
PHINode *PN = cast<PHINode>(User);
// Surprisingly, instructions can be used outside of loops without any
// exits. This can only happen in PHI nodes if the incoming block is
// unreachable.
- Use &U = UI.getUse();
BasicBlock *BB = PN->getIncomingBlock(U);
if (!DT->isReachableFromEntry(BB)) {
U = UndefValue::get(I.getType());
+ Changed = true;
continue;
}
- BasicBlock *ExitBlock = PN->getParent();
- assert(ExitBlockSet.count(ExitBlock) &&
- "The LCSSA PHI is not in an exit block!");
+ VisitedUsers.insert(PN);
+ if (isTriviallyReplacablePHI(*PN, I))
+ continue;
- Instruction *New;
- auto It = SunkCopies.find(ExitBlock);
- if (It != SunkCopies.end())
- New = It->second;
- else
- New = SunkCopies[ExitBlock] =
- CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI, SafetyInfo);
+ if (!canSplitPredecessors(PN))
+ return Changed;
+
+ // Split predecessors of the PHI so that we can make users trivially
+ // replacable.
+ splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop);
+
+ // Should rebuild the iterators, as they may be invalidated by
+ // splitPredecessorsOfLoopExit().
+ UI = I.user_begin();
+ UE = I.user_end();
+ }
+
+ if (VisitedUsers.empty())
+ return Changed;
+
+#ifndef NDEBUG
+ SmallVector<BasicBlock *, 32> ExitBlocks;
+ CurLoop->getUniqueExitBlocks(ExitBlocks);
+ SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
+ ExitBlocks.end());
+#endif
+
+ // Clones of this instruction. Don't create more than one per exit block!
+ SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies;
+
+ // If this instruction is only used outside of the loop, then all users are
+ // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
+ // the instruction.
+ SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end());
+ for (auto *UI : Users) {
+ auto *User = cast<Instruction>(UI);
+
+ if (CurLoop->contains(User))
+ continue;
+ PHINode *PN = cast<PHINode>(User);
+ assert(ExitBlockSet.count(PN->getParent()) &&
+ "The LCSSA PHI is not in an exit block!");
+ // The PHI must be trivially replacable.
+ Instruction *New = sinkThroughTriviallyReplacablePHI(PN, &I, LI, SunkCopies,
+ SafetyInfo, CurLoop);
PN->replaceAllUsesWith(New);
PN->eraseFromParent();
+ Changed = true;
}
-
- CurAST->deleteValue(&I);
- I.eraseFromParent();
return Changed;
}
@@ -870,8 +1033,10 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
auto *Preheader = CurLoop->getLoopPreheader();
DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
<< "\n");
- ORE->emit(OptimizationRemark(DEBUG_TYPE, "Hoisted", &I)
- << "hoisting " << ore::NV("Inst", &I));
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "Hoisted", &I) << "hoisting "
+ << ore::NV("Inst", &I);
+ });
// Metadata can be dependent on conditions we are hoisting above.
// Conservatively strip all metadata on the instruction unless we were
@@ -921,10 +1086,12 @@ static bool isSafeToExecuteUnconditionally(Instruction &Inst,
if (!GuaranteedToExecute) {
auto *LI = dyn_cast<LoadInst>(&Inst);
if (LI && CurLoop->isLoopInvariant(LI->getPointerOperand()))
- ORE->emit(OptimizationRemarkMissed(
- DEBUG_TYPE, "LoadWithLoopInvariantAddressCondExecuted", LI)
- << "failed to hoist load with loop-invariant address "
- "because load is conditionally executed");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(
+ DEBUG_TYPE, "LoadWithLoopInvariantAddressCondExecuted", LI)
+ << "failed to hoist load with loop-invariant address "
+ "because load is conditionally executed";
+ });
}
return GuaranteedToExecute;
@@ -933,7 +1100,7 @@ static bool isSafeToExecuteUnconditionally(Instruction &Inst,
namespace {
class LoopPromoter : public LoadAndStorePromoter {
Value *SomePtr; // Designated pointer to store to.
- SmallPtrSetImpl<Value *> &PointerMustAliases;
+ const SmallSetVector<Value *, 8> &PointerMustAliases;
SmallVectorImpl<BasicBlock *> &LoopExitBlocks;
SmallVectorImpl<Instruction *> &LoopInsertPts;
PredIteratorCache &PredCache;
@@ -961,7 +1128,7 @@ class LoopPromoter : public LoadAndStorePromoter {
public:
LoopPromoter(Value *SP, ArrayRef<const Instruction *> Insts, SSAUpdater &S,
- SmallPtrSetImpl<Value *> &PMA,
+ const SmallSetVector<Value *, 8> &PMA,
SmallVectorImpl<BasicBlock *> &LEB,
SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC,
AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment,
@@ -969,7 +1136,7 @@ public:
: LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast),
LI(li), DL(std::move(dl)), Alignment(alignment),
- UnorderedAtomic(UnorderedAtomic),AATags(AATags) {}
+ UnorderedAtomic(UnorderedAtomic), AATags(AATags) {}
bool isInstInList(Instruction *I,
const SmallVectorImpl<Instruction *> &) const override {
@@ -1008,7 +1175,31 @@ public:
}
void instructionDeleted(Instruction *I) const override { AST.deleteValue(I); }
};
-} // end anon namespace
+
+
+/// Return true iff we can prove that a caller of this function can not inspect
+/// the contents of the provided object in a well defined program.
+bool isKnownNonEscaping(Value *Object, const TargetLibraryInfo *TLI) {
+ if (isa<AllocaInst>(Object))
+ // Since the alloca goes out of scope, we know the caller can't retain a
+ // reference to it and be well defined. Thus, we don't need to check for
+ // capture.
+ return true;
+
+ // For all other objects we need to know that the caller can't possibly
+ // have gotten a reference to the object. There are two components of
+ // that:
+ // 1) Object can't be escaped by this function. This is what
+ // PointerMayBeCaptured checks.
+ // 2) Object can't have been captured at definition site. For this, we
+ // need to know the return value is noalias. At the moment, we use a
+ // weaker condition and handle only AllocLikeFunctions (which are
+ // known to be noalias). TODO
+ return isAllocLikeFn(Object, TLI) &&
+ !PointerMayBeCaptured(Object, true, true);
+}
+
+} // namespace
/// Try to promote memory values to scalars by sinking stores out of the
/// loop and moving loads to before the loop. We do this by looping over
@@ -1016,7 +1207,8 @@ public:
/// loop invariant.
///
bool llvm::promoteLoopAccessesToScalars(
- AliasSet &AS, SmallVectorImpl<BasicBlock *> &ExitBlocks,
+ const SmallSetVector<Value *, 8> &PointerMustAliases,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks,
SmallVectorImpl<Instruction *> &InsertPts, PredIteratorCache &PIC,
LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
@@ -1026,17 +1218,7 @@ bool llvm::promoteLoopAccessesToScalars(
CurAST != nullptr && SafetyInfo != nullptr &&
"Unexpected Input to promoteLoopAccessesToScalars");
- // We can promote this alias set if it has a store, if it is a "Must" alias
- // set, if the pointer is loop invariant, and if we are not eliminating any
- // volatile loads or stores.
- if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
- AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue()))
- return false;
-
- assert(!AS.empty() &&
- "Must alias set should have at least one pointer element in it!");
-
- Value *SomePtr = AS.begin()->getValue();
+ Value *SomePtr = *PointerMustAliases.begin();
BasicBlock *Preheader = CurLoop->getLoopPreheader();
// It isn't safe to promote a load/store from the loop if the load/store is
@@ -1065,8 +1247,8 @@ bool llvm::promoteLoopAccessesToScalars(
// is safe (i.e. proving dereferenceability on all paths through the loop). We
// can use any access within the alias set to prove dereferenceability,
// since they're all must alias.
- //
- // There are two ways establish (p2):
+ //
+ // There are two ways establish (p2):
// a) Prove the location is thread-local. In this case the memory model
// requirement does not apply, and stores are safe to insert.
// b) Prove a store dominates every exit block. In this case, if an exit
@@ -1080,55 +1262,36 @@ bool llvm::promoteLoopAccessesToScalars(
bool SafeToInsertStore = false;
SmallVector<Instruction *, 64> LoopUses;
- SmallPtrSet<Value *, 4> PointerMustAliases;
// We start with an alignment of one and try to find instructions that allow
// us to prove better alignment.
unsigned Alignment = 1;
// Keep track of which types of access we see
- bool SawUnorderedAtomic = false;
+ bool SawUnorderedAtomic = false;
bool SawNotAtomic = false;
AAMDNodes AATags;
const DataLayout &MDL = Preheader->getModule()->getDataLayout();
- // Do we know this object does not escape ?
- bool IsKnownNonEscapingObject = false;
+ bool IsKnownThreadLocalObject = false;
if (SafetyInfo->MayThrow) {
// If a loop can throw, we have to insert a store along each unwind edge.
// That said, we can't actually make the unwind edge explicit. Therefore,
- // we have to prove that the store is dead along the unwind edge.
- //
- // If the underlying object is not an alloca, nor a pointer that does not
- // escape, then we can not effectively prove that the store is dead along
- // the unwind edge. i.e. the caller of this function could have ways to
- // access the pointed object.
+ // we have to prove that the store is dead along the unwind edge. We do
+ // this by proving that the caller can't have a reference to the object
+ // after return and thus can't possibly load from the object.
Value *Object = GetUnderlyingObject(SomePtr, MDL);
- // If this is a base pointer we do not understand, simply bail.
- // We only handle alloca and return value from alloc-like fn right now.
- if (!isa<AllocaInst>(Object)) {
- if (!isAllocLikeFn(Object, TLI))
- return false;
- // If this is an alloc like fn. There are more constraints we need to verify.
- // More specifically, we must make sure that the pointer can not escape.
- //
- // NOTE: PointerMayBeCaptured is not enough as the pointer may have escaped
- // even though its not captured by the enclosing function. Standard allocation
- // functions like malloc, calloc, and operator new return values which can
- // be assumed not to have previously escaped.
- if (PointerMayBeCaptured(Object, true, true))
- return false;
- IsKnownNonEscapingObject = true;
- }
+ if (!isKnownNonEscaping(Object, TLI))
+ return false;
+ // Subtlety: Alloca's aren't visible to callers, but *are* potentially
+ // visible to other threads if captured and used during their lifetimes.
+ IsKnownThreadLocalObject = !isa<AllocaInst>(Object);
}
// Check that all of the pointers in the alias set have the same type. We
// cannot (yet) promote a memory location that is loaded and stored in
// different sizes. While we are at it, collect alignment and AA info.
- for (const auto &ASI : AS) {
- Value *ASIV = ASI.getValue();
- PointerMustAliases.insert(ASIV);
-
+ for (Value *ASIV : PointerMustAliases) {
// Check that all of the pointers in the alias set have the same type. We
// cannot (yet) promote a memory location that is loaded and stored in
// different sizes.
@@ -1147,7 +1310,7 @@ bool llvm::promoteLoopAccessesToScalars(
assert(!Load->isVolatile() && "AST broken");
if (!Load->isUnordered())
return false;
-
+
SawUnorderedAtomic |= Load->isAtomic();
SawNotAtomic |= !Load->isAtomic();
@@ -1234,14 +1397,13 @@ bool llvm::promoteLoopAccessesToScalars(
// stores along paths which originally didn't have them without violating the
// memory model.
if (!SafeToInsertStore) {
- // If this is a known non-escaping object, it is safe to insert the stores.
- if (IsKnownNonEscapingObject)
+ if (IsKnownThreadLocalObject)
SafeToInsertStore = true;
else {
Value *Object = GetUnderlyingObject(SomePtr, MDL);
SafeToInsertStore =
- (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) &&
- !PointerMayBeCaptured(Object, true, true);
+ (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) &&
+ !PointerMayBeCaptured(Object, true, true);
}
}
@@ -1252,9 +1414,11 @@ bool llvm::promoteLoopAccessesToScalars(
// Otherwise, this is safe to promote, lets do it!
DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
<< '\n');
- ORE->emit(
- OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar", LoopUses[0])
- << "Moving accesses to memory location out of the loop");
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar",
+ LoopUses[0])
+ << "Moving accesses to memory location out of the loop";
+ });
++NumPromoted;
// Grab a debug location for the inserted loads/stores; given that the
@@ -1333,7 +1497,7 @@ LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
auto mergeLoop = [&](Loop *L) {
// Loop over the body of this loop, looking for calls, invokes, and stores.
for (BasicBlock *BB : L->blocks())
- CurAST->add(*BB); // Incorporate the specified basic block
+ CurAST->add(*BB); // Incorporate the specified basic block
};
// Add everything from the sub loops that are no longer directly available.
diff --git a/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index d09af32a99fd..7f7c6de76450 100644
--- a/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -18,25 +18,20 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
-#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
using namespace llvm;
@@ -120,9 +115,7 @@ public:
AU.addPreserved<LoopInfoWrapperPass>();
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
- // FIXME: For some reason, preserving SE here breaks LSR (even if
- // this pass changes nothing).
- // AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
}
@@ -329,8 +322,10 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
++NumPrefetches;
DEBUG(dbgs() << " Access: " << *PtrValue << ", SCEV: " << *LSCEV
<< "\n");
- ORE->emit(OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI)
- << "prefetched memory access");
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI)
+ << "prefetched memory access";
+ });
MadeChange = true;
}
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index ac4dd44a0e90..82604a8842bf 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -30,20 +30,12 @@ using namespace llvm;
STATISTIC(NumDeleted, "Number of loops deleted");
-/// This function deletes dead loops. The caller of this function needs to
-/// guarantee that the loop is infact dead. Here we handle two kinds of dead
-/// loop. The first kind (\p isLoopDead) is where only invariant values from
-/// within the loop are used outside of it. The second kind (\p
-/// isLoopNeverExecuted) is where the loop is provably never executed. We can
-/// always remove never executed loops since they will not cause any difference
-/// to program behaviour.
-///
-/// This also updates the relevant analysis information in \p DT, \p SE, and \p
-/// LI. It also updates the loop PM if an updater struct is provided.
-// TODO: This function will be used by loop-simplifyCFG as well. So, move this
-// to LoopUtils.cpp
-static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
- LoopInfo &LI, LPMUpdater *Updater = nullptr);
+enum class LoopDeletionResult {
+ Unmodified,
+ Modified,
+ Deleted,
+};
+
/// Determines if a loop is dead.
///
/// This assumes that we've already checked for unique exit and exiting blocks,
@@ -144,8 +136,8 @@ static bool isLoopNeverExecuted(Loop *L) {
/// \returns true if any changes were made. This may mutate the loop even if it
/// is unable to delete it due to hoisting trivially loop invariant
/// instructions out of the loop.
-static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
- LoopInfo &LI, LPMUpdater *Updater = nullptr) {
+static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
+ ScalarEvolution &SE, LoopInfo &LI) {
assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
// We can only remove the loop if there is a preheader that we can branch from
@@ -155,13 +147,13 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
if (!Preheader || !L->hasDedicatedExits()) {
DEBUG(dbgs()
<< "Deletion requires Loop with preheader and dedicated exits.\n");
- return false;
+ return LoopDeletionResult::Unmodified;
}
// We can't remove loops that contain subloops. If the subloops were dead,
// they would already have been removed in earlier executions of this pass.
if (L->begin() != L->end()) {
DEBUG(dbgs() << "Loop contains subloops.\n");
- return false;
+ return LoopDeletionResult::Unmodified;
}
@@ -176,9 +168,9 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
P->setIncomingValue(i, UndefValue::get(P->getType()));
BI++;
}
- deleteDeadLoop(L, DT, SE, LI, Updater);
+ deleteDeadLoop(L, &DT, &SE, &LI);
++NumDeleted;
- return true;
+ return LoopDeletionResult::Deleted;
}
// The remaining checks below are for a loop being dead because all statements
@@ -192,13 +184,14 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
// a loop invariant manner.
if (!ExitBlock) {
DEBUG(dbgs() << "Deletion requires single exit block\n");
- return false;
+ return LoopDeletionResult::Unmodified;
}
// Finally, we have to check that the loop really is dead.
bool Changed = false;
if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader)) {
DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n");
- return Changed;
+ return Changed ? LoopDeletionResult::Modified
+ : LoopDeletionResult::Unmodified;
}
// Don't remove loops for which we can't solve the trip count.
@@ -206,114 +199,15 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
const SCEV *S = SE.getMaxBackedgeTakenCount(L);
if (isa<SCEVCouldNotCompute>(S)) {
DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n");
- return Changed;
+ return Changed ? LoopDeletionResult::Modified
+ : LoopDeletionResult::Unmodified;
}
DEBUG(dbgs() << "Loop is invariant, delete it!");
- deleteDeadLoop(L, DT, SE, LI, Updater);
+ deleteDeadLoop(L, &DT, &SE, &LI);
++NumDeleted;
- return true;
-}
-
-static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
- LoopInfo &LI, LPMUpdater *Updater) {
- assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
- auto *Preheader = L->getLoopPreheader();
- assert(Preheader && "Preheader should exist!");
-
- // Now that we know the removal is safe, remove the loop by changing the
- // branch from the preheader to go to the single exit block.
- //
- // Because we're deleting a large chunk of code at once, the sequence in which
- // we remove things is very important to avoid invalidation issues.
-
- // If we have an LPM updater, tell it about the loop being removed.
- if (Updater)
- Updater->markLoopAsDeleted(*L);
-
- // Tell ScalarEvolution that the loop is deleted. Do this before
- // deleting the loop so that ScalarEvolution can look at the loop
- // to determine what it needs to clean up.
- SE.forgetLoop(L);
-
- auto *ExitBlock = L->getUniqueExitBlock();
- assert(ExitBlock && "Should have a unique exit block!");
-
- assert(L->hasDedicatedExits() && "Loop should have dedicated exits!");
-
- // Connect the preheader directly to the exit block.
- // Even when the loop is never executed, we cannot remove the edge from the
- // source block to the exit block. Consider the case where the unexecuted loop
- // branches back to an outer loop. If we deleted the loop and removed the edge
- // coming to this inner loop, this will break the outer loop structure (by
- // deleting the backedge of the outer loop). If the outer loop is indeed a
- // non-loop, it will be deleted in a future iteration of loop deletion pass.
- Preheader->getTerminator()->replaceUsesOfWith(L->getHeader(), ExitBlock);
-
- // Rewrite phis in the exit block to get their inputs from the Preheader
- // instead of the exiting block.
- BasicBlock::iterator BI = ExitBlock->begin();
- while (PHINode *P = dyn_cast<PHINode>(BI)) {
- // Set the zero'th element of Phi to be from the preheader and remove all
- // other incoming values. Given the loop has dedicated exits, all other
- // incoming values must be from the exiting blocks.
- int PredIndex = 0;
- P->setIncomingBlock(PredIndex, Preheader);
- // Removes all incoming values from all other exiting blocks (including
- // duplicate values from an exiting block).
- // Nuke all entries except the zero'th entry which is the preheader entry.
- // NOTE! We need to remove Incoming Values in the reverse order as done
- // below, to keep the indices valid for deletion (removeIncomingValues
- // updates getNumIncomingValues and shifts all values down into the operand
- // being deleted).
- for (unsigned i = 0, e = P->getNumIncomingValues() - 1; i != e; ++i)
- P->removeIncomingValue(e-i, false);
-
- assert((P->getNumIncomingValues() == 1 &&
- P->getIncomingBlock(PredIndex) == Preheader) &&
- "Should have exactly one value and that's from the preheader!");
- ++BI;
- }
-
- // Update the dominator tree and remove the instructions and blocks that will
- // be deleted from the reference counting scheme.
- SmallVector<DomTreeNode*, 8> ChildNodes;
- for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
- LI != LE; ++LI) {
- // Move all of the block's children to be children of the Preheader, which
- // allows us to remove the domtree entry for the block.
- ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end());
- for (DomTreeNode *ChildNode : ChildNodes) {
- DT.changeImmediateDominator(ChildNode, DT[Preheader]);
- }
-
- ChildNodes.clear();
- DT.eraseNode(*LI);
-
- // Remove the block from the reference counting scheme, so that we can
- // delete it freely later.
- (*LI)->dropAllReferences();
- }
-
- // Erase the instructions and the blocks without having to worry
- // about ordering because we already dropped the references.
- // NOTE: This iteration is safe because erasing the block does not remove its
- // entry from the loop's block list. We do that in the next section.
- for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
- LI != LE; ++LI)
- (*LI)->eraseFromParent();
-
- // Finally, the blocks from loopinfo. This has to happen late because
- // otherwise our loop iterators won't work.
-
- SmallPtrSet<BasicBlock *, 8> blocks;
- blocks.insert(L->block_begin(), L->block_end());
- for (BasicBlock *BB : blocks)
- LI.removeBlock(BB);
-
- // The last step is to update LoopInfo now that we've eliminated this loop.
- LI.markAsRemoved(L);
+ return LoopDeletionResult::Deleted;
}
PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
@@ -322,9 +216,14 @@ PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
DEBUG(dbgs() << "Analyzing Loop for deletion: ");
DEBUG(L.dump());
- if (!deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, &Updater))
+ std::string LoopName = L.getName();
+ auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI);
+ if (Result == LoopDeletionResult::Unmodified)
return PreservedAnalyses::all();
+ if (Result == LoopDeletionResult::Deleted)
+ Updater.markLoopAsDeleted(L, LoopName);
+
return getLoopPassPreservedAnalyses();
}
@@ -354,7 +253,7 @@ INITIALIZE_PASS_END(LoopDeletionLegacyPass, "loop-deletion",
Pass *llvm::createLoopDeletionPass() { return new LoopDeletionLegacyPass(); }
-bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &) {
+bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
if (skipLoop(L))
return false;
DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -363,5 +262,11 @@ bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &) {
DEBUG(dbgs() << "Analyzing Loop for deletion: ");
DEBUG(L->dump());
- return deleteLoopIfDead(L, DT, SE, LI);
+
+ LoopDeletionResult Result = deleteLoopIfDead(L, DT, SE, LI);
+
+ if (Result == LoopDeletionResult::Deleted)
+ LPM.markLoopAsDeleted(*L);
+
+ return Result != LoopDeletionResult::Unmodified;
}
diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp
index 3624bba10345..0d7e3db901cb 100644
--- a/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -23,32 +23,61 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/LoopDistribute.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <cassert>
+#include <functional>
#include <list>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
#define LDIST_NAME "loop-distribute"
#define DEBUG_TYPE LDIST_NAME
-using namespace llvm;
-
static cl::opt<bool>
LDistVerify("loop-distribute-verify", cl::Hidden,
cl::desc("Turn on DominatorTree and LoopInfo verification "
@@ -81,14 +110,15 @@ static cl::opt<bool> EnableLoopDistribute(
STATISTIC(NumLoopsDistributed, "Number of loops distributed");
namespace {
+
/// \brief Maintains the set of instructions of the loop for a partition before
/// cloning. After cloning, it hosts the new loop.
class InstPartition {
- typedef SmallPtrSet<Instruction *, 8> InstructionSet;
+ using InstructionSet = SmallPtrSet<Instruction *, 8>;
public:
InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
- : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) {
+ : DepCycle(DepCycle), OrigLoop(L) {
Set.insert(I);
}
@@ -220,7 +250,7 @@ private:
/// \brief The cloned loop. If this partition is mapped to the original loop,
/// this is null.
- Loop *ClonedLoop;
+ Loop *ClonedLoop = nullptr;
/// \brief The blocks of ClonedLoop including the preheader. If this
/// partition is mapped to the original loop, this is empty.
@@ -235,7 +265,7 @@ private:
/// \brief Holds the set of Partitions. It populates them, merges them and then
/// clones the loops.
class InstPartitionContainer {
- typedef DenseMap<Instruction *, int> InstToPartitionIdT;
+ using InstToPartitionIdT = DenseMap<Instruction *, int>;
public:
InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
@@ -308,8 +338,8 @@ public:
///
/// Return if any partitions were merged.
bool mergeToAvoidDuplicatedLoads() {
- typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT;
- typedef EquivalenceClasses<InstPartition *> ToBeMergedT;
+ using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>;
+ using ToBeMergedT = EquivalenceClasses<InstPartition *>;
LoadToPartitionT LoadToPartition;
ToBeMergedT ToBeMerged;
@@ -511,7 +541,7 @@ public:
}
private:
- typedef std::list<InstPartition> PartitionContainerT;
+ using PartitionContainerT = std::list<InstPartition>;
/// \brief List of partitions.
PartitionContainerT PartitionContainer;
@@ -552,17 +582,17 @@ private:
/// By traversing the memory instructions in program order and accumulating this
/// number, we know whether any unsafe dependence crosses over a program point.
class MemoryInstructionDependences {
- typedef MemoryDepChecker::Dependence Dependence;
+ using Dependence = MemoryDepChecker::Dependence;
public:
struct Entry {
Instruction *Inst;
- unsigned NumUnsafeDependencesStartOrEnd;
+ unsigned NumUnsafeDependencesStartOrEnd = 0;
- Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {}
+ Entry(Instruction *Inst) : Inst(Inst) {}
};
- typedef SmallVector<Entry, 8> AccessesType;
+ using AccessesType = SmallVector<Entry, 8>;
AccessesType::const_iterator begin() const { return Accesses.begin(); }
AccessesType::const_iterator end() const { return Accesses.end(); }
@@ -594,7 +624,7 @@ class LoopDistributeForLoop {
public:
LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
- : L(L), F(F), LI(LI), LAI(nullptr), DT(DT), SE(SE), ORE(ORE) {
+ : L(L), F(F), LI(LI), DT(DT), SE(SE), ORE(ORE) {
setForced();
}
@@ -755,9 +785,11 @@ public:
++NumLoopsDistributed;
// Report the success.
- ORE->emit(OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(),
- L->getHeader())
- << "distributed loop");
+ ORE->emit([&]() {
+ return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(),
+ L->getHeader())
+ << "distributed loop";
+ });
return true;
}
@@ -769,11 +801,13 @@ public:
DEBUG(dbgs() << "Skipping; " << Message << "\n");
// With Rpass-missed report that distribution failed.
- ORE->emit(
- OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", L->getStartLoc(),
- L->getHeader())
- << "loop not distributed: use -Rpass-analysis=loop-distribute for more "
- "info");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed",
+ L->getStartLoc(), L->getHeader())
+ << "loop not distributed: use -Rpass-analysis=loop-distribute for "
+ "more "
+ "info";
+ });
// With Rpass-analysis report why. This is on by default if distribution
// was requested explicitly.
@@ -857,7 +891,7 @@ private:
// Analyses used.
LoopInfo *LI;
- const LoopAccessInfo *LAI;
+ const LoopAccessInfo *LAI = nullptr;
DominatorTree *DT;
ScalarEvolution *SE;
OptimizationRemarkEmitter *ORE;
@@ -871,6 +905,8 @@ private:
Optional<bool> IsForced;
};
+} // end anonymous namespace
+
/// Shared implementation between new and old PMs.
static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
ScalarEvolution *SE, OptimizationRemarkEmitter *ORE,
@@ -901,9 +937,13 @@ static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
return Changed;
}
+namespace {
+
/// \brief The pass class.
class LoopDistributeLegacy : public FunctionPass {
public:
+ static char ID;
+
LoopDistributeLegacy() : FunctionPass(ID) {
// The default is set by the caller.
initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry());
@@ -934,10 +974,9 @@ public:
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
-
- static char ID;
};
-} // anonymous namespace
+
+} // end anonymous namespace
PreservedAnalyses LoopDistributePass::run(Function &F,
FunctionAnalysisManager &AM) {
@@ -956,7 +995,7 @@ PreservedAnalyses LoopDistributePass::run(Function &F,
auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
std::function<const LoopAccessInfo &(Loop &)> GetLAA =
[&](Loop &L) -> const LoopAccessInfo & {
- LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+ LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr};
return LAM.getResult<LoopAccessAnalysis>(L, AR);
};
@@ -971,6 +1010,7 @@ PreservedAnalyses LoopDistributePass::run(Function &F,
}
char LoopDistributeLegacy::ID;
+
static const char ldist_name[] = "Loop Distribution";
INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false,
@@ -982,6 +1022,4 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false)
-namespace llvm {
-FunctionPass *createLoopDistributePass() { return new LoopDistributeLegacy(); }
-}
+FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); }
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 4a6a35c0ab1b..21551f0a0825 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1,4 +1,4 @@
-//===-- LoopIdiomRecognize.cpp - Loop idiom recognition -------------------===//
+//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -38,32 +38,64 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/BuildLibCalls.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
using namespace llvm;
#define DEBUG_TYPE "loop-idiom"
@@ -80,7 +112,7 @@ static cl::opt<bool> UseLIRCodeSizeHeurs(
namespace {
class LoopIdiomRecognize {
- Loop *CurLoop;
+ Loop *CurLoop = nullptr;
AliasAnalysis *AA;
DominatorTree *DT;
LoopInfo *LI;
@@ -96,20 +128,21 @@ public:
TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI,
const DataLayout *DL)
- : CurLoop(nullptr), AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI),
- DL(DL) {}
+ : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL) {}
bool runOnLoop(Loop *L);
private:
- typedef SmallVector<StoreInst *, 8> StoreList;
- typedef MapVector<Value *, StoreList> StoreListMap;
+ using StoreList = SmallVector<StoreInst *, 8>;
+ using StoreListMap = MapVector<Value *, StoreList>;
+
StoreListMap StoreRefsForMemset;
StoreListMap StoreRefsForMemsetPattern;
StoreList StoreRefsForMemcpy;
bool HasMemset;
bool HasMemsetPattern;
bool HasMemcpy;
+
/// Return code for isLegalStore()
enum LegalStoreKind {
None = 0,
@@ -164,6 +197,7 @@ private:
class LoopIdiomRecognizeLegacyPass : public LoopPass {
public:
static char ID;
+
explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
initializeLoopIdiomRecognizeLegacyPassPass(
*PassRegistry::getPassRegistry());
@@ -190,14 +224,16 @@ public:
/// This transformation requires natural loop information & requires that
/// loop preheaders be inserted into the CFG.
- ///
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
getLoopAnalysisUsage(AU);
}
};
-} // End anonymous namespace.
+
+} // end anonymous namespace
+
+char LoopIdiomRecognizeLegacyPass::ID = 0;
PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
@@ -211,7 +247,6 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
return getLoopPassPreservedAnalyses();
}
-char LoopIdiomRecognizeLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
"Recognize loop idioms", false, false)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
@@ -299,13 +334,6 @@ bool LoopIdiomRecognize::runOnCountableLoop() {
return MadeChange;
}
-static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL) {
- uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
- assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) &&
- "Don't overflow unsigned.");
- return (unsigned)SizeInBits >> 3;
-}
-
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
return ConstStride->getAPInt();
@@ -354,7 +382,6 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
LoopIdiomRecognize::LegalStoreKind
LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
-
// Don't touch volatile stores.
if (SI->isVolatile())
return LegalStoreKind::None;
@@ -424,7 +451,7 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
// Check to see if the stride matches the size of the store. If so, then we
// know that every byte is touched in the loop.
APInt Stride = getStoreStride(StoreEv);
- unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+ unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
if (StoreSize != Stride && StoreSize != -Stride)
return LegalStoreKind::None;
@@ -563,7 +590,7 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
const SCEVAddRecExpr *FirstStoreEv =
cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
APInt FirstStride = getStoreStride(FirstStoreEv);
- unsigned FirstStoreSize = getStoreSizeInBytes(SL[i], DL);
+ unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
// See if we can optimize just this store in isolation.
if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
@@ -656,7 +683,7 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
break;
AdjacentStores.insert(I);
- StoreSize += getStoreSizeInBytes(I, DL);
+ StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
// Move to the next value in the chain.
I = ConsecutiveChain[I];
}
@@ -761,7 +788,8 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
++BI)
for (Instruction &I : **BI)
if (IgnoredStores.count(&I) == 0 &&
- (AA.getModRefInfo(&I, StoreLoc) & Access))
+ isModOrRefSet(
+ intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
return true;
return false;
@@ -780,6 +808,41 @@ static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
return SE->getMinusSCEV(Start, Index);
}
+/// Compute the number of bytes as a SCEV from the backedge taken count.
+///
+/// This also maps the SCEV into the provided type and tries to handle the
+/// computation in a way that will fold cleanly.
+static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
+ unsigned StoreSize, Loop *CurLoop,
+ const DataLayout *DL, ScalarEvolution *SE) {
+ const SCEV *NumBytesS;
+ // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
+ // pointer size if it isn't already.
+ //
+ // If we're going to need to zero extend the BE count, check if we can add
+ // one to it prior to zero extending without overflow. Provided this is safe,
+ // it allows better simplification of the +1.
+ if (DL->getTypeSizeInBits(BECount->getType()) <
+ DL->getTypeSizeInBits(IntPtr) &&
+ SE->isLoopEntryGuardedByCond(
+ CurLoop, ICmpInst::ICMP_NE, BECount,
+ SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
+ NumBytesS = SE->getZeroExtendExpr(
+ SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
+ IntPtr);
+ } else {
+ NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
+ SE->getOne(IntPtr), SCEV::FlagNUW);
+ }
+
+ // And scale it based on the store size.
+ if (StoreSize != 1) {
+ NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
+ SCEV::FlagNUW);
+ }
+ return NumBytesS;
+}
+
/// processLoopStridedStore - We see a strided store of some value. If we can
/// transform this into a memset or memset_pattern in the loop preheader, do so.
bool LoopIdiomRecognize::processLoopStridedStore(
@@ -824,8 +887,8 @@ bool LoopIdiomRecognize::processLoopStridedStore(
// base pointer and checking the region.
Value *BasePtr =
Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
- if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize,
- *AA, Stores)) {
+ if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
+ StoreSize, *AA, Stores)) {
Expander.clear();
// If we generated new code for the base pointer, clean up.
RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI);
@@ -837,16 +900,8 @@ bool LoopIdiomRecognize::processLoopStridedStore(
// Okay, everything looks good, insert the memset.
- // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
- // pointer size if it isn't already.
- BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
-
const SCEV *NumBytesS =
- SE->getAddExpr(BECount, SE->getOne(IntPtr), SCEV::FlagNUW);
- if (StoreSize != 1) {
- NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
- SCEV::FlagNUW);
- }
+ getNumBytes(BECount, IntPtr, StoreSize, CurLoop, DL, SE);
// TODO: ideally we should still be able to generate memset if SCEV expander
// is taught to generate the dependencies at the latest point.
@@ -903,7 +958,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
Value *StorePtr = SI->getPointerOperand();
const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
APInt Stride = getStoreStride(StoreEv);
- unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+ unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
bool NegStride = StoreSize == -Stride;
// The store must be feeding a non-volatile load.
@@ -942,7 +997,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
SmallPtrSet<Instruction *, 1> Stores;
Stores.insert(SI);
- if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
+ if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
StoreSize, *AA, Stores)) {
Expander.clear();
// If we generated new code for the base pointer, clean up.
@@ -962,8 +1017,8 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
Value *LoadBasePtr = Expander.expandCodeFor(
LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
- if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
- *AA, Stores)) {
+ if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
+ StoreSize, *AA, Stores)) {
Expander.clear();
// If we generated new code for the base pointer, clean up.
RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
@@ -976,16 +1031,8 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
// Okay, everything is safe, we can transform this!
- // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
- // pointer size if it isn't already.
- BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
-
const SCEV *NumBytesS =
- SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
-
- if (StoreSize != 1)
- NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
- SCEV::FlagNUW);
+ getNumBytes(BECount, IntPtrTy, StoreSize, CurLoop, DL, SE);
Value *NumBytes =
Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
@@ -1010,16 +1057,12 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
return false;
+ // Create the call.
+ // Note that unordered atomic loads/stores are *required* by the spec to
+ // have an alignment but non-atomic loads/stores may not.
NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
- StoreBasePtr, LoadBasePtr, NumBytes, StoreSize);
-
- // Propagate alignment info onto the pointer args. Note that unordered
- // atomic loads/stores are *required* by the spec to have an alignment
- // but non-atomic loads/stores may not.
- NewCall->addParamAttr(0, Attribute::getWithAlignment(NewCall->getContext(),
- SI->getAlignment()));
- NewCall->addParamAttr(1, Attribute::getWithAlignment(NewCall->getContext(),
- LI->getAlignment()));
+ StoreBasePtr, SI->getAlignment(), LoadBasePtr, LI->getAlignment(),
+ NumBytes, StoreSize);
}
NewCall->setDebugLoc(SI->getDebugLoc());
@@ -1273,9 +1316,9 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
// step 2: detect instructions corresponding to "x.next = x >> 1"
if (!DefX || DefX->getOpcode() != Instruction::AShr)
return false;
- if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)))
- if (!Shft || !Shft->isOne())
- return false;
+ ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
+ if (!Shft || !Shft->isOne())
+ return false;
VarX = DefX->getOperand(0);
// step 3: Check the recurrence of variable X
@@ -1469,7 +1512,7 @@ static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
/// PhiX = PHI [InitX, DefX]
/// CntInst = CntPhi + 1
/// DefX = PhiX >> 1
-// LOOP_BODY
+/// LOOP_BODY
/// Br: loop if (DefX != 0)
/// Use(CntPhi) or Use(CntInst)
///
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index af095560cc02..40d468a084d4 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -12,22 +12,33 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/LoopInstSimplify.h"
+#include "llvm/ADT/PointerIntPair.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/User.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <algorithm>
+#include <utility>
+
using namespace llvm;
#define DEBUG_TYPE "loop-instsimplify"
@@ -45,7 +56,7 @@ static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI,
// The bit we are stealing from the pointer represents whether this basic
// block is the header of a subloop, in which case we only process its phis.
- typedef PointerIntPair<BasicBlock *, 1> WorklistItem;
+ using WorklistItem = PointerIntPair<BasicBlock *, 1>;
SmallVector<WorklistItem, 16> VisitStack;
SmallPtrSet<BasicBlock *, 32> Visited;
@@ -151,9 +162,11 @@ static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI,
}
namespace {
+
class LoopInstSimplifyLegacyPass : public LoopPass {
public:
static char ID; // Pass ID, replacement for typeid
+
LoopInstSimplifyLegacyPass() : LoopPass(ID) {
initializeLoopInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
}
@@ -181,7 +194,8 @@ public:
getLoopAnalysisUsage(AU);
}
};
-}
+
+} // end anonymous namespace
PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
@@ -195,6 +209,7 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
}
char LoopInstSimplifyLegacyPass::ID = 0;
+
INITIALIZE_PASS_BEGIN(LoopInstSimplifyLegacyPass, "loop-instsimplify",
"Simplify instructions in loops", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp
index 2e0d8e0374c0..4f8dafef230a 100644
--- a/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -1,4 +1,4 @@
-//===- LoopInterchange.cpp - Loop interchange pass------------------------===//
+//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -13,33 +13,38 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DependenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <cassert>
+#include <utility>
+#include <vector>
using namespace llvm;
@@ -51,10 +56,12 @@ static cl::opt<int> LoopInterchangeCostThreshold(
namespace {
-typedef SmallVector<Loop *, 8> LoopVector;
+using LoopVector = SmallVector<Loop *, 8>;
// TODO: Check if we can use a sparse matrix here.
-typedef std::vector<std::vector<char>> CharMatrix;
+using CharMatrix = std::vector<std::vector<char>>;
+
+} // end anonymous namespace
// Maximum number of dependencies that can be handled in the dependency matrix.
static const unsigned MaxMemInstrCount = 100;
@@ -62,14 +69,11 @@ static const unsigned MaxMemInstrCount = 100;
// Maximum loop depth supported.
static const unsigned MaxLoopNestDepth = 10;
-struct LoopInterchange;
-
#ifdef DUMP_DEP_MATRICIES
-void printDepMatrix(CharMatrix &DepMatrix) {
- for (auto I = DepMatrix.begin(), E = DepMatrix.end(); I != E; ++I) {
- std::vector<char> Vec = *I;
- for (auto II = Vec.begin(), EE = Vec.end(); II != EE; ++II)
- DEBUG(dbgs() << *II << " ");
+static void printDepMatrix(CharMatrix &DepMatrix) {
+ for (auto &Row : DepMatrix) {
+ for (auto D : Row)
+ DEBUG(dbgs() << D << " ");
DEBUG(dbgs() << "\n");
}
}
@@ -77,25 +81,24 @@ void printDepMatrix(CharMatrix &DepMatrix) {
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
Loop *L, DependenceInfo *DI) {
- typedef SmallVector<Value *, 16> ValueVector;
+ using ValueVector = SmallVector<Value *, 16>;
+
ValueVector MemInstr;
// For each block.
- for (Loop::block_iterator BB = L->block_begin(), BE = L->block_end();
- BB != BE; ++BB) {
+ for (BasicBlock *BB : L->blocks()) {
// Scan the BB and collect legal loads and stores.
- for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E;
- ++I) {
+ for (Instruction &I : *BB) {
if (!isa<Instruction>(I))
return false;
- if (LoadInst *Ld = dyn_cast<LoadInst>(I)) {
+ if (auto *Ld = dyn_cast<LoadInst>(&I)) {
if (!Ld->isSimple())
return false;
- MemInstr.push_back(&*I);
- } else if (StoreInst *St = dyn_cast<StoreInst>(I)) {
+ MemInstr.push_back(&I);
+ } else if (auto *St = dyn_cast<StoreInst>(&I)) {
if (!St->isSimple())
return false;
- MemInstr.push_back(&*I);
+ MemInstr.push_back(&I);
}
}
}
@@ -171,7 +174,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
}
// We don't have a DepMatrix to check legality return false.
- if (DepMatrix.size() == 0)
+ if (DepMatrix.empty())
return false;
return true;
}
@@ -216,7 +219,6 @@ static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
unsigned OuterLoopId, char InnerDep,
char OuterDep) {
-
if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId))
return false;
@@ -255,7 +257,6 @@ static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
unsigned InnerLoopId,
unsigned OuterLoopId) {
-
unsigned NumRows = DepMatrix.size();
// For each row check if it is valid to interchange.
for (unsigned Row = 0; Row < NumRows; ++Row) {
@@ -270,7 +271,6 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
}
static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) {
-
DEBUG(dbgs() << "Calling populateWorklist on Func: "
<< L.getHeader()->getParent()->getName() << " Loop: %"
<< L.getHeader()->getName() << '\n');
@@ -320,6 +320,8 @@ static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
return nullptr;
}
+namespace {
+
/// LoopInterchangeLegality checks if it is legal to interchange the loop.
class LoopInterchangeLegality {
public:
@@ -327,11 +329,12 @@ public:
LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA,
OptimizationRemarkEmitter *ORE)
: OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
- PreserveLCSSA(PreserveLCSSA), ORE(ORE), InnerLoopHasReduction(false) {}
+ PreserveLCSSA(PreserveLCSSA), ORE(ORE) {}
/// Check if the loops can be interchanged.
bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
CharMatrix &DepMatrix);
+
/// Check if the loop structure is understood. We do not handle triangular
/// loops for now.
bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
@@ -348,6 +351,7 @@ private:
bool findInductionAndReductions(Loop *L,
SmallVector<PHINode *, 8> &Inductions,
SmallVector<PHINode *, 8> &Reductions);
+
Loop *OuterLoop;
Loop *InnerLoop;
@@ -355,10 +359,11 @@ private:
LoopInfo *LI;
DominatorTree *DT;
bool PreserveLCSSA;
+
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
- bool InnerLoopHasReduction;
+ bool InnerLoopHasReduction = false;
};
/// LoopInterchangeProfitability checks if it is profitable to interchange the
@@ -381,6 +386,7 @@ private:
/// Scev analysis.
ScalarEvolution *SE;
+
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
};
@@ -415,6 +421,7 @@ private:
/// Scev analysis.
ScalarEvolution *SE;
+
LoopInfo *LI;
DominatorTree *DT;
BasicBlock *LoopExit;
@@ -424,16 +431,16 @@ private:
// Main LoopInterchange Pass.
struct LoopInterchange : public FunctionPass {
static char ID;
- ScalarEvolution *SE;
- LoopInfo *LI;
- DependenceInfo *DI;
- DominatorTree *DT;
+ ScalarEvolution *SE = nullptr;
+ LoopInfo *LI = nullptr;
+ DependenceInfo *DI = nullptr;
+ DominatorTree *DT = nullptr;
bool PreserveLCSSA;
+
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
- LoopInterchange()
- : FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) {
+ LoopInterchange() : FunctionPass(ID) {
initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
}
@@ -501,7 +508,6 @@ struct LoopInterchange : public FunctionPass {
}
bool processLoopList(LoopVector LoopList, Function &F) {
-
bool Changed = false;
unsigned LoopNestDepth = LoopList.size();
if (LoopNestDepth < 2) {
@@ -580,7 +586,6 @@ struct LoopInterchange : public FunctionPass {
bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
unsigned OuterLoopId, BasicBlock *LoopNestExit,
std::vector<std::vector<char>> &DependencyMatrix) {
-
DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
<< " and OuterLoopId = " << OuterLoopId << "\n");
Loop *InnerLoop = LoopList[InnerLoopId];
@@ -599,10 +604,12 @@ struct LoopInterchange : public FunctionPass {
return false;
}
- ORE->emit(OptimizationRemark(DEBUG_TYPE, "Interchanged",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Loop interchanged with enclosing loop.");
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "Interchanged",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Loop interchanged with enclosing loop.";
+ });
LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
LoopNestExit, LIL.hasInnerLoopReduction());
@@ -612,9 +619,10 @@ struct LoopInterchange : public FunctionPass {
}
};
-} // end of namespace
+} // end anonymous namespace
+
bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) {
- return none_of(Ins->users(), [=](User *U) -> bool {
+ return llvm::none_of(Ins->users(), [=](User *U) -> bool {
auto *UserIns = dyn_cast<PHINode>(U);
RecurrenceDescriptor RD;
return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD);
@@ -664,11 +672,9 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
if (!OuterLoopHeaderBI)
return false;
- for (unsigned i = 0, e = OuterLoopHeaderBI->getNumSuccessors(); i < e; ++i) {
- if (OuterLoopHeaderBI->getSuccessor(i) != InnerLoopPreHeader &&
- OuterLoopHeaderBI->getSuccessor(i) != OuterLoopLatch)
+ for (BasicBlock *Succ : OuterLoopHeaderBI->successors())
+ if (Succ != InnerLoopPreHeader && Succ != OuterLoopLatch)
return false;
- }
DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
// We do not have any basic block in between now make sure the outer header
@@ -682,10 +688,8 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
return true;
}
-
bool LoopInterchangeLegality::isLoopStructureUnderstood(
PHINode *InnerInduction) {
-
unsigned Num = InnerInduction->getNumOperands();
BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
for (unsigned i = 0; i < Num; ++i) {
@@ -750,12 +754,12 @@ static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock,
BasicBlock *LoopHeader) {
if (BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator())) {
- unsigned Num = BI->getNumSuccessors();
- assert(Num == 2);
- for (unsigned i = 0; i < Num; ++i) {
- if (BI->getSuccessor(i) == LoopHeader)
+ assert(BI->getNumSuccessors() == 2 &&
+ "Branch leaving loop latch must have 2 successors");
+ for (BasicBlock *Succ : BI->successors()) {
+ if (Succ == LoopHeader)
continue;
- return BI->getSuccessor(i);
+ return Succ;
}
}
return nullptr;
@@ -764,7 +768,6 @@ static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock,
// This function indicates the current limitations in the transform as a result
// of which we do not proceed.
bool LoopInterchangeLegality::currentLimitations() {
-
BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
@@ -777,12 +780,13 @@ bool LoopInterchangeLegality::currentLimitations() {
if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) {
DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes "
<< "are supported currently.\n");
- ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
- "UnsupportedPHIInner",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Only inner loops with induction or reduction PHI nodes can be"
- " interchange currently.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Only inner loops with induction or reduction PHI nodes can be"
+ " interchange currently.";
+ });
return true;
}
@@ -790,12 +794,13 @@ bool LoopInterchangeLegality::currentLimitations() {
if (Inductions.size() != 1) {
DEBUG(dbgs() << "We currently only support loops with 1 induction variable."
<< "Failed to interchange due to current limitation\n");
- ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
- "MultiInductionInner",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Only inner loops with 1 induction variable can be "
- "interchanged currently.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Only inner loops with 1 induction variable can be "
+ "interchanged currently.";
+ });
return true;
}
if (Reductions.size() > 0)
@@ -806,12 +811,13 @@ bool LoopInterchangeLegality::currentLimitations() {
if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) {
DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes "
<< "are supported currently.\n");
- ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
- "UnsupportedPHIOuter",
- OuterLoop->getStartLoc(),
- OuterLoop->getHeader())
- << "Only outer loops with induction or reduction PHI nodes can be"
- " interchanged currently.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Only outer loops with induction or reduction PHI nodes can be"
+ " interchanged currently.";
+ });
return true;
}
@@ -820,35 +826,38 @@ bool LoopInterchangeLegality::currentLimitations() {
if (!Reductions.empty()) {
DEBUG(dbgs() << "Outer loops with reductions are not supported "
<< "currently.\n");
- ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
- "ReductionsOuter",
- OuterLoop->getStartLoc(),
- OuterLoop->getHeader())
- << "Outer loops with reductions cannot be interchangeed "
- "currently.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Outer loops with reductions cannot be interchangeed "
+ "currently.";
+ });
return true;
}
// TODO: Currently we handle only loops with 1 induction variable.
if (Inductions.size() != 1) {
DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
<< "supported currently.\n");
- ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
- "MultiIndutionOuter",
- OuterLoop->getStartLoc(),
- OuterLoop->getHeader())
- << "Only outer loops with 1 induction variable can be "
- "interchanged currently.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Only outer loops with 1 induction variable can be "
+ "interchanged currently.";
+ });
return true;
}
// TODO: Triangular loops are not handled for now.
if (!isLoopStructureUnderstood(InnerInductionVar)) {
DEBUG(dbgs() << "Loop structure not understood by pass\n");
- ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
- "UnsupportedStructureInner",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Inner loop structure not understood currently.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Inner loop structure not understood currently.";
+ });
return true;
}
@@ -857,24 +866,26 @@ bool LoopInterchangeLegality::currentLimitations() {
getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader);
if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) {
DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n");
- ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
- "NoLCSSAPHIOuter",
- OuterLoop->getStartLoc(),
- OuterLoop->getHeader())
- << "Only outer loops with LCSSA PHIs can be interchange "
- "currently.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuter",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Only outer loops with LCSSA PHIs can be interchange "
+ "currently.";
+ });
return true;
}
LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader);
if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) {
DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
- ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
- "NoLCSSAPHIOuterInner",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Only inner loops with LCSSA PHIs can be interchange "
- "currently.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Only inner loops with LCSSA PHIs can be interchange "
+ "currently.";
+ });
return true;
}
@@ -899,11 +910,12 @@ bool LoopInterchangeLegality::currentLimitations() {
if (!InnerIndexVarInc) {
DEBUG(dbgs() << "Did not find an instruction to increment the induction "
<< "variable.\n");
- ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
- "NoIncrementInInner",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "The inner loop does not increment the induction variable.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "The inner loop does not increment the induction variable.";
+ });
return true;
}
@@ -912,8 +924,9 @@ bool LoopInterchangeLegality::currentLimitations() {
// instruction.
bool FoundInduction = false;
- for (const Instruction &I : reverse(*InnerLoopLatch)) {
- if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I))
+ for (const Instruction &I : llvm::reverse(*InnerLoopLatch)) {
+ if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) ||
+ isa<ZExtInst>(I))
continue;
// We found an instruction. If this is not induction variable then it is not
@@ -921,12 +934,13 @@ bool LoopInterchangeLegality::currentLimitations() {
if (!I.isIdenticalTo(InnerIndexVarInc)) {
DEBUG(dbgs() << "Found unsupported instructions between induction "
<< "variable increment and branch.\n");
- ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
- "UnsupportedInsBetweenInduction",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Found unsupported instruction between induction variable "
- "increment and branch.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(
+ DEBUG_TYPE, "UnsupportedInsBetweenInduction",
+ InnerLoop->getStartLoc(), InnerLoop->getHeader())
+ << "Found unsupported instruction between induction variable "
+ "increment and branch.";
+ });
return true;
}
@@ -937,11 +951,12 @@ bool LoopInterchangeLegality::currentLimitations() {
// current limitation.
if (!FoundInduction) {
DEBUG(dbgs() << "Did not find the induction variable.\n");
- ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
- "NoIndutionVariable",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Did not find the induction variable.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Did not find the induction variable.";
+ });
return true;
}
return false;
@@ -950,19 +965,31 @@ bool LoopInterchangeLegality::currentLimitations() {
bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
unsigned OuterLoopId,
CharMatrix &DepMatrix) {
-
if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
<< " and OuterLoopId = " << OuterLoopId
<< " due to dependence\n");
- ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
- "Dependence",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Cannot interchange loops due to dependences.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Cannot interchange loops due to dependences.";
+ });
return false;
}
+ // Check if outer and inner loop contain legal instructions only.
+ for (auto *BB : OuterLoop->blocks())
+ for (Instruction &I : *BB)
+ if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+ // readnone functions do not prevent interchanging.
+ if (CI->doesNotReadMemory())
+ continue;
+ DEBUG(dbgs() << "Loops with call instructions cannot be interchanged "
+ << "safely.");
+ return false;
+ }
+
// Create unique Preheaders if we already do not have one.
BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
@@ -995,12 +1022,13 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
// Check if the loops are tightly nested.
if (!tightlyNested(OuterLoop, InnerLoop)) {
DEBUG(dbgs() << "Loops not tightly nested\n");
- ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
- "NotTightlyNested",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Cannot interchange loops because they are not tightly "
- "nested.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Cannot interchange loops because they are not tightly "
+ "nested.";
+ });
return false;
}
@@ -1010,9 +1038,8 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
int LoopInterchangeProfitability::getInstrOrderCost() {
unsigned GoodOrder, BadOrder;
BadOrder = GoodOrder = 0;
- for (auto BI = InnerLoop->block_begin(), BE = InnerLoop->block_end();
- BI != BE; ++BI) {
- for (Instruction &Ins : **BI) {
+ for (BasicBlock *BB : InnerLoop->blocks()) {
+ for (Instruction &Ins : *BB) {
if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
unsigned NumOp = GEP->getNumOperands();
bool FoundInnerInduction = false;
@@ -1064,12 +1091,11 @@ static bool isProfitableForVectorization(unsigned InnerLoopId,
// TODO: Improve this heuristic to catch more cases.
// If the inner loop is loop independent or doesn't carry any dependency it is
// profitable to move this to outer position.
- unsigned Row = DepMatrix.size();
- for (unsigned i = 0; i < Row; ++i) {
- if (DepMatrix[i][InnerLoopId] != 'S' && DepMatrix[i][InnerLoopId] != 'I')
+ for (auto &Row : DepMatrix) {
+ if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I')
return false;
// TODO: We need to improve this heuristic.
- if (DepMatrix[i][OuterLoopId] != '=')
+ if (Row[OuterLoopId] != '=')
return false;
}
// If outer loop has dependence and inner loop is loop independent then it is
@@ -1080,7 +1106,6 @@ static bool isProfitableForVectorization(unsigned InnerLoopId,
bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
unsigned OuterLoopId,
CharMatrix &DepMatrix) {
-
// TODO: Add better profitability checks.
// e.g
// 1) Construct dependency matrix and move the one with no loop carried dep
@@ -1099,14 +1124,15 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
return true;
- ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
- "InterchangeNotProfitable",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Interchanging loops is too costly (cost="
- << ore::NV("Cost", Cost) << ", threshold="
- << ore::NV("Threshold", LoopInterchangeCostThreshold) <<
- ") and it does not improve parallelism.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Interchanging loops is too costly (cost="
+ << ore::NV("Cost", Cost) << ", threshold="
+ << ore::NV("Threshold", LoopInterchangeCostThreshold)
+ << ") and it does not improve parallelism.";
+ });
return false;
}
@@ -1145,7 +1171,7 @@ bool LoopInterchangeTransform::transform() {
bool Transformed = false;
Instruction *InnerIndexVar;
- if (InnerLoop->getSubLoops().size() == 0) {
+ if (InnerLoop->getSubLoops().empty()) {
BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
DEBUG(dbgs() << "Calling Split Inner Loop\n");
PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
@@ -1159,7 +1185,11 @@ bool LoopInterchangeTransform::transform() {
else
InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
- //
+ // Ensure that InductionPHI is the first Phi node as required by
+ // splitInnerLoopHeader
+ if (&InductionPHI->getParent()->front() != InductionPHI)
+ InductionPHI->moveBefore(&InductionPHI->getParent()->front());
+
// Split at the place were the induction variable is
// incremented/decremented.
// TODO: This splitting logic may not work always. Fix this.
@@ -1188,13 +1218,12 @@ void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
}
void LoopInterchangeTransform::splitInnerLoopHeader() {
-
// Split the inner loop header out. Here make sure that the reduction PHI's
// stay in the innerloop body.
BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
if (InnerLoopHasReduction) {
- // FIXME: Check if the induction PHI will always be the first PHI.
+ // Note: The induction PHI must be the first PHI for this to work
BasicBlock *New = InnerLoopHeader->splitBasicBlock(
++(InnerLoopHeader->begin()), InnerLoopHeader->getName() + ".split");
if (LI)
@@ -1244,7 +1273,6 @@ void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock,
}
bool LoopInterchangeTransform::adjustLoopBranches() {
-
DEBUG(dbgs() << "adjustLoopBranches called\n");
// Adjust the loop preheader
BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
@@ -1352,8 +1380,8 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
return true;
}
-void LoopInterchangeTransform::adjustLoopPreheaders() {
+void LoopInterchangeTransform::adjustLoopPreheaders() {
// We have interchanged the preheaders so we need to interchange the data in
// the preheader as well.
// This is because the content of inner preheader was previously executed
@@ -1373,7 +1401,6 @@ void LoopInterchangeTransform::adjustLoopPreheaders() {
}
bool LoopInterchangeTransform::adjustLoopLinks() {
-
// Adjust all branches in the inner and outer loop.
bool Changed = adjustLoopBranches();
if (Changed)
@@ -1382,6 +1409,7 @@ bool LoopInterchangeTransform::adjustLoopLinks() {
}
char LoopInterchange::ID = 0;
+
INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange",
"Interchanges loops for cache reuse", false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 20b37c4b70e6..dfa5ec1f354d 100644
--- a/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -28,22 +28,29 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
#include <algorithm>
@@ -53,11 +60,11 @@
#include <tuple>
#include <utility>
+using namespace llvm;
+
#define LLE_OPTION "loop-load-elim"
#define DEBUG_TYPE LLE_OPTION
-using namespace llvm;
-
static cl::opt<unsigned> CheckPerElim(
"runtime-check-per-loop-load-elim", cl::Hidden,
cl::desc("Max number of memchecks allowed per eliminated load on average"),
@@ -127,10 +134,12 @@ struct StoreToLoadForwardingCandidate {
#endif
};
+} // end anonymous namespace
+
/// \brief Check if the store dominates all latches, so as long as there is no
/// intervening store this value will be loaded in the next iteration.
-bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
- DominatorTree *DT) {
+static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
+ DominatorTree *DT) {
SmallVector<BasicBlock *, 8> Latches;
L->getLoopLatches(Latches);
return llvm::all_of(Latches, [&](const BasicBlock *Latch) {
@@ -143,6 +152,8 @@ static bool isLoadConditional(LoadInst *Load, Loop *L) {
return Load->getParent() != L->getHeader();
}
+namespace {
+
/// \brief The per-loop class that does most of the work.
class LoadEliminationForLoop {
public:
@@ -241,8 +252,8 @@ public:
std::forward_list<StoreToLoadForwardingCandidate> &Candidates) {
// If Store is nullptr it means that we have multiple stores forwarding to
// this store.
- typedef DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *>
- LoadToSingleCandT;
+ using LoadToSingleCandT =
+ DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *>;
LoadToSingleCandT LoadToSingleCand;
for (const auto &Cand : Candidates) {
@@ -393,7 +404,6 @@ public:
void
propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand,
SCEVExpander &SEE) {
- //
// loop:
// %x = load %gep_i
// = ... %x
@@ -431,6 +441,7 @@ public:
bool processLoop() {
DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName()
<< "\" checking " << *L << "\n");
+
// Look for store-to-load forwarding cases across the
// backedge. E.g.:
//
@@ -558,6 +569,8 @@ private:
PredicatedScalarEvolution PSE;
};
+} // end anonymous namespace
+
static bool
eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
function_ref<const LoopAccessInfo &(Loop &)> GetLAI) {
@@ -584,10 +597,14 @@ eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
return Changed;
}
+namespace {
+
/// \brief The pass. Most of the work is delegated to the per-loop
/// LoadEliminationForLoop class.
class LoopLoadElimination : public FunctionPass {
public:
+ static char ID;
+
LoopLoadElimination() : FunctionPass(ID) {
initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry());
}
@@ -616,13 +633,12 @@ public:
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
-
- static char ID;
};
} // end anonymous namespace
char LoopLoadElimination::ID;
+
static const char LLE_name[] = "Loop Load Elimination";
INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
@@ -633,9 +649,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
-namespace llvm {
-
-FunctionPass *createLoopLoadEliminationPass() {
+FunctionPass *llvm::createLoopLoadEliminationPass() {
return new LoopLoadElimination();
}
@@ -652,7 +666,8 @@ PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
bool Changed = eliminateLoadsAcrossLoops(
F, LI, DT, [&](Loop &L) -> const LoopAccessInfo & {
- LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+ LoopStandardAnalysisResults AR = {AA, AC, DT, LI,
+ SE, TLI, TTI, nullptr};
return LAM.getResult<LoopAccessAnalysis>(L, AR);
});
@@ -662,5 +677,3 @@ PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
PreservedAnalyses PA;
return PA;
}
-
-} // end namespace llvm
diff --git a/lib/Transforms/Scalar/LoopPredication.cpp b/lib/Transforms/Scalar/LoopPredication.cpp
index 9b12ba180444..2e4c7b19e476 100644
--- a/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/lib/Transforms/Scalar/LoopPredication.cpp
@@ -34,6 +34,143 @@
// else
// deoptimize
//
+// It's tempting to rely on SCEV here, but it has proven to be problematic.
+// Generally the facts SCEV provides about the increment step of add
+// recurrences are true if the backedge of the loop is taken, which implicitly
+// assumes that the guard doesn't fail. Using these facts to optimize the
+// guard results in a circular logic where the guard is optimized under the
+// assumption that it never fails.
+//
+// For example, in the loop below the induction variable will be marked as nuw
+// basing on the guard. Basing on nuw the guard predicate will be considered
+// monotonic. Given a monotonic condition it's tempting to replace the induction
+// variable in the condition with its value on the last iteration. But this
+// transformation is not correct, e.g. e = 4, b = 5 breaks the loop.
+//
+// for (int i = b; i != e; i++)
+// guard(i u< len)
+//
+// One of the ways to reason about this problem is to use an inductive proof
+// approach. Given the loop:
+//
+// if (B(0)) {
+// do {
+// I = PHI(0, I.INC)
+// I.INC = I + Step
+// guard(G(I));
+// } while (B(I));
+// }
+//
+// where B(x) and G(x) are predicates that map integers to booleans, we want a
+// loop invariant expression M such the following program has the same semantics
+// as the above:
+//
+// if (B(0)) {
+// do {
+// I = PHI(0, I.INC)
+// I.INC = I + Step
+// guard(G(0) && M);
+// } while (B(I));
+// }
+//
+// One solution for M is M = forall X . (G(X) && B(X)) => G(X + Step)
+//
+// Informal proof that the transformation above is correct:
+//
+// By the definition of guards we can rewrite the guard condition to:
+// G(I) && G(0) && M
+//
+// Let's prove that for each iteration of the loop:
+// G(0) && M => G(I)
+// And the condition above can be simplified to G(Start) && M.
+//
+// Induction base.
+// G(0) && M => G(0)
+//
+// Induction step. Assuming G(0) && M => G(I) on the subsequent
+// iteration:
+//
+// B(I) is true because it's the backedge condition.
+// G(I) is true because the backedge is guarded by this condition.
+//
+// So M = forall X . (G(X) && B(X)) => G(X + Step) implies G(I + Step).
+//
+// Note that we can use anything stronger than M, i.e. any condition which
+// implies M.
+//
+// When S = 1 (i.e. forward iterating loop), the transformation is supported
+// when:
+// * The loop has a single latch with the condition of the form:
+// B(X) = latchStart + X <pred> latchLimit,
+// where <pred> is u<, u<=, s<, or s<=.
+// * The guard condition is of the form
+// G(X) = guardStart + X u< guardLimit
+//
+// For the ult latch comparison case M is:
+// forall X . guardStart + X u< guardLimit && latchStart + X <u latchLimit =>
+// guardStart + X + 1 u< guardLimit
+//
+// The only way the antecedent can be true and the consequent can be false is
+// if
+// X == guardLimit - 1 - guardStart
+// (and guardLimit is non-zero, but we won't use this latter fact).
+// If X == guardLimit - 1 - guardStart then the second half of the antecedent is
+// latchStart + guardLimit - 1 - guardStart u< latchLimit
+// and its negation is
+// latchStart + guardLimit - 1 - guardStart u>= latchLimit
+//
+// In other words, if
+// latchLimit u<= latchStart + guardLimit - 1 - guardStart
+// then:
+// (the ranges below are written in ConstantRange notation, where [A, B) is the
+// set for (I = A; I != B; I++ /*maywrap*/) yield(I);)
+//
+// forall X . guardStart + X u< guardLimit &&
+// latchStart + X u< latchLimit =>
+// guardStart + X + 1 u< guardLimit
+// == forall X . guardStart + X u< guardLimit &&
+// latchStart + X u< latchStart + guardLimit - 1 - guardStart =>
+// guardStart + X + 1 u< guardLimit
+// == forall X . (guardStart + X) in [0, guardLimit) &&
+// (latchStart + X) in [0, latchStart + guardLimit - 1 - guardStart) =>
+// (guardStart + X + 1) in [0, guardLimit)
+// == forall X . X in [-guardStart, guardLimit - guardStart) &&
+// X in [-latchStart, guardLimit - 1 - guardStart) =>
+// X in [-guardStart - 1, guardLimit - guardStart - 1)
+// == true
+//
+// So the widened condition is:
+// guardStart u< guardLimit &&
+// latchStart + guardLimit - 1 - guardStart u>= latchLimit
+// Similarly for ule condition the widened condition is:
+// guardStart u< guardLimit &&
+// latchStart + guardLimit - 1 - guardStart u> latchLimit
+// For slt condition the widened condition is:
+// guardStart u< guardLimit &&
+// latchStart + guardLimit - 1 - guardStart s>= latchLimit
+// For sle condition the widened condition is:
+// guardStart u< guardLimit &&
+// latchStart + guardLimit - 1 - guardStart s> latchLimit
+//
+// When S = -1 (i.e. reverse iterating loop), the transformation is supported
+// when:
+// * The loop has a single latch with the condition of the form:
+// B(X) = X <pred> latchLimit, where <pred> is u> or s>.
+// * The guard condition is of the form
+// G(X) = X - 1 u< guardLimit
+//
+// For the ugt latch comparison case M is:
+// forall X. X-1 u< guardLimit and X u> latchLimit => X-2 u< guardLimit
+//
+// The only way the antecedent can be true and the consequent can be false is if
+// X == 1.
+// If X == 1 then the second half of the antecedent is
+// 1 u> latchLimit, and its negation is latchLimit u>= 1.
+//
+// So the widened condition is:
+// guardStart u< guardLimit && latchLimit u>= 1.
+// Similarly for sgt condition the widened condition is:
+// guardStart u< guardLimit && latchLimit s>= 1.
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/LoopPredication.h"
@@ -56,6 +193,11 @@
using namespace llvm;
+static cl::opt<bool> EnableIVTruncation("loop-predication-enable-iv-truncation",
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> EnableCountDownLoop("loop-predication-enable-count-down-loop",
+ cl::Hidden, cl::init(true));
namespace {
class LoopPredication {
/// Represents an induction variable check:
@@ -68,6 +210,10 @@ class LoopPredication {
const SCEV *Limit)
: Pred(Pred), IV(IV), Limit(Limit) {}
LoopICmp() {}
+ void dump() {
+ dbgs() << "LoopICmp Pred = " << Pred << ", IV = " << *IV
+ << ", Limit = " << *Limit << "\n";
+ }
};
ScalarEvolution *SE;
@@ -75,17 +221,51 @@ class LoopPredication {
Loop *L;
const DataLayout *DL;
BasicBlock *Preheader;
+ LoopICmp LatchCheck;
- Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI);
+ bool isSupportedStep(const SCEV* Step);
+ Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI) {
+ return parseLoopICmp(ICI->getPredicate(), ICI->getOperand(0),
+ ICI->getOperand(1));
+ }
+ Optional<LoopICmp> parseLoopICmp(ICmpInst::Predicate Pred, Value *LHS,
+ Value *RHS);
+
+ Optional<LoopICmp> parseLoopLatchICmp();
+ bool CanExpand(const SCEV* S);
Value *expandCheck(SCEVExpander &Expander, IRBuilder<> &Builder,
ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
Instruction *InsertAt);
Optional<Value *> widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander,
IRBuilder<> &Builder);
+ Optional<Value *> widenICmpRangeCheckIncrementingLoop(LoopICmp LatchCheck,
+ LoopICmp RangeCheck,
+ SCEVExpander &Expander,
+ IRBuilder<> &Builder);
+ Optional<Value *> widenICmpRangeCheckDecrementingLoop(LoopICmp LatchCheck,
+ LoopICmp RangeCheck,
+ SCEVExpander &Expander,
+ IRBuilder<> &Builder);
bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);
+ // When the IV type is wider than the range operand type, we can still do loop
+ // predication, by generating SCEVs for the range and latch that are of the
+ // same type. We achieve this by generating a SCEV truncate expression for the
+ // latch IV. This is done iff truncation of the IV is a safe operation,
+ // without loss of information.
+ // Another way to achieve this is by generating a wider type SCEV for the
+ // range check operand, however, this needs a more involved check that
+ // operands do not overflow. This can lead to loss of information when the
+ // range operand is of the form: add i32 %offset, %iv. We need to prove that
+ // sext(x + y) is same as sext(x) + sext(y).
+ // This function returns true if we can safely represent the IV type in
+ // the RangeCheckType without loss of information.
+ bool isSafeToTruncateWideIVType(Type *RangeCheckType);
+ // Return the loopLatchCheck corresponding to the RangeCheckType if safe to do
+ // so.
+ Optional<LoopICmp> generateLoopLatchCheck(Type *RangeCheckType);
public:
LoopPredication(ScalarEvolution *SE) : SE(SE){};
bool runOnLoop(Loop *L);
@@ -135,11 +315,8 @@ PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
}
Optional<LoopPredication::LoopICmp>
-LoopPredication::parseLoopICmp(ICmpInst *ICI) {
- ICmpInst::Predicate Pred = ICI->getPredicate();
-
- Value *LHS = ICI->getOperand(0);
- Value *RHS = ICI->getOperand(1);
+LoopPredication::parseLoopICmp(ICmpInst::Predicate Pred, Value *LHS,
+ Value *RHS) {
const SCEV *LHSS = SE->getSCEV(LHS);
if (isa<SCEVCouldNotCompute>(LHSS))
return None;
@@ -165,13 +342,146 @@ Value *LoopPredication::expandCheck(SCEVExpander &Expander,
IRBuilder<> &Builder,
ICmpInst::Predicate Pred, const SCEV *LHS,
const SCEV *RHS, Instruction *InsertAt) {
+ // TODO: we can check isLoopEntryGuardedByCond before emitting the check
+
Type *Ty = LHS->getType();
assert(Ty == RHS->getType() && "expandCheck operands have different types?");
+
+ if (SE->isLoopEntryGuardedByCond(L, Pred, LHS, RHS))
+ return Builder.getTrue();
+
Value *LHSV = Expander.expandCodeFor(LHS, Ty, InsertAt);
Value *RHSV = Expander.expandCodeFor(RHS, Ty, InsertAt);
return Builder.CreateICmp(Pred, LHSV, RHSV);
}
+Optional<LoopPredication::LoopICmp>
+LoopPredication::generateLoopLatchCheck(Type *RangeCheckType) {
+
+ auto *LatchType = LatchCheck.IV->getType();
+ if (RangeCheckType == LatchType)
+ return LatchCheck;
+ // For now, bail out if latch type is narrower than range type.
+ if (DL->getTypeSizeInBits(LatchType) < DL->getTypeSizeInBits(RangeCheckType))
+ return None;
+ if (!isSafeToTruncateWideIVType(RangeCheckType))
+ return None;
+ // We can now safely identify the truncated version of the IV and limit for
+ // RangeCheckType.
+ LoopICmp NewLatchCheck;
+ NewLatchCheck.Pred = LatchCheck.Pred;
+ NewLatchCheck.IV = dyn_cast<SCEVAddRecExpr>(
+ SE->getTruncateExpr(LatchCheck.IV, RangeCheckType));
+ if (!NewLatchCheck.IV)
+ return None;
+ NewLatchCheck.Limit = SE->getTruncateExpr(LatchCheck.Limit, RangeCheckType);
+ DEBUG(dbgs() << "IV of type: " << *LatchType
+ << "can be represented as range check type:" << *RangeCheckType
+ << "\n");
+ DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n");
+ DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n");
+ return NewLatchCheck;
+}
+
+bool LoopPredication::isSupportedStep(const SCEV* Step) {
+ return Step->isOne() || (Step->isAllOnesValue() && EnableCountDownLoop);
+}
+
+bool LoopPredication::CanExpand(const SCEV* S) {
+ return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE);
+}
+
+Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop(
+ LoopPredication::LoopICmp LatchCheck, LoopPredication::LoopICmp RangeCheck,
+ SCEVExpander &Expander, IRBuilder<> &Builder) {
+ auto *Ty = RangeCheck.IV->getType();
+ // Generate the widened condition for the forward loop:
+ // guardStart u< guardLimit &&
+ // latchLimit <pred> guardLimit - 1 - guardStart + latchStart
+ // where <pred> depends on the latch condition predicate. See the file
+ // header comment for the reasoning.
+ // guardLimit - guardStart + latchStart - 1
+ const SCEV *GuardStart = RangeCheck.IV->getStart();
+ const SCEV *GuardLimit = RangeCheck.Limit;
+ const SCEV *LatchStart = LatchCheck.IV->getStart();
+ const SCEV *LatchLimit = LatchCheck.Limit;
+
+ // guardLimit - guardStart + latchStart - 1
+ const SCEV *RHS =
+ SE->getAddExpr(SE->getMinusSCEV(GuardLimit, GuardStart),
+ SE->getMinusSCEV(LatchStart, SE->getOne(Ty)));
+ if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) ||
+ !CanExpand(LatchLimit) || !CanExpand(RHS)) {
+ DEBUG(dbgs() << "Can't expand limit check!\n");
+ return None;
+ }
+ ICmpInst::Predicate LimitCheckPred;
+ switch (LatchCheck.Pred) {
+ case ICmpInst::ICMP_ULT:
+ LimitCheckPred = ICmpInst::ICMP_ULE;
+ break;
+ case ICmpInst::ICMP_ULE:
+ LimitCheckPred = ICmpInst::ICMP_ULT;
+ break;
+ case ICmpInst::ICMP_SLT:
+ LimitCheckPred = ICmpInst::ICMP_SLE;
+ break;
+ case ICmpInst::ICMP_SLE:
+ LimitCheckPred = ICmpInst::ICMP_SLT;
+ break;
+ default:
+ llvm_unreachable("Unsupported loop latch!");
+ }
+
+ DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n");
+ DEBUG(dbgs() << "RHS: " << *RHS << "\n");
+ DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n");
+
+ Instruction *InsertAt = Preheader->getTerminator();
+ auto *LimitCheck =
+ expandCheck(Expander, Builder, LimitCheckPred, LatchLimit, RHS, InsertAt);
+ auto *FirstIterationCheck = expandCheck(Expander, Builder, RangeCheck.Pred,
+ GuardStart, GuardLimit, InsertAt);
+ return Builder.CreateAnd(FirstIterationCheck, LimitCheck);
+}
+
+Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
+ LoopPredication::LoopICmp LatchCheck, LoopPredication::LoopICmp RangeCheck,
+ SCEVExpander &Expander, IRBuilder<> &Builder) {
+ auto *Ty = RangeCheck.IV->getType();
+ const SCEV *GuardStart = RangeCheck.IV->getStart();
+ const SCEV *GuardLimit = RangeCheck.Limit;
+ const SCEV *LatchLimit = LatchCheck.Limit;
+ if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) ||
+ !CanExpand(LatchLimit)) {
+ DEBUG(dbgs() << "Can't expand limit check!\n");
+ return None;
+ }
+ // The decrement of the latch check IV should be the same as the
+ // rangeCheckIV.
+ auto *PostDecLatchCheckIV = LatchCheck.IV->getPostIncExpr(*SE);
+ if (RangeCheck.IV != PostDecLatchCheckIV) {
+ DEBUG(dbgs() << "Not the same. PostDecLatchCheckIV: "
+ << *PostDecLatchCheckIV
+ << " and RangeCheckIV: " << *RangeCheck.IV << "\n");
+ return None;
+ }
+
+ // Generate the widened condition for CountDownLoop:
+ // guardStart u< guardLimit &&
+ // latchLimit <pred> 1.
+ // See the header comment for reasoning of the checks.
+ Instruction *InsertAt = Preheader->getTerminator();
+ auto LimitCheckPred = ICmpInst::isSigned(LatchCheck.Pred)
+ ? ICmpInst::ICMP_SGE
+ : ICmpInst::ICMP_UGE;
+ auto *FirstIterationCheck = expandCheck(Expander, Builder, ICmpInst::ICMP_ULT,
+ GuardStart, GuardLimit, InsertAt);
+ auto *LimitCheck = expandCheck(Expander, Builder, LimitCheckPred, LatchLimit,
+ SE->getOne(Ty), InsertAt);
+ return Builder.CreateAnd(FirstIterationCheck, LimitCheck);
+}
+
/// If ICI can be widened to a loop invariant condition emits the loop
/// invariant condition in the loop preheader and return it, otherwise
/// returns None.
@@ -181,51 +491,62 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
DEBUG(ICI->dump());
+ // parseLoopStructure guarantees that the latch condition is:
+ // ++i <pred> latchLimit, where <pred> is u<, u<=, s<, or s<=.
+ // We are looking for the range checks of the form:
+ // i u< guardLimit
auto RangeCheck = parseLoopICmp(ICI);
if (!RangeCheck) {
DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
return None;
}
-
- ICmpInst::Predicate Pred = RangeCheck->Pred;
- const SCEVAddRecExpr *IndexAR = RangeCheck->IV;
- const SCEV *RHSS = RangeCheck->Limit;
-
- auto CanExpand = [this](const SCEV *S) {
- return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE);
- };
- if (!CanExpand(RHSS))
+ DEBUG(dbgs() << "Guard check:\n");
+ DEBUG(RangeCheck->dump());
+ if (RangeCheck->Pred != ICmpInst::ICMP_ULT) {
+ DEBUG(dbgs() << "Unsupported range check predicate(" << RangeCheck->Pred
+ << ")!\n");
return None;
-
- DEBUG(dbgs() << "IndexAR: ");
- DEBUG(IndexAR->dump());
-
- bool IsIncreasing = false;
- if (!SE->isMonotonicPredicate(IndexAR, Pred, IsIncreasing))
+ }
+ auto *RangeCheckIV = RangeCheck->IV;
+ if (!RangeCheckIV->isAffine()) {
+ DEBUG(dbgs() << "Range check IV is not affine!\n");
return None;
-
- // If the predicate is increasing the condition can change from false to true
- // as the loop progresses, in this case take the value on the first iteration
- // for the widened check. Otherwise the condition can change from true to
- // false as the loop progresses, so take the value on the last iteration.
- const SCEV *NewLHSS = IsIncreasing
- ? IndexAR->getStart()
- : SE->getSCEVAtScope(IndexAR, L->getParentLoop());
- if (NewLHSS == IndexAR) {
- DEBUG(dbgs() << "Can't compute NewLHSS!\n");
+ }
+ auto *Step = RangeCheckIV->getStepRecurrence(*SE);
+ // We cannot just compare with latch IV step because the latch and range IVs
+ // may have different types.
+ if (!isSupportedStep(Step)) {
+ DEBUG(dbgs() << "Range check and latch have IVs different steps!\n");
return None;
}
-
- DEBUG(dbgs() << "NewLHSS: ");
- DEBUG(NewLHSS->dump());
-
- if (!CanExpand(NewLHSS))
+ auto *Ty = RangeCheckIV->getType();
+ auto CurrLatchCheckOpt = generateLoopLatchCheck(Ty);
+ if (!CurrLatchCheckOpt) {
+ DEBUG(dbgs() << "Failed to generate a loop latch check "
+ "corresponding to range type: "
+ << *Ty << "\n");
return None;
+ }
- DEBUG(dbgs() << "NewLHSS is loop invariant and safe to expand. Expand!\n");
+ LoopICmp CurrLatchCheck = *CurrLatchCheckOpt;
+ // At this point, the range and latch step should have the same type, but need
+ // not have the same value (we support both 1 and -1 steps).
+ assert(Step->getType() ==
+ CurrLatchCheck.IV->getStepRecurrence(*SE)->getType() &&
+ "Range and latch steps should be of same type!");
+ if (Step != CurrLatchCheck.IV->getStepRecurrence(*SE)) {
+ DEBUG(dbgs() << "Range and latch have different step values!\n");
+ return None;
+ }
- Instruction *InsertAt = Preheader->getTerminator();
- return expandCheck(Expander, Builder, Pred, NewLHSS, RHSS, InsertAt);
+ if (Step->isOne())
+ return widenICmpRangeCheckIncrementingLoop(CurrLatchCheck, *RangeCheck,
+ Expander, Builder);
+ else {
+ assert(Step->isAllOnesValue() && "Step should be -1!");
+ return widenICmpRangeCheckDecrementingLoop(CurrLatchCheck, *RangeCheck,
+ Expander, Builder);
+ }
}
bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
@@ -288,6 +609,97 @@ bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
return true;
}
+Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
+ using namespace PatternMatch;
+
+ BasicBlock *LoopLatch = L->getLoopLatch();
+ if (!LoopLatch) {
+ DEBUG(dbgs() << "The loop doesn't have a single latch!\n");
+ return None;
+ }
+
+ ICmpInst::Predicate Pred;
+ Value *LHS, *RHS;
+ BasicBlock *TrueDest, *FalseDest;
+
+ if (!match(LoopLatch->getTerminator(),
+ m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)), TrueDest,
+ FalseDest))) {
+ DEBUG(dbgs() << "Failed to match the latch terminator!\n");
+ return None;
+ }
+ assert((TrueDest == L->getHeader() || FalseDest == L->getHeader()) &&
+ "One of the latch's destinations must be the header");
+ if (TrueDest != L->getHeader())
+ Pred = ICmpInst::getInversePredicate(Pred);
+
+ auto Result = parseLoopICmp(Pred, LHS, RHS);
+ if (!Result) {
+ DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+ return None;
+ }
+
+ // Check affine first, so if it's not we don't try to compute the step
+ // recurrence.
+ if (!Result->IV->isAffine()) {
+ DEBUG(dbgs() << "The induction variable is not affine!\n");
+ return None;
+ }
+
+ auto *Step = Result->IV->getStepRecurrence(*SE);
+ if (!isSupportedStep(Step)) {
+ DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n");
+ return None;
+ }
+
+ auto IsUnsupportedPredicate = [](const SCEV *Step, ICmpInst::Predicate Pred) {
+ if (Step->isOne()) {
+ return Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_SLT &&
+ Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_SLE;
+ } else {
+ assert(Step->isAllOnesValue() && "Step should be -1!");
+ return Pred != ICmpInst::ICMP_UGT && Pred != ICmpInst::ICMP_SGT;
+ }
+ };
+
+ if (IsUnsupportedPredicate(Step, Result->Pred)) {
+ DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred
+ << ")!\n");
+ return None;
+ }
+ return Result;
+}
+
+// Returns true if its safe to truncate the IV to RangeCheckType.
+bool LoopPredication::isSafeToTruncateWideIVType(Type *RangeCheckType) {
+ if (!EnableIVTruncation)
+ return false;
+ assert(DL->getTypeSizeInBits(LatchCheck.IV->getType()) >
+ DL->getTypeSizeInBits(RangeCheckType) &&
+ "Expected latch check IV type to be larger than range check operand "
+ "type!");
+ // The start and end values of the IV should be known. This is to guarantee
+ // that truncating the wide type will not lose information.
+ auto *Limit = dyn_cast<SCEVConstant>(LatchCheck.Limit);
+ auto *Start = dyn_cast<SCEVConstant>(LatchCheck.IV->getStart());
+ if (!Limit || !Start)
+ return false;
+ // This check makes sure that the IV does not change sign during loop
+ // iterations. Consider latchType = i64, LatchStart = 5, Pred = ICMP_SGE,
+ // LatchEnd = 2, rangeCheckType = i32. If it's not a monotonic predicate, the
+ // IV wraps around, and the truncation of the IV would lose the range of
+ // iterations between 2^32 and 2^64.
+ bool Increasing;
+ if (!SE->isMonotonicPredicate(LatchCheck.IV, LatchCheck.Pred, Increasing))
+ return false;
+ // The active bits should be less than the bits in the RangeCheckType. This
+ // guarantees that truncating the latch check to RangeCheckType is a safe
+ // operation.
+ auto RangeCheckTypeBitSize = DL->getTypeSizeInBits(RangeCheckType);
+ return Start->getAPInt().getActiveBits() < RangeCheckTypeBitSize &&
+ Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize;
+}
+
bool LoopPredication::runOnLoop(Loop *Loop) {
L = Loop;
@@ -308,6 +720,14 @@ bool LoopPredication::runOnLoop(Loop *Loop) {
if (!Preheader)
return false;
+ auto LatchCheckOpt = parseLoopLatchICmp();
+ if (!LatchCheckOpt)
+ return false;
+ LatchCheck = *LatchCheckOpt;
+
+ DEBUG(dbgs() << "Latch check:\n");
+ DEBUG(LatchCheck.dump());
+
// Collect all the guards into a vector and process later, so as not
// to invalidate the instruction iterator.
SmallVector<IntrinsicInst *, 4> Guards;
diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp
index fc0216e76a5b..d1a54b877950 100644
--- a/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -1,4 +1,4 @@
-//===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===//
+//===- LoopReroll.cpp - Loop rerolling pass -------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -11,22 +11,42 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/ADT/APInt.h"
#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -34,6 +54,13 @@
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <map>
+#include <utility>
using namespace llvm;
@@ -127,6 +154,7 @@ NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
// br %cmp, header, exit
namespace {
+
enum IterationLimits {
/// The maximum number of iterations that we'll try and reroll.
IL_MaxRerollIterations = 32,
@@ -139,6 +167,7 @@ namespace {
class LoopReroll : public LoopPass {
public:
static char ID; // Pass ID, replacement for typeid
+
LoopReroll() : LoopPass(ID) {
initializeLoopRerollPass(*PassRegistry::getPassRegistry());
}
@@ -158,11 +187,12 @@ namespace {
DominatorTree *DT;
bool PreserveLCSSA;
- typedef SmallVector<Instruction *, 16> SmallInstructionVector;
- typedef SmallSet<Instruction *, 16> SmallInstructionSet;
+ using SmallInstructionVector = SmallVector<Instruction *, 16>;
+ using SmallInstructionSet = SmallSet<Instruction *, 16>;
// Map between induction variable and its increment
DenseMap<Instruction *, int64_t> IVToIncMap;
+
// For loop with multiple induction variable, remember the one used only to
// control the loop.
Instruction *LoopControlIV;
@@ -171,8 +201,7 @@ namespace {
// representing a reduction. Only the last value may be used outside the
// loop.
struct SimpleLoopReduction {
- SimpleLoopReduction(Instruction *P, Loop *L)
- : Valid(false), Instructions(1, P) {
+ SimpleLoopReduction(Instruction *P, Loop *L) : Instructions(1, P) {
assert(isa<PHINode>(P) && "First reduction instruction must be a PHI");
add(L);
}
@@ -204,8 +233,8 @@ namespace {
return Instructions.size()-1;
}
- typedef SmallInstructionVector::iterator iterator;
- typedef SmallInstructionVector::const_iterator const_iterator;
+ using iterator = SmallInstructionVector::iterator;
+ using const_iterator = SmallInstructionVector::const_iterator;
iterator begin() {
assert(Valid && "Using invalid reduction");
@@ -221,7 +250,7 @@ namespace {
const_iterator end() const { return Instructions.end(); }
protected:
- bool Valid;
+ bool Valid = false;
SmallInstructionVector Instructions;
void add(Loop *L);
@@ -230,7 +259,7 @@ namespace {
// The set of all reductions, and state tracking of possible reductions
// during loop instruction processing.
struct ReductionTracker {
- typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector;
+ using SmallReductionVector = SmallVector<SimpleLoopReduction, 16>;
// Add a new possible reduction.
void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); }
@@ -342,6 +371,7 @@ namespace {
struct DAGRootSet {
Instruction *BaseInst;
SmallInstructionVector Roots;
+
// The instructions between IV and BaseInst (but not including BaseInst).
SmallInstructionSet SubsumedInsts;
};
@@ -361,15 +391,17 @@ namespace {
/// Stage 1: Find all the DAG roots for the induction variable.
bool findRoots();
+
/// Stage 2: Validate if the found roots are valid.
bool validate(ReductionTracker &Reductions);
+
/// Stage 3: Assuming validate() returned true, perform the
/// replacement.
/// @param IterCount The maximum iteration count of L.
void replace(const SCEV *IterCount);
protected:
- typedef MapVector<Instruction*, BitVector> UsesTy;
+ using UsesTy = MapVector<Instruction *, BitVector>;
void findRootsRecursive(Instruction *IVU,
SmallInstructionSet SubsumedInsts);
@@ -412,22 +444,29 @@ namespace {
// The loop induction variable.
Instruction *IV;
+
// Loop step amount.
int64_t Inc;
+
// Loop reroll count; if Inc == 1, this records the scaling applied
// to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ;
// If Inc is not 1, Scale = Inc.
uint64_t Scale;
+
// The roots themselves.
SmallVector<DAGRootSet,16> RootSets;
+
// All increment instructions for IV.
SmallInstructionVector LoopIncs;
+
// Map of all instructions in the loop (in order) to the iterations
// they are used in (or specially, IL_All for instructions
// used in the loop increment mechanism).
UsesTy Uses;
+
// Map between induction variable and its increment
DenseMap<Instruction *, int64_t> &IVToIncMap;
+
Instruction *LoopControlIV;
};
@@ -446,9 +485,11 @@ namespace {
bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount,
ReductionTracker &Reductions);
};
-}
+
+} // end anonymous namespace
char LoopReroll::ID = 0;
+
INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
@@ -1069,7 +1110,6 @@ bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &Po
}
return true;
-
}
/// Get the next instruction in "In" that is a member of set Val.
@@ -1124,7 +1164,7 @@ static bool isIgnorableInst(const Instruction *I) {
switch (II->getIntrinsicID()) {
default:
return false;
- case llvm::Intrinsic::annotation:
+ case Intrinsic::annotation:
case Intrinsic::ptr_annotation:
case Intrinsic::var_annotation:
// TODO: the following intrinsics may also be whitelisted:
@@ -1407,8 +1447,8 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
BaseIt = nextInstr(0, Uses, Visited);
RootIt = nextInstr(Iter, Uses, Visited);
}
- assert (BaseIt == Uses.end() && RootIt == Uses.end() &&
- "Mismatched set sizes!");
+ assert(BaseIt == Uses.end() && RootIt == Uses.end() &&
+ "Mismatched set sizes!");
}
DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 3506ac343d59..a91f53ba663f 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -25,6 +25,7 @@
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -141,37 +142,29 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
// Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
// intrinsics.
- LLVMContext &C = OrigHeader->getContext();
- if (auto *VAM = ValueAsMetadata::getIfExists(OrigHeaderVal)) {
- if (auto *MAV = MetadataAsValue::getIfExists(C, VAM)) {
- for (auto UI = MAV->use_begin(), E = MAV->use_end(); UI != E;) {
- // Grab the use before incrementing the iterator. Otherwise, altering
- // the Use will invalidate the iterator.
- Use &U = *UI++;
- DbgInfoIntrinsic *UserInst = dyn_cast<DbgInfoIntrinsic>(U.getUser());
- if (!UserInst)
- continue;
-
- // The original users in the OrigHeader are already using the original
- // definitions.
- BasicBlock *UserBB = UserInst->getParent();
- if (UserBB == OrigHeader)
- continue;
+ SmallVector<DbgValueInst *, 1> DbgValues;
+ llvm::findDbgValues(DbgValues, OrigHeaderVal);
+ for (auto &DbgValue : DbgValues) {
+ // The original users in the OrigHeader are already using the original
+ // definitions.
+ BasicBlock *UserBB = DbgValue->getParent();
+ if (UserBB == OrigHeader)
+ continue;
- // Users in the OrigPreHeader need to use the value to which the
- // original definitions are mapped and anything else can be handled by
- // the SSAUpdater. To avoid adding PHINodes, check if the value is
- // available in UserBB, if not substitute undef.
- Value *NewVal;
- if (UserBB == OrigPreheader)
- NewVal = OrigPreHeaderVal;
- else if (SSA.HasValueForBlock(UserBB))
- NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
- else
- NewVal = UndefValue::get(OrigHeaderVal->getType());
- U = MetadataAsValue::get(C, ValueAsMetadata::get(NewVal));
- }
- }
+ // Users in the OrigPreHeader need to use the value to which the
+ // original definitions are mapped and anything else can be handled by
+ // the SSAUpdater. To avoid adding PHINodes, check if the value is
+ // available in UserBB, if not substitute undef.
+ Value *NewVal;
+ if (UserBB == OrigPreheader)
+ NewVal = OrigPreHeaderVal;
+ else if (SSA.HasValueForBlock(UserBB))
+ NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
+ else
+ NewVal = UndefValue::get(OrigHeaderVal->getType());
+ DbgValue->setOperand(0,
+ MetadataAsValue::get(OrigHeaderVal->getContext(),
+ ValueAsMetadata::get(NewVal)));
}
}
}
@@ -315,6 +308,22 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
// For the rest of the instructions, either hoist to the OrigPreheader if
// possible or create a clone in the OldPreHeader if not.
TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
+
+ // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication.
+ using DbgIntrinsicHash =
+ std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>;
+ auto makeHash = [](DbgInfoIntrinsic *D) -> DbgIntrinsicHash {
+ return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()};
+ };
+ SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
+ for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend();
+ I != E; ++I) {
+ if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&*I))
+ DbgIntrinsics.insert(makeHash(DII));
+ else
+ break;
+ }
+
while (I != E) {
Instruction *Inst = &*I++;
@@ -338,6 +347,13 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
RemapInstruction(C, ValueMap,
RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+ // Avoid inserting the same intrinsic twice.
+ if (auto *DII = dyn_cast<DbgInfoIntrinsic>(C))
+ if (DbgIntrinsics.count(makeHash(DII))) {
+ C->deleteValue();
+ continue;
+ }
+
// With the operands remapped, see if the instruction constant folds or is
// otherwise simplifyable. This commonly occurs because the entry from PHI
// nodes allows icmps and other instructions to fold.
@@ -395,6 +411,17 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
L->moveToHeader(NewHeader);
assert(L->getHeader() == NewHeader && "Latch block is our new header");
+ // Inform DT about changes to the CFG.
+ if (DT) {
+ // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
+ // the DT about the removed edge to the OrigHeader (that got removed).
+ SmallVector<DominatorTree::UpdateType, 3> Updates;
+ Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit});
+ Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
+ Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
+ DT->applyUpdates(Updates);
+ }
+
// At this point, we've finished our major CFG changes. As part of cloning
// the loop into the preheader we've simplified instructions and the
// duplicated conditional branch may now be branching on a constant. If it is
@@ -408,26 +435,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
NewHeader) {
// The conditional branch can't be folded, handle the general case.
- // Update DominatorTree to reflect the CFG change we just made. Then split
- // edges as necessary to preserve LoopSimplify form.
- if (DT) {
- // Everything that was dominated by the old loop header is now dominated
- // by the original loop preheader. Conceptually the header was merged
- // into the preheader, even though we reuse the actual block as a new
- // loop latch.
- DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
- SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
- OrigHeaderNode->end());
- DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader);
- for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I)
- DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
-
- assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode);
- assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode);
-
- // Update OrigHeader to be dominated by the new header block.
- DT->changeImmediateDominator(OrigHeader, OrigLatch);
- }
+ // Split edges as necessary to preserve LoopSimplify form.
// Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
// thus is not a preheader anymore.
@@ -467,52 +475,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
PHBI->eraseFromParent();
// With our CFG finalized, update DomTree if it is available.
- if (DT) {
- // Update OrigHeader to be dominated by the new header block.
- DT->changeImmediateDominator(NewHeader, OrigPreheader);
- DT->changeImmediateDominator(OrigHeader, OrigLatch);
-
- // Brute force incremental dominator tree update. Call
- // findNearestCommonDominator on all CFG predecessors of each child of the
- // original header.
- DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
- SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
- OrigHeaderNode->end());
- bool Changed;
- do {
- Changed = false;
- for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) {
- DomTreeNode *Node = HeaderChildren[I];
- BasicBlock *BB = Node->getBlock();
-
- BasicBlock *NearestDom = nullptr;
- for (BasicBlock *Pred : predecessors(BB)) {
- // Consider only reachable basic blocks.
- if (!DT->getNode(Pred))
- continue;
-
- if (!NearestDom) {
- NearestDom = Pred;
- continue;
- }
-
- NearestDom = DT->findNearestCommonDominator(NearestDom, Pred);
- assert(NearestDom && "No NearestCommonDominator found");
- }
-
- assert(NearestDom && "Nearest dominator not found");
-
- // Remember if this changes the DomTree.
- if (Node->getIDom()->getBlock() != NearestDom) {
- DT->changeImmediateDominator(BB, NearestDom);
- Changed = true;
- }
- }
-
- // If the dominator changed, this may have an effect on other
- // predecessors, continue until we reach a fixpoint.
- } while (Changed);
- }
+ if (DT) DT->deleteEdge(OrigPreheader, Exit);
}
assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
@@ -671,7 +634,7 @@ bool LoopRotate::processLoop(Loop *L) {
if ((MadeChange || SimplifiedLatch) && LoopMD)
L->setLoopID(LoopMD);
- return MadeChange;
+ return MadeChange || SimplifiedLatch;
}
LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication)
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 3638da118cb7..953854c8b7b7 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -65,7 +65,9 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
@@ -80,13 +82,18 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/OperandTraits.h"
#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/Pass.h"
@@ -98,7 +105,6 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
@@ -107,8 +113,8 @@
#include <cstdint>
#include <cstdlib>
#include <iterator>
+#include <limits>
#include <map>
-#include <tuple>
#include <utility>
using namespace llvm;
@@ -131,7 +137,7 @@ static cl::opt<bool> EnablePhiElim(
// The flag adds instruction count to solutions cost comparision.
static cl::opt<bool> InsnsCost(
- "lsr-insns-cost", cl::Hidden, cl::init(false),
+ "lsr-insns-cost", cl::Hidden, cl::init(true),
cl::desc("Add instruction count to a LSR cost model"));
// Flag to choose how to narrow complex lsr solution
@@ -160,15 +166,14 @@ namespace {
struct MemAccessTy {
/// Used in situations where the accessed memory type is unknown.
- static const unsigned UnknownAddressSpace = ~0u;
+ static const unsigned UnknownAddressSpace =
+ std::numeric_limits<unsigned>::max();
- Type *MemTy;
- unsigned AddrSpace;
+ Type *MemTy = nullptr;
+ unsigned AddrSpace = UnknownAddressSpace;
- MemAccessTy() : MemTy(nullptr), AddrSpace(UnknownAddressSpace) {}
-
- MemAccessTy(Type *Ty, unsigned AS) :
- MemTy(Ty), AddrSpace(AS) {}
+ MemAccessTy() = default;
+ MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
bool operator==(MemAccessTy Other) const {
return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
@@ -195,11 +200,11 @@ public:
} // end anonymous namespace
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void RegSortData::print(raw_ostream &OS) const {
OS << "[NumUses=" << UsedByIndices.count() << ']';
}
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD void RegSortData::dump() const {
print(errs()); errs() << '\n';
}
@@ -209,7 +214,7 @@ namespace {
/// Map register candidates to information about how they are used.
class RegUseTracker {
- typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
+ using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
RegUsesTy RegUsesMap;
SmallVector<const SCEV *, 16> RegSequence;
@@ -225,8 +230,9 @@ public:
void clear();
- typedef SmallVectorImpl<const SCEV *>::iterator iterator;
- typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator;
+ using iterator = SmallVectorImpl<const SCEV *>::iterator;
+ using const_iterator = SmallVectorImpl<const SCEV *>::const_iterator;
+
iterator begin() { return RegSequence.begin(); }
iterator end() { return RegSequence.end(); }
const_iterator begin() const { return RegSequence.begin(); }
@@ -299,16 +305,16 @@ namespace {
/// satisfying a use. It may include broken-out immediates and scaled registers.
struct Formula {
/// Global base address used for complex addressing.
- GlobalValue *BaseGV;
+ GlobalValue *BaseGV = nullptr;
/// Base offset for complex addressing.
- int64_t BaseOffset;
+ int64_t BaseOffset = 0;
/// Whether any complex addressing has a base register.
- bool HasBaseReg;
+ bool HasBaseReg = false;
/// The scale of any complex addressing.
- int64_t Scale;
+ int64_t Scale = 0;
/// The list of "base" registers for this use. When this is non-empty. The
/// canonical representation of a formula is
@@ -328,16 +334,14 @@ struct Formula {
/// The 'scaled' register for this use. This should be non-null when Scale is
/// not zero.
- const SCEV *ScaledReg;
+ const SCEV *ScaledReg = nullptr;
/// An additional constant offset which added near the use. This requires a
/// temporary register, but the offset itself can live in an add immediate
/// field rather than a register.
- int64_t UnfoldedOffset;
+ int64_t UnfoldedOffset = 0;
- Formula()
- : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0),
- ScaledReg(nullptr), UnfoldedOffset(0) {}
+ Formula() = default;
void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
@@ -562,6 +566,7 @@ bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
return false;
}
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void Formula::print(raw_ostream &OS) const {
bool First = true;
if (BaseGV) {
@@ -598,7 +603,6 @@ void Formula::print(raw_ostream &OS) const {
}
}
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD void Formula::dump() const {
print(errs()); errs() << '\n';
}
@@ -773,7 +777,8 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
/// Returns true if the specified instruction is using the specified value as an
/// address.
-static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
+static bool isAddressUse(const TargetTransformInfo &TTI,
+ Instruction *Inst, Value *OperandVal) {
bool isAddress = isa<LoadInst>(Inst);
if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
if (SI->getPointerOperand() == OperandVal)
@@ -782,11 +787,24 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
// Addressing modes can also be folded into prefetches and a variety
// of intrinsics.
switch (II->getIntrinsicID()) {
- default: break;
- case Intrinsic::prefetch:
- if (II->getArgOperand(0) == OperandVal)
+ case Intrinsic::memset:
+ case Intrinsic::prefetch:
+ if (II->getArgOperand(0) == OperandVal)
+ isAddress = true;
+ break;
+ case Intrinsic::memmove:
+ case Intrinsic::memcpy:
+ if (II->getArgOperand(0) == OperandVal ||
+ II->getArgOperand(1) == OperandVal)
+ isAddress = true;
+ break;
+ default: {
+ MemIntrinsicInfo IntrInfo;
+ if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
+ if (IntrInfo.PtrVal == OperandVal)
isAddress = true;
- break;
+ }
+ }
}
} else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
if (RMW->getPointerOperand() == OperandVal)
@@ -799,7 +817,8 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
}
/// Return the type of the memory being accessed.
-static MemAccessTy getAccessType(const Instruction *Inst) {
+static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
+ Instruction *Inst) {
MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
AccessTy.MemTy = SI->getOperand(0)->getType();
@@ -810,6 +829,21 @@ static MemAccessTy getAccessType(const Instruction *Inst) {
AccessTy.AddrSpace = RMW->getPointerAddressSpace();
} else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
+ } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::prefetch:
+ AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
+ break;
+ default: {
+ MemIntrinsicInfo IntrInfo;
+ if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
+ AccessTy.AddrSpace
+ = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
+ }
+
+ break;
+ }
+ }
}
// All pointers have the same requirements, so canonicalize them to an
@@ -948,6 +982,7 @@ class LSRUse;
/// accurate cost model.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
const LSRUse &LU, const Formula &F);
+
// Get the cost of the scaling factor used in F for LU.
static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
const LSRUse &LU, const Formula &F,
@@ -1013,30 +1048,30 @@ private:
ScalarEvolution &SE, DominatorTree &DT,
SmallPtrSetImpl<const SCEV *> *LoserRegs);
};
-
+
/// An operand value in an instruction which is to be replaced with some
/// equivalent, possibly strength-reduced, replacement.
struct LSRFixup {
/// The instruction which will be updated.
- Instruction *UserInst;
+ Instruction *UserInst = nullptr;
/// The operand of the instruction which will be replaced. The operand may be
/// used more than once; every instance will be replaced.
- Value *OperandValToReplace;
+ Value *OperandValToReplace = nullptr;
/// If this user is to use the post-incremented value of an induction
- /// variable, this variable is non-null and holds the loop associated with the
+ /// variable, this set is non-empty and holds the loops associated with the
/// induction variable.
PostIncLoopSet PostIncLoops;
/// A constant offset to be added to the LSRUse expression. This allows
/// multiple fixups to share the same LSRUse with different offsets, for
/// example in an unrolled loop.
- int64_t Offset;
+ int64_t Offset = 0;
- bool isUseFullyOutsideLoop(const Loop *L) const;
+ LSRFixup() = default;
- LSRFixup();
+ bool isUseFullyOutsideLoop(const Loop *L) const;
void print(raw_ostream &OS) const;
void dump() const;
@@ -1086,7 +1121,7 @@ public:
// TODO: Add a generic icmp too?
};
- typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair;
+ using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
KindType Kind;
MemAccessTy AccessTy;
@@ -1095,25 +1130,25 @@ public:
SmallVector<LSRFixup, 8> Fixups;
/// Keep track of the min and max offsets of the fixups.
- int64_t MinOffset;
- int64_t MaxOffset;
+ int64_t MinOffset = std::numeric_limits<int64_t>::max();
+ int64_t MaxOffset = std::numeric_limits<int64_t>::min();
/// This records whether all of the fixups using this LSRUse are outside of
/// the loop, in which case some special-case heuristics may be used.
- bool AllFixupsOutsideLoop;
+ bool AllFixupsOutsideLoop = true;
/// RigidFormula is set to true to guarantee that this use will be associated
/// with a single formula--the one that initially matched. Some SCEV
/// expressions cannot be expanded. This allows LSR to consider the registers
/// used by those expressions without the need to expand them later after
/// changing the formula.
- bool RigidFormula;
+ bool RigidFormula = false;
/// This records the widest use type for any fixup using this
/// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
/// fixup widths to be equivalent, because the narrower one may be relying on
/// the implicit truncation to truncate away bogus bits.
- Type *WidestFixupType;
+ Type *WidestFixupType = nullptr;
/// A list of ways to build a value that can satisfy this user. After the
/// list is populated, one of these is selected heuristically and used to
@@ -1123,10 +1158,7 @@ public:
/// The set of register candidates used by all formulae in this LSRUse.
SmallPtrSet<const SCEV *, 4> Regs;
- LSRUse(KindType K, MemAccessTy AT)
- : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN),
- AllFixupsOutsideLoop(true), RigidFormula(false),
- WidestFixupType(nullptr) {}
+ LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
LSRFixup &getNewFixup() {
Fixups.push_back(LSRFixup());
@@ -1140,7 +1172,7 @@ public:
if (f.Offset < MinOffset)
MinOffset = f.Offset;
}
-
+
bool HasFormulaWithSameRegs(const Formula &F) const;
float getNotSelectedProbability(const SCEV *Reg) const;
bool InsertFormula(const Formula &F, const Loop &L);
@@ -1153,6 +1185,12 @@ public:
} // end anonymous namespace
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+ LSRUse::KindType Kind, MemAccessTy AccessTy,
+ GlobalValue *BaseGV, int64_t BaseOffset,
+ bool HasBaseReg, int64_t Scale,
+ Instruction *Fixup = nullptr);
+
/// Tally up interesting quantities from the given register.
void Cost::RateRegister(const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
@@ -1280,8 +1318,9 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
// Check with target if this offset with this instruction is
// specifically not supported.
- if ((isa<LoadInst>(Fixup.UserInst) || isa<StoreInst>(Fixup.UserInst)) &&
- !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset))
+ if (LU.Kind == LSRUse::Address && Offset != 0 &&
+ !isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
+ Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
C.NumBaseAdds++;
}
@@ -1325,14 +1364,14 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
/// Set this cost to a losing value.
void Cost::Lose() {
- C.Insns = ~0u;
- C.NumRegs = ~0u;
- C.AddRecCost = ~0u;
- C.NumIVMuls = ~0u;
- C.NumBaseAdds = ~0u;
- C.ImmCost = ~0u;
- C.SetupCost = ~0u;
- C.ScaleCost = ~0u;
+ C.Insns = std::numeric_limits<unsigned>::max();
+ C.NumRegs = std::numeric_limits<unsigned>::max();
+ C.AddRecCost = std::numeric_limits<unsigned>::max();
+ C.NumIVMuls = std::numeric_limits<unsigned>::max();
+ C.NumBaseAdds = std::numeric_limits<unsigned>::max();
+ C.ImmCost = std::numeric_limits<unsigned>::max();
+ C.SetupCost = std::numeric_limits<unsigned>::max();
+ C.ScaleCost = std::numeric_limits<unsigned>::max();
}
/// Choose the lower cost.
@@ -1343,6 +1382,7 @@ bool Cost::isLess(Cost &Other, const TargetTransformInfo &TTI) {
return TTI.isLSRCostLess(C, Other.C);
}
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void Cost::print(raw_ostream &OS) const {
if (InsnsCost)
OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
@@ -1363,16 +1403,11 @@ void Cost::print(raw_ostream &OS) const {
OS << ", plus " << C.SetupCost << " setup cost";
}
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD void Cost::dump() const {
print(errs()); errs() << '\n';
}
#endif
-LSRFixup::LSRFixup()
- : UserInst(nullptr), OperandValToReplace(nullptr),
- Offset(0) {}
-
/// Test whether this fixup always uses its value outside of the given loop.
bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
// PHI nodes use their value in their incoming blocks.
@@ -1387,6 +1422,7 @@ bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
return !L->contains(UserInst);
}
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void LSRFixup::print(raw_ostream &OS) const {
OS << "UserInst=";
// Store is common and interesting enough to be worth special-casing.
@@ -1410,7 +1446,6 @@ void LSRFixup::print(raw_ostream &OS) const {
OS << ", Offset=" << Offset;
}
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD void LSRFixup::dump() const {
print(errs()); errs() << '\n';
}
@@ -1493,6 +1528,7 @@ void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
RegUses.dropRegister(S, LUIdx);
}
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void LSRUse::print(raw_ostream &OS) const {
OS << "LSR Use: Kind=";
switch (Kind) {
@@ -1526,7 +1562,6 @@ void LSRUse::print(raw_ostream &OS) const {
OS << ", widest fixup type: " << *WidestFixupType;
}
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD void LSRUse::dump() const {
print(errs()); errs() << '\n';
}
@@ -1535,11 +1570,12 @@ LLVM_DUMP_METHOD void LSRUse::dump() const {
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
LSRUse::KindType Kind, MemAccessTy AccessTy,
GlobalValue *BaseGV, int64_t BaseOffset,
- bool HasBaseReg, int64_t Scale) {
+ bool HasBaseReg, int64_t Scale,
+ Instruction *Fixup/*= nullptr*/) {
switch (Kind) {
case LSRUse::Address:
return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
- HasBaseReg, Scale, AccessTy.AddrSpace);
+ HasBaseReg, Scale, AccessTy.AddrSpace, Fixup);
case LSRUse::ICmpZero:
// There's not even a target hook for querying whether it would be legal to
@@ -1564,7 +1600,8 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
// ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
// Offs is the ICmp immediate.
if (Scale == 0)
- // The cast does the right thing with INT64_MIN.
+ // The cast does the right thing with
+ // std::numeric_limits<int64_t>::min().
BaseOffset = -(uint64_t)BaseOffset;
return TTI.isLegalICmpImmediate(BaseOffset);
}
@@ -1645,6 +1682,16 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
const LSRUse &LU, const Formula &F) {
+ // Target may want to look at the user instructions.
+ if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
+ for (const LSRFixup &Fixup : LU.Fixups)
+ if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
+ (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
+ F.Scale, Fixup.UserInst))
+ return false;
+ return true;
+ }
+
return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
F.Scale);
@@ -1752,22 +1799,21 @@ struct IVInc {
Value* IVOperand;
const SCEV *IncExpr;
- IVInc(Instruction *U, Value *O, const SCEV *E):
- UserInst(U), IVOperand(O), IncExpr(E) {}
+ IVInc(Instruction *U, Value *O, const SCEV *E)
+ : UserInst(U), IVOperand(O), IncExpr(E) {}
};
// The list of IV increments in program order. We typically add the head of a
// chain without finding subsequent links.
struct IVChain {
- SmallVector<IVInc,1> Incs;
- const SCEV *ExprBase;
-
- IVChain() : ExprBase(nullptr) {}
+ SmallVector<IVInc, 1> Incs;
+ const SCEV *ExprBase = nullptr;
+ IVChain() = default;
IVChain(const IVInc &Head, const SCEV *Base)
- : Incs(1, Head), ExprBase(Base) {}
+ : Incs(1, Head), ExprBase(Base) {}
- typedef SmallVectorImpl<IVInc>::const_iterator const_iterator;
+ using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
// Return the first increment in the chain.
const_iterator begin() const {
@@ -1809,13 +1855,13 @@ class LSRInstance {
LoopInfo &LI;
const TargetTransformInfo &TTI;
Loop *const L;
- bool Changed;
+ bool Changed = false;
/// This is the insert position that the current loop's induction variable
/// increment should be placed. In simple loops, this is the latch block's
/// terminator. But in more complicated cases, this is a position which will
/// dominate all the in-loop post-increment users.
- Instruction *IVIncInsertPos;
+ Instruction *IVIncInsertPos = nullptr;
/// Interesting factors between use strides.
///
@@ -1861,7 +1907,7 @@ class LSRInstance {
void CollectFixupsAndInitialFormulae();
// Support for sharing of LSRUses between LSRFixups.
- typedef DenseMap<LSRUse::SCEVUseKindPair, size_t> UseMapTy;
+ using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
UseMapTy UseMap;
bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
@@ -2002,6 +2048,14 @@ void LSRInstance::OptimizeShadowIV() {
if (!PH) continue;
if (PH->getNumIncomingValues() != 2) continue;
+ // If the calculation in integers overflows, the result in FP type will
+ // differ. So we only can do this transformation if we are guaranteed to not
+ // deal with overflowing values
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
+ if (!AR) continue;
+ if (IsSigned && !AR->hasNoSignedWrap()) continue;
+ if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
+
Type *SrcTy = PH->getType();
int Mantissa = DestTy->getFPMantissaWidth();
if (Mantissa == -1) continue;
@@ -2094,7 +2148,7 @@ bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
/// unfortunately this can come up even for loops where the user didn't use
/// a C do-while loop. For example, seemingly well-behaved top-test loops
/// will commonly be lowered like this:
-//
+///
/// if (n > 0) {
/// i = 0;
/// do {
@@ -2128,7 +2182,6 @@ bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
/// This function solves this problem by detecting this type of loop and
/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
/// the instructions for the maximum computation.
-///
ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
// Check that the loop matches the pattern we're looking for.
if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
@@ -2268,7 +2321,6 @@ LSRInstance::OptimizeLoopTermCond() {
// Otherwise treat this as a rotated loop.
for (BasicBlock *ExitingBlock : ExitingBlocks) {
-
// Get the terminating condition for the loop if possible. If we
// can, we want to change it to use a post-incremented version of its
// induction variable, to allow coalescing the live ranges for the IV into
@@ -2333,7 +2385,7 @@ LSRInstance::OptimizeLoopTermCond() {
C->getValue().isMinSignedValue())
goto decline_post_inc;
// Check for possible scaled-address reuse.
- MemAccessTy AccessTy = getAccessType(UI->getUser());
+ MemAccessTy AccessTy = getAccessType(TTI, UI->getUser());
int64_t Scale = C->getSExtValue();
if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
/*BaseOffset=*/0,
@@ -2941,7 +2993,7 @@ void LSRInstance::CollectChains() {
// consider leaf IV Users. This effectively rediscovers a portion of
// IVUsers analysis but in program order this time.
if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
- continue;
+ continue;
// Remove this instruction from any NearUsers set it may be in.
for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
@@ -3003,13 +3055,13 @@ void LSRInstance::FinalizeChain(IVChain &Chain) {
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
Value *Operand, const TargetTransformInfo &TTI) {
const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
- if (!IncConst || !isAddressUse(UserInst, Operand))
+ if (!IncConst || !isAddressUse(TTI, UserInst, Operand))
return false;
if (IncConst->getAPInt().getMinSignedBits() > 64)
return false;
- MemAccessTy AccessTy = getAccessType(UserInst);
+ MemAccessTy AccessTy = getAccessType(TTI, UserInst);
int64_t IncOffset = IncConst->getValue()->getSExtValue();
if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
IncOffset, /*HaseBaseReg=*/false))
@@ -3136,14 +3188,14 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
LSRUse::KindType Kind = LSRUse::Basic;
MemAccessTy AccessTy;
- if (isAddressUse(UserInst, U.getOperandValToReplace())) {
+ if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
Kind = LSRUse::Address;
- AccessTy = getAccessType(UserInst);
+ AccessTy = getAccessType(TTI, UserInst);
}
const SCEV *S = IU.getExpr(U);
PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
-
+
// Equality (== and !=) ICmps are special. We can rewrite (i == N) as
// (N - i == 0), and this allows (N - i) to be the expression that we work
// with rather than just N or i, so we can consider the register
@@ -3432,7 +3484,6 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
JE = AddOps.end();
J != JE; ++J) {
-
// Loop-variant "unknown" values are uninteresting; we won't be able to
// do anything meaningful with them.
if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
@@ -3654,12 +3705,18 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
// Don't do this if there is more than one offset.
if (LU.MinOffset != LU.MaxOffset) return;
+ // Check if transformation is valid. It is illegal to multiply pointer.
+ if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
+ return;
+ for (const SCEV *BaseReg : Base.BaseRegs)
+ if (BaseReg->getType()->isPointerTy())
+ return;
assert(!Base.BaseGV && "ICmpZero use is not legal!");
// Check each interesting stride.
for (int64_t Factor : Factors) {
// Check that the multiplication doesn't overflow.
- if (Base.BaseOffset == INT64_MIN && Factor == -1)
+ if (Base.BaseOffset == std::numeric_limits<int64_t>::min() && Factor == -1)
continue;
int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
if (NewBaseOffset / Factor != Base.BaseOffset)
@@ -3671,7 +3728,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
// Check that multiplying with the use offset doesn't overflow.
int64_t Offset = LU.MinOffset;
- if (Offset == INT64_MIN && Factor == -1)
+ if (Offset == std::numeric_limits<int64_t>::min() && Factor == -1)
continue;
Offset = (uint64_t)Offset * Factor;
if (Offset / Factor != LU.MinOffset)
@@ -3709,7 +3766,8 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
// Check that multiplying with the unfolded offset doesn't overflow.
if (F.UnfoldedOffset != 0) {
- if (F.UnfoldedOffset == INT64_MIN && Factor == -1)
+ if (F.UnfoldedOffset == std::numeric_limits<int64_t>::min() &&
+ Factor == -1)
continue;
F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
@@ -3833,7 +3891,7 @@ struct WorkItem {
const SCEV *OrigReg;
WorkItem(size_t LI, int64_t I, const SCEV *R)
- : LUIdx(LI), Imm(I), OrigReg(R) {}
+ : LUIdx(LI), Imm(I), OrigReg(R) {}
void print(raw_ostream &OS) const;
void dump() const;
@@ -3841,12 +3899,12 @@ struct WorkItem {
} // end anonymous namespace
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void WorkItem::print(raw_ostream &OS) const {
OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
<< " , add offset " << Imm;
}
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD void WorkItem::dump() const {
print(errs()); errs() << '\n';
}
@@ -3856,7 +3914,8 @@ LLVM_DUMP_METHOD void WorkItem::dump() const {
/// opportunities between them.
void LSRInstance::GenerateCrossUseConstantOffsets() {
// Group the registers by their value without any added constant offset.
- typedef std::map<int64_t, const SCEV *> ImmMapTy;
+ using ImmMapTy = std::map<int64_t, const SCEV *>;
+
DenseMap<const SCEV *, ImmMapTy> Map;
DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
SmallVector<const SCEV *, 8> Sequence;
@@ -4060,8 +4119,9 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
// Collect the best formula for each unique set of shared registers. This
// is reset for each use.
- typedef DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>
- BestFormulaeTy;
+ using BestFormulaeTy =
+ DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>;
+
BestFormulaeTy BestFormulae;
for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
@@ -4148,7 +4208,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
}
// This is a rough guess that seems to work fairly well.
-static const size_t ComplexityLimit = UINT16_MAX;
+static const size_t ComplexityLimit = std::numeric_limits<uint16_t>::max();
/// Estimate the worst-case number of solutions the solver might have to
/// consider. It almost never considers this many solutions because it prune the
@@ -4267,7 +4327,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
LUThatHas->pushFixup(Fixup);
DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
}
-
+
// Delete formulae from the new use which are no longer legal.
bool Any = false;
for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
@@ -4332,7 +4392,8 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
"from the Formulae with the same Scale and ScaledReg.\n");
// Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
- typedef DenseMap<std::pair<const SCEV *, int64_t>, size_t> BestFormulaeTy;
+ using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
+
BestFormulaeTy BestFormulae;
#ifndef NDEBUG
bool ChangedFormulae = false;
@@ -4454,7 +4515,6 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
/// Use3:
/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
/// reg(c) + reg({b,+,1}) 1 + 2/3
-
void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
if (EstimateSearchSpaceComplexity() < ComplexityLimit)
return;
@@ -4549,7 +4609,6 @@ void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
print_uses(dbgs()));
}
-
/// Pick a register which seems likely to be profitable, and then in any use
/// which has any reference to that register, delete all formulae which do not
/// reference that register.
@@ -5196,8 +5255,7 @@ void LSRInstance::ImplementSolution(
LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
DominatorTree &DT, LoopInfo &LI,
const TargetTransformInfo &TTI)
- : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L), Changed(false),
- IVIncInsertPos(nullptr) {
+ : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L) {
// If LoopSimplify form is not available, stay out of trouble.
if (!L->isLoopSimplifyForm())
return;
@@ -5302,6 +5360,7 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
ImplementSolution(Solution);
}
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
if (Factors.empty() && Types.empty()) return;
@@ -5352,7 +5411,6 @@ void LSRInstance::print(raw_ostream &OS) const {
print_uses(OS);
}
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD void LSRInstance::dump() const {
print(errs()); errs() << '\n';
}
@@ -5448,6 +5506,7 @@ PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM,
}
char LoopStrengthReduce::ID = 0;
+
INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
"Loop Strength Reduction", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 530a68424d5c..7b1d6446a24a 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1,4 +1,4 @@
-//===-- LoopUnroll.cpp - Loop unroller pass -------------------------------===//
+//===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -13,29 +13,55 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/LoopUnrollAnalyzer.h"
-#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
-#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
-#include <climits>
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <tuple>
#include <utility>
using namespace llvm;
@@ -79,6 +105,10 @@ static cl::opt<unsigned> UnrollFullMaxCount(
cl::desc(
"Set the max unroll count for full unrolling, for testing purposes"));
+static cl::opt<unsigned> UnrollPeelCount(
+ "unroll-peel-count", cl::Hidden,
+ cl::desc("Set the unroll peeling count, for testing purposes"));
+
static cl::opt<bool>
UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
cl::desc("Allows loops to be partially unrolled until "
@@ -114,6 +144,10 @@ static cl::opt<bool>
cl::desc("Allows loops to be peeled when the dynamic "
"trip count is known to be low."));
+static cl::opt<bool> UnrollUnrollRemainder(
+ "unroll-remainder", cl::Hidden,
+ cl::desc("Allow the loop remainder to be unrolled."));
+
// This option isn't ever intended to be enabled, it serves to allow
// experiments to check the assumptions about when this kind of revisit is
// necessary.
@@ -126,7 +160,7 @@ static cl::opt<bool> UnrollRevisitChildLoops(
/// A magic value for use with the Threshold parameter to indicate
/// that the loop unroll should be performed regardless of how much
/// code expansion would result.
-static const unsigned NoThreshold = UINT_MAX;
+static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
/// Gather the various unrolling parameters based on the defaults, compiler
/// flags, TTI overrides and user specified parameters.
@@ -134,7 +168,7 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
- Optional<bool> UserUpperBound) {
+ Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling) {
TargetTransformInfo::UnrollingPreferences UP;
// Set up the defaults
@@ -146,12 +180,13 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
UP.Count = 0;
UP.PeelCount = 0;
UP.DefaultUnrollRuntimeCount = 8;
- UP.MaxCount = UINT_MAX;
- UP.FullUnrollMaxCount = UINT_MAX;
+ UP.MaxCount = std::numeric_limits<unsigned>::max();
+ UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
UP.BEInsns = 2;
UP.Partial = false;
UP.Runtime = false;
UP.AllowRemainder = true;
+ UP.UnrollRemainder = false;
UP.AllowExpensiveTripCount = false;
UP.Force = false;
UP.UpperBound = false;
@@ -177,6 +212,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
UP.MaxCount = UnrollMaxCount;
if (UnrollFullMaxCount.getNumOccurrences() > 0)
UP.FullUnrollMaxCount = UnrollFullMaxCount;
+ if (UnrollPeelCount.getNumOccurrences() > 0)
+ UP.PeelCount = UnrollPeelCount;
if (UnrollAllowPartial.getNumOccurrences() > 0)
UP.Partial = UnrollAllowPartial;
if (UnrollAllowRemainder.getNumOccurrences() > 0)
@@ -187,6 +224,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
UP.UpperBound = false;
if (UnrollAllowPeeling.getNumOccurrences() > 0)
UP.AllowPeeling = UnrollAllowPeeling;
+ if (UnrollUnrollRemainder.getNumOccurrences() > 0)
+ UP.UnrollRemainder = UnrollUnrollRemainder;
// Apply user values provided by argument
if (UserThreshold.hasValue()) {
@@ -201,11 +240,14 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
UP.Runtime = *UserRuntime;
if (UserUpperBound.hasValue())
UP.UpperBound = *UserUpperBound;
+ if (UserAllowPeeling.hasValue())
+ UP.AllowPeeling = *UserAllowPeeling;
return UP;
}
namespace {
+
/// A struct to densely store the state of an instruction after unrolling at
/// each iteration.
///
@@ -221,25 +263,27 @@ struct UnrolledInstState {
/// Hashing and equality testing for a set of the instruction states.
struct UnrolledInstStateKeyInfo {
- typedef DenseMapInfo<Instruction *> PtrInfo;
- typedef DenseMapInfo<std::pair<Instruction *, int>> PairInfo;
+ using PtrInfo = DenseMapInfo<Instruction *>;
+ using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>;
+
static inline UnrolledInstState getEmptyKey() {
return {PtrInfo::getEmptyKey(), 0, 0, 0};
}
+
static inline UnrolledInstState getTombstoneKey() {
return {PtrInfo::getTombstoneKey(), 0, 0, 0};
}
+
static inline unsigned getHashValue(const UnrolledInstState &S) {
return PairInfo::getHashValue({S.I, S.Iteration});
}
+
static inline bool isEqual(const UnrolledInstState &LHS,
const UnrolledInstState &RHS) {
return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
}
};
-}
-namespace {
struct EstimatedUnrollCost {
/// \brief The estimated cost after unrolling.
unsigned UnrolledCost;
@@ -248,7 +292,8 @@ struct EstimatedUnrollCost {
/// rolled form.
unsigned RolledDynamicCost;
};
-}
+
+} // end anonymous namespace
/// \brief Figure out if the loop is worth full unrolling.
///
@@ -270,7 +315,8 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
// We want to be able to scale offsets by the trip count and add more offsets
// to them without checking for overflows, and we already don't want to
// analyze *massive* trip counts, so we force the max to be reasonably small.
- assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) &&
+ assert(UnrollMaxIterationsCountToAnalyze <
+ (unsigned)(std::numeric_limits<int>::max() / 2) &&
"The unroll iterations max is too large!");
// Only analyze inner loops. We can't properly estimate cost of nested loops
@@ -633,43 +679,6 @@ static unsigned UnrollCountPragmaValue(const Loop *L) {
return 0;
}
-// Remove existing unroll metadata and add unroll disable metadata to
-// indicate the loop has already been unrolled. This prevents a loop
-// from being unrolled more than is directed by a pragma if the loop
-// unrolling pass is run more than once (which it generally is).
-static void SetLoopAlreadyUnrolled(Loop *L) {
- MDNode *LoopID = L->getLoopID();
- // First remove any existing loop unrolling metadata.
- SmallVector<Metadata *, 4> MDs;
- // Reserve first location for self reference to the LoopID metadata node.
- MDs.push_back(nullptr);
-
- if (LoopID) {
- for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
- bool IsUnrollMetadata = false;
- MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
- if (MD) {
- const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
- IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
- }
- if (!IsUnrollMetadata)
- MDs.push_back(LoopID->getOperand(i));
- }
- }
-
- // Add unroll(disable) metadata to disable future unrolling.
- LLVMContext &Context = L->getHeader()->getContext();
- SmallVector<Metadata *, 1> DisableOperands;
- DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
- MDNode *DisableNode = MDNode::get(Context, DisableOperands);
- MDs.push_back(DisableNode);
-
- MDNode *NewLoopID = MDNode::get(Context, MDs);
- // Set operand 0 to refer to the loop id itself.
- NewLoopID->replaceOperandWith(0, NewLoopID);
- L->setLoopID(NewLoopID);
-}
-
// Computes the boosting factor for complete unrolling.
// If fully unrolling the loop would save a lot of RolledDynamicCost, it would
// be beneficial to fully unroll the loop even if unrolledcost is large. We
@@ -677,7 +686,7 @@ static void SetLoopAlreadyUnrolled(Loop *L) {
// the unroll threshold.
static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
unsigned MaxPercentThresholdBoost) {
- if (Cost.RolledDynamicCost >= UINT_MAX / 100)
+ if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
return 100;
else if (Cost.UnrolledCost != 0)
// The boosting factor is RolledDynamicCost / UnrolledCost
@@ -826,11 +835,14 @@ static bool computeUnrollCount(
}
if (UP.Count < 2) {
if (PragmaEnableUnroll)
- ORE->emit(
- OptimizationRemarkMissed(DEBUG_TYPE, "UnrollAsDirectedTooLarge",
- L->getStartLoc(), L->getHeader())
- << "Unable to unroll loop as directed by unroll(enable) pragma "
- "because unrolled size is too large.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE,
+ "UnrollAsDirectedTooLarge",
+ L->getStartLoc(), L->getHeader())
+ << "Unable to unroll loop as directed by unroll(enable) "
+ "pragma "
+ "because unrolled size is too large.";
+ });
UP.Count = 0;
}
} else {
@@ -840,22 +852,27 @@ static bool computeUnrollCount(
UP.Count = UP.MaxCount;
if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
UP.Count != TripCount)
- ORE->emit(
- OptimizationRemarkMissed(DEBUG_TYPE, "FullUnrollAsDirectedTooLarge",
- L->getStartLoc(), L->getHeader())
- << "Unable to fully unroll loop as directed by unroll pragma because "
- "unrolled size is too large.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE,
+ "FullUnrollAsDirectedTooLarge",
+ L->getStartLoc(), L->getHeader())
+ << "Unable to fully unroll loop as directed by unroll pragma "
+ "because "
+ "unrolled size is too large.";
+ });
return ExplicitUnroll;
}
assert(TripCount == 0 &&
"All cases when TripCount is constant should be covered here.");
if (PragmaFullUnroll)
- ORE->emit(
- OptimizationRemarkMissed(DEBUG_TYPE,
- "CantFullUnrollAsDirectedRuntimeTripCount",
- L->getStartLoc(), L->getHeader())
- << "Unable to fully unroll loop as directed by unroll(full) pragma "
- "because loop has a runtime trip count.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(
+ DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
+ L->getStartLoc(), L->getHeader())
+ << "Unable to fully unroll loop as directed by unroll(full) "
+ "pragma "
+ "because loop has a runtime trip count.";
+ });
// 6th priority is runtime unrolling.
// Don't unroll a runtime trip count loop when it is disabled.
@@ -904,19 +921,23 @@ static bool computeUnrollCount(
"multiple, "
<< TripMultiple << ". Reducing unroll count from "
<< OrigCount << " to " << UP.Count << ".\n");
+
using namespace ore;
+
if (PragmaCount > 0 && !UP.AllowRemainder)
- ORE->emit(
- OptimizationRemarkMissed(DEBUG_TYPE,
- "DifferentUnrollCountFromDirected",
- L->getStartLoc(), L->getHeader())
- << "Unable to unroll loop the number of times directed by "
- "unroll_count pragma because remainder loop is restricted "
- "(that could architecture specific or because the loop "
- "contains a convergent instruction) and so must have an unroll "
- "count that divides the loop trip multiple of "
- << NV("TripMultiple", TripMultiple) << ". Unrolling instead "
- << NV("UnrollCount", UP.Count) << " time(s).");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE,
+ "DifferentUnrollCountFromDirected",
+ L->getStartLoc(), L->getHeader())
+ << "Unable to unroll loop the number of times directed by "
+ "unroll_count pragma because remainder loop is restricted "
+ "(that could architecture specific or because the loop "
+ "contains a convergent instruction) and so must have an "
+ "unroll "
+ "count that divides the loop trip multiple of "
+ << NV("TripMultiple", TripMultiple) << ". Unrolling instead "
+ << NV("UnrollCount", UP.Count) << " time(s).";
+ });
}
if (UP.Count > UP.MaxCount)
@@ -927,23 +948,21 @@ static bool computeUnrollCount(
return ExplicitUnroll;
}
-static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
- ScalarEvolution &SE, const TargetTransformInfo &TTI,
- AssumptionCache &AC, OptimizationRemarkEmitter &ORE,
- bool PreserveLCSSA, int OptLevel,
- Optional<unsigned> ProvidedCount,
- Optional<unsigned> ProvidedThreshold,
- Optional<bool> ProvidedAllowPartial,
- Optional<bool> ProvidedRuntime,
- Optional<bool> ProvidedUpperBound) {
+static LoopUnrollResult tryToUnrollLoop(
+ Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
+ const TargetTransformInfo &TTI, AssumptionCache &AC,
+ OptimizationRemarkEmitter &ORE, bool PreserveLCSSA, int OptLevel,
+ Optional<unsigned> ProvidedCount, Optional<unsigned> ProvidedThreshold,
+ Optional<bool> ProvidedAllowPartial, Optional<bool> ProvidedRuntime,
+ Optional<bool> ProvidedUpperBound, Optional<bool> ProvidedAllowPeeling) {
DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName()
<< "] Loop %" << L->getHeader()->getName() << "\n");
- if (HasUnrollDisablePragma(L))
- return false;
- if (!L->isLoopSimplifyForm()) {
+ if (HasUnrollDisablePragma(L))
+ return LoopUnrollResult::Unmodified;
+ if (!L->isLoopSimplifyForm()) {
DEBUG(
dbgs() << " Not unrolling loop which is not in loop-simplify form.\n");
- return false;
+ return LoopUnrollResult::Unmodified;
}
unsigned NumInlineCandidates;
@@ -951,21 +970,22 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
bool Convergent;
TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
L, SE, TTI, OptLevel, ProvidedThreshold, ProvidedCount,
- ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound);
+ ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
+ ProvidedAllowPeeling);
// Exit early if unrolling is disabled.
if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
- return false;
+ return LoopUnrollResult::Unmodified;
unsigned LoopSize = ApproximateLoopSize(
L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC, UP.BEInsns);
DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
if (NotDuplicatable) {
DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
<< " instructions.\n");
- return false;
+ return LoopUnrollResult::Unmodified;
}
if (NumInlineCandidates != 0) {
DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
- return false;
+ return LoopUnrollResult::Unmodified;
}
// Find trip count and trip multiple if count is not available
@@ -1024,41 +1044,35 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount,
TripMultiple, LoopSize, UP, UseUpperBound);
if (!UP.Count)
- return false;
+ return LoopUnrollResult::Unmodified;
// Unroll factor (Count) must be less or equal to TripCount.
if (TripCount && UP.Count > TripCount)
UP.Count = TripCount;
// Unroll the loop.
- if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime,
- UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero,
- TripMultiple, UP.PeelCount, LI, &SE, &DT, &AC, &ORE,
- PreserveLCSSA))
- return false;
+ LoopUnrollResult UnrollResult = UnrollLoop(
+ L, UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
+ UseUpperBound, MaxOrZero, TripMultiple, UP.PeelCount, UP.UnrollRemainder,
+ LI, &SE, &DT, &AC, &ORE, PreserveLCSSA);
+ if (UnrollResult == LoopUnrollResult::Unmodified)
+ return LoopUnrollResult::Unmodified;
// If loop has an unroll count pragma or unrolled by explicitly set count
// mark loop as unrolled to prevent unrolling beyond that requested.
// If the loop was peeled, we already "used up" the profile information
// we had, so we don't want to unroll or peel again.
- if (IsCountSetExplicitly || UP.PeelCount)
- SetLoopAlreadyUnrolled(L);
+ if (UnrollResult != LoopUnrollResult::FullyUnrolled &&
+ (IsCountSetExplicitly || UP.PeelCount))
+ L->setLoopAlreadyUnrolled();
- return true;
+ return UnrollResult;
}
namespace {
+
class LoopUnroll : public LoopPass {
public:
static char ID; // Pass ID, replacement for typeid
- LoopUnroll(int OptLevel = 2, Optional<unsigned> Threshold = None,
- Optional<unsigned> Count = None,
- Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
- Optional<bool> UpperBound = None)
- : LoopPass(ID), OptLevel(OptLevel), ProvidedCount(std::move(Count)),
- ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
- ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound) {
- initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
- }
int OptLevel;
Optional<unsigned> ProvidedCount;
@@ -1066,8 +1080,21 @@ public:
Optional<bool> ProvidedAllowPartial;
Optional<bool> ProvidedRuntime;
Optional<bool> ProvidedUpperBound;
+ Optional<bool> ProvidedAllowPeeling;
- bool runOnLoop(Loop *L, LPPassManager &) override {
+ LoopUnroll(int OptLevel = 2, Optional<unsigned> Threshold = None,
+ Optional<unsigned> Count = None,
+ Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
+ Optional<bool> UpperBound = None,
+ Optional<bool> AllowPeeling = None)
+ : LoopPass(ID), OptLevel(OptLevel), ProvidedCount(std::move(Count)),
+ ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
+ ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
+ ProvidedAllowPeeling(AllowPeeling) {
+ initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
if (skipLoop(L))
return false;
@@ -1085,15 +1112,19 @@ public:
OptimizationRemarkEmitter ORE(&F);
bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
- return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel,
- ProvidedCount, ProvidedThreshold,
- ProvidedAllowPartial, ProvidedRuntime,
- ProvidedUpperBound);
+ LoopUnrollResult Result = tryToUnrollLoop(
+ L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel, ProvidedCount,
+ ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime,
+ ProvidedUpperBound, ProvidedAllowPeeling);
+
+ if (Result == LoopUnrollResult::FullyUnrolled)
+ LPM.markLoopAsDeleted(*L);
+
+ return Result != LoopUnrollResult::Unmodified;
}
/// This transformation requires natural loop information & requires that
/// loop preheaders be inserted into the CFG...
- ///
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetTransformInfoWrapperPass>();
@@ -1102,9 +1133,11 @@ public:
getLoopAnalysisUsage(AU);
}
};
-}
+
+} // end anonymous namespace
char LoopUnroll::ID = 0;
+
INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
@@ -1112,8 +1145,8 @@ INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count,
- int AllowPartial, int Runtime,
- int UpperBound) {
+ int AllowPartial, int Runtime, int UpperBound,
+ int AllowPeeling) {
// TODO: It would make more sense for this function to take the optionals
// directly, but that's dangerous since it would silently break out of tree
// callers.
@@ -1122,16 +1155,17 @@ Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count,
Count == -1 ? None : Optional<unsigned>(Count),
AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
Runtime == -1 ? None : Optional<bool>(Runtime),
- UpperBound == -1 ? None : Optional<bool>(UpperBound));
+ UpperBound == -1 ? None : Optional<bool>(UpperBound),
+ AllowPeeling == -1 ? None : Optional<bool>(AllowPeeling));
}
Pass *llvm::createSimpleLoopUnrollPass(int OptLevel) {
- return llvm::createLoopUnrollPass(OptLevel, -1, -1, 0, 0, 0);
+ return createLoopUnrollPass(OptLevel, -1, -1, 0, 0, 0, 0);
}
-PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
- LoopStandardAnalysisResults &AR,
- LPMUpdater &Updater) {
+PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &Updater) {
const auto &FAM =
AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
Function *F = L.getHeader()->getParent();
@@ -1139,8 +1173,9 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
// FIXME: This should probably be optional rather than required.
if (!ORE)
- report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not "
- "cached at a higher level");
+ report_fatal_error(
+ "LoopFullUnrollPass: OptimizationRemarkEmitterAnalysis not "
+ "cached at a higher level");
// Keep track of the previous loop structure so we can identify new loops
// created by unrolling.
@@ -1151,17 +1186,14 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
else
OldLoops.insert(AR.LI.begin(), AR.LI.end());
- // The API here is quite complex to call, but there are only two interesting
- // states we support: partial and full (or "simple") unrolling. However, to
- // enable these things we actually pass "None" in for the optional to avoid
- // providing an explicit choice.
- Optional<bool> AllowPartialParam, RuntimeParam, UpperBoundParam;
- if (!AllowPartialUnrolling)
- AllowPartialParam = RuntimeParam = UpperBoundParam = false;
- bool Changed = tryToUnrollLoop(
- &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
- /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
- /*Threshold*/ None, AllowPartialParam, RuntimeParam, UpperBoundParam);
+ std::string LoopName = L.getName();
+
+ bool Changed =
+ tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
+ /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
+ /*Threshold*/ None, /*AllowPartial*/ false,
+ /*Runtime*/ false, /*UpperBound*/ false,
+ /*AllowPeeling*/ false) != LoopUnrollResult::Unmodified;
if (!Changed)
return PreservedAnalyses::all();
@@ -1172,17 +1204,13 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
#endif
// Unrolling can do several things to introduce new loops into a loop nest:
- // - Partial unrolling clones child loops within the current loop. If it
- // uses a remainder, then it can also create any number of sibling loops.
// - Full unrolling clones child loops within the current loop but then
// removes the current loop making all of the children appear to be new
// sibling loops.
- // - Loop peeling can directly introduce new sibling loops by peeling one
- // iteration.
//
- // When a new loop appears as a sibling loop, either from peeling an
- // iteration or fully unrolling, its nesting structure has fundamentally
- // changed and we want to revisit it to reflect that.
+ // When a new loop appears as a sibling loop after fully unrolling,
+ // its nesting structure has fundamentally changed and we want to revisit
+ // it to reflect that.
//
// When unrolling has removed the current loop, we need to tell the
// infrastructure that it is gone.
@@ -1209,13 +1237,11 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
Updater.addSiblingLoops(SibLoops);
if (!IsCurrentLoopValid) {
- Updater.markLoopAsDeleted(L);
+ Updater.markLoopAsDeleted(L, LoopName);
} else {
// We can only walk child loops if the current loop remained valid.
if (UnrollRevisitChildLoops) {
- // Walk *all* of the child loops. This is a highly speculative mode
- // anyways so look for any simplifications that arose from partial
- // unrolling or peeling off of iterations.
+ // Walk *all* of the child loops.
SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
Updater.addChildLoops(ChildLoops);
}
@@ -1223,3 +1249,105 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
return getLoopPassPreservedAnalyses();
}
+
+template <typename RangeT>
+static SmallVector<Loop *, 8> appendLoopsToWorklist(RangeT &&Loops) {
+ SmallVector<Loop *, 8> Worklist;
+ // We use an internal worklist to build up the preorder traversal without
+ // recursion.
+ SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist;
+
+ for (Loop *RootL : Loops) {
+ assert(PreOrderLoops.empty() && "Must start with an empty preorder walk.");
+ assert(PreOrderWorklist.empty() &&
+ "Must start with an empty preorder walk worklist.");
+ PreOrderWorklist.push_back(RootL);
+ do {
+ Loop *L = PreOrderWorklist.pop_back_val();
+ PreOrderWorklist.append(L->begin(), L->end());
+ PreOrderLoops.push_back(L);
+ } while (!PreOrderWorklist.empty());
+
+ Worklist.append(PreOrderLoops.begin(), PreOrderLoops.end());
+ PreOrderLoops.clear();
+ }
+ return Worklist;
+}
+
+PreservedAnalyses LoopUnrollPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+ LoopAnalysisManager *LAM = nullptr;
+ if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
+ LAM = &LAMProxy->getManager();
+
+ const ModuleAnalysisManager &MAM =
+ AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
+ ProfileSummaryInfo *PSI =
+ MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+
+ bool Changed = false;
+
+ // The unroller requires loops to be in simplified form, and also needs LCSSA.
+ // Since simplification may add new inner loops, it has to run before the
+ // legality and profitability checks. This means running the loop unroller
+ // will simplify all loops, regardless of whether anything end up being
+ // unrolled.
+ for (auto &L : LI) {
+ Changed |= simplifyLoop(L, &DT, &LI, &SE, &AC, false /* PreserveLCSSA */);
+ Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
+ }
+
+ SmallVector<Loop *, 8> Worklist = appendLoopsToWorklist(LI);
+
+ while (!Worklist.empty()) {
+ // Because the LoopInfo stores the loops in RPO, we walk the worklist
+ // from back to front so that we work forward across the CFG, which
+ // for unrolling is only needed to get optimization remarks emitted in
+ // a forward order.
+ Loop &L = *Worklist.pop_back_val();
+#ifndef NDEBUG
+ Loop *ParentL = L.getParentLoop();
+#endif
+
+ // The API here is quite complex to call, but there are only two interesting
+ // states we support: partial and full (or "simple") unrolling. However, to
+ // enable these things we actually pass "None" in for the optional to avoid
+ // providing an explicit choice.
+ Optional<bool> AllowPartialParam, RuntimeParam, UpperBoundParam,
+ AllowPeeling;
+ // Check if the profile summary indicates that the profiled application
+ // has a huge working set size, in which case we disable peeling to avoid
+ // bloating it further.
+ if (PSI && PSI->hasHugeWorkingSetSize())
+ AllowPeeling = false;
+ std::string LoopName = L.getName();
+ LoopUnrollResult Result =
+ tryToUnrollLoop(&L, DT, &LI, SE, TTI, AC, ORE,
+ /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
+ /*Threshold*/ None, AllowPartialParam, RuntimeParam,
+ UpperBoundParam, AllowPeeling);
+ Changed |= Result != LoopUnrollResult::Unmodified;
+
+ // The parent must not be damaged by unrolling!
+#ifndef NDEBUG
+ if (Result != LoopUnrollResult::Unmodified && ParentL)
+ ParentL->verifyLoop();
+#endif
+
+ // Clear any cached analysis results for L if we removed it completely.
+ if (LAM && Result == LoopUnrollResult::FullyUnrolled)
+ LAM->clear(L, LoopName);
+ }
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ return getLoopPassPreservedAnalyses();
+}
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index d0c96fa627a4..bd468338a1d0 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -1,4 +1,4 @@
-//===-- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop ------===//
+//===- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop -------===//
//
// The LLVM Compiler Infrastructure
//
@@ -26,30 +26,40 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
-#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
-#include "llvm/Support/BranchProbability.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -58,9 +68,15 @@
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
#include <algorithm>
+#include <cassert>
#include <map>
#include <set>
+#include <tuple>
+#include <utility>
+#include <vector>
+
using namespace llvm;
#define DEBUG_TYPE "loop-unswitch"
@@ -82,11 +98,9 @@ Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
namespace {
class LUAnalysisCache {
-
- typedef DenseMap<const SwitchInst*, SmallPtrSet<const Value *, 8> >
- UnswitchedValsMap;
-
- typedef UnswitchedValsMap::iterator UnswitchedValsIt;
+ using UnswitchedValsMap =
+ DenseMap<const SwitchInst *, SmallPtrSet<const Value *, 8>>;
+ using UnswitchedValsIt = UnswitchedValsMap::iterator;
struct LoopProperties {
unsigned CanBeUnswitchedCount;
@@ -97,12 +111,12 @@ namespace {
// Here we use std::map instead of DenseMap, since we need to keep valid
// LoopProperties pointer for current loop for better performance.
- typedef std::map<const Loop*, LoopProperties> LoopPropsMap;
- typedef LoopPropsMap::iterator LoopPropsMapIt;
+ using LoopPropsMap = std::map<const Loop *, LoopProperties>;
+ using LoopPropsMapIt = LoopPropsMap::iterator;
LoopPropsMap LoopsProperties;
- UnswitchedValsMap *CurLoopInstructions;
- LoopProperties *CurrentLoopProperties;
+ UnswitchedValsMap *CurLoopInstructions = nullptr;
+ LoopProperties *CurrentLoopProperties = nullptr;
// A loop unswitching with an estimated cost above this threshold
// is not performed. MaxSize is turned into unswitching quota for
@@ -121,9 +135,7 @@ namespace {
unsigned MaxSize;
public:
- LUAnalysisCache()
- : CurLoopInstructions(nullptr), CurrentLoopProperties(nullptr),
- MaxSize(Threshold) {}
+ LUAnalysisCache() : MaxSize(Threshold) {}
// Analyze loop. Check its size, calculate is it possible to unswitch
// it. Returns true if we can unswitch this loop.
@@ -164,12 +176,12 @@ namespace {
LUAnalysisCache BranchesInfo;
bool OptimizeForSize;
- bool redoLoop;
+ bool redoLoop = false;
- Loop *currentLoop;
- DominatorTree *DT;
- BasicBlock *loopHeader;
- BasicBlock *loopPreheader;
+ Loop *currentLoop = nullptr;
+ DominatorTree *DT = nullptr;
+ BasicBlock *loopHeader = nullptr;
+ BasicBlock *loopPreheader = nullptr;
bool SanitizeMemory;
LoopSafetyInfo SafetyInfo;
@@ -185,16 +197,17 @@ namespace {
public:
static char ID; // Pass ID, replacement for typeid
- explicit LoopUnswitch(bool Os = false, bool hasBranchDivergence = false) :
- LoopPass(ID), OptimizeForSize(Os), redoLoop(false),
- currentLoop(nullptr), DT(nullptr), loopHeader(nullptr),
- loopPreheader(nullptr), hasBranchDivergence(hasBranchDivergence) {
+
+ explicit LoopUnswitch(bool Os = false, bool hasBranchDivergence = false)
+ : LoopPass(ID), OptimizeForSize(Os),
+ hasBranchDivergence(hasBranchDivergence) {
initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
- }
+ }
bool runOnLoop(Loop *L, LPPassManager &LPM) override;
bool processCurrentLoop();
bool isUnreachableDueToPreviousUnswitching(BasicBlock *);
+
/// This transformation requires natural loop information & requires that
/// loop preheaders be inserted into the CFG.
///
@@ -207,7 +220,6 @@ namespace {
}
private:
-
void releaseMemory() override {
BranchesInfo.forgetLoop(currentLoop);
}
@@ -237,7 +249,7 @@ namespace {
void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
BasicBlock *TrueDest,
BasicBlock *FalseDest,
- Instruction *InsertPt,
+ BranchInst *OldBranch,
TerminatorInst *TI);
void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
@@ -247,13 +259,13 @@ namespace {
Value *SimplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant,
Constant *Val);
};
-}
+
+} // end anonymous namespace
// Analyze loop. Check its size, calculate is it possible to unswitch
// it. Returns true if we can unswitch this loop.
bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
AssumptionCache *AC) {
-
LoopPropsMapIt PropsIt;
bool Inserted;
std::tie(PropsIt, Inserted) =
@@ -302,7 +314,6 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
// Clean all data related to given loop.
void LUAnalysisCache::forgetLoop(const Loop *L) {
-
LoopPropsMapIt LIt = LoopsProperties.find(L);
if (LIt != LoopsProperties.end()) {
@@ -337,7 +348,6 @@ bool LUAnalysisCache::CostAllowsUnswitching() {
// Note, that new loop data is stored inside the VMap.
void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop,
const ValueToValueMapTy &VMap) {
-
LoopProperties &NewLoopProps = LoopsProperties[NewLoop];
LoopProperties &OldLoopProps = *CurrentLoopProperties;
UnswitchedValsMap &Insts = OldLoopProps.UnswitchedVals;
@@ -367,6 +377,7 @@ void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop,
}
char LoopUnswitch::ID = 0;
+
INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
@@ -518,9 +529,6 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
Changed |= processCurrentLoop();
} while(redoLoop);
- // FIXME: Reconstruct dom info, because it is not preserved properly.
- if (Changed)
- DT->recalculate(*F);
return Changed;
}
@@ -553,6 +561,48 @@ bool LoopUnswitch::isUnreachableDueToPreviousUnswitching(BasicBlock *BB) {
return false;
}
+/// FIXME: Remove this workaround when freeze related patches are done.
+/// LoopUnswitch and Equality propagation in GVN have discrepancy about
+/// whether branch on undef/poison has undefine behavior. Here it is to
+/// rule out some common cases that we found such discrepancy already
+/// causing problems. Detail could be found in PR31652. Note if the
+/// func returns true, it is unsafe. But if it is false, it doesn't mean
+/// it is necessarily safe.
+static bool EqualityPropUnSafe(Value &LoopCond) {
+ ICmpInst *CI = dyn_cast<ICmpInst>(&LoopCond);
+ if (!CI || !CI->isEquality())
+ return false;
+
+ Value *LHS = CI->getOperand(0);
+ Value *RHS = CI->getOperand(1);
+ if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS))
+ return true;
+
+ auto hasUndefInPHI = [](PHINode &PN) {
+ for (Value *Opd : PN.incoming_values()) {
+ if (isa<UndefValue>(Opd))
+ return true;
+ }
+ return false;
+ };
+ PHINode *LPHI = dyn_cast<PHINode>(LHS);
+ PHINode *RPHI = dyn_cast<PHINode>(RHS);
+ if ((LPHI && hasUndefInPHI(*LPHI)) || (RPHI && hasUndefInPHI(*RPHI)))
+ return true;
+
+ auto hasUndefInSelect = [](SelectInst &SI) {
+ if (isa<UndefValue>(SI.getTrueValue()) ||
+ isa<UndefValue>(SI.getFalseValue()))
+ return true;
+ return false;
+ };
+ SelectInst *LSI = dyn_cast<SelectInst>(LHS);
+ SelectInst *RSI = dyn_cast<SelectInst>(RHS);
+ if ((LSI && hasUndefInSelect(*LSI)) || (RSI && hasUndefInSelect(*RSI)))
+ return true;
+ return false;
+}
+
/// Do actual work and unswitch loop if possible and profitable.
bool LoopUnswitch::processCurrentLoop() {
bool Changed = false;
@@ -666,7 +716,7 @@ bool LoopUnswitch::processCurrentLoop() {
// unswitch on it if we desire.
Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
currentLoop, Changed).first;
- if (LoopCond &&
+ if (LoopCond && !EqualityPropUnSafe(*LoopCond) &&
UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
++NumBranches;
return true;
@@ -831,7 +881,7 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
/// mapping the blocks with the specified map.
static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
LoopInfo *LI, LPPassManager *LPM) {
- Loop &New = *new Loop();
+ Loop &New = *LI->AllocateLoop();
if (PL)
PL->addChildLoop(&New);
else
@@ -852,31 +902,59 @@ static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
}
/// Emit a conditional branch on two values if LIC == Val, branch to TrueDst,
-/// otherwise branch to FalseDest. Insert the code immediately before InsertPt.
+/// otherwise branch to FalseDest. Insert the code immediately before OldBranch
+/// and remove (but not erase!) it from the function.
void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
BasicBlock *TrueDest,
BasicBlock *FalseDest,
- Instruction *InsertPt,
+ BranchInst *OldBranch,
TerminatorInst *TI) {
+ assert(OldBranch->isUnconditional() && "Preheader is not split correctly");
// Insert a conditional branch on LIC to the two preheaders. The original
// code is the true version and the new code is the false version.
Value *BranchVal = LIC;
bool Swapped = false;
if (!isa<ConstantInt>(Val) ||
Val->getType() != Type::getInt1Ty(LIC->getContext()))
- BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val);
+ BranchVal = new ICmpInst(OldBranch, ICmpInst::ICMP_EQ, LIC, Val);
else if (Val != ConstantInt::getTrue(Val->getContext())) {
// We want to enter the new loop when the condition is true.
std::swap(TrueDest, FalseDest);
Swapped = true;
}
+ // Old branch will be removed, so save its parent and successor to update the
+ // DomTree.
+ auto *OldBranchSucc = OldBranch->getSuccessor(0);
+ auto *OldBranchParent = OldBranch->getParent();
+
// Insert the new branch.
BranchInst *BI =
- IRBuilder<>(InsertPt).CreateCondBr(BranchVal, TrueDest, FalseDest, TI);
+ IRBuilder<>(OldBranch).CreateCondBr(BranchVal, TrueDest, FalseDest, TI);
if (Swapped)
BI->swapProfMetadata();
+ // Remove the old branch so there is only one branch at the end. This is
+ // needed to perform DomTree's internal DFS walk on the function's CFG.
+ OldBranch->removeFromParent();
+
+ // Inform the DT about the new branch.
+ if (DT) {
+ // First, add both successors.
+ SmallVector<DominatorTree::UpdateType, 3> Updates;
+ if (TrueDest != OldBranchParent)
+ Updates.push_back({DominatorTree::Insert, OldBranchParent, TrueDest});
+ if (FalseDest != OldBranchParent)
+ Updates.push_back({DominatorTree::Insert, OldBranchParent, FalseDest});
+ // If both of the new successors are different from the old one, inform the
+ // DT that the edge was deleted.
+ if (OldBranchSucc != TrueDest && OldBranchSucc != FalseDest) {
+ Updates.push_back({DominatorTree::Delete, OldBranchParent, OldBranchSucc});
+ }
+
+ DT->applyUpdates(Updates);
+ }
+
// If either edge is critical, split it. This helps preserve LoopSimplify
// form for enclosing loops.
auto Options = CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA();
@@ -916,10 +994,14 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
// Okay, now we have a position to branch from and a position to branch to,
// insert the new conditional branch.
- EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH,
- loopPreheader->getTerminator(), TI);
- LPM->deleteSimpleAnalysisValue(loopPreheader->getTerminator(), L);
- loopPreheader->getTerminator()->eraseFromParent();
+ auto *OldBranch = dyn_cast<BranchInst>(loopPreheader->getTerminator());
+ assert(OldBranch && "Failed to split the preheader");
+ EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, OldBranch, TI);
+ LPM->deleteSimpleAnalysisValue(OldBranch, L);
+
+ // EmitPreheaderBranchOnCondition removed the OldBranch from the function.
+ // Delete it, as it is no longer needed.
+ delete OldBranch;
// We need to reprocess this loop, it could be unswitched again.
redoLoop = true;
@@ -1035,6 +1117,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
return false; // Can't handle this.
+ if (EqualityPropUnSafe(*LoopCond))
+ return false;
+
UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
CurrentTerm);
++NumBranches;
@@ -1231,7 +1316,10 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
EmitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR,
TI);
LPM->deleteSimpleAnalysisValue(OldBR, L);
- OldBR->eraseFromParent();
+
+ // The OldBr was replaced by a new one and removed (but not erased) by
+ // EmitPreheaderBranchOnCondition. It is no longer needed, so delete it.
+ delete OldBR;
LoopProcessWorklist.push_back(NewLoop);
redoLoop = true;
diff --git a/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index c23d891b6504..53b25e688e82 100644
--- a/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -1,4 +1,4 @@
-//===----------- LoopVersioningLICM.cpp - LICM Loop Versioning ------------===//
+//===- LoopVersioningLICM.cpp - LICM Loop Versioning ----------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -60,41 +60,41 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/Dominators.h"
-#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/PredIteratorCache.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <cassert>
+#include <memory>
+
+using namespace llvm;
#define DEBUG_TYPE "loop-versioning-licm"
-static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable";
-using namespace llvm;
+static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable";
/// Threshold minimum allowed percentage for possible
/// invariant instructions in a loop.
@@ -143,9 +143,16 @@ void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
}
namespace {
+
struct LoopVersioningLICM : public LoopPass {
static char ID;
+ LoopVersioningLICM()
+ : LoopPass(ID), LoopDepthThreshold(LVLoopDepthThreshold),
+ InvariantThreshold(LVInvarThreshold) {
+ initializeLoopVersioningLICMPass(*PassRegistry::getPassRegistry());
+ }
+
bool runOnLoop(Loop *L, LPPassManager &LPM) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -161,13 +168,6 @@ struct LoopVersioningLICM : public LoopPass {
AU.addPreserved<GlobalsAAWrapperPass>();
}
- LoopVersioningLICM()
- : LoopPass(ID), AA(nullptr), SE(nullptr), LAA(nullptr), LAI(nullptr),
- CurLoop(nullptr), LoopDepthThreshold(LVLoopDepthThreshold),
- InvariantThreshold(LVInvarThreshold), LoadAndStoreCounter(0),
- InvariantCounter(0), IsReadOnlyLoop(true) {
- initializeLoopVersioningLICMPass(*PassRegistry::getPassRegistry());
- }
StringRef getPassName() const override { return "Loop Versioning for LICM"; }
void reset() {
@@ -191,30 +191,49 @@ struct LoopVersioningLICM : public LoopPass {
};
private:
- AliasAnalysis *AA; // Current AliasAnalysis information
- ScalarEvolution *SE; // Current ScalarEvolution
- LoopAccessLegacyAnalysis *LAA; // Current LoopAccessAnalysis
- const LoopAccessInfo *LAI; // Current Loop's LoopAccessInfo
+ // Current AliasAnalysis information
+ AliasAnalysis *AA = nullptr;
+
+ // Current ScalarEvolution
+ ScalarEvolution *SE = nullptr;
+
+ // Current LoopAccessAnalysis
+ LoopAccessLegacyAnalysis *LAA = nullptr;
+
+ // Current Loop's LoopAccessInfo
+ const LoopAccessInfo *LAI = nullptr;
+
+ // The current loop we are working on.
+ Loop *CurLoop = nullptr;
+
+ // AliasSet information for the current loop.
+ std::unique_ptr<AliasSetTracker> CurAST;
- Loop *CurLoop; // The current loop we are working on.
- std::unique_ptr<AliasSetTracker>
- CurAST; // AliasSet information for the current loop.
+ // Maximum loop nest threshold
+ unsigned LoopDepthThreshold;
- unsigned LoopDepthThreshold; // Maximum loop nest threshold
- float InvariantThreshold; // Minimum invariant threshold
- unsigned LoadAndStoreCounter; // Counter to track num of load & store
- unsigned InvariantCounter; // Counter to track num of invariant
- bool IsReadOnlyLoop; // Read only loop marker.
+ // Minimum invariant threshold
+ float InvariantThreshold;
+
+ // Counter to track num of load & store
+ unsigned LoadAndStoreCounter = 0;
+
+ // Counter to track num of invariant
+ unsigned InvariantCounter = 0;
+
+ // Read only loop marker.
+ bool IsReadOnlyLoop = true;
bool isLegalForVersioning();
bool legalLoopStructure();
bool legalLoopInstructions();
bool legalLoopMemoryAccesses();
bool isLoopAlreadyVisited();
- void setNoAliasToLoop(Loop *);
- bool instructionSafeForVersioning(Instruction *);
+ void setNoAliasToLoop(Loop *VerLoop);
+ bool instructionSafeForVersioning(Instruction *I);
};
-}
+
+} // end anonymous namespace
/// \brief Check loop structure and confirms it's good for LoopVersioningLICM.
bool LoopVersioningLICM::legalLoopStructure() {
@@ -225,7 +244,7 @@ bool LoopVersioningLICM::legalLoopStructure() {
return false;
}
// Loop should be innermost loop, if not return false.
- if (CurLoop->getSubLoops().size()) {
+ if (!CurLoop->getSubLoops().empty()) {
DEBUG(dbgs() << " loop is not innermost\n");
return false;
}
@@ -562,6 +581,7 @@ bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) {
}
char LoopVersioningLICM::ID = 0;
+
INITIALIZE_PASS_BEGIN(LoopVersioningLICM, "loop-versioning-licm",
"Loop Versioning For LICM", false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp
index 6f77c5bd0d07..c165c5ece95c 100644
--- a/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -15,7 +15,6 @@
#include "llvm/Transforms/Scalar/LowerAtomic.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Scalar.h"
using namespace llvm;
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 7896396f0898..9c870b42a747 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -14,10 +14,12 @@
#include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
@@ -25,6 +27,8 @@
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
@@ -41,6 +45,7 @@
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -54,6 +59,7 @@
#include <algorithm>
#include <cassert>
#include <cstdint>
+#include <utility>
using namespace llvm;
@@ -225,15 +231,18 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
namespace {
class MemsetRanges {
+ using range_iterator = SmallVectorImpl<MemsetRange>::iterator;
+
/// A sorted list of the memset ranges.
SmallVector<MemsetRange, 8> Ranges;
- typedef SmallVectorImpl<MemsetRange>::iterator range_iterator;
+
const DataLayout &DL;
public:
MemsetRanges(const DataLayout &DL) : DL(DL) {}
- typedef SmallVectorImpl<MemsetRange>::const_iterator const_iterator;
+ using const_iterator = SmallVectorImpl<MemsetRange>::const_iterator;
+
const_iterator begin() const { return Ranges.begin(); }
const_iterator end() const { return Ranges.end(); }
bool empty() const { return Ranges.empty(); }
@@ -259,7 +268,6 @@ public:
void addRange(int64_t Start, int64_t Size, Value *Ptr,
unsigned Alignment, Instruction *Inst);
-
};
} // end anonymous namespace
@@ -356,10 +364,10 @@ private:
}
};
-char MemCpyOptLegacyPass::ID = 0;
-
} // end anonymous namespace
+char MemCpyOptLegacyPass::ID = 0;
+
/// The public interface to this file...
FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); }
@@ -450,7 +458,6 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
// emit memset's for anything big enough to be worthwhile.
Instruction *AMemSet = nullptr;
for (const MemsetRange &Range : Ranges) {
-
if (Range.TheStores.size() == 1) continue;
// If it is profitable to lower this range to memset, do so now.
@@ -511,7 +518,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
const LoadInst *LI) {
// If the store alias this position, early bail out.
MemoryLocation StoreLoc = MemoryLocation::get(SI);
- if (AA.getModRefInfo(P, StoreLoc) != MRI_NoModRef)
+ if (isModOrRefSet(AA.getModRefInfo(P, StoreLoc)))
return false;
// Keep track of the arguments of all instruction we plan to lift
@@ -535,20 +542,20 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) {
auto *C = &*I;
- bool MayAlias = AA.getModRefInfo(C) != MRI_NoModRef;
+ bool MayAlias = isModOrRefSet(AA.getModRefInfo(C, None));
bool NeedLift = false;
if (Args.erase(C))
NeedLift = true;
else if (MayAlias) {
NeedLift = llvm::any_of(MemLocs, [C, &AA](const MemoryLocation &ML) {
- return AA.getModRefInfo(C, ML);
+ return isModOrRefSet(AA.getModRefInfo(C, ML));
});
if (!NeedLift)
NeedLift =
llvm::any_of(CallSites, [C, &AA](const ImmutableCallSite &CS) {
- return AA.getModRefInfo(C, CS);
+ return isModOrRefSet(AA.getModRefInfo(C, CS));
});
}
@@ -558,18 +565,18 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
if (MayAlias) {
// Since LI is implicitly moved downwards past the lifted instructions,
// none of them may modify its source.
- if (AA.getModRefInfo(C, LoadLoc) & MRI_Mod)
+ if (isModSet(AA.getModRefInfo(C, LoadLoc)))
return false;
else if (auto CS = ImmutableCallSite(C)) {
// If we can't lift this before P, it's game over.
- if (AA.getModRefInfo(P, CS) != MRI_NoModRef)
+ if (isModOrRefSet(AA.getModRefInfo(P, CS)))
return false;
CallSites.push_back(CS);
} else if (isa<LoadInst>(C) || isa<StoreInst>(C) || isa<VAArgInst>(C)) {
// If we can't lift this before P, it's game over.
auto ML = MemoryLocation::get(C);
- if (AA.getModRefInfo(P, ML) != MRI_NoModRef)
+ if (isModOrRefSet(AA.getModRefInfo(P, ML)))
return false;
MemLocs.push_back(ML);
@@ -624,7 +631,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
// of at the store position.
Instruction *P = SI;
for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) {
- if (AA.getModRefInfo(&I, LoadLoc) & MRI_Mod) {
+ if (isModSet(AA.getModRefInfo(&I, LoadLoc))) {
P = &I;
break;
}
@@ -695,7 +702,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
MemoryLocation StoreLoc = MemoryLocation::get(SI);
for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator();
I != E; --I) {
- if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) {
+ if (isModOrRefSet(AA.getModRefInfo(&*I, StoreLoc))) {
C = nullptr;
break;
}
@@ -927,9 +934,9 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
AliasAnalysis &AA = LookupAliasAnalysis();
ModRefInfo MR = AA.getModRefInfo(C, cpyDest, srcSize);
// If necessary, perform additional analysis.
- if (MR != MRI_NoModRef)
+ if (isModOrRefSet(MR))
MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT);
- if (MR != MRI_NoModRef)
+ if (isModOrRefSet(MR))
return false;
// We can't create address space casts here because we don't know if they're
diff --git a/lib/Transforms/Scalar/MergeICmps.cpp b/lib/Transforms/Scalar/MergeICmps.cpp
new file mode 100644
index 000000000000..9869a3fb96fa
--- /dev/null
+++ b/lib/Transforms/Scalar/MergeICmps.cpp
@@ -0,0 +1,650 @@
+//===- MergeICmps.cpp - Optimize chains of integer comparisons ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass turns chains of integer comparisons into memcmp (the memcmp is
+// later typically inlined as a chain of efficient hardware comparisons). This
+// typically benefits c++ member or nonmember operator==().
+//
+// The basic idea is to replace a larger chain of integer comparisons loaded
+// from contiguous memory locations into a smaller chain of such integer
+// comparisons. Benefits are double:
+// - There are less jumps, and therefore less opportunities for mispredictions
+// and I-cache misses.
+// - Code size is smaller, both because jumps are removed and because the
+// encoding of a 2*n byte compare is smaller than that of two n-byte
+// compares.
+
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <numeric>
+#include <utility>
+#include <vector>
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+
+using namespace llvm;
+
+namespace {
+
+#define DEBUG_TYPE "mergeicmps"
+
+// A BCE atom.
+struct BCEAtom {
+ BCEAtom() : GEP(nullptr), LoadI(nullptr), Offset() {}
+
+ const Value *Base() const { return GEP ? GEP->getPointerOperand() : nullptr; }
+
+ bool operator<(const BCEAtom &O) const {
+ assert(Base() && "invalid atom");
+ assert(O.Base() && "invalid atom");
+ // Just ordering by (Base(), Offset) is sufficient. However because this
+ // means that the ordering will depend on the addresses of the base
+ // values, which are not reproducible from run to run. To guarantee
+ // stability, we use the names of the values if they exist; we sort by:
+ // (Base.getName(), Base(), Offset).
+ const int NameCmp = Base()->getName().compare(O.Base()->getName());
+ if (NameCmp == 0) {
+ if (Base() == O.Base()) {
+ return Offset.slt(O.Offset);
+ }
+ return Base() < O.Base();
+ }
+ return NameCmp < 0;
+ }
+
+ GetElementPtrInst *GEP;
+ LoadInst *LoadI;
+ APInt Offset;
+};
+
+// If this value is a load from a constant offset w.r.t. a base address, and
+// there are no othe rusers of the load or address, returns the base address and
+// the offset.
+BCEAtom visitICmpLoadOperand(Value *const Val) {
+ BCEAtom Result;
+ if (auto *const LoadI = dyn_cast<LoadInst>(Val)) {
+ DEBUG(dbgs() << "load\n");
+ if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
+ DEBUG(dbgs() << "used outside of block\n");
+ return {};
+ }
+ if (LoadI->isVolatile()) {
+ DEBUG(dbgs() << "volatile\n");
+ return {};
+ }
+ Value *const Addr = LoadI->getOperand(0);
+ if (auto *const GEP = dyn_cast<GetElementPtrInst>(Addr)) {
+ DEBUG(dbgs() << "GEP\n");
+ if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
+ DEBUG(dbgs() << "used outside of block\n");
+ return {};
+ }
+ const auto &DL = GEP->getModule()->getDataLayout();
+ if (!isDereferenceablePointer(GEP, DL)) {
+ DEBUG(dbgs() << "not dereferenceable\n");
+ // We need to make sure that we can do comparison in any order, so we
+ // require memory to be unconditionnally dereferencable.
+ return {};
+ }
+ Result.Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0);
+ if (GEP->accumulateConstantOffset(DL, Result.Offset)) {
+ Result.GEP = GEP;
+ Result.LoadI = LoadI;
+ }
+ }
+ }
+ return Result;
+}
+
+// A basic block with a comparison between two BCE atoms.
+// Note: the terminology is misleading: the comparison is symmetric, so there
+// is no real {l/r}hs. What we want though is to have the same base on the
+// left (resp. right), so that we can detect consecutive loads. To ensure this
+// we put the smallest atom on the left.
+class BCECmpBlock {
+ public:
+ BCECmpBlock() {}
+
+ BCECmpBlock(BCEAtom L, BCEAtom R, int SizeBits)
+ : Lhs_(L), Rhs_(R), SizeBits_(SizeBits) {
+ if (Rhs_ < Lhs_) std::swap(Rhs_, Lhs_);
+ }
+
+ bool IsValid() const {
+ return Lhs_.Base() != nullptr && Rhs_.Base() != nullptr;
+ }
+
+ // Assert the the block is consistent: If valid, it should also have
+ // non-null members besides Lhs_ and Rhs_.
+ void AssertConsistent() const {
+ if (IsValid()) {
+ assert(BB);
+ assert(CmpI);
+ assert(BranchI);
+ }
+ }
+
+ const BCEAtom &Lhs() const { return Lhs_; }
+ const BCEAtom &Rhs() const { return Rhs_; }
+ int SizeBits() const { return SizeBits_; }
+
+ // Returns true if the block does other works besides comparison.
+ bool doesOtherWork() const;
+
+ // The basic block where this comparison happens.
+ BasicBlock *BB = nullptr;
+ // The ICMP for this comparison.
+ ICmpInst *CmpI = nullptr;
+ // The terminating branch.
+ BranchInst *BranchI = nullptr;
+
+ private:
+ BCEAtom Lhs_;
+ BCEAtom Rhs_;
+ int SizeBits_ = 0;
+};
+
+bool BCECmpBlock::doesOtherWork() const {
+ AssertConsistent();
+ // TODO(courbet): Can we allow some other things ? This is very conservative.
+ // We might be able to get away with anything does does not have any side
+ // effects outside of the basic block.
+ // Note: The GEPs and/or loads are not necessarily in the same block.
+ for (const Instruction &Inst : *BB) {
+ if (const auto *const GEP = dyn_cast<GetElementPtrInst>(&Inst)) {
+ if (!(Lhs_.GEP == GEP || Rhs_.GEP == GEP)) return true;
+ } else if (const auto *const L = dyn_cast<LoadInst>(&Inst)) {
+ if (!(Lhs_.LoadI == L || Rhs_.LoadI == L)) return true;
+ } else if (const auto *const C = dyn_cast<ICmpInst>(&Inst)) {
+ if (C != CmpI) return true;
+ } else if (const auto *const Br = dyn_cast<BranchInst>(&Inst)) {
+ if (Br != BranchI) return true;
+ } else {
+ return true;
+ }
+ }
+ return false;
+}
+
+// Visit the given comparison. If this is a comparison between two valid
+// BCE atoms, returns the comparison.
+BCECmpBlock visitICmp(const ICmpInst *const CmpI,
+ const ICmpInst::Predicate ExpectedPredicate) {
+ if (CmpI->getPredicate() == ExpectedPredicate) {
+ DEBUG(dbgs() << "cmp "
+ << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
+ << "\n");
+ auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0));
+ if (!Lhs.Base()) return {};
+ auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1));
+ if (!Rhs.Base()) return {};
+ return BCECmpBlock(std::move(Lhs), std::move(Rhs),
+ CmpI->getOperand(0)->getType()->getScalarSizeInBits());
+ }
+ return {};
+}
+
+// Visit the given comparison block. If this is a comparison between two valid
+// BCE atoms, returns the comparison.
+BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
+ const BasicBlock *const PhiBlock) {
+ if (Block->empty()) return {};
+ auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator());
+ if (!BranchI) return {};
+ DEBUG(dbgs() << "branch\n");
+ if (BranchI->isUnconditional()) {
+ // In this case, we expect an incoming value which is the result of the
+ // comparison. This is the last link in the chain of comparisons (note
+ // that this does not mean that this is the last incoming value, blocks
+ // can be reordered).
+ auto *const CmpI = dyn_cast<ICmpInst>(Val);
+ if (!CmpI) return {};
+ DEBUG(dbgs() << "icmp\n");
+ auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ);
+ Result.CmpI = CmpI;
+ Result.BranchI = BranchI;
+ return Result;
+ } else {
+ // In this case, we expect a constant incoming value (the comparison is
+ // chained).
+ const auto *const Const = dyn_cast<ConstantInt>(Val);
+ DEBUG(dbgs() << "const\n");
+ if (!Const->isZero()) return {};
+ DEBUG(dbgs() << "false\n");
+ auto *const CmpI = dyn_cast<ICmpInst>(BranchI->getCondition());
+ if (!CmpI) return {};
+ DEBUG(dbgs() << "icmp\n");
+ assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch");
+ BasicBlock *const FalseBlock = BranchI->getSuccessor(1);
+ auto Result = visitICmp(
+ CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE);
+ Result.CmpI = CmpI;
+ Result.BranchI = BranchI;
+ return Result;
+ }
+ return {};
+}
+
+// A chain of comparisons.
+class BCECmpChain {
+ public:
+ BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi);
+
+ int size() const { return Comparisons_.size(); }
+
+#ifdef MERGEICMPS_DOT_ON
+ void dump() const;
+#endif // MERGEICMPS_DOT_ON
+
+ bool simplify(const TargetLibraryInfo *const TLI);
+
+ private:
+ static bool IsContiguous(const BCECmpBlock &First,
+ const BCECmpBlock &Second) {
+ return First.Lhs().Base() == Second.Lhs().Base() &&
+ First.Rhs().Base() == Second.Rhs().Base() &&
+ First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset &&
+ First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset;
+ }
+
+ // Merges the given comparison blocks into one memcmp block and update
+ // branches. Comparisons are assumed to be continguous. If NextBBInChain is
+ // null, the merged block will link to the phi block.
+ static void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
+ BasicBlock *const NextBBInChain, PHINode &Phi,
+ const TargetLibraryInfo *const TLI);
+
+ PHINode &Phi_;
+ std::vector<BCECmpBlock> Comparisons_;
+ // The original entry block (before sorting);
+ BasicBlock *EntryBlock_;
+};
+
+BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi)
+ : Phi_(Phi) {
+ // Now look inside blocks to check for BCE comparisons.
+ std::vector<BCECmpBlock> Comparisons;
+ for (BasicBlock *Block : Blocks) {
+ BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block),
+ Block, Phi.getParent());
+ Comparison.BB = Block;
+ if (!Comparison.IsValid()) {
+ DEBUG(dbgs() << "skip: not a valid BCECmpBlock\n");
+ return;
+ }
+ if (Comparison.doesOtherWork()) {
+ DEBUG(dbgs() << "block does extra work besides compare\n");
+ if (Comparisons.empty()) { // First block.
+ // TODO(courbet): The first block can do other things, and we should
+ // split them apart in a separate block before the comparison chain.
+ // Right now we just discard it and make the chain shorter.
+ DEBUG(dbgs()
+ << "ignoring first block that does extra work besides compare\n");
+ continue;
+ }
+ // TODO(courbet): Right now we abort the whole chain. We could be
+ // merging only the blocks that don't do other work and resume the
+ // chain from there. For example:
+ // if (a[0] == b[0]) { // bb1
+ // if (a[1] == b[1]) { // bb2
+ // some_value = 3; //bb3
+ // if (a[2] == b[2]) { //bb3
+ // do a ton of stuff //bb4
+ // }
+ // }
+ // }
+ //
+ // This is:
+ //
+ // bb1 --eq--> bb2 --eq--> bb3* -eq--> bb4 --+
+ // \ \ \ \
+ // ne ne ne \
+ // \ \ \ v
+ // +------------+-----------+----------> bb_phi
+ //
+ // We can only merge the first two comparisons, because bb3* does
+ // "other work" (setting some_value to 3).
+ // We could still merge bb1 and bb2 though.
+ return;
+ }
+ DEBUG(dbgs() << "*Found cmp of " << Comparison.SizeBits()
+ << " bits between " << Comparison.Lhs().Base() << " + "
+ << Comparison.Lhs().Offset << " and "
+ << Comparison.Rhs().Base() << " + " << Comparison.Rhs().Offset
+ << "\n");
+ DEBUG(dbgs() << "\n");
+ Comparisons.push_back(Comparison);
+ }
+ EntryBlock_ = Comparisons[0].BB;
+ Comparisons_ = std::move(Comparisons);
+#ifdef MERGEICMPS_DOT_ON
+ errs() << "BEFORE REORDERING:\n\n";
+ dump();
+#endif // MERGEICMPS_DOT_ON
+ // Reorder blocks by LHS. We can do that without changing the
+ // semantics because we are only accessing dereferencable memory.
+ std::sort(Comparisons_.begin(), Comparisons_.end(),
+ [](const BCECmpBlock &a, const BCECmpBlock &b) {
+ return a.Lhs() < b.Lhs();
+ });
+#ifdef MERGEICMPS_DOT_ON
+ errs() << "AFTER REORDERING:\n\n";
+ dump();
+#endif // MERGEICMPS_DOT_ON
+}
+
+#ifdef MERGEICMPS_DOT_ON
+void BCECmpChain::dump() const {
+ errs() << "digraph dag {\n";
+ errs() << " graph [bgcolor=transparent];\n";
+ errs() << " node [color=black,style=filled,fillcolor=lightyellow];\n";
+ errs() << " edge [color=black];\n";
+ for (size_t I = 0; I < Comparisons_.size(); ++I) {
+ const auto &Comparison = Comparisons_[I];
+ errs() << " \"" << I << "\" [label=\"%"
+ << Comparison.Lhs().Base()->getName() << " + "
+ << Comparison.Lhs().Offset << " == %"
+ << Comparison.Rhs().Base()->getName() << " + "
+ << Comparison.Rhs().Offset << " (" << (Comparison.SizeBits() / 8)
+ << " bytes)\"];\n";
+ const Value *const Val = Phi_.getIncomingValueForBlock(Comparison.BB);
+ if (I > 0) errs() << " \"" << (I - 1) << "\" -> \"" << I << "\";\n";
+ errs() << " \"" << I << "\" -> \"Phi\" [label=\"" << *Val << "\"];\n";
+ }
+ errs() << " \"Phi\" [label=\"Phi\"];\n";
+ errs() << "}\n\n";
+}
+#endif // MERGEICMPS_DOT_ON
+
+bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI) {
+ // First pass to check if there is at least one merge. If not, we don't do
+ // anything and we keep analysis passes intact.
+ {
+ bool AtLeastOneMerged = false;
+ for (size_t I = 1; I < Comparisons_.size(); ++I) {
+ if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) {
+ AtLeastOneMerged = true;
+ break;
+ }
+ }
+ if (!AtLeastOneMerged) return false;
+ }
+
+ // Remove phi references to comparison blocks, they will be rebuilt as we
+ // merge the blocks.
+ for (const auto &Comparison : Comparisons_) {
+ Phi_.removeIncomingValue(Comparison.BB, false);
+ }
+
+ // Point the predecessors of the chain to the first comparison block (which is
+ // the new entry point).
+ if (EntryBlock_ != Comparisons_[0].BB)
+ EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB);
+
+ // Effectively merge blocks.
+ int NumMerged = 1;
+ for (size_t I = 1; I < Comparisons_.size(); ++I) {
+ if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) {
+ ++NumMerged;
+ } else {
+ // Merge all previous comparisons and start a new merge block.
+ mergeComparisons(
+ makeArrayRef(Comparisons_).slice(I - NumMerged, NumMerged),
+ Comparisons_[I].BB, Phi_, TLI);
+ NumMerged = 1;
+ }
+ }
+ mergeComparisons(makeArrayRef(Comparisons_)
+ .slice(Comparisons_.size() - NumMerged, NumMerged),
+ nullptr, Phi_, TLI);
+
+ return true;
+}
+
+void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
+ BasicBlock *const NextBBInChain,
+ PHINode &Phi,
+ const TargetLibraryInfo *const TLI) {
+ assert(!Comparisons.empty());
+ const auto &FirstComparison = *Comparisons.begin();
+ BasicBlock *const BB = FirstComparison.BB;
+ LLVMContext &Context = BB->getContext();
+
+ if (Comparisons.size() >= 2) {
+ DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n");
+ const auto TotalSize =
+ std::accumulate(Comparisons.begin(), Comparisons.end(), 0,
+ [](int Size, const BCECmpBlock &C) {
+ return Size + C.SizeBits();
+ }) /
+ 8;
+
+ // Incoming edges do not need to be updated, and both GEPs are already
+ // computing the right address, we just need to:
+ // - replace the two loads and the icmp with the memcmp
+ // - update the branch
+ // - update the incoming values in the phi.
+ FirstComparison.BranchI->eraseFromParent();
+ FirstComparison.CmpI->eraseFromParent();
+ FirstComparison.Lhs().LoadI->eraseFromParent();
+ FirstComparison.Rhs().LoadI->eraseFromParent();
+
+ IRBuilder<> Builder(BB);
+ const auto &DL = Phi.getModule()->getDataLayout();
+ Value *const MemCmpCall = emitMemCmp(
+ FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP, ConstantInt::get(DL.getIntPtrType(Context), TotalSize),
+ Builder, DL, TLI);
+ Value *const MemCmpIsZero = Builder.CreateICmpEQ(
+ MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0));
+
+ // Add a branch to the next basic block in the chain.
+ if (NextBBInChain) {
+ Builder.CreateCondBr(MemCmpIsZero, NextBBInChain, Phi.getParent());
+ Phi.addIncoming(ConstantInt::getFalse(Context), BB);
+ } else {
+ Builder.CreateBr(Phi.getParent());
+ Phi.addIncoming(MemCmpIsZero, BB);
+ }
+
+ // Delete merged blocks.
+ for (size_t I = 1; I < Comparisons.size(); ++I) {
+ BasicBlock *CBB = Comparisons[I].BB;
+ CBB->replaceAllUsesWith(BB);
+ CBB->eraseFromParent();
+ }
+ } else {
+ assert(Comparisons.size() == 1);
+ // There are no blocks to merge, but we still need to update the branches.
+ DEBUG(dbgs() << "Only one comparison, updating branches\n");
+ if (NextBBInChain) {
+ if (FirstComparison.BranchI->isConditional()) {
+ DEBUG(dbgs() << "conditional -> conditional\n");
+ // Just update the "true" target, the "false" target should already be
+ // the phi block.
+ assert(FirstComparison.BranchI->getSuccessor(1) == Phi.getParent());
+ FirstComparison.BranchI->setSuccessor(0, NextBBInChain);
+ Phi.addIncoming(ConstantInt::getFalse(Context), BB);
+ } else {
+ DEBUG(dbgs() << "unconditional -> conditional\n");
+ // Replace the unconditional branch by a conditional one.
+ FirstComparison.BranchI->eraseFromParent();
+ IRBuilder<> Builder(BB);
+ Builder.CreateCondBr(FirstComparison.CmpI, NextBBInChain,
+ Phi.getParent());
+ Phi.addIncoming(FirstComparison.CmpI, BB);
+ }
+ } else {
+ if (FirstComparison.BranchI->isConditional()) {
+ DEBUG(dbgs() << "conditional -> unconditional\n");
+ // Replace the conditional branch by an unconditional one.
+ FirstComparison.BranchI->eraseFromParent();
+ IRBuilder<> Builder(BB);
+ Builder.CreateBr(Phi.getParent());
+ Phi.addIncoming(FirstComparison.CmpI, BB);
+ } else {
+ DEBUG(dbgs() << "unconditional -> unconditional\n");
+ Phi.addIncoming(FirstComparison.CmpI, BB);
+ }
+ }
+ }
+}
+
+std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
+ BasicBlock *const LastBlock,
+ int NumBlocks) {
+ // Walk up from the last block to find other blocks.
+ std::vector<BasicBlock *> Blocks(NumBlocks);
+ BasicBlock *CurBlock = LastBlock;
+ for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) {
+ if (CurBlock->hasAddressTaken()) {
+ // Somebody is jumping to the block through an address, all bets are
+ // off.
+ DEBUG(dbgs() << "skip: block " << BlockIndex
+ << " has its address taken\n");
+ return {};
+ }
+ Blocks[BlockIndex] = CurBlock;
+ auto *SinglePredecessor = CurBlock->getSinglePredecessor();
+ if (!SinglePredecessor) {
+ // The block has two or more predecessors.
+ DEBUG(dbgs() << "skip: block " << BlockIndex
+ << " has two or more predecessors\n");
+ return {};
+ }
+ if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) {
+ // The block does not link back to the phi.
+ DEBUG(dbgs() << "skip: block " << BlockIndex
+ << " does not link back to the phi\n");
+ return {};
+ }
+ CurBlock = SinglePredecessor;
+ }
+ Blocks[0] = CurBlock;
+ return Blocks;
+}
+
+bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
+ DEBUG(dbgs() << "processPhi()\n");
+ if (Phi.getNumIncomingValues() <= 1) {
+ DEBUG(dbgs() << "skip: only one incoming value in phi\n");
+ return false;
+ }
+ // We are looking for something that has the following structure:
+ // bb1 --eq--> bb2 --eq--> bb3 --eq--> bb4 --+
+ // \ \ \ \
+ // ne ne ne \
+ // \ \ \ v
+ // +------------+-----------+----------> bb_phi
+ //
+ // - The last basic block (bb4 here) must branch unconditionally to bb_phi.
+ // It's the only block that contributes a non-constant value to the Phi.
+ // - All other blocks (b1, b2, b3) must have exactly two successors, one of
+ // them being the the phi block.
+ // - All intermediate blocks (bb2, bb3) must have only one predecessor.
+ // - Blocks cannot do other work besides the comparison, see doesOtherWork()
+
+ // The blocks are not necessarily ordered in the phi, so we start from the
+ // last block and reconstruct the order.
+ BasicBlock *LastBlock = nullptr;
+ for (unsigned I = 0; I < Phi.getNumIncomingValues(); ++I) {
+ if (isa<ConstantInt>(Phi.getIncomingValue(I))) continue;
+ if (LastBlock) {
+ // There are several non-constant values.
+ DEBUG(dbgs() << "skip: several non-constant values\n");
+ return false;
+ }
+ LastBlock = Phi.getIncomingBlock(I);
+ }
+ if (!LastBlock) {
+ // There is no non-constant block.
+ DEBUG(dbgs() << "skip: no non-constant block\n");
+ return false;
+ }
+ if (LastBlock->getSingleSuccessor() != Phi.getParent()) {
+ DEBUG(dbgs() << "skip: last block non-phi successor\n");
+ return false;
+ }
+
+ const auto Blocks =
+ getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues());
+ if (Blocks.empty()) return false;
+ BCECmpChain CmpChain(Blocks, Phi);
+
+ if (CmpChain.size() < 2) {
+ DEBUG(dbgs() << "skip: only one compare block\n");
+ return false;
+ }
+
+ return CmpChain.simplify(TLI);
+}
+
+class MergeICmps : public FunctionPass {
+ public:
+ static char ID;
+
+ MergeICmps() : FunctionPass(ID) {
+ initializeMergeICmpsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F)) return false;
+ const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto PA = runImpl(F, &TLI, &TTI);
+ return !PA.areAllPreserved();
+ }
+
+ private:
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+
+ PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI);
+};
+
+PreservedAnalyses MergeICmps::runImpl(Function &F, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI) {
+ DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n");
+
+ // We only try merging comparisons if the target wants to expand memcmp later.
+ // The rationale is to avoid turning small chains into memcmp calls.
+ if (!TTI->enableMemCmpExpansion(true)) return PreservedAnalyses::all();
+
+ bool MadeChange = false;
+
+ for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) {
+ // A Phi operation is always first in a basic block.
+ if (auto *const Phi = dyn_cast<PHINode>(&*BBIt->begin()))
+ MadeChange |= processPhi(*Phi, TLI);
+ }
+
+ if (MadeChange) return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+}
+
+} // namespace
+
+char MergeICmps::ID = 0;
+INITIALIZE_PASS_BEGIN(MergeICmps, "mergeicmps",
+ "Merge contiguous icmps into a memcmp", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(MergeICmps, "mergeicmps",
+ "Merge contiguous icmps into a memcmp", false, false)
+
+Pass *llvm::createMergeICmpsPass() { return new MergeICmps(); }
diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 6727cf0179c1..f2f615cb9b0f 100644
--- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -80,11 +80,9 @@
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Metadata.h"
-#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
@@ -195,7 +193,7 @@ bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start,
make_range(Start.getIterator(), End.getIterator()))
if (Inst.mayThrow())
return true;
- return AA->canInstructionRangeModRef(Start, End, Loc, MRI_ModRef);
+ return AA->canInstructionRangeModRef(Start, End, Loc, ModRefInfo::ModRef);
}
///
diff --git a/lib/Transforms/Scalar/NaryReassociate.cpp b/lib/Transforms/Scalar/NaryReassociate.cpp
index d0bfe3603897..b026c8d692c3 100644
--- a/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -77,19 +77,45 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/NaryReassociate.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
#include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+
using namespace llvm;
using namespace PatternMatch;
#define DEBUG_TYPE "nary-reassociate"
namespace {
+
class NaryReassociateLegacyPass : public FunctionPass {
public:
static char ID;
@@ -101,6 +127,7 @@ public:
bool doInitialization(Module &M) override {
return false;
}
+
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -118,9 +145,11 @@ public:
private:
NaryReassociatePass Impl;
};
-} // anonymous namespace
+
+} // end anonymous namespace
char NaryReassociateLegacyPass::ID = 0;
+
INITIALIZE_PASS_BEGIN(NaryReassociateLegacyPass, "nary-reassociate",
"Nary reassociation", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 8ac10348eb77..9ebf2d769356 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -1,4 +1,4 @@
-//===---- NewGVN.cpp - Global Value Numbering Pass --------------*- C++ -*-===//
+//===- NewGVN.cpp - Global Value Numbering Pass ---------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,6 +6,7 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
+//
/// \file
/// This file implements the new LLVM's Global Value Numbering pass.
/// GVN partitions values computed by a function into congruence classes.
@@ -48,59 +49,81 @@
/// published algorithms are O(Instructions). Instead, we use a technique that
/// is O(number of operations with the same value number), enabling us to skip
/// trying to eliminate things that have unique value numbers.
+//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/NewGVN.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PointerIntPair.h"
#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/CFGPrinter.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/Dominators.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
#include "llvm/Support/Allocator.h"
+#include "llvm/Support/ArrayRecycler.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/PointerLikeTypeTraits.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVNExpression.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/PredicateInfo.h"
#include "llvm/Transforms/Utils/VNCoercion.h"
-#include <numeric>
-#include <unordered_map>
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
#include <utility>
#include <vector>
+
using namespace llvm;
-using namespace PatternMatch;
using namespace llvm::GVNExpression;
using namespace llvm::VNCoercion;
+
#define DEBUG_TYPE "newgvn"
STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted");
@@ -118,15 +141,19 @@ STATISTIC(NumGVNPHIOfOpsCreated, "Number of PHI of ops created");
STATISTIC(NumGVNPHIOfOpsEliminations,
"Number of things eliminated using PHI of ops");
DEBUG_COUNTER(VNCounter, "newgvn-vn",
- "Controls which instructions are value numbered")
+ "Controls which instructions are value numbered");
DEBUG_COUNTER(PHIOfOpsCounter, "newgvn-phi",
- "Controls which instructions we create phi of ops for")
+ "Controls which instructions we create phi of ops for");
// Currently store defining access refinement is too slow due to basicaa being
// egregiously slow. This flag lets us keep it working while we work on this
// issue.
static cl::opt<bool> EnableStoreRefinement("enable-store-refinement",
cl::init(false), cl::Hidden);
+/// Currently, the generation "phi of ops" can result in correctness issues.
+static cl::opt<bool> EnablePhiOfOps("enable-phi-of-ops", cl::init(true),
+ cl::Hidden);
+
//===----------------------------------------------------------------------===//
// GVN Pass
//===----------------------------------------------------------------------===//
@@ -134,6 +161,7 @@ static cl::opt<bool> EnableStoreRefinement("enable-store-refinement",
// Anchor methods.
namespace llvm {
namespace GVNExpression {
+
Expression::~Expression() = default;
BasicExpression::~BasicExpression() = default;
CallExpression::~CallExpression() = default;
@@ -141,8 +169,11 @@ LoadExpression::~LoadExpression() = default;
StoreExpression::~StoreExpression() = default;
AggregateValueExpression::~AggregateValueExpression() = default;
PHIExpression::~PHIExpression() = default;
-}
-}
+
+} // end namespace GVNExpression
+} // end namespace llvm
+
+namespace {
// Tarjan's SCC finding algorithm with Nuutila's improvements
// SCCIterator is actually fairly complex for the simple thing we want.
@@ -153,7 +184,6 @@ PHIExpression::~PHIExpression() = default;
// instructions,
// not generic values (arguments, etc).
struct TarjanSCC {
-
TarjanSCC() : Components(1) {}
void Start(const Instruction *Start) {
@@ -208,15 +238,19 @@ private:
Stack.push_back(I);
}
}
+
unsigned int DFSNum = 1;
SmallPtrSet<const Value *, 8> InComponent;
DenseMap<const Value *, unsigned int> Root;
SmallVector<const Value *, 8> Stack;
+
// Store the components as vector of ptr sets, because we need the topo order
// of SCC's, but not individual member order
SmallVector<SmallPtrSet<const Value *, 8>, 8> Components;
+
DenseMap<const Value *, unsigned> ValueToComponent;
};
+
// Congruence classes represent the set of expressions/instructions
// that are all the same *during some scope in the function*.
// That is, because of the way we perform equality propagation, and
@@ -265,7 +299,9 @@ public:
explicit CongruenceClass(unsigned ID) : ID(ID) {}
CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
: ID(ID), RepLeader(Leader), DefiningExpr(E) {}
+
unsigned getID() const { return ID; }
+
// True if this class has no members left. This is mainly used for assertion
// purposes, and for skipping empty classes.
bool isDead() const {
@@ -273,6 +309,7 @@ public:
// perspective, it's really dead.
return empty() && memory_empty();
}
+
// Leader functions
Value *getLeader() const { return RepLeader; }
void setLeader(Value *Leader) { RepLeader = Leader; }
@@ -280,7 +317,6 @@ public:
return NextLeader;
}
void resetNextLeader() { NextLeader = {nullptr, ~0}; }
-
void addPossibleNextLeader(std::pair<Value *, unsigned int> LeaderPair) {
if (LeaderPair.second < NextLeader.second)
NextLeader = LeaderPair;
@@ -315,6 +351,7 @@ public:
iterator_range<MemoryMemberSet::const_iterator> memory() const {
return make_range(memory_begin(), memory_end());
}
+
void memory_insert(const MemoryMemberType *M) { MemoryMembers.insert(M); }
void memory_erase(const MemoryMemberType *M) { MemoryMembers.erase(M); }
@@ -354,34 +391,48 @@ public:
private:
unsigned ID;
+
// Representative leader.
Value *RepLeader = nullptr;
+
// The most dominating leader after our current leader, because the member set
// is not sorted and is expensive to keep sorted all the time.
std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
+
// If this is represented by a store, the value of the store.
Value *RepStoredValue = nullptr;
+
// If this class contains MemoryDefs or MemoryPhis, this is the leading memory
// access.
const MemoryAccess *RepMemoryAccess = nullptr;
+
// Defining Expression.
const Expression *DefiningExpr = nullptr;
+
// Actual members of this class.
MemberSet Members;
+
// This is the set of MemoryPhis that exist in the class. MemoryDefs and
// MemoryUses have real instructions representing them, so we only need to
// track MemoryPhis here.
MemoryMemberSet MemoryMembers;
+
// Number of stores in this congruence class.
// This is used so we can detect store equivalence changes properly.
int StoreCount = 0;
};
+} // end anonymous namespace
+
namespace llvm {
+
struct ExactEqualsExpression {
const Expression &E;
+
explicit ExactEqualsExpression(const Expression &E) : E(E) {}
+
hash_code getComputedHash() const { return E.getComputedHash(); }
+
bool operator==(const Expression &Other) const {
return E.exactlyEquals(Other);
}
@@ -393,17 +444,21 @@ template <> struct DenseMapInfo<const Expression *> {
Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
return reinterpret_cast<const Expression *>(Val);
}
+
static const Expression *getTombstoneKey() {
auto Val = static_cast<uintptr_t>(~1U);
Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
return reinterpret_cast<const Expression *>(Val);
}
+
static unsigned getHashValue(const Expression *E) {
return E->getComputedHash();
}
+
static unsigned getHashValue(const ExactEqualsExpression &E) {
return E.getComputedHash();
}
+
static bool isEqual(const ExactEqualsExpression &LHS, const Expression *RHS) {
if (RHS == getTombstoneKey() || RHS == getEmptyKey())
return false;
@@ -425,9 +480,11 @@ template <> struct DenseMapInfo<const Expression *> {
return *LHS == *RHS;
}
};
+
} // end namespace llvm
namespace {
+
class NewGVN {
Function &F;
DominatorTree *DT;
@@ -464,16 +521,22 @@ class NewGVN {
// Value Mappings.
DenseMap<Value *, CongruenceClass *> ValueToClass;
DenseMap<Value *, const Expression *> ValueToExpression;
+
// Value PHI handling, used to make equivalence between phi(op, op) and
// op(phi, phi).
// These mappings just store various data that would normally be part of the
// IR.
- DenseSet<const Instruction *> PHINodeUses;
+ SmallPtrSet<const Instruction *, 8> PHINodeUses;
+
+ DenseMap<const Value *, bool> OpSafeForPHIOfOps;
+
// Map a temporary instruction we created to a parent block.
DenseMap<const Value *, BasicBlock *> TempToBlock;
- // Map between the temporary phis we created and the real instructions they
- // are known equivalent to.
+
+ // Map between the already in-program instructions and the temporary phis we
+ // created that they are known equivalent to.
DenseMap<const Value *, PHINode *> RealToTemp;
+
// In order to know when we should re-process instructions that have
// phi-of-ops, we track the set of expressions that they needed as
// leaders. When we discover new leaders for those expressions, we process the
@@ -485,19 +548,32 @@ class NewGVN {
mutable DenseMap<const Value *, SmallPtrSet<Value *, 2>> AdditionalUsers;
DenseMap<const Expression *, SmallPtrSet<Instruction *, 2>>
ExpressionToPhiOfOps;
- // Map from basic block to the temporary operations we created
- DenseMap<const BasicBlock *, SmallVector<PHINode *, 8>> PHIOfOpsPHIs;
+
// Map from temporary operation to MemoryAccess.
DenseMap<const Instruction *, MemoryUseOrDef *> TempToMemory;
+
// Set of all temporary instructions we created.
+ // Note: This will include instructions that were just created during value
+ // numbering. The way to test if something is using them is to check
+ // RealToTemp.
DenseSet<Instruction *> AllTempInstructions;
+ // This is the set of instructions to revisit on a reachability change. At
+ // the end of the main iteration loop it will contain at least all the phi of
+ // ops instructions that will be changed to phis, as well as regular phis.
+ // During the iteration loop, it may contain other things, such as phi of ops
+ // instructions that used edge reachability to reach a result, and so need to
+ // be revisited when the edge changes, independent of whether the phi they
+ // depended on changes.
+ DenseMap<BasicBlock *, SparseBitVector<>> RevisitOnReachabilityChange;
+
// Mapping from predicate info we used to the instructions we used it with.
// In order to correctly ensure propagation, we must keep track of what
// comparisons we used, so that when the values of the comparisons change, we
// propagate the information to the places we used the comparison.
mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>>
PredicateToUsers;
+
// the same reasoning as PredicateToUsers. When we skip MemoryAccesses for
// stores, we no longer can rely solely on the def-use chains of MemorySSA.
mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>>
@@ -525,6 +601,7 @@ class NewGVN {
enum InstCycleState { ICS_Unknown, ICS_CycleFree, ICS_Cycle };
mutable DenseMap<const Instruction *, InstCycleState> InstCycleState;
+
// Expression to class mapping.
using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
ExpressionClassMap ExpressionToClass;
@@ -581,6 +658,7 @@ public:
: F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), DL(DL),
PredInfo(make_unique<PredicateInfo>(F, *DT, *AC)), SQ(DL, TLI, DT, AC) {
}
+
bool runGVN();
private:
@@ -588,7 +666,13 @@ private:
const Expression *createExpression(Instruction *) const;
const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *,
Instruction *) const;
- PHIExpression *createPHIExpression(Instruction *, bool &HasBackEdge,
+
+ // Our canonical form for phi arguments is a pair of incoming value, incoming
+ // basic block.
+ using ValPair = std::pair<Value *, BasicBlock *>;
+
+ PHIExpression *createPHIExpression(ArrayRef<ValPair>, const Instruction *,
+ BasicBlock *, bool &HasBackEdge,
bool &OriginalOpsConstant) const;
const DeadExpression *createDeadExpression() const;
const VariableExpression *createVariableExpression(Value *) const;
@@ -617,6 +701,7 @@ private:
CC->setMemoryLeader(MA);
return CC;
}
+
CongruenceClass *ensureLeaderOfMemoryClass(MemoryAccess *MA) {
auto *CC = getMemoryClass(MA);
if (CC->getMemoryLeader() != MA)
@@ -630,10 +715,21 @@ private:
ValueToClass[Member] = CClass;
return CClass;
}
+
void initializeCongruenceClasses(Function &F);
- const Expression *makePossiblePhiOfOps(Instruction *,
+ const Expression *makePossiblePHIOfOps(Instruction *,
SmallPtrSetImpl<Value *> &);
+ Value *findLeaderForInst(Instruction *ValueOp,
+ SmallPtrSetImpl<Value *> &Visited,
+ MemoryAccess *MemAccess, Instruction *OrigInst,
+ BasicBlock *PredBB);
+ bool OpIsSafeForPHIOfOpsHelper(Value *V, const BasicBlock *PHIBlock,
+ SmallPtrSetImpl<const Value *> &Visited,
+ SmallVectorImpl<Instruction *> &Worklist);
+ bool OpIsSafeForPHIOfOps(Value *Op, const BasicBlock *PHIBlock,
+ SmallPtrSetImpl<const Value *> &);
void addPhiOfOps(PHINode *Op, BasicBlock *BB, Instruction *ExistingValue);
+ void removePhiOfOps(Instruction *I, PHINode *PHITemp);
// Value number an Instruction or MemoryPhi.
void valueNumberMemoryPhi(MemoryPhi *);
@@ -650,7 +746,10 @@ private:
const Expression *performSymbolicLoadEvaluation(Instruction *) const;
const Expression *performSymbolicStoreEvaluation(Instruction *) const;
const Expression *performSymbolicCallEvaluation(Instruction *) const;
- const Expression *performSymbolicPHIEvaluation(Instruction *) const;
+ void sortPHIOps(MutableArrayRef<ValPair> Ops) const;
+ const Expression *performSymbolicPHIEvaluation(ArrayRef<ValPair>,
+ Instruction *I,
+ BasicBlock *PHIBlock) const;
const Expression *performSymbolicAggrValueEvaluation(Instruction *) const;
const Expression *performSymbolicCmpEvaluation(Instruction *) const;
const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const;
@@ -658,6 +757,7 @@ private:
// Congruence finding.
bool someEquivalentDominates(const Instruction *, const Instruction *) const;
Value *lookupOperandLeader(Value *) const;
+ CongruenceClass *getClassForExpression(const Expression *E) const;
void performCongruenceFinding(Instruction *, const Expression *);
void moveValueToNewCongruenceClass(Instruction *, const Expression *,
CongruenceClass *, CongruenceClass *);
@@ -692,10 +792,11 @@ private:
void replaceInstruction(Instruction *, Value *);
void markInstructionForDeletion(Instruction *);
void deleteInstructionsInBlock(BasicBlock *);
- Value *findPhiOfOpsLeader(const Expression *E, const BasicBlock *BB) const;
+ Value *findPHIOfOpsLeader(const Expression *, const Instruction *,
+ const BasicBlock *) const;
// New instruction creation.
- void handleNewInstruction(Instruction *){};
+ void handleNewInstruction(Instruction *) {}
// Various instruction touch utilities
template <typename Map, typename KeyType, typename Func>
@@ -731,6 +832,7 @@ private:
MemoryAccess *getDefiningAccess(const MemoryAccess *) const;
MemoryPhi *getMemoryAccess(const BasicBlock *) const;
template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
+
unsigned InstrToDFSNum(const Value *V) const {
assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");
return InstrDFS.lookup(V);
@@ -739,7 +841,9 @@ private:
unsigned InstrToDFSNum(const MemoryAccess *MA) const {
return MemoryToDFSNum(MA);
}
+
Value *InstrFromDFSNum(unsigned DFSNum) { return DFSToInstr[DFSNum]; }
+
// Given a MemoryAccess, return the relevant instruction DFS number. Note:
// This deliberately takes a value so it can be used with Use's, which will
// auto-convert to Value's but not to MemoryAccess's.
@@ -750,12 +854,15 @@ private:
? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())
: InstrDFS.lookup(MA);
}
+
bool isCycleFree(const Instruction *) const;
bool isBackedge(BasicBlock *From, BasicBlock *To) const;
+
// Debug counter info. When verifying, we have to reset the value numbering
// debug counter to the same state it started in to get the same results.
std::pair<int, int> StartingVNCounter;
};
+
} // end anonymous namespace
template <typename T>
@@ -781,11 +888,9 @@ bool StoreExpression::equals(const Expression &Other) const {
// Determine if the edge From->To is a backedge
bool NewGVN::isBackedge(BasicBlock *From, BasicBlock *To) const {
- if (From == To)
- return true;
- auto *FromDTN = DT->getNode(From);
- auto *ToDTN = DT->getNode(To);
- return RPOOrdering.lookup(FromDTN) >= RPOOrdering.lookup(ToDTN);
+ return From == To ||
+ RPOOrdering.lookup(DT->getNode(From)) >=
+ RPOOrdering.lookup(DT->getNode(To));
}
#ifndef NDEBUG
@@ -830,51 +935,77 @@ void NewGVN::deleteExpression(const Expression *E) const {
const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler);
ExpressionAllocator.Deallocate(E);
}
-PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
+
+// If V is a predicateinfo copy, get the thing it is a copy of.
+static Value *getCopyOf(const Value *V) {
+ if (auto *II = dyn_cast<IntrinsicInst>(V))
+ if (II->getIntrinsicID() == Intrinsic::ssa_copy)
+ return II->getOperand(0);
+ return nullptr;
+}
+
+// Return true if V is really PN, even accounting for predicateinfo copies.
+static bool isCopyOfPHI(const Value *V, const PHINode *PN) {
+ return V == PN || getCopyOf(V) == PN;
+}
+
+static bool isCopyOfAPHI(const Value *V) {
+ auto *CO = getCopyOf(V);
+ return CO && isa<PHINode>(CO);
+}
+
+// Sort PHI Operands into a canonical order. What we use here is an RPO
+// order. The BlockInstRange numbers are generated in an RPO walk of the basic
+// blocks.
+void NewGVN::sortPHIOps(MutableArrayRef<ValPair> Ops) const {
+ std::sort(Ops.begin(), Ops.end(), [&](const ValPair &P1, const ValPair &P2) {
+ return BlockInstRange.lookup(P1.second).first <
+ BlockInstRange.lookup(P2.second).first;
+ });
+}
+
+// Return true if V is a value that will always be available (IE can
+// be placed anywhere) in the function. We don't do globals here
+// because they are often worse to put in place.
+static bool alwaysAvailable(Value *V) {
+ return isa<Constant>(V) || isa<Argument>(V);
+}
+
+// Create a PHIExpression from an array of {incoming edge, value} pairs. I is
+// the original instruction we are creating a PHIExpression for (but may not be
+// a phi node). We require, as an invariant, that all the PHIOperands in the
+// same block are sorted the same way. sortPHIOps will sort them into a
+// canonical order.
+PHIExpression *NewGVN::createPHIExpression(ArrayRef<ValPair> PHIOperands,
+ const Instruction *I,
+ BasicBlock *PHIBlock,
+ bool &HasBackedge,
bool &OriginalOpsConstant) const {
- BasicBlock *PHIBlock = getBlockForValue(I);
- auto *PN = cast<PHINode>(I);
- auto *E =
- new (ExpressionAllocator) PHIExpression(PN->getNumOperands(), PHIBlock);
+ unsigned NumOps = PHIOperands.size();
+ auto *E = new (ExpressionAllocator) PHIExpression(NumOps, PHIBlock);
E->allocateOperands(ArgRecycler, ExpressionAllocator);
- E->setType(I->getType());
- E->setOpcode(I->getOpcode());
-
- // NewGVN assumes the operands of a PHI node are in a consistent order across
- // PHIs. LLVM doesn't seem to always guarantee this. While we need to fix
- // this in LLVM at some point we don't want GVN to find wrong congruences.
- // Therefore, here we sort uses in predecessor order.
- // We're sorting the values by pointer. In theory this might be cause of
- // non-determinism, but here we don't rely on the ordering for anything
- // significant, e.g. we don't create new instructions based on it so we're
- // fine.
- SmallVector<const Use *, 4> PHIOperands;
- for (const Use &U : PN->operands())
- PHIOperands.push_back(&U);
- std::sort(PHIOperands.begin(), PHIOperands.end(),
- [&](const Use *U1, const Use *U2) {
- return PN->getIncomingBlock(*U1) < PN->getIncomingBlock(*U2);
- });
+ E->setType(PHIOperands.begin()->first->getType());
+ E->setOpcode(Instruction::PHI);
// Filter out unreachable phi operands.
- auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) {
- if (*U == PN)
- return false;
- if (!ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock}))
+ auto Filtered = make_filter_range(PHIOperands, [&](const ValPair &P) {
+ auto *BB = P.second;
+ if (auto *PHIOp = dyn_cast<PHINode>(I))
+ if (isCopyOfPHI(P.first, PHIOp))
+ return false;
+ if (!ReachableEdges.count({BB, PHIBlock}))
return false;
// Things in TOPClass are equivalent to everything.
- if (ValueToClass.lookup(*U) == TOPClass)
+ if (ValueToClass.lookup(P.first) == TOPClass)
return false;
- return lookupOperandLeader(*U) != PN;
+ OriginalOpsConstant = OriginalOpsConstant && isa<Constant>(P.first);
+ HasBackedge = HasBackedge || isBackedge(BB, PHIBlock);
+ return lookupOperandLeader(P.first) != I;
});
std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
- [&](const Use *U) -> Value * {
- auto *BB = PN->getIncomingBlock(*U);
- HasBackedge = HasBackedge || isBackedge(BB, PHIBlock);
- OriginalOpsConstant =
- OriginalOpsConstant && isa<Constant>(*U);
- return lookupOperandLeader(*U);
+ [&](const ValPair &P) -> Value * {
+ return lookupOperandLeader(P.first);
});
return E;
}
@@ -929,8 +1060,6 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
// Take a Value returned by simplification of Expression E/Instruction
// I, and see if it resulted in a simpler expression. If so, return
// that expression.
-// TODO: Once finished, this should not take an Instruction, we only
-// use it for printing.
const Expression *NewGVN::checkSimplificationResults(Expression *E,
Instruction *I,
Value *V) const {
@@ -954,25 +1083,37 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
}
CongruenceClass *CC = ValueToClass.lookup(V);
- if (CC && CC->getDefiningExpr()) {
- // If we simplified to something else, we need to communicate
- // that we're users of the value we simplified to.
- if (I != V) {
+ if (CC) {
+ if (CC->getLeader() && CC->getLeader() != I) {
// Don't add temporary instructions to the user lists.
if (!AllTempInstructions.count(I))
addAdditionalUsers(V, I);
+ return createVariableOrConstant(CC->getLeader());
}
+ if (CC->getDefiningExpr()) {
+ // If we simplified to something else, we need to communicate
+ // that we're users of the value we simplified to.
+ if (I != V) {
+ // Don't add temporary instructions to the user lists.
+ if (!AllTempInstructions.count(I))
+ addAdditionalUsers(V, I);
+ }
- if (I)
- DEBUG(dbgs() << "Simplified " << *I << " to "
- << " expression " << *CC->getDefiningExpr() << "\n");
- NumGVNOpsSimplified++;
- deleteExpression(E);
- return CC->getDefiningExpr();
+ if (I)
+ DEBUG(dbgs() << "Simplified " << *I << " to "
+ << " expression " << *CC->getDefiningExpr() << "\n");
+ NumGVNOpsSimplified++;
+ deleteExpression(E);
+ return CC->getDefiningExpr();
+ }
}
+
return nullptr;
}
+// Create a value expression from the instruction I, replacing operands with
+// their leaders.
+
const Expression *NewGVN::createExpression(Instruction *I) const {
auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
@@ -987,15 +1128,7 @@ const Expression *NewGVN::createExpression(Instruction *I) const {
if (shouldSwapOperands(E->getOperand(0), E->getOperand(1)))
E->swapOperands(0, 1);
}
-
- // Perform simplificaiton
- // TODO: Right now we only check to see if we get a constant result.
- // We may get a less than constant, but still better, result for
- // some operations.
- // IE
- // add 0, x -> x
- // and x, x -> x
- // We should handle this by simply rewriting the expression.
+ // Perform simplification.
if (auto *CI = dyn_cast<CmpInst>(I)) {
// Sort the operand value numbers so x<y and y>x get the same value
// number.
@@ -1016,7 +1149,7 @@ const Expression *NewGVN::createExpression(Instruction *I) const {
return SimplifiedE;
} else if (isa<SelectInst>(I)) {
if (isa<Constant>(E->getOperand(0)) ||
- E->getOperand(0) == E->getOperand(1)) {
+ E->getOperand(1) == E->getOperand(2)) {
assert(E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
E->getOperand(2)->getType() == I->getOperand(2)->getType());
Value *V = SimplifySelectInst(E->getOperand(0), E->getOperand(1),
@@ -1121,7 +1254,7 @@ NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const {
bool NewGVN::someEquivalentDominates(const Instruction *Inst,
const Instruction *U) const {
auto *CC = ValueToClass.lookup(Inst);
- // This must be an instruction because we are only called from phi nodes
+ // This must be an instruction because we are only called from phi nodes
// in the case that the value it needs to check against is an instruction.
// The most likely candiates for dominance are the leader and the next leader.
@@ -1139,6 +1272,8 @@ bool NewGVN::someEquivalentDominates(const Instruction *Inst,
// any of these siblings.
if (!CC)
return false;
+ if (alwaysAvailable(CC->getLeader()))
+ return true;
if (DT->dominates(cast<Instruction>(CC->getLeader()), U))
return true;
if (CC->getNextLeader().first &&
@@ -1229,9 +1364,9 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
if (EnableStoreRefinement)
StoreRHS = MSSAWalker->getClobberingMemoryAccess(StoreAccess);
// If we bypassed the use-def chains, make sure we add a use.
+ StoreRHS = lookupMemoryLeader(StoreRHS);
if (StoreRHS != StoreAccess->getDefiningAccess())
addMemoryUsers(StoreRHS, StoreAccess);
- StoreRHS = lookupMemoryLeader(StoreRHS);
// If we are defined by ourselves, use the live on entry def.
if (StoreRHS == StoreAccess)
StoreRHS = MSSA->getLiveOnEntryDef();
@@ -1278,7 +1413,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {
// Can't forward from non-atomic to atomic without violating memory model.
// Also don't need to coerce if they are the same type, we will just
- // propogate..
+ // propagate.
if (LI->isAtomic() > DepSI->isAtomic() ||
LoadType == DepSI->getValueOperand()->getType())
return nullptr;
@@ -1292,14 +1427,13 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
getConstantStoreValueForLoad(C, Offset, LoadType, DL));
}
}
-
- } else if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
+ } else if (auto *DepLI = dyn_cast<LoadInst>(DepInst)) {
// Can't forward from non-atomic to atomic without violating memory model.
if (LI->isAtomic() > DepLI->isAtomic())
return nullptr;
int Offset = analyzeLoadFromClobberingLoad(LoadType, LoadPtr, DepLI, DL);
if (Offset >= 0) {
- // We can coerce a constant load into a load
+ // We can coerce a constant load into a load.
if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI)))
if (auto *PossibleConstant =
getConstantLoadValueForLoad(C, Offset, LoadType, DL)) {
@@ -1308,8 +1442,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
return createConstantExpression(PossibleConstant);
}
}
-
- } else if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
+ } else if (auto *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
int Offset = analyzeLoadFromClobberingMemInst(LoadType, LoadPtr, DepMI, DL);
if (Offset >= 0) {
if (auto *PossibleConstant =
@@ -1381,9 +1514,13 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
}
}
- const Expression *E = createLoadExpression(LI->getType(), LoadAddressLeader,
- LI, DefiningAccess);
- return E;
+ const auto *LE = createLoadExpression(LI->getType(), LoadAddressLeader, LI,
+ DefiningAccess);
+ // If our MemoryLeader is not our defining access, add a use to the
+ // MemoryLeader, so that we get reprocessed when it changes.
+ if (LE->getMemoryLeader() != DefiningAccess)
+ addMemoryUsers(LE->getMemoryLeader(), OriginalAccess);
+ return LE;
}
const Expression *
@@ -1402,7 +1539,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
auto *Cond = PWC->Condition;
// If this a copy of the condition, it must be either true or false depending
- // on the predicate info type and edge
+ // on the predicate info type and edge.
if (CopyOf == Cond) {
// We should not need to add predicate users because the predicate info is
// already a use of this operand.
@@ -1438,7 +1575,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
Value *SecondOp = lookupOperandLeader(Cmp->getOperand(1));
bool SwappedOps = false;
- // Sort the ops
+ // Sort the ops.
if (shouldSwapOperands(FirstOp, SecondOp)) {
std::swap(FirstOp, SecondOp);
SwappedOps = true;
@@ -1464,7 +1601,8 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
if ((PBranch->TrueEdge && Predicate == CmpInst::ICMP_EQ) ||
(!PBranch->TrueEdge && Predicate == CmpInst::ICMP_NE)) {
addPredicateUsers(PI, I);
- addAdditionalUsers(Cmp->getOperand(0), I);
+ addAdditionalUsers(SwappedOps ? Cmp->getOperand(1) : Cmp->getOperand(0),
+ I);
return createVariableOrConstant(FirstOp);
}
// Handle the special case of floating point.
@@ -1472,7 +1610,8 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
(!PBranch->TrueEdge && Predicate == CmpInst::FCMP_UNE)) &&
isa<ConstantFP>(FirstOp) && !cast<ConstantFP>(FirstOp)->isZero()) {
addPredicateUsers(PI, I);
- addAdditionalUsers(Cmp->getOperand(0), I);
+ addAdditionalUsers(SwappedOps ? Cmp->getOperand(1) : Cmp->getOperand(0),
+ I);
return createConstantExpression(cast<Constant>(FirstOp));
}
}
@@ -1502,7 +1641,6 @@ const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
// Retrieve the memory class for a given MemoryAccess.
CongruenceClass *NewGVN::getMemoryClass(const MemoryAccess *MA) const {
-
auto *Result = MemoryAccessToClass.lookup(MA);
assert(Result && "Should have found memory class");
return Result;
@@ -1571,8 +1709,9 @@ bool NewGVN::isCycleFree(const Instruction *I) const {
if (SCC.size() == 1)
InstCycleState.insert({I, ICS_CycleFree});
else {
- bool AllPhis =
- llvm::all_of(SCC, [](const Value *V) { return isa<PHINode>(V); });
+ bool AllPhis = llvm::all_of(SCC, [](const Value *V) {
+ return isa<PHINode>(V) || isCopyOfAPHI(V);
+ });
ICS = AllPhis ? ICS_CycleFree : ICS_Cycle;
for (auto *Member : SCC)
if (auto *MemberPhi = dyn_cast<PHINode>(Member))
@@ -1584,17 +1723,20 @@ bool NewGVN::isCycleFree(const Instruction *I) const {
return true;
}
-// Evaluate PHI nodes symbolically, and create an expression result.
-const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
+// Evaluate PHI nodes symbolically and create an expression result.
+const Expression *
+NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
+ Instruction *I,
+ BasicBlock *PHIBlock) const {
// True if one of the incoming phi edges is a backedge.
bool HasBackedge = false;
// All constant tracks the state of whether all the *original* phi operands
// This is really shorthand for "this phi cannot cycle due to forward
// change in value of the phi is guaranteed not to later change the value of
// the phi. IE it can't be v = phi(undef, v+1)
- bool AllConstant = true;
- auto *E =
- cast<PHIExpression>(createPHIExpression(I, HasBackedge, AllConstant));
+ bool OriginalOpsConstant = true;
+ auto *E = cast<PHIExpression>(createPHIExpression(
+ PHIOps, I, PHIBlock, HasBackedge, OriginalOpsConstant));
// We match the semantics of SimplifyPhiNode from InstructionSimplify here.
// See if all arguments are the same.
// We track if any were undef because they need special handling.
@@ -1620,14 +1762,10 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
deleteExpression(E);
return createDeadExpression();
}
- unsigned NumOps = 0;
Value *AllSameValue = *(Filtered.begin());
++Filtered.begin();
// Can't use std::equal here, sadly, because filter.begin moves.
- if (llvm::all_of(Filtered, [&](Value *Arg) {
- ++NumOps;
- return Arg == AllSameValue;
- })) {
+ if (llvm::all_of(Filtered, [&](Value *Arg) { return Arg == AllSameValue; })) {
// In LLVM's non-standard representation of phi nodes, it's possible to have
// phi nodes with cycles (IE dependent on other phis that are .... dependent
// on the original phi node), especially in weird CFG's where some arguments
@@ -1642,9 +1780,8 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
// multivalued phi, and we need to know if it's cycle free in order to
// evaluate whether we can ignore the undef. The other parts of this are
// just shortcuts. If there is no backedge, or all operands are
- // constants, or all operands are ignored but the undef, it also must be
- // cycle free.
- if (!AllConstant && HasBackedge && NumOps > 0 &&
+ // constants, it also must be cycle free.
+ if (HasBackedge && !OriginalOpsConstant &&
!isa<UndefValue>(AllSameValue) && !isCycleFree(I))
return E;
@@ -1708,8 +1845,11 @@ NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const {
return createAggregateValueExpression(I);
}
+
const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
- auto *CI = dyn_cast<CmpInst>(I);
+ assert(isa<CmpInst>(I) && "Expected a cmp instruction.");
+
+ auto *CI = cast<CmpInst>(I);
// See if our operands are equal to those of a previous predicate, and if so,
// if it implies true or false.
auto Op0 = lookupOperandLeader(CI->getOperand(0));
@@ -1720,7 +1860,7 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
OurPredicate = CI->getSwappedPredicate();
}
- // Avoid processing the same info twice
+ // Avoid processing the same info twice.
const PredicateBase *LastPredInfo = nullptr;
// See if we know something about the comparison itself, like it is the target
// of an assume.
@@ -1754,7 +1894,7 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
// %operands are considered users of the icmp.
// *Currently* we only check one level of comparisons back, and only mark one
- // level back as touched when changes appen . If you modify this code to look
+ // level back as touched when changes happen. If you modify this code to look
// back farther through comparisons, you *must* mark the appropriate
// comparisons as users in PredicateInfo.cpp, or you will cause bugs. See if
// we know something just from the operands themselves
@@ -1767,10 +1907,15 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
if (PI == LastPredInfo)
continue;
LastPredInfo = PI;
-
- // TODO: Along the false edge, we may know more things too, like icmp of
+ // In phi of ops cases, we may have predicate info that we are evaluating
+ // in a different context.
+ if (!DT->dominates(PBranch->To, getBlockForValue(I)))
+ continue;
+ // TODO: Along the false edge, we may know more things too, like
+ // icmp of
// same operands is false.
- // TODO: We only handle actual comparison conditions below, not and/or.
+ // TODO: We only handle actual comparison conditions below, not
+ // and/or.
auto *BranchCond = dyn_cast<CmpInst>(PBranch->Condition);
if (!BranchCond)
continue;
@@ -1798,7 +1943,6 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
return createConstantExpression(
ConstantInt::getFalse(CI->getType()));
}
-
} else {
// Just handle the ne and eq cases, where if we have the same
// operands, we may know something.
@@ -1822,14 +1966,6 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
return createExpression(I);
}
-// Return true if V is a value that will always be available (IE can
-// be placed anywhere) in the function. We don't do globals here
-// because they are often worse to put in place.
-// TODO: Separate cost from availability
-static bool alwaysAvailable(Value *V) {
- return isa<Constant>(V) || isa<Argument>(V);
-}
-
// Substitute and symbolize the value before value numbering.
const Expression *
NewGVN::performSymbolicEvaluation(Value *V,
@@ -1849,9 +1985,15 @@ NewGVN::performSymbolicEvaluation(Value *V,
case Instruction::InsertValue:
E = performSymbolicAggrValueEvaluation(I);
break;
- case Instruction::PHI:
- E = performSymbolicPHIEvaluation(I);
- break;
+ case Instruction::PHI: {
+ SmallVector<ValPair, 3> Ops;
+ auto *PN = cast<PHINode>(I);
+ for (unsigned i = 0; i < PN->getNumOperands(); ++i)
+ Ops.push_back({PN->getIncomingValue(i), PN->getIncomingBlock(i)});
+ // Sort to ensure the invariant createPHIExpression requires is met.
+ sortPHIOps(Ops);
+ E = performSymbolicPHIEvaluation(Ops, I, getBlockForValue(I));
+ } break;
case Instruction::Call:
E = performSymbolicCallEvaluation(I);
break;
@@ -1861,13 +2003,13 @@ NewGVN::performSymbolicEvaluation(Value *V,
case Instruction::Load:
E = performSymbolicLoadEvaluation(I);
break;
- case Instruction::BitCast: {
+ case Instruction::BitCast:
E = createExpression(I);
- } break;
+ break;
case Instruction::ICmp:
- case Instruction::FCmp: {
+ case Instruction::FCmp:
E = performSymbolicCmpEvaluation(I);
- } break;
+ break;
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
@@ -2017,7 +2159,7 @@ T *NewGVN::getMinDFSOfRange(const Range &R) const {
const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {
// TODO: If this ends up to slow, we can maintain a next memory leader like we
// do for regular leaders.
- // Make sure there will be a leader to find
+ // Make sure there will be a leader to find.
assert(!CC->definesNoMemory() && "Can't get next leader if there is none");
if (CC->getStoreCount() > 0) {
if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))
@@ -2194,7 +2336,7 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
// For a given expression, mark the phi of ops instructions that could have
// changed as a result.
void NewGVN::markPhiOfOpsChanged(const Expression *E) {
- touchAndErase(ExpressionToPhiOfOps, ExactEqualsExpression(*E));
+ touchAndErase(ExpressionToPhiOfOps, E);
}
// Perform congruence finding on a given value numbering expression.
@@ -2315,14 +2457,11 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
if (MemoryAccess *MemPhi = getMemoryAccess(To))
TouchedInstructions.set(InstrToDFSNum(MemPhi));
- auto BI = To->begin();
- while (isa<PHINode>(BI)) {
- TouchedInstructions.set(InstrToDFSNum(&*BI));
- ++BI;
- }
- for_each_found(PHIOfOpsPHIs, To, [&](const PHINode *I) {
- TouchedInstructions.set(InstrToDFSNum(I));
- });
+ // FIXME: We should just add a union op on a Bitvector and
+ // SparseBitVector. We can do it word by word faster than we are doing it
+ // here.
+ for (auto InstNum : RevisitOnReachabilityChange[To])
+ TouchedInstructions.set(InstNum);
}
}
}
@@ -2419,24 +2558,146 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
}
}
+// Remove the PHI of Ops PHI for I
+void NewGVN::removePhiOfOps(Instruction *I, PHINode *PHITemp) {
+ InstrDFS.erase(PHITemp);
+ // It's still a temp instruction. We keep it in the array so it gets erased.
+ // However, it's no longer used by I, or in the block
+ TempToBlock.erase(PHITemp);
+ RealToTemp.erase(I);
+ // We don't remove the users from the phi node uses. This wastes a little
+ // time, but such is life. We could use two sets to track which were there
+ // are the start of NewGVN, and which were added, but right nowt he cost of
+ // tracking is more than the cost of checking for more phi of ops.
+}
+
+// Add PHI Op in BB as a PHI of operations version of ExistingValue.
void NewGVN::addPhiOfOps(PHINode *Op, BasicBlock *BB,
Instruction *ExistingValue) {
InstrDFS[Op] = InstrToDFSNum(ExistingValue);
AllTempInstructions.insert(Op);
- PHIOfOpsPHIs[BB].push_back(Op);
TempToBlock[Op] = BB;
RealToTemp[ExistingValue] = Op;
+ // Add all users to phi node use, as they are now uses of the phi of ops phis
+ // and may themselves be phi of ops.
+ for (auto *U : ExistingValue->users())
+ if (auto *UI = dyn_cast<Instruction>(U))
+ PHINodeUses.insert(UI);
}
static bool okayForPHIOfOps(const Instruction *I) {
+ if (!EnablePhiOfOps)
+ return false;
return isa<BinaryOperator>(I) || isa<SelectInst>(I) || isa<CmpInst>(I) ||
isa<LoadInst>(I);
}
+bool NewGVN::OpIsSafeForPHIOfOpsHelper(
+ Value *V, const BasicBlock *PHIBlock,
+ SmallPtrSetImpl<const Value *> &Visited,
+ SmallVectorImpl<Instruction *> &Worklist) {
+
+ if (!isa<Instruction>(V))
+ return true;
+ auto OISIt = OpSafeForPHIOfOps.find(V);
+ if (OISIt != OpSafeForPHIOfOps.end())
+ return OISIt->second;
+
+ // Keep walking until we either dominate the phi block, or hit a phi, or run
+ // out of things to check.
+ if (DT->properlyDominates(getBlockForValue(V), PHIBlock)) {
+ OpSafeForPHIOfOps.insert({V, true});
+ return true;
+ }
+ // PHI in the same block.
+ if (isa<PHINode>(V) && getBlockForValue(V) == PHIBlock) {
+ OpSafeForPHIOfOps.insert({V, false});
+ return false;
+ }
+
+ auto *OrigI = cast<Instruction>(V);
+ for (auto *Op : OrigI->operand_values()) {
+ if (!isa<Instruction>(Op))
+ continue;
+ // Stop now if we find an unsafe operand.
+ auto OISIt = OpSafeForPHIOfOps.find(OrigI);
+ if (OISIt != OpSafeForPHIOfOps.end()) {
+ if (!OISIt->second) {
+ OpSafeForPHIOfOps.insert({V, false});
+ return false;
+ }
+ continue;
+ }
+ if (!Visited.insert(Op).second)
+ continue;
+ Worklist.push_back(cast<Instruction>(Op));
+ }
+ return true;
+}
+
+// Return true if this operand will be safe to use for phi of ops.
+//
+// The reason some operands are unsafe is that we are not trying to recursively
+// translate everything back through phi nodes. We actually expect some lookups
+// of expressions to fail. In particular, a lookup where the expression cannot
+// exist in the predecessor. This is true even if the expression, as shown, can
+// be determined to be constant.
+bool NewGVN::OpIsSafeForPHIOfOps(Value *V, const BasicBlock *PHIBlock,
+ SmallPtrSetImpl<const Value *> &Visited) {
+ SmallVector<Instruction *, 4> Worklist;
+ if (!OpIsSafeForPHIOfOpsHelper(V, PHIBlock, Visited, Worklist))
+ return false;
+ while (!Worklist.empty()) {
+ auto *I = Worklist.pop_back_val();
+ if (!OpIsSafeForPHIOfOpsHelper(I, PHIBlock, Visited, Worklist))
+ return false;
+ }
+ OpSafeForPHIOfOps.insert({V, true});
+ return true;
+}
+
+// Try to find a leader for instruction TransInst, which is a phi translated
+// version of something in our original program. Visited is used to ensure we
+// don't infinite loop during translations of cycles. OrigInst is the
+// instruction in the original program, and PredBB is the predecessor we
+// translated it through.
+Value *NewGVN::findLeaderForInst(Instruction *TransInst,
+ SmallPtrSetImpl<Value *> &Visited,
+ MemoryAccess *MemAccess, Instruction *OrigInst,
+ BasicBlock *PredBB) {
+ unsigned IDFSNum = InstrToDFSNum(OrigInst);
+ // Make sure it's marked as a temporary instruction.
+ AllTempInstructions.insert(TransInst);
+ // and make sure anything that tries to add it's DFS number is
+ // redirected to the instruction we are making a phi of ops
+ // for.
+ TempToBlock.insert({TransInst, PredBB});
+ InstrDFS.insert({TransInst, IDFSNum});
+
+ const Expression *E = performSymbolicEvaluation(TransInst, Visited);
+ InstrDFS.erase(TransInst);
+ AllTempInstructions.erase(TransInst);
+ TempToBlock.erase(TransInst);
+ if (MemAccess)
+ TempToMemory.erase(TransInst);
+ if (!E)
+ return nullptr;
+ auto *FoundVal = findPHIOfOpsLeader(E, OrigInst, PredBB);
+ if (!FoundVal) {
+ ExpressionToPhiOfOps[E].insert(OrigInst);
+ DEBUG(dbgs() << "Cannot find phi of ops operand for " << *TransInst
+ << " in block " << getBlockName(PredBB) << "\n");
+ return nullptr;
+ }
+ if (auto *SI = dyn_cast<StoreInst>(FoundVal))
+ FoundVal = SI->getValueOperand();
+ return FoundVal;
+}
+
// When we see an instruction that is an op of phis, generate the equivalent phi
// of ops form.
const Expression *
-NewGVN::makePossiblePhiOfOps(Instruction *I,
+NewGVN::makePossiblePHIOfOps(Instruction *I,
SmallPtrSetImpl<Value *> &Visited) {
if (!okayForPHIOfOps(I))
return nullptr;
@@ -2450,7 +2711,6 @@ NewGVN::makePossiblePhiOfOps(Instruction *I,
if (!isCycleFree(I))
return nullptr;
- unsigned IDFSNum = InstrToDFSNum(I);
SmallPtrSet<const Value *, 8> ProcessedPHIs;
// TODO: We don't do phi translation on memory accesses because it's
// complicated. For a load, we'd need to be able to simulate a new memoryuse,
@@ -2463,81 +2723,94 @@ NewGVN::makePossiblePhiOfOps(Instruction *I,
MemAccess->getDefiningAccess()->getBlock() == I->getParent())
return nullptr;
+ SmallPtrSet<const Value *, 10> VisitedOps;
// Convert op of phis to phi of ops
- for (auto &Op : I->operands()) {
- // TODO: We can't handle expressions that must be recursively translated
- // IE
- // a = phi (b, c)
- // f = use a
- // g = f + phi of something
- // To properly make a phi of ops for g, we'd have to properly translate and
- // use the instruction for f. We should add this by splitting out the
- // instruction creation we do below.
- if (isa<Instruction>(Op) && PHINodeUses.count(cast<Instruction>(Op)))
- return nullptr;
- if (!isa<PHINode>(Op))
- continue;
+ for (auto *Op : I->operand_values()) {
+ if (!isa<PHINode>(Op)) {
+ auto *ValuePHI = RealToTemp.lookup(Op);
+ if (!ValuePHI)
+ continue;
+ DEBUG(dbgs() << "Found possible dependent phi of ops\n");
+ Op = ValuePHI;
+ }
auto *OpPHI = cast<PHINode>(Op);
// No point in doing this for one-operand phis.
if (OpPHI->getNumOperands() == 1)
continue;
if (!DebugCounter::shouldExecute(PHIOfOpsCounter))
return nullptr;
- SmallVector<std::pair<Value *, BasicBlock *>, 4> Ops;
+ SmallVector<ValPair, 4> Ops;
+ SmallPtrSet<Value *, 4> Deps;
auto *PHIBlock = getBlockForValue(OpPHI);
- for (auto PredBB : OpPHI->blocks()) {
+ RevisitOnReachabilityChange[PHIBlock].reset(InstrToDFSNum(I));
+ for (unsigned PredNum = 0; PredNum < OpPHI->getNumOperands(); ++PredNum) {
+ auto *PredBB = OpPHI->getIncomingBlock(PredNum);
Value *FoundVal = nullptr;
// We could just skip unreachable edges entirely but it's tricky to do
// with rewriting existing phi nodes.
if (ReachableEdges.count({PredBB, PHIBlock})) {
- // Clone the instruction, create an expression from it, and see if we
- // have a leader.
+ // Clone the instruction, create an expression from it that is
+ // translated back into the predecessor, and see if we have a leader.
Instruction *ValueOp = I->clone();
if (MemAccess)
TempToMemory.insert({ValueOp, MemAccess});
-
+ bool SafeForPHIOfOps = true;
+ VisitedOps.clear();
for (auto &Op : ValueOp->operands()) {
- Op = Op->DoPHITranslation(PHIBlock, PredBB);
- // When this operand changes, it could change whether there is a
- // leader for us or not.
- addAdditionalUsers(Op, I);
+ auto *OrigOp = &*Op;
+ // When these operand changes, it could change whether there is a
+ // leader for us or not, so we have to add additional users.
+ if (isa<PHINode>(Op)) {
+ Op = Op->DoPHITranslation(PHIBlock, PredBB);
+ if (Op != OrigOp && Op != I)
+ Deps.insert(Op);
+ } else if (auto *ValuePHI = RealToTemp.lookup(Op)) {
+ if (getBlockForValue(ValuePHI) == PHIBlock)
+ Op = ValuePHI->getIncomingValueForBlock(PredBB);
+ }
+ // If we phi-translated the op, it must be safe.
+ SafeForPHIOfOps =
+ SafeForPHIOfOps &&
+ (Op != OrigOp || OpIsSafeForPHIOfOps(Op, PHIBlock, VisitedOps));
}
- // Make sure it's marked as a temporary instruction.
- AllTempInstructions.insert(ValueOp);
- // and make sure anything that tries to add it's DFS number is
- // redirected to the instruction we are making a phi of ops
- // for.
- InstrDFS.insert({ValueOp, IDFSNum});
- const Expression *E = performSymbolicEvaluation(ValueOp, Visited);
- InstrDFS.erase(ValueOp);
- AllTempInstructions.erase(ValueOp);
+ // FIXME: For those things that are not safe we could generate
+ // expressions all the way down, and see if this comes out to a
+ // constant. For anything where that is true, and unsafe, we should
+ // have made a phi-of-ops (or value numbered it equivalent to something)
+ // for the pieces already.
+ FoundVal = !SafeForPHIOfOps ? nullptr
+ : findLeaderForInst(ValueOp, Visited,
+ MemAccess, I, PredBB);
ValueOp->deleteValue();
- if (MemAccess)
- TempToMemory.erase(ValueOp);
- if (!E)
+ if (!FoundVal)
return nullptr;
- FoundVal = findPhiOfOpsLeader(E, PredBB);
- if (!FoundVal) {
- ExpressionToPhiOfOps[E].insert(I);
- return nullptr;
- }
- if (auto *SI = dyn_cast<StoreInst>(FoundVal))
- FoundVal = SI->getValueOperand();
} else {
DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
<< getBlockName(PredBB)
<< " because the block is unreachable\n");
FoundVal = UndefValue::get(I->getType());
+ RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
}
Ops.push_back({FoundVal, PredBB});
DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in "
<< getBlockName(PredBB) << "\n");
}
+ for (auto Dep : Deps)
+ addAdditionalUsers(Dep, I);
+ sortPHIOps(Ops);
+ auto *E = performSymbolicPHIEvaluation(Ops, I, PHIBlock);
+ if (isa<ConstantExpression>(E) || isa<VariableExpression>(E)) {
+ DEBUG(dbgs()
+ << "Not creating real PHI of ops because it simplified to existing "
+ "value or constant\n");
+ return E;
+ }
auto *ValuePHI = RealToTemp.lookup(I);
bool NewPHI = false;
if (!ValuePHI) {
- ValuePHI = PHINode::Create(I->getType(), OpPHI->getNumOperands());
+ ValuePHI =
+ PHINode::Create(I->getType(), OpPHI->getNumOperands(), "phiofops");
addPhiOfOps(ValuePHI, PHIBlock, I);
NewPHI = true;
NumGVNPHIOfOpsCreated++;
@@ -2553,10 +2826,11 @@ NewGVN::makePossiblePhiOfOps(Instruction *I,
++i;
}
}
-
+ RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I
<< "\n");
- return performSymbolicEvaluation(ValuePHI, Visited);
+
+ return E;
}
return nullptr;
}
@@ -2602,8 +2876,11 @@ void NewGVN::initializeCongruenceClasses(Function &F) {
if (MD && isa<StoreInst>(MD->getMemoryInst()))
TOPClass->incStoreCount();
}
+
+ // FIXME: This is trying to discover which instructions are uses of phi
+ // nodes. We should move this into one of the myriad of places that walk
+ // all the operands already.
for (auto &I : *BB) {
- // TODO: Move to helper
if (isa<PHINode>(&I))
for (auto *U : I.users())
if (auto *UInst = dyn_cast<Instruction>(U))
@@ -2661,7 +2938,8 @@ void NewGVN::cleanupTables() {
ExpressionToPhiOfOps.clear();
TempToBlock.clear();
TempToMemory.clear();
- PHIOfOpsPHIs.clear();
+ PHINodeUses.clear();
+ OpSafeForPHIOfOps.clear();
ReachableBlocks.clear();
ReachableEdges.clear();
#ifndef NDEBUG
@@ -2675,6 +2953,7 @@ void NewGVN::cleanupTables() {
MemoryAccessToClass.clear();
PredicateToUsers.clear();
MemoryToUsers.clear();
+ RevisitOnReachabilityChange.clear();
}
// Assign local DFS number mapping to instructions, and leave space for Value
@@ -2698,6 +2977,8 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
markInstructionForDeletion(&I);
continue;
}
+ if (isa<PHINode>(&I))
+ RevisitOnReachabilityChange[B].set(End);
InstrDFS[&I] = End++;
DFSToInstr.emplace_back(&I);
}
@@ -2719,6 +3000,7 @@ void NewGVN::updateProcessedCount(const Value *V) {
}
#endif
}
+
// Evaluate MemoryPhi nodes symbolically, just like PHI nodes
void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
// If all the arguments are the same, the MemoryPhi has the same value as the
@@ -2787,11 +3069,15 @@ void NewGVN::valueNumberInstruction(Instruction *I) {
// Make a phi of ops if necessary
if (Symbolized && !isa<ConstantExpression>(Symbolized) &&
!isa<VariableExpression>(Symbolized) && PHINodeUses.count(I)) {
- auto *PHIE = makePossiblePhiOfOps(I, Visited);
- if (PHIE)
+ auto *PHIE = makePossiblePHIOfOps(I, Visited);
+ // If we created a phi of ops, use it.
+ // If we couldn't create one, make sure we don't leave one lying around
+ if (PHIE) {
Symbolized = PHIE;
+ } else if (auto *Op = RealToTemp.lookup(I)) {
+ removePhiOfOps(I, Op);
+ }
}
-
} else {
// Mark the instruction as unused so we don't value number it again.
InstrDFS[I] = 0;
@@ -2905,7 +3191,7 @@ void NewGVN::verifyMemoryCongruency() const {
// so we don't process them.
if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) {
for (auto &U : MemPHI->incoming_values()) {
- if (Instruction *I = dyn_cast<Instruction>(U.get())) {
+ if (auto *I = dyn_cast<Instruction>(&*U)) {
if (!isInstructionTriviallyDead(I))
return true;
}
@@ -3200,11 +3486,13 @@ struct NewGVN::ValueDFS {
int DFSIn = 0;
int DFSOut = 0;
int LocalNum = 0;
+
// Only one of Def and U will be set.
// The bool in the Def tells us whether the Def is the stored value of a
// store.
PointerIntPair<Value *, 1, bool> Def;
Use *U = nullptr;
+
bool operator<(const ValueDFS &Other) const {
// It's not enough that any given field be less than - we have sets
// of fields that need to be evaluated together to give a proper ordering.
@@ -3439,7 +3727,6 @@ void NewGVN::markInstructionForDeletion(Instruction *I) {
}
void NewGVN::replaceInstruction(Instruction *I, Value *V) {
-
DEBUG(dbgs() << "Replacing " << *I << " with " << *V << "\n");
patchAndReplaceAllUsesWith(I, V);
// We save the actual erasing to avoid invalidating memory
@@ -3460,7 +3747,9 @@ public:
ValueStack.emplace_back(V);
DFSStack.emplace_back(DFSIn, DFSOut);
}
+
bool empty() const { return DFSStack.empty(); }
+
bool isInScope(int DFSIn, int DFSOut) const {
if (empty())
return false;
@@ -3484,19 +3773,33 @@ private:
SmallVector<Value *, 8> ValueStack;
SmallVector<std::pair<int, int>, 8> DFSStack;
};
+
+} // end anonymous namespace
+
+// Given an expression, get the congruence class for it.
+CongruenceClass *NewGVN::getClassForExpression(const Expression *E) const {
+ if (auto *VE = dyn_cast<VariableExpression>(E))
+ return ValueToClass.lookup(VE->getVariableValue());
+ else if (isa<DeadExpression>(E))
+ return TOPClass;
+ return ExpressionToClass.lookup(E);
}
// Given a value and a basic block we are trying to see if it is available in,
// see if the value has a leader available in that block.
-Value *NewGVN::findPhiOfOpsLeader(const Expression *E,
+Value *NewGVN::findPHIOfOpsLeader(const Expression *E,
+ const Instruction *OrigInst,
const BasicBlock *BB) const {
// It would already be constant if we could make it constant
if (auto *CE = dyn_cast<ConstantExpression>(E))
return CE->getConstantValue();
- if (auto *VE = dyn_cast<VariableExpression>(E))
- return VE->getVariableValue();
+ if (auto *VE = dyn_cast<VariableExpression>(E)) {
+ auto *V = VE->getVariableValue();
+ if (alwaysAvailable(V) || DT->dominates(getBlockForValue(V), BB))
+ return VE->getVariableValue();
+ }
- auto *CC = ExpressionToClass.lookup(E);
+ auto *CC = getClassForExpression(E);
if (!CC)
return nullptr;
if (alwaysAvailable(CC->getLeader()))
@@ -3504,15 +3807,13 @@ Value *NewGVN::findPhiOfOpsLeader(const Expression *E,
for (auto Member : *CC) {
auto *MemberInst = dyn_cast<Instruction>(Member);
+ if (MemberInst == OrigInst)
+ continue;
// Anything that isn't an instruction is always available.
if (!MemberInst)
return Member;
- // If we are looking for something in the same block as the member, it must
- // be a leader because this function is looking for operands for a phi node.
- if (MemberInst->getParent() == BB ||
- DT->dominates(MemberInst->getParent(), BB)) {
+ if (DT->dominates(getBlockForValue(MemberInst), BB))
return Member;
- }
}
return nullptr;
}
@@ -3549,36 +3850,39 @@ bool NewGVN::eliminateInstructions(Function &F) {
// Go through all of our phi nodes, and kill the arguments associated with
// unreachable edges.
- auto ReplaceUnreachablePHIArgs = [&](PHINode &PHI, BasicBlock *BB) {
- for (auto &Operand : PHI.incoming_values())
- if (!ReachableEdges.count({PHI.getIncomingBlock(Operand), BB})) {
+ auto ReplaceUnreachablePHIArgs = [&](PHINode *PHI, BasicBlock *BB) {
+ for (auto &Operand : PHI->incoming_values())
+ if (!ReachableEdges.count({PHI->getIncomingBlock(Operand), BB})) {
DEBUG(dbgs() << "Replacing incoming value of " << PHI << " for block "
- << getBlockName(PHI.getIncomingBlock(Operand))
+ << getBlockName(PHI->getIncomingBlock(Operand))
<< " with undef due to it being unreachable\n");
- Operand.set(UndefValue::get(PHI.getType()));
+ Operand.set(UndefValue::get(PHI->getType()));
}
};
- SmallPtrSet<BasicBlock *, 8> BlocksWithPhis;
- for (auto &B : F)
- if ((!B.empty() && isa<PHINode>(*B.begin())) ||
- (PHIOfOpsPHIs.find(&B) != PHIOfOpsPHIs.end()))
- BlocksWithPhis.insert(&B);
+ // Replace unreachable phi arguments.
+ // At this point, RevisitOnReachabilityChange only contains:
+ //
+ // 1. PHIs
+ // 2. Temporaries that will convert to PHIs
+ // 3. Operations that are affected by an unreachable edge but do not fit into
+ // 1 or 2 (rare).
+ // So it is a slight overshoot of what we want. We could make it exact by
+ // using two SparseBitVectors per block.
DenseMap<const BasicBlock *, unsigned> ReachablePredCount;
- for (auto KV : ReachableEdges)
+ for (auto &KV : ReachableEdges)
ReachablePredCount[KV.getEnd()]++;
- for (auto *BB : BlocksWithPhis)
- // TODO: It would be faster to use getNumIncomingBlocks() on a phi node in
- // the block and subtract the pred count, but it's more complicated.
- if (ReachablePredCount.lookup(BB) !=
- unsigned(std::distance(pred_begin(BB), pred_end(BB)))) {
- for (auto II = BB->begin(); isa<PHINode>(II); ++II) {
- auto &PHI = cast<PHINode>(*II);
+ for (auto &BBPair : RevisitOnReachabilityChange) {
+ for (auto InstNum : BBPair.second) {
+ auto *Inst = InstrFromDFSNum(InstNum);
+ auto *PHI = dyn_cast<PHINode>(Inst);
+ PHI = PHI ? PHI : dyn_cast_or_null<PHINode>(RealToTemp.lookup(Inst));
+ if (!PHI)
+ continue;
+ auto *BB = BBPair.first;
+ if (ReachablePredCount.lookup(BB) != PHI->getNumIncomingValues())
ReplaceUnreachablePHIArgs(PHI, BB);
- }
- for_each_found(PHIOfOpsPHIs, BB, [&](PHINode *PHI) {
- ReplaceUnreachablePHIArgs(*PHI, BB);
- });
}
+ }
// Map to store the use counts
DenseMap<const Value *, unsigned int> UseCounts;
@@ -3631,7 +3935,7 @@ bool NewGVN::eliminateInstructions(Function &F) {
CC->swap(MembersLeft);
} else {
// If this is a singleton, we can skip it.
- if (CC->size() != 1 || RealToTemp.lookup(Leader)) {
+ if (CC->size() != 1 || RealToTemp.count(Leader)) {
// This is a stack because equality replacement/etc may place
// constants in the middle of the member list, and we want to use
// those constant values in preference to the current leader, over
@@ -3873,12 +4177,16 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
}
namespace {
+
class NewGVNLegacyPass : public FunctionPass {
public:
- static char ID; // Pass identification, replacement for typeid.
+ // Pass identification, replacement for typeid.
+ static char ID;
+
NewGVNLegacyPass() : FunctionPass(ID) {
initializeNewGVNLegacyPassPass(*PassRegistry::getPassRegistry());
}
+
bool runOnFunction(Function &F) override;
private:
@@ -3892,7 +4200,8 @@ private:
AU.addPreserved<GlobalsAAWrapperPass>();
}
};
-} // namespace
+
+} // end anonymous namespace
bool NewGVNLegacyPass::runOnFunction(Function &F) {
if (skipFunction(F))
@@ -3906,6 +4215,8 @@ bool NewGVNLegacyPass::runOnFunction(Function &F) {
.runGVN();
}
+char NewGVNLegacyPass::ID = 0;
+
INITIALIZE_PASS_BEGIN(NewGVNLegacyPass, "newgvn", "Global Value Numbering",
false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
@@ -3917,8 +4228,6 @@ INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
INITIALIZE_PASS_END(NewGVNLegacyPass, "newgvn", "Global Value Numbering", false,
false)
-char NewGVNLegacyPass::ID = 0;
-
// createGVNPass - The public interface to this file.
FunctionPass *llvm::createNewGVNPass() { return new NewGVNLegacyPass(); }
diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 1bfecea2f61e..1748815c5941 100644
--- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -26,16 +26,13 @@ using namespace llvm;
static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
- BasicBlock &CurrBB, Function::iterator &BB) {
+ BasicBlock &CurrBB, Function::iterator &BB,
+ const TargetTransformInfo *TTI) {
// There is no need to change the IR, since backend will emit sqrt
// instruction if the call has already been marked read-only.
if (Call->onlyReadsMemory())
return false;
- // The call must have the expected result type.
- if (!Call->getType()->isFloatingPointTy())
- return false;
-
// Do the following transformation:
//
// (before)
@@ -43,7 +40,7 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
//
// (after)
// v0 = sqrt_noreadmem(src) # native sqrt instruction.
- // if (v0 is a NaN)
+ // [if (v0 is a NaN) || if (src < 0)]
// v1 = sqrt(src) # library call.
// dst = phi(v0, v1)
//
@@ -52,7 +49,8 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
// Create phi and replace all uses.
BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode());
IRBuilder<> Builder(JoinBB, JoinBB->begin());
- PHINode *Phi = Builder.CreatePHI(Call->getType(), 2);
+ Type *Ty = Call->getType();
+ PHINode *Phi = Builder.CreatePHI(Ty, 2);
Call->replaceAllUsesWith(Phi);
// Create basic block LibCallBB and insert a call to library function sqrt.
@@ -69,7 +67,10 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
CurrBB.getTerminator()->eraseFromParent();
Builder.SetInsertPoint(&CurrBB);
- Value *FCmp = Builder.CreateFCmpOEQ(Call, Call);
+ Value *FCmp = TTI->isFCmpOrdCheaperThanFCmpZero(Ty)
+ ? Builder.CreateFCmpORD(Call, Call)
+ : Builder.CreateFCmpOGE(Call->getOperand(0),
+ ConstantFP::get(Ty, 0.0));
Builder.CreateCondBr(FCmp, JoinBB, LibCallBB);
// Add phi operands.
@@ -96,18 +97,21 @@ static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI,
if (!Call || !(CalledFunc = Call->getCalledFunction()))
continue;
+ if (Call->isNoBuiltin())
+ continue;
+
// Skip if function either has local linkage or is not a known library
// function.
LibFunc LF;
- if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() ||
- !TLI->getLibFunc(CalledFunc->getName(), LF))
+ if (CalledFunc->hasLocalLinkage() ||
+ !TLI->getLibFunc(*CalledFunc, LF) || !TLI->has(LF))
continue;
switch (LF) {
case LibFunc_sqrtf:
case LibFunc_sqrt:
if (TTI->haveFastSqrt(Call->getType()) &&
- optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
+ optimizeSQRT(Call, CalledFunc, *CurrBB, BB, TTI))
break;
continue;
default:
diff --git a/lib/Transforms/Scalar/PlaceSafepoints.cpp b/lib/Transforms/Scalar/PlaceSafepoints.cpp
index e47b636348e3..2d0cb6fbf211 100644
--- a/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -54,6 +54,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -113,6 +114,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {
ScalarEvolution *SE = nullptr;
DominatorTree *DT = nullptr;
LoopInfo *LI = nullptr;
+ TargetLibraryInfo *TLI = nullptr;
PlaceBackedgeSafepointsImpl(bool CallSafepoints = false)
: FunctionPass(ID), CallSafepointsEnabled(CallSafepoints) {
@@ -131,6 +133,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
for (Loop *I : *LI) {
runOnLoopAndSubLoops(I);
}
@@ -141,6 +144,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
// We no longer modify the IR at all in this pass. Thus all
// analysis are preserved.
AU.setPreservesAll();
@@ -165,6 +169,7 @@ struct PlaceSafepoints : public FunctionPass {
// We modify the graph wholesale (inlining, block insertion, etc). We
// preserve nothing at the moment. We could potentially preserve dom tree
// if that was worth doing
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
}
};
}
@@ -174,10 +179,11 @@ struct PlaceSafepoints : public FunctionPass {
// callers job.
static void
InsertSafepointPoll(Instruction *InsertBefore,
- std::vector<CallSite> &ParsePointsNeeded /*rval*/);
+ std::vector<CallSite> &ParsePointsNeeded /*rval*/,
+ const TargetLibraryInfo &TLI);
-static bool needsStatepoint(const CallSite &CS) {
- if (callsGCLeafFunction(CS))
+static bool needsStatepoint(const CallSite &CS, const TargetLibraryInfo &TLI) {
+ if (callsGCLeafFunction(CS, TLI))
return false;
if (CS.isCall()) {
CallInst *call = cast<CallInst>(CS.getInstruction());
@@ -194,7 +200,8 @@ static bool needsStatepoint(const CallSite &CS) {
/// answer; i.e. false is always valid.
static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,
BasicBlock *Pred,
- DominatorTree &DT) {
+ DominatorTree &DT,
+ const TargetLibraryInfo &TLI) {
// In general, we're looking for any cut of the graph which ensures
// there's a call safepoint along every edge between Header and Pred.
// For the moment, we look only for the 'cuts' that consist of a single call
@@ -217,7 +224,7 @@ static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,
// unconditional poll. In practice, this is only a theoretical concern
// since we don't have any methods with conditional-only safepoint
// polls.
- if (needsStatepoint(CS))
+ if (needsStatepoint(CS, TLI))
return true;
}
@@ -321,7 +328,7 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
continue;
}
if (CallSafepointsEnabled &&
- containsUnconditionalCallSafepoint(L, Header, Pred, *DT)) {
+ containsUnconditionalCallSafepoint(L, Header, Pred, *DT, *TLI)) {
// Note: This is only semantically legal since we won't do any further
// IPO or inlining before the actual call insertion.. If we hadn't, we
// might latter loose this call safepoint.
@@ -472,6 +479,9 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
if (!shouldRewriteFunction(F))
return false;
+ const TargetLibraryInfo &TLI =
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
bool Modified = false;
// In various bits below, we rely on the fact that uses are reachable from
@@ -578,7 +588,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
// safepoint polls themselves.
for (Instruction *PollLocation : PollsNeeded) {
std::vector<CallSite> RuntimeCalls;
- InsertSafepointPoll(PollLocation, RuntimeCalls);
+ InsertSafepointPoll(PollLocation, RuntimeCalls, TLI);
ParsePointNeeded.insert(ParsePointNeeded.end(), RuntimeCalls.begin(),
RuntimeCalls.end());
}
@@ -610,7 +620,8 @@ INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints",
static void
InsertSafepointPoll(Instruction *InsertBefore,
- std::vector<CallSite> &ParsePointsNeeded /*rval*/) {
+ std::vector<CallSite> &ParsePointsNeeded /*rval*/,
+ const TargetLibraryInfo &TLI) {
BasicBlock *OrigBB = InsertBefore->getParent();
Module *M = InsertBefore->getModule();
assert(M && "must be part of a module");
@@ -669,7 +680,7 @@ InsertSafepointPoll(Instruction *InsertBefore,
assert(ParsePointsNeeded.empty());
for (auto *CI : Calls) {
// No safepoint needed or wanted
- if (!needsStatepoint(CI))
+ if (!needsStatepoint(CI, TLI))
continue;
// These are likely runtime calls. Should we assert that via calling
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index e235e5eb1a06..88dcaf0f8a36 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -21,28 +21,45 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/Reassociate.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
+#include <cassert>
+#include <utility>
+
using namespace llvm;
using namespace reassociate;
@@ -54,7 +71,6 @@ STATISTIC(NumFactor , "Number of multiplies factored");
#ifndef NDEBUG
/// Print out the expression identified in the Ops list.
-///
static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
Module *M = I->getModule();
dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " "
@@ -128,38 +144,37 @@ XorOpnd::XorOpnd(Value *V) {
/// Return true if V is an instruction of the specified opcode and if it
/// only has one use.
static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
- if (V->hasOneUse() && isa<Instruction>(V) &&
- cast<Instruction>(V)->getOpcode() == Opcode &&
- (!isa<FPMathOperator>(V) ||
- cast<Instruction>(V)->hasUnsafeAlgebra()))
- return cast<BinaryOperator>(V);
+ auto *I = dyn_cast<Instruction>(V);
+ if (I && I->hasOneUse() && I->getOpcode() == Opcode)
+ if (!isa<FPMathOperator>(I) || I->isFast())
+ return cast<BinaryOperator>(I);
return nullptr;
}
static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
unsigned Opcode2) {
- if (V->hasOneUse() && isa<Instruction>(V) &&
- (cast<Instruction>(V)->getOpcode() == Opcode1 ||
- cast<Instruction>(V)->getOpcode() == Opcode2) &&
- (!isa<FPMathOperator>(V) ||
- cast<Instruction>(V)->hasUnsafeAlgebra()))
- return cast<BinaryOperator>(V);
+ auto *I = dyn_cast<Instruction>(V);
+ if (I && I->hasOneUse() &&
+ (I->getOpcode() == Opcode1 || I->getOpcode() == Opcode2))
+ if (!isa<FPMathOperator>(I) || I->isFast())
+ return cast<BinaryOperator>(I);
return nullptr;
}
void ReassociatePass::BuildRankMap(Function &F,
ReversePostOrderTraversal<Function*> &RPOT) {
- unsigned i = 2;
+ unsigned Rank = 2;
// Assign distinct ranks to function arguments.
- for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
- ValueRankMap[&*I] = ++i;
- DEBUG(dbgs() << "Calculated Rank[" << I->getName() << "] = " << i << "\n");
+ for (auto &Arg : F.args()) {
+ ValueRankMap[&Arg] = ++Rank;
+ DEBUG(dbgs() << "Calculated Rank[" << Arg.getName() << "] = " << Rank
+ << "\n");
}
// Traverse basic blocks in ReversePostOrder
for (BasicBlock *BB : RPOT) {
- unsigned BBRank = RankMap[BB] = ++i << 16;
+ unsigned BBRank = RankMap[BB] = ++Rank << 16;
// Walk the basic block, adding precomputed ranks for any instructions that
// we cannot move. This ensures that the ranks for these instructions are
@@ -207,13 +222,9 @@ void ReassociatePass::canonicalizeOperands(Instruction *I) {
Value *LHS = I->getOperand(0);
Value *RHS = I->getOperand(1);
- unsigned LHSRank = getRank(LHS);
- unsigned RHSRank = getRank(RHS);
-
- if (isa<Constant>(RHS))
+ if (LHS == RHS || isa<Constant>(RHS))
return;
-
- if (isa<Constant>(LHS) || RHSRank < LHSRank)
+ if (isa<Constant>(LHS) || getRank(RHS) < getRank(LHS))
cast<BinaryOperator>(I)->swapOperands();
}
@@ -357,7 +368,7 @@ static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
}
}
-typedef std::pair<Value*, APInt> RepeatedValue;
+using RepeatedValue = std::pair<Value*, APInt>;
/// Given an associative binary expression, return the leaf
/// nodes in Ops along with their weights (how many times the leaf occurs). The
@@ -432,7 +443,6 @@ typedef std::pair<Value*, APInt> RepeatedValue;
/// that have all uses inside the expression (i.e. only used by non-leaf nodes
/// of the expression) if it can turn them into binary operators of the right
/// type and thus make the expression bigger.
-
static bool LinearizeExprTree(BinaryOperator *I,
SmallVectorImpl<RepeatedValue> &Ops) {
DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
@@ -470,12 +480,12 @@ static bool LinearizeExprTree(BinaryOperator *I,
// Leaves - Keeps track of the set of putative leaves as well as the number of
// paths to each leaf seen so far.
- typedef DenseMap<Value*, APInt> LeafMap;
+ using LeafMap = DenseMap<Value *, APInt>;
LeafMap Leaves; // Leaf -> Total weight so far.
- SmallVector<Value*, 8> LeafOrder; // Ensure deterministic leaf output order.
+ SmallVector<Value *, 8> LeafOrder; // Ensure deterministic leaf output order.
#ifndef NDEBUG
- SmallPtrSet<Value*, 8> Visited; // For sanity checking the iteration scheme.
+ SmallPtrSet<Value *, 8> Visited; // For sanity checking the iteration scheme.
#endif
while (!Worklist.empty()) {
std::pair<BinaryOperator*, APInt> P = Worklist.pop_back_val();
@@ -554,7 +564,7 @@ static bool LinearizeExprTree(BinaryOperator *I,
assert((!isa<Instruction>(Op) ||
cast<Instruction>(Op)->getOpcode() != Opcode
|| (isa<FPMathOperator>(Op) &&
- !cast<Instruction>(Op)->hasUnsafeAlgebra())) &&
+ !cast<Instruction>(Op)->isFast())) &&
"Should have been handled above!");
assert(Op->hasOneUse() && "Has uses outside the expression tree!");
@@ -773,7 +783,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
break;
ExpressionChanged->moveBefore(I);
ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin());
- } while (1);
+ } while (true);
// Throw away any left over nodes from the original expression.
for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i)
@@ -789,13 +799,9 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
/// additional opportunities have been exposed.
static Value *NegateValue(Value *V, Instruction *BI,
SetVector<AssertingVH<Instruction>> &ToRedo) {
- if (Constant *C = dyn_cast<Constant>(V)) {
- if (C->getType()->isFPOrFPVectorTy()) {
- return ConstantExpr::getFNeg(C);
- }
- return ConstantExpr::getNeg(C);
- }
-
+ if (auto *C = dyn_cast<Constant>(V))
+ return C->getType()->isFPOrFPVectorTy() ? ConstantExpr::getFNeg(C) :
+ ConstantExpr::getNeg(C);
// We are trying to expose opportunity for reassociation. One of the things
// that we want to do to achieve this is to push a negation as deep into an
@@ -913,7 +919,6 @@ BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) {
//
// Calculate the negative value of Operand 1 of the sub instruction,
// and set it as the RHS of the add instruction we just made.
- //
Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo);
BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub);
Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
@@ -990,7 +995,7 @@ static Value *EmitAddTreeOfValues(Instruction *I,
Value *V1 = Ops.back();
Ops.pop_back();
Value *V2 = EmitAddTreeOfValues(I, Ops);
- return CreateAdd(V2, V1, "tmp", I, I);
+ return CreateAdd(V2, V1, "reass.add", I, I);
}
/// If V is an expression tree that is a multiplication sequence,
@@ -1157,7 +1162,6 @@ static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,
// If it was successful, true is returned, and the "R" and "C" is returned
// via "Res" and "ConstOpnd", respectively; otherwise, false is returned,
// and both "Res" and "ConstOpnd" remain unchanged.
-//
bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
APInt &ConstOpnd, Value *&Res) {
// Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2
@@ -1183,7 +1187,6 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
RedoInsts.insert(T);
return true;
}
-
// Helper function of OptimizeXor(). It tries to simplify
// "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a
@@ -1230,7 +1233,6 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
Res = createAndInstr(I, X, C3);
ConstOpnd ^= C1;
-
} else if (Opnd1->isOrExpr()) {
// Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2
//
@@ -1349,7 +1351,6 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,
// step 3.2: When previous and current operands share the same symbolic
// value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd"
- //
if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) {
// Remove previous operand
PrevOpnd->Invalidate();
@@ -1601,7 +1602,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
RedoInsts.insert(VI);
// Create the multiply.
- Instruction *V2 = CreateMul(V, MaxOccVal, "tmp", I, I);
+ Instruction *V2 = CreateMul(V, MaxOccVal, "reass.mul", I, I);
// Rerun associate on the multiply in case the inner expression turned into
// a multiply. We want to make sure that we keep things in canonical form.
@@ -2012,8 +2013,8 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
if (I->isCommutative())
canonicalizeOperands(I);
- // Don't optimize floating point instructions that don't have unsafe algebra.
- if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra())
+ // Don't optimize floating-point instructions unless they are 'fast'.
+ if (I->getType()->isFPOrFPVectorTy() && !I->isFast())
return;
// Do not reassociate boolean (i1) expressions. We want to preserve the
@@ -2140,7 +2141,8 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
I->replaceAllUsesWith(V);
if (Instruction *VI = dyn_cast<Instruction>(V))
- VI->setDebugLoc(I->getDebugLoc());
+ if (I->getDebugLoc())
+ VI->setDebugLoc(I->getDebugLoc());
RedoInsts.insert(I);
++NumAnnihil;
return;
@@ -2183,11 +2185,104 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
return;
}
+ if (Ops.size() > 2 && Ops.size() <= GlobalReassociateLimit) {
+ // Find the pair with the highest count in the pairmap and move it to the
+ // back of the list so that it can later be CSE'd.
+ // example:
+ // a*b*c*d*e
+ // if c*e is the most "popular" pair, we can express this as
+ // (((c*e)*d)*b)*a
+ unsigned Max = 1;
+ unsigned BestRank = 0;
+ std::pair<unsigned, unsigned> BestPair;
+ unsigned Idx = I->getOpcode() - Instruction::BinaryOpsBegin;
+ for (unsigned i = 0; i < Ops.size() - 1; ++i)
+ for (unsigned j = i + 1; j < Ops.size(); ++j) {
+ unsigned Score = 0;
+ Value *Op0 = Ops[i].Op;
+ Value *Op1 = Ops[j].Op;
+ if (std::less<Value *>()(Op1, Op0))
+ std::swap(Op0, Op1);
+ auto it = PairMap[Idx].find({Op0, Op1});
+ if (it != PairMap[Idx].end())
+ Score += it->second;
+
+ unsigned MaxRank = std::max(Ops[i].Rank, Ops[j].Rank);
+ if (Score > Max || (Score == Max && MaxRank < BestRank)) {
+ BestPair = {i, j};
+ Max = Score;
+ BestRank = MaxRank;
+ }
+ }
+ if (Max > 1) {
+ auto Op0 = Ops[BestPair.first];
+ auto Op1 = Ops[BestPair.second];
+ Ops.erase(&Ops[BestPair.second]);
+ Ops.erase(&Ops[BestPair.first]);
+ Ops.push_back(Op0);
+ Ops.push_back(Op1);
+ }
+ }
// Now that we ordered and optimized the expressions, splat them back into
// the expression tree, removing any unneeded nodes.
RewriteExprTree(I, Ops);
}
+void
+ReassociatePass::BuildPairMap(ReversePostOrderTraversal<Function *> &RPOT) {
+ // Make a "pairmap" of how often each operand pair occurs.
+ for (BasicBlock *BI : RPOT) {
+ for (Instruction &I : *BI) {
+ if (!I.isAssociative())
+ continue;
+
+ // Ignore nodes that aren't at the root of trees.
+ if (I.hasOneUse() && I.user_back()->getOpcode() == I.getOpcode())
+ continue;
+
+ // Collect all operands in a single reassociable expression.
+ // Since Reassociate has already been run once, we can assume things
+ // are already canonical according to Reassociation's regime.
+ SmallVector<Value *, 8> Worklist = { I.getOperand(0), I.getOperand(1) };
+ SmallVector<Value *, 8> Ops;
+ while (!Worklist.empty() && Ops.size() <= GlobalReassociateLimit) {
+ Value *Op = Worklist.pop_back_val();
+ Instruction *OpI = dyn_cast<Instruction>(Op);
+ if (!OpI || OpI->getOpcode() != I.getOpcode() || !OpI->hasOneUse()) {
+ Ops.push_back(Op);
+ continue;
+ }
+ // Be paranoid about self-referencing expressions in unreachable code.
+ if (OpI->getOperand(0) != OpI)
+ Worklist.push_back(OpI->getOperand(0));
+ if (OpI->getOperand(1) != OpI)
+ Worklist.push_back(OpI->getOperand(1));
+ }
+ // Skip extremely long expressions.
+ if (Ops.size() > GlobalReassociateLimit)
+ continue;
+
+ // Add all pairwise combinations of operands to the pair map.
+ unsigned BinaryIdx = I.getOpcode() - Instruction::BinaryOpsBegin;
+ SmallSet<std::pair<Value *, Value*>, 32> Visited;
+ for (unsigned i = 0; i < Ops.size() - 1; ++i) {
+ for (unsigned j = i + 1; j < Ops.size(); ++j) {
+ // Canonicalize operand orderings.
+ Value *Op0 = Ops[i];
+ Value *Op1 = Ops[j];
+ if (std::less<Value *>()(Op1, Op0))
+ std::swap(Op0, Op1);
+ if (!Visited.insert({Op0, Op1}).second)
+ continue;
+ auto res = PairMap[BinaryIdx].insert({{Op0, Op1}, 1});
+ if (!res.second)
+ ++res.first->second;
+ }
+ }
+ }
+ }
+}
+
PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
// Get the functions basic blocks in Reverse Post Order. This order is used by
// BuildRankMap to pre calculate ranks correctly. It also excludes dead basic
@@ -2198,8 +2293,20 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
// Calculate the rank map for F.
BuildRankMap(F, RPOT);
+ // Build the pair map before running reassociate.
+ // Technically this would be more accurate if we did it after one round
+ // of reassociation, but in practice it doesn't seem to help much on
+ // real-world code, so don't waste the compile time running reassociate
+ // twice.
+ // If a user wants, they could expicitly run reassociate twice in their
+ // pass pipeline for further potential gains.
+ // It might also be possible to update the pair map during runtime, but the
+ // overhead of that may be large if there's many reassociable chains.
+ BuildPairMap(RPOT);
+
MadeChange = false;
- // Traverse the same blocks that was analysed by BuildRankMap.
+
+ // Traverse the same blocks that were analysed by BuildRankMap.
for (BasicBlock *BI : RPOT) {
assert(RankMap.count(&*BI) && "BB should be ranked.");
// Optimize every instruction in the basic block.
@@ -2238,9 +2345,11 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
}
}
- // We are done with the rank map.
+ // We are done with the rank map and pair map.
RankMap.clear();
ValueRankMap.clear();
+ for (auto &Entry : PairMap)
+ Entry.clear();
if (MadeChange) {
PreservedAnalyses PA;
@@ -2253,10 +2362,13 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
}
namespace {
+
class ReassociateLegacyPass : public FunctionPass {
ReassociatePass Impl;
+
public:
static char ID; // Pass identification, replacement for typeid
+
ReassociateLegacyPass() : FunctionPass(ID) {
initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
}
@@ -2275,9 +2387,11 @@ namespace {
AU.addPreserved<GlobalsAAWrapperPass>();
}
};
-}
+
+} // end anonymous namespace
char ReassociateLegacyPass::ID = 0;
+
INITIALIZE_PASS(ReassociateLegacyPass, "reassociate",
"Reassociate expressions", false, false)
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index f19d45329d23..3b45cfa482e6 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -12,36 +12,69 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Scalar/RewriteStatepointsForGC.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/CFG.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
-#include "llvm/IR/Verifier.h"
+#include "llvm/IR/ValueHandle.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
#define DEBUG_TYPE "rewrite-statepoints-for-gc"
@@ -52,6 +85,7 @@ static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden,
cl::init(false));
static cl::opt<bool> PrintLiveSetSize("spp-print-liveset-size", cl::Hidden,
cl::init(false));
+
// Print out the base pointers for debugging
static cl::opt<bool> PrintBasePointers("spp-print-base-pointers", cl::Hidden,
cl::init(false));
@@ -67,6 +101,7 @@ static bool ClobberNonLive = true;
#else
static bool ClobberNonLive = false;
#endif
+
static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live",
cl::location(ClobberNonLive),
cl::Hidden);
@@ -75,27 +110,96 @@ static cl::opt<bool>
AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info",
cl::Hidden, cl::init(true));
+/// The IR fed into RewriteStatepointsForGC may have had attributes and
+/// metadata implying dereferenceability that are no longer valid/correct after
+/// RewriteStatepointsForGC has run. This is because semantically, after
+/// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire
+/// heap. stripNonValidData (conservatively) restores
+/// correctness by erasing all attributes in the module that externally imply
+/// dereferenceability. Similar reasoning also applies to the noalias
+/// attributes and metadata. gc.statepoint can touch the entire heap including
+/// noalias objects.
+/// Apart from attributes and metadata, we also remove instructions that imply
+/// constant physical memory: llvm.invariant.start.
+static void stripNonValidData(Module &M);
+
+static bool shouldRewriteStatepointsIn(Function &F);
+
+PreservedAnalyses RewriteStatepointsForGC::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ bool Changed = false;
+ auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ for (Function &F : M) {
+ // Nothing to do for declarations.
+ if (F.isDeclaration() || F.empty())
+ continue;
+
+ // Policy choice says not to rewrite - the most common reason is that we're
+ // compiling code without a GCStrategy.
+ if (!shouldRewriteStatepointsIn(F))
+ continue;
+
+ auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+ auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
+ auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+ Changed |= runOnFunction(F, DT, TTI, TLI);
+ }
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ // stripNonValidData asserts that shouldRewriteStatepointsIn
+ // returns true for at least one function in the module. Since at least
+ // one function changed, we know that the precondition is satisfied.
+ stripNonValidData(M);
+
+ PreservedAnalyses PA;
+ PA.preserve<TargetIRAnalysis>();
+ PA.preserve<TargetLibraryAnalysis>();
+ return PA;
+}
+
namespace {
-struct RewriteStatepointsForGC : public ModulePass {
+
+class RewriteStatepointsForGCLegacyPass : public ModulePass {
+ RewriteStatepointsForGC Impl;
+
+public:
static char ID; // Pass identification, replacement for typeid
- RewriteStatepointsForGC() : ModulePass(ID) {
- initializeRewriteStatepointsForGCPass(*PassRegistry::getPassRegistry());
+ RewriteStatepointsForGCLegacyPass() : ModulePass(ID), Impl() {
+ initializeRewriteStatepointsForGCLegacyPassPass(
+ *PassRegistry::getPassRegistry());
}
- bool runOnFunction(Function &F);
+
bool runOnModule(Module &M) override {
bool Changed = false;
- for (Function &F : M)
- Changed |= runOnFunction(F);
+ const TargetLibraryInfo &TLI =
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ for (Function &F : M) {
+ // Nothing to do for declarations.
+ if (F.isDeclaration() || F.empty())
+ continue;
+
+ // Policy choice says not to rewrite - the most common reason is that
+ // we're compiling code without a GCStrategy.
+ if (!shouldRewriteStatepointsIn(F))
+ continue;
- if (Changed) {
- // stripNonValidAttributesAndMetadata asserts that shouldRewriteStatepointsIn
- // returns true for at least one function in the module. Since at least
- // one function changed, we know that the precondition is satisfied.
- stripNonValidAttributesAndMetadata(M);
+ TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+
+ Changed |= Impl.runOnFunction(F, DT, TTI, TLI);
}
- return Changed;
+ if (!Changed)
+ return false;
+
+ // stripNonValidData asserts that shouldRewriteStatepointsIn
+ // returns true for at least one function in the module. Since at least
+ // one function changed, we know that the precondition is satisfied.
+ stripNonValidData(M);
+ return true;
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -103,46 +207,33 @@ struct RewriteStatepointsForGC : public ModulePass {
// else. We could in theory preserve a lot more analyses here.
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
}
-
- /// The IR fed into RewriteStatepointsForGC may have had attributes and
- /// metadata implying dereferenceability that are no longer valid/correct after
- /// RewriteStatepointsForGC has run. This is because semantically, after
- /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire
- /// heap. stripNonValidAttributesAndMetadata (conservatively) restores
- /// correctness by erasing all attributes in the module that externally imply
- /// dereferenceability. Similar reasoning also applies to the noalias
- /// attributes and metadata. gc.statepoint can touch the entire heap including
- /// noalias objects.
- void stripNonValidAttributesAndMetadata(Module &M);
-
- // Helpers for stripNonValidAttributesAndMetadata
- void stripNonValidAttributesAndMetadataFromBody(Function &F);
- void stripNonValidAttributesFromPrototype(Function &F);
- // Certain metadata on instructions are invalid after running RS4GC.
- // Optimizations that run after RS4GC can incorrectly use this metadata to
- // optimize functions. We drop such metadata on the instruction.
- void stripInvalidMetadataFromInstruction(Instruction &I);
};
-} // namespace
-char RewriteStatepointsForGC::ID = 0;
+} // end anonymous namespace
-ModulePass *llvm::createRewriteStatepointsForGCPass() {
- return new RewriteStatepointsForGC();
+char RewriteStatepointsForGCLegacyPass::ID = 0;
+
+ModulePass *llvm::createRewriteStatepointsForGCLegacyPass() {
+ return new RewriteStatepointsForGCLegacyPass();
}
-INITIALIZE_PASS_BEGIN(RewriteStatepointsForGC, "rewrite-statepoints-for-gc",
+INITIALIZE_PASS_BEGIN(RewriteStatepointsForGCLegacyPass,
+ "rewrite-statepoints-for-gc",
"Make relocations explicit at statepoints", false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(RewriteStatepointsForGC, "rewrite-statepoints-for-gc",
+INITIALIZE_PASS_END(RewriteStatepointsForGCLegacyPass,
+ "rewrite-statepoints-for-gc",
"Make relocations explicit at statepoints", false, false)
namespace {
+
struct GCPtrLivenessData {
/// Values defined in this block.
MapVector<BasicBlock *, SetVector<Value *>> KillSet;
+
/// Values used in this block (and thus live); does not included values
/// killed within this block.
MapVector<BasicBlock *, SetVector<Value *>> LiveSet;
@@ -166,10 +257,10 @@ struct GCPtrLivenessData {
// Generally, after the execution of a full findBasePointer call, only the
// base relation will remain. Internally, we add a mixture of the two
// types, then update all the second type to the first type
-typedef MapVector<Value *, Value *> DefiningValueMapTy;
-typedef SetVector<Value *> StatepointLiveSetTy;
-typedef MapVector<AssertingVH<Instruction>, AssertingVH<Value>>
- RematerializedValueMapTy;
+using DefiningValueMapTy = MapVector<Value *, Value *>;
+using StatepointLiveSetTy = SetVector<Value *>;
+using RematerializedValueMapTy =
+ MapVector<AssertingVH<Instruction>, AssertingVH<Value>>;
struct PartiallyConstructedSafepointRecord {
/// The set of values known to be live across this safepoint
@@ -191,7 +282,8 @@ struct PartiallyConstructedSafepointRecord {
/// Maps rematerialized copy to it's original value.
RematerializedValueMapTy RematerializedValues;
};
-}
+
+} // end anonymous namespace
static ArrayRef<Use> GetDeoptBundleOperands(ImmutableCallSite CS) {
Optional<OperandBundleUse> DeoptBundle =
@@ -254,7 +346,7 @@ static bool containsGCPtrType(Type *Ty) {
if (ArrayType *AT = dyn_cast<ArrayType>(Ty))
return containsGCPtrType(AT->getElementType());
if (StructType *ST = dyn_cast<StructType>(Ty))
- return any_of(ST->subtypes(), containsGCPtrType);
+ return llvm::any_of(ST->subtypes(), containsGCPtrType);
return false;
}
@@ -299,7 +391,9 @@ analyzeParsePointLiveness(DominatorTree &DT,
}
static bool isKnownBaseResult(Value *V);
+
namespace {
+
/// A single base defining value - An immediate base defining value for an
/// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'.
/// For instructions which have multiple pointer [vector] inputs or that
@@ -311,9 +405,11 @@ namespace {
struct BaseDefiningValueResult {
/// Contains the value which is the base defining value.
Value * const BDV;
+
/// True if the base defining value is also known to be an actual base
/// pointer.
const bool IsKnownBase;
+
BaseDefiningValueResult(Value *BDV, bool IsKnownBase)
: BDV(BDV), IsKnownBase(IsKnownBase) {
#ifndef NDEBUG
@@ -324,7 +420,8 @@ struct BaseDefiningValueResult {
#endif
}
};
-}
+
+} // end anonymous namespace
static BaseDefiningValueResult findBaseDefiningValue(Value *I);
@@ -374,6 +471,11 @@ findBaseDefiningValueOfVector(Value *I) {
if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
return findBaseDefiningValue(GEP->getPointerOperand());
+ // If the pointer comes through a bitcast of a vector of pointers to
+ // a vector of another type of pointer, then look through the bitcast
+ if (auto *BC = dyn_cast<BitCastInst>(I))
+ return findBaseDefiningValue(BC->getOperand(0));
+
// A PHI or Select is a base defining value. The outer findBasePointer
// algorithm is responsible for constructing a base value for this BDV.
assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
@@ -429,7 +531,6 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
if (isa<LoadInst>(I))
// The value loaded is an gc base itself
return BaseDefiningValueResult(I, true);
-
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I))
// The base of this GEP is the base
@@ -442,12 +543,11 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
break;
case Intrinsic::experimental_gc_statepoint:
llvm_unreachable("statepoints don't produce pointers");
- case Intrinsic::experimental_gc_relocate: {
+ case Intrinsic::experimental_gc_relocate:
// Rerunning safepoint insertion after safepoints are already
// inserted is not supported. It could probably be made to work,
// but why are you doing this? There's no good reason.
llvm_unreachable("repeat safepoint insertion is not supported");
- }
case Intrinsic::gcroot:
// Currently, this mechanism hasn't been extended to work with gcroot.
// There's no reason it couldn't be, but I haven't thought about the
@@ -551,6 +651,7 @@ static bool isKnownBaseResult(Value *V) {
}
namespace {
+
/// Models the state of a single base defining value in the findBasePointer
/// algorithm for determining where a new instruction is needed to propagate
/// the base of this BDV.
@@ -558,7 +659,7 @@ class BDVState {
public:
enum Status { Unknown, Base, Conflict };
- BDVState() : Status(Unknown), BaseValue(nullptr) {}
+ BDVState() : BaseValue(nullptr) {}
explicit BDVState(Status Status, Value *BaseValue = nullptr)
: Status(Status), BaseValue(BaseValue) {
@@ -597,16 +698,17 @@ public:
case Conflict:
OS << "C";
break;
- };
+ }
OS << " (" << getBaseValue() << " - "
<< (getBaseValue() ? getBaseValue()->getName() : "nullptr") << "): ";
}
private:
- Status Status;
+ Status Status = Unknown;
AssertingVH<Value> BaseValue; // Non-null only if Status == Base.
};
-}
+
+} // end anonymous namespace
#ifndef NDEBUG
static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) {
@@ -1169,7 +1271,7 @@ static void CreateGCRelocates(ArrayRef<Value *> LiveVariables,
return;
auto FindIndex = [](ArrayRef<Value *> LiveVec, Value *Val) {
- auto ValIt = find(LiveVec, Val);
+ auto ValIt = llvm::find(LiveVec, Val);
assert(ValIt != LiveVec.end() && "Val not found in LiveVec!");
size_t Index = std::distance(LiveVec.begin(), ValIt);
assert(Index < LiveVec.size() && "Bug in std::find?");
@@ -1229,7 +1331,7 @@ class DeferredReplacement {
AssertingVH<Instruction> New;
bool IsDeoptimize = false;
- DeferredReplacement() {}
+ DeferredReplacement() = default;
public:
static DeferredReplacement createRAUW(Instruction *Old, Instruction *New) {
@@ -1286,7 +1388,8 @@ public:
OldI->eraseFromParent();
}
};
-}
+
+} // end anonymous namespace
static StringRef getDeoptLowering(CallSite CS) {
const char *DeoptLowering = "deopt-lowering";
@@ -1304,7 +1407,6 @@ static StringRef getDeoptLowering(CallSite CS) {
return "live-through";
}
-
static void
makeStatepointExplicitImpl(const CallSite CS, /* to replace */
const SmallVectorImpl<Value *> &BasePtrs,
@@ -1528,7 +1630,6 @@ static void
insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs,
DenseMap<Value *, Value *> &AllocaMap,
DenseSet<Value *> &VisitedLiveValues) {
-
for (User *U : GCRelocs) {
GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U);
if (!Relocate)
@@ -1564,7 +1665,6 @@ static void insertRematerializationStores(
const RematerializedValueMapTy &RematerializedValues,
DenseMap<Value *, Value *> &AllocaMap,
DenseSet<Value *> &VisitedLiveValues) {
-
for (auto RematerializedValuePair: RematerializedValues) {
Instruction *RematerializedValue = RematerializedValuePair.first;
Value *OriginalValue = RematerializedValuePair.second;
@@ -1830,7 +1930,6 @@ static void findLiveReferences(
static Value* findRematerializableChainToBasePointer(
SmallVectorImpl<Instruction*> &ChainToBase,
Value *CurrentValue) {
-
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurrentValue)) {
ChainToBase.push_back(GEP);
return findRematerializableChainToBasePointer(ChainToBase,
@@ -1886,7 +1985,6 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
}
static bool AreEquivalentPhiNodes(PHINode &OrigRootPhi, PHINode &AlternateRootPhi) {
-
unsigned PhiNum = OrigRootPhi.getNumIncomingValues();
if (PhiNum != AlternateRootPhi.getNumIncomingValues() ||
OrigRootPhi.getParent() != AlternateRootPhi.getParent())
@@ -1910,7 +2008,6 @@ static bool AreEquivalentPhiNodes(PHINode &OrigRootPhi, PHINode &AlternateRootPh
return false;
}
return true;
-
}
// From the statepoint live set pick values that are cheaper to recompute then
@@ -2297,8 +2394,7 @@ static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
AH.setAttributes(AH.getAttributes().removeAttributes(Ctx, Index, R));
}
-void
-RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) {
+static void stripNonValidAttributesFromPrototype(Function &F) {
LLVMContext &Ctx = F.getContext();
for (Argument &A : F.args())
@@ -2310,8 +2406,10 @@ RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) {
RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex);
}
-void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I) {
-
+/// Certain metadata on instructions are invalid after running RS4GC.
+/// Optimizations that run after RS4GC can incorrectly use this metadata to
+/// optimize functions. We drop such metadata on the instruction.
+static void stripInvalidMetadataFromInstruction(Instruction &I) {
if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
return;
// These are the attributes that are still valid on loads and stores after
@@ -2337,18 +2435,32 @@ void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I
// Drops all metadata on the instruction other than ValidMetadataAfterRS4GC.
I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC);
-
}
-void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Function &F) {
+static void stripNonValidDataFromBody(Function &F) {
if (F.empty())
return;
LLVMContext &Ctx = F.getContext();
MDBuilder Builder(Ctx);
+ // Set of invariantstart instructions that we need to remove.
+ // Use this to avoid invalidating the instruction iterator.
+ SmallVector<IntrinsicInst*, 12> InvariantStartInstructions;
for (Instruction &I : instructions(F)) {
+ // invariant.start on memory location implies that the referenced memory
+ // location is constant and unchanging. This is no longer true after
+ // RewriteStatepointsForGC runs because there can be calls to gc.statepoint
+ // which frees the entire heap and the presence of invariant.start allows
+ // the optimizer to sink the load of a memory location past a statepoint,
+ // which is incorrect.
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::invariant_start) {
+ InvariantStartInstructions.push_back(II);
+ continue;
+ }
+
if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) {
assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!");
bool IsImmutableTBAA =
@@ -2378,6 +2490,12 @@ void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Functio
RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex);
}
}
+
+ // Delete the invariant.start instructions and RAUW undef.
+ for (auto *II : InvariantStartInstructions) {
+ II->replaceAllUsesWith(UndefValue::get(II->getType()));
+ II->eraseFromParent();
+ }
}
/// Returns true if this function should be rewritten by this pass. The main
@@ -2394,35 +2512,28 @@ static bool shouldRewriteStatepointsIn(Function &F) {
return false;
}
-void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) {
+static void stripNonValidData(Module &M) {
#ifndef NDEBUG
- assert(any_of(M, shouldRewriteStatepointsIn) && "precondition!");
+ assert(llvm::any_of(M, shouldRewriteStatepointsIn) && "precondition!");
#endif
for (Function &F : M)
stripNonValidAttributesFromPrototype(F);
for (Function &F : M)
- stripNonValidAttributesAndMetadataFromBody(F);
+ stripNonValidDataFromBody(F);
}
-bool RewriteStatepointsForGC::runOnFunction(Function &F) {
- // Nothing to do for declarations.
- if (F.isDeclaration() || F.empty())
- return false;
-
- // Policy choice says not to rewrite - the most common reason is that we're
- // compiling code without a GCStrategy.
- if (!shouldRewriteStatepointsIn(F))
- return false;
-
- DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
- TargetTransformInfo &TTI =
- getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
+ TargetTransformInfo &TTI,
+ const TargetLibraryInfo &TLI) {
+ assert(!F.isDeclaration() && !F.empty() &&
+ "need function body to rewrite statepoints in");
+ assert(shouldRewriteStatepointsIn(F) && "mismatch in rewrite decision");
- auto NeedsRewrite = [](Instruction &I) {
+ auto NeedsRewrite = [&TLI](Instruction &I) {
if (ImmutableCallSite CS = ImmutableCallSite(&I))
- return !callsGCLeafFunction(CS) && !isStatepoint(CS);
+ return !callsGCLeafFunction(CS, TLI) && !isStatepoint(CS);
return false;
};
@@ -2662,7 +2773,6 @@ static void computeLiveInValues(DominatorTree &DT, Function &F,
static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data,
StatepointLiveSetTy &Out) {
-
BasicBlock *BB = Inst->getParent();
// Note: The copy is intentional and required
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 4822cf7cce0f..e5866b4718da 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -18,30 +18,49 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/IPO/SCCP.h"
+#include "llvm/Transforms/Scalar/SCCP.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueLattice.h"
+#include "llvm/Analysis/ValueLatticeUtils.h"
+#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/SCCP.h"
#include "llvm/Transforms/Utils/Local.h"
-#include <algorithm>
+#include <cassert>
+#include <utility>
+#include <vector>
+
using namespace llvm;
#define DEBUG_TYPE "sccp"
@@ -52,8 +71,11 @@ STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");
STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP");
STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");
+STATISTIC(IPNumRangeInfoUsed, "Number of times constant range info was used by"
+ "IPSCCP");
namespace {
+
/// LatticeVal class - This class represents the different lattice values that
/// an LLVM value may occupy. It is a simple class with value semantics.
///
@@ -88,9 +110,11 @@ public:
LatticeVal() : Val(nullptr, unknown) {}
bool isUnknown() const { return getLatticeValue() == unknown; }
+
bool isConstant() const {
return getLatticeValue() == constant || getLatticeValue() == forcedconstant;
}
+
bool isOverdefined() const { return getLatticeValue() == overdefined; }
Constant *getConstant() const {
@@ -153,11 +177,15 @@ public:
Val.setInt(forcedconstant);
Val.setPointer(V);
}
-};
-} // end anonymous namespace.
-
-namespace {
+ ValueLatticeElement toValueLattice() const {
+ if (isOverdefined())
+ return ValueLatticeElement::getOverdefined();
+ if (isConstant())
+ return ValueLatticeElement::get(getConstant());
+ return ValueLatticeElement();
+ }
+};
//===----------------------------------------------------------------------===//
//
@@ -167,37 +195,38 @@ namespace {
class SCCPSolver : public InstVisitor<SCCPSolver> {
const DataLayout &DL;
const TargetLibraryInfo *TLI;
- SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable.
- DenseMap<Value*, LatticeVal> ValueState; // The state each value is in.
+ SmallPtrSet<BasicBlock *, 8> BBExecutable; // The BBs that are executable.
+ DenseMap<Value *, LatticeVal> ValueState; // The state each value is in.
+ // The state each parameter is in.
+ DenseMap<Value *, ValueLatticeElement> ParamState;
/// StructValueState - This maintains ValueState for values that have
/// StructType, for example for formal arguments, calls, insertelement, etc.
- ///
- DenseMap<std::pair<Value*, unsigned>, LatticeVal> StructValueState;
+ DenseMap<std::pair<Value *, unsigned>, LatticeVal> StructValueState;
/// GlobalValue - If we are tracking any values for the contents of a global
/// variable, we keep a mapping from the constant accessor to the element of
/// the global, to the currently known value. If the value becomes
/// overdefined, it's entry is simply removed from this map.
- DenseMap<GlobalVariable*, LatticeVal> TrackedGlobals;
+ DenseMap<GlobalVariable *, LatticeVal> TrackedGlobals;
/// TrackedRetVals - If we are tracking arguments into and the return
/// value out of a function, it will have an entry in this map, indicating
/// what the known return value for the function is.
- DenseMap<Function*, LatticeVal> TrackedRetVals;
+ DenseMap<Function *, LatticeVal> TrackedRetVals;
/// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions
/// that return multiple values.
- DenseMap<std::pair<Function*, unsigned>, LatticeVal> TrackedMultipleRetVals;
+ DenseMap<std::pair<Function *, unsigned>, LatticeVal> TrackedMultipleRetVals;
/// MRVFunctionsTracked - Each function in TrackedMultipleRetVals is
/// represented here for efficient lookup.
- SmallPtrSet<Function*, 16> MRVFunctionsTracked;
+ SmallPtrSet<Function *, 16> MRVFunctionsTracked;
/// TrackingIncomingArguments - This is the set of functions for whose
/// arguments we make optimistic assumptions about and try to prove as
/// constants.
- SmallPtrSet<Function*, 16> TrackingIncomingArguments;
+ SmallPtrSet<Function *, 16> TrackingIncomingArguments;
/// The reason for two worklists is that overdefined is the lowest state
/// on the lattice, and moving things to overdefined as fast as possible
@@ -206,16 +235,17 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
/// By having a separate worklist, we accomplish this because everything
/// possibly overdefined will become overdefined at the soonest possible
/// point.
- SmallVector<Value*, 64> OverdefinedInstWorkList;
- SmallVector<Value*, 64> InstWorkList;
-
+ SmallVector<Value *, 64> OverdefinedInstWorkList;
+ SmallVector<Value *, 64> InstWorkList;
- SmallVector<BasicBlock*, 64> BBWorkList; // The BasicBlock work list
+ // The BasicBlock work list
+ SmallVector<BasicBlock *, 64> BBWorkList;
/// KnownFeasibleEdges - Entries in this set are edges which have already had
/// PHI nodes retriggered.
- typedef std::pair<BasicBlock*, BasicBlock*> Edge;
+ using Edge = std::pair<BasicBlock *, BasicBlock *>;
DenseSet<Edge> KnownFeasibleEdges;
+
public:
SCCPSolver(const DataLayout &DL, const TargetLibraryInfo *tli)
: DL(DL), TLI(tli) {}
@@ -263,8 +293,13 @@ public:
TrackingIncomingArguments.insert(F);
}
+ /// Returns true if the given function is in the solver's set of
+ /// argument-tracked functions.
+ bool isArgumentTrackedFunction(Function *F) {
+ return TrackingIncomingArguments.count(F);
+ }
+
/// Solve - Solve for constants and executable blocks.
- ///
void Solve();
/// ResolvedUndefsIn - While solving the dataflow for a function, we assume
@@ -290,14 +325,23 @@ public:
return StructValues;
}
- LatticeVal getLatticeValueFor(Value *V) const {
- DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V);
- assert(I != ValueState.end() && "V is not in valuemap!");
- return I->second;
+ ValueLatticeElement getLatticeValueFor(Value *V) {
+ assert(!V->getType()->isStructTy() &&
+ "Should use getStructLatticeValueFor");
+ std::pair<DenseMap<Value*, ValueLatticeElement>::iterator, bool>
+ PI = ParamState.insert(std::make_pair(V, ValueLatticeElement()));
+ ValueLatticeElement &LV = PI.first->second;
+ if (PI.second) {
+ DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V);
+ assert(I != ValueState.end() &&
+ "V not found in ValueState nor Paramstate map!");
+ LV = I->second.toValueLattice();
+ }
+
+ return LV;
}
/// getTrackedRetVals - Get the inferred return value map.
- ///
const DenseMap<Function*, LatticeVal> &getTrackedRetVals() {
return TrackedRetVals;
}
@@ -349,7 +393,6 @@ private:
// markConstant - Make a value be marked as "constant". If the value
// is not already a constant, add it to the instruction work list so that
// the users of the instruction are updated later.
- //
void markConstant(LatticeVal &IV, Value *V, Constant *C) {
if (!IV.markConstant(C)) return;
DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n');
@@ -369,7 +412,6 @@ private:
pushToWorkList(IV, V);
}
-
// markOverdefined - Make a value be marked as "overdefined". If the
// value is not already overdefined, add it to the overdefined instruction
// work list so that the users of the instruction are updated later.
@@ -402,7 +444,6 @@ private:
mergeInValue(ValueState[V], V, MergeWithV);
}
-
/// getValueState - Return the LatticeVal object that corresponds to the
/// value. This function handles the case when the value hasn't been seen yet
/// by properly seeding constants etc.
@@ -426,6 +467,18 @@ private:
return LV;
}
+ ValueLatticeElement &getParamState(Value *V) {
+ assert(!V->getType()->isStructTy() && "Should use getStructValueState");
+
+ std::pair<DenseMap<Value*, ValueLatticeElement>::iterator, bool>
+ PI = ParamState.insert(std::make_pair(V, ValueLatticeElement()));
+ ValueLatticeElement &LV = PI.first->second;
+ if (PI.second)
+ LV = getValueState(V).toValueLattice();
+
+ return LV;
+ }
+
/// getStructValueState - Return the LatticeVal object that corresponds to the
/// value/field pair. This function handles the case when the value hasn't
/// been seen yet by properly seeding constants etc.
@@ -457,7 +510,6 @@ private:
return LV;
}
-
/// markEdgeExecutable - Mark a basic block as executable, adding it to the BB
/// work list if it is not already executable.
void markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
@@ -480,18 +532,15 @@ private:
// getFeasibleSuccessors - Return a vector of booleans to indicate which
// successors are reachable from a given terminator instruction.
- //
void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs);
// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
// block to the 'To' basic block is currently feasible.
- //
bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
// OperandChangedState - This method is invoked on all of the users of an
// instruction that was just changed state somehow. Based on this
// information, we need to update the specified user of this instruction.
- //
void OperandChangedState(Instruction *I) {
if (BBExecutable.count(I->getParent())) // Inst is executable?
visit(*I);
@@ -506,6 +555,7 @@ private:
void visitPHINode(PHINode &I);
// Terminators
+
void visitReturnInst(ReturnInst &I);
void visitTerminatorInst(TerminatorInst &TI);
@@ -515,26 +565,32 @@ private:
void visitCmpInst(CmpInst &I);
void visitExtractValueInst(ExtractValueInst &EVI);
void visitInsertValueInst(InsertValueInst &IVI);
+
void visitCatchSwitchInst(CatchSwitchInst &CPI) {
markOverdefined(&CPI);
visitTerminatorInst(CPI);
}
// Instructions that cannot be folded away.
+
void visitStoreInst (StoreInst &I);
void visitLoadInst (LoadInst &I);
void visitGetElementPtrInst(GetElementPtrInst &I);
+
void visitCallInst (CallInst &I) {
visitCallSite(&I);
}
+
void visitInvokeInst (InvokeInst &II) {
visitCallSite(&II);
visitTerminatorInst(II);
}
+
void visitCallSite (CallSite CS);
void visitResumeInst (TerminatorInst &I) { /*returns void*/ }
void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
void visitFenceInst (FenceInst &I) { /*returns void*/ }
+
void visitInstruction(Instruction &I) {
// All the instructions we don't do any special handling for just
// go to overdefined.
@@ -545,10 +601,8 @@ private:
} // end anonymous namespace
-
// getFeasibleSuccessors - Return a vector of booleans to indicate which
// successors are reachable from a given terminator instruction.
-//
void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
SmallVectorImpl<bool> &Succs) {
Succs.resize(TI.getNumSuccessors());
@@ -631,10 +685,8 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
llvm_unreachable("SCCP: Don't know how to handle this terminator!");
}
-
// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
// block to the 'To' basic block is currently feasible.
-//
bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
assert(BBExecutable.count(To) && "Dest should always be alive!");
@@ -710,7 +762,6 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
// destination executable
// 7. If a conditional branch has a value that is overdefined, make all
// successors executable.
-//
void SCCPSolver::visitPHINode(PHINode &PN) {
// If this PN returns a struct, just mark the result overdefined.
// TODO: We could do a lot better than this if code actually uses this.
@@ -730,7 +781,6 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
// constant, and they agree with each other, the PHI becomes the identical
// constant. If they are constant and don't agree, the PHI is overdefined.
// If there are no executable operands, the PHI remains unknown.
- //
Constant *OperandVal = nullptr;
for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
LatticeVal IV = getValueState(PN.getIncomingValue(i));
@@ -761,7 +811,6 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
// arguments that agree with each other(and OperandVal is the constant) or
// OperandVal is null because there are no defined incoming arguments. If
// this is the case, the PHI remains unknown.
- //
if (OperandVal)
markConstant(&PN, OperandVal); // Acquire operand value
}
@@ -789,7 +838,6 @@ void SCCPSolver::visitReturnInst(ReturnInst &I) {
for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F,
getStructValueState(ResultOp, i));
-
}
}
@@ -820,7 +868,6 @@ void SCCPSolver::visitCastInst(CastInst &I) {
}
}
-
void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
// If this returns a struct, mark all elements over defined, we don't track
// structs in structs.
@@ -969,7 +1016,6 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
}
}
-
markOverdefined(&I);
}
@@ -998,7 +1044,6 @@ void SCCPSolver::visitCmpInst(CmpInst &I) {
// Handle getelementptr instructions. If all operands are constants then we
// can turn this into a getelementptr ConstantExpr.
-//
void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
if (ValueState[&I].isOverdefined()) return;
@@ -1044,7 +1089,6 @@ void SCCPSolver::visitStoreInst(StoreInst &SI) {
TrackedGlobals.erase(I); // No need to keep tracking this!
}
-
// Handle load instructions. If the operand is a constant pointer to a constant
// global, we can replace the load with the loaded constant value!
void SCCPSolver::visitLoadInst(LoadInst &I) {
@@ -1108,7 +1152,6 @@ CallOverdefined:
// a declaration, maybe we can constant fold it.
if (F && F->isDeclaration() && !I->getType()->isStructTy() &&
canConstantFoldCallTo(CS, F)) {
-
SmallVector<Constant*, 8> Operands;
for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
AI != E; ++AI) {
@@ -1162,6 +1205,9 @@ CallOverdefined:
mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg);
}
} else {
+ // Most other parts of the Solver still only use the simpler value
+ // lattice, so we propagate changes for parameters to both lattices.
+ getParamState(&*AI).mergeIn(getValueState(*CAI).toValueLattice(), DL);
mergeInValue(&*AI, getValueState(*CAI));
}
}
@@ -1360,7 +1406,6 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
// undef & X -> 0. X could be zero.
markForcedConstant(&I, Constant::getNullValue(ITy));
return true;
-
case Instruction::Or:
// Both operands undef -> undef
if (Op0LV.isUnknown() && Op1LV.isUnknown())
@@ -1368,7 +1413,6 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
// undef | X -> -1. X could be -1.
markForcedConstant(&I, Constant::getAllOnesValue(ITy));
return true;
-
case Instruction::Xor:
// undef ^ undef -> 0; strictly speaking, this is not strictly
// necessary, but we try to be nice to people who expect this
@@ -1379,7 +1423,6 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
}
// undef ^ X -> undef
break;
-
case Instruction::SDiv:
case Instruction::UDiv:
case Instruction::SRem:
@@ -1397,7 +1440,6 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
// undef % X -> 0. X could be 1.
markForcedConstant(&I, Constant::getNullValue(ITy));
return true;
-
case Instruction::AShr:
// X >>a undef -> undef.
if (Op1LV.isUnknown()) break;
@@ -1464,7 +1506,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
markOverdefined(&I);
return true;
case Instruction::Call:
- case Instruction::Invoke: {
+ case Instruction::Invoke:
// There are two reasons a call can have an undef result
// 1. It could be tracked.
// 2. It could be constant-foldable.
@@ -1478,7 +1520,6 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
// we do not know what return values are valid.
markOverdefined(&I);
return true;
- }
default:
// If we don't know what should happen here, conservatively mark it
// overdefined.
@@ -1557,11 +1598,56 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
return false;
}
+static bool tryToReplaceWithConstantRange(SCCPSolver &Solver, Value *V) {
+ bool Changed = false;
+
+ // Currently we only use range information for integer values.
+ if (!V->getType()->isIntegerTy())
+ return false;
+
+ const ValueLatticeElement &IV = Solver.getLatticeValueFor(V);
+ if (!IV.isConstantRange())
+ return false;
+
+ for (auto UI = V->uses().begin(), E = V->uses().end(); UI != E;) {
+ const Use &U = *UI++;
+ auto *Icmp = dyn_cast<ICmpInst>(U.getUser());
+ if (!Icmp || !Solver.isBlockExecutable(Icmp->getParent()))
+ continue;
+
+ auto getIcmpLatticeValue = [&](Value *Op) {
+ if (auto *C = dyn_cast<Constant>(Op))
+ return ValueLatticeElement::get(C);
+ return Solver.getLatticeValueFor(Op);
+ };
+
+ ValueLatticeElement A = getIcmpLatticeValue(Icmp->getOperand(0));
+ ValueLatticeElement B = getIcmpLatticeValue(Icmp->getOperand(1));
+
+ Constant *C = nullptr;
+ if (A.satisfiesPredicate(Icmp->getPredicate(), B))
+ C = ConstantInt::getTrue(Icmp->getType());
+ else if (A.satisfiesPredicate(Icmp->getInversePredicate(), B))
+ C = ConstantInt::getFalse(Icmp->getType());
+
+ if (C) {
+ Icmp->replaceAllUsesWith(C);
+ DEBUG(dbgs() << "Replacing " << *Icmp << " with " << *C
+ << ", because of range information " << A << " " << B
+ << "\n");
+ Icmp->eraseFromParent();
+ Changed = true;
+ }
+ }
+ return Changed;
+}
+
static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
Constant *Const = nullptr;
if (V->getType()->isStructTy()) {
std::vector<LatticeVal> IVs = Solver.getStructLatticeValueFor(V);
- if (any_of(IVs, [](const LatticeVal &LV) { return LV.isOverdefined(); }))
+ if (llvm::any_of(IVs,
+ [](const LatticeVal &LV) { return LV.isOverdefined(); }))
return false;
std::vector<Constant *> ConstVals;
auto *ST = dyn_cast<StructType>(V->getType());
@@ -1573,10 +1659,19 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
}
Const = ConstantStruct::get(ST, ConstVals);
} else {
- LatticeVal IV = Solver.getLatticeValueFor(V);
+ const ValueLatticeElement &IV = Solver.getLatticeValueFor(V);
if (IV.isOverdefined())
return false;
- Const = IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType());
+
+ if (IV.isConstantRange()) {
+ if (IV.getConstantRange().isSingleElement())
+ Const =
+ ConstantInt::get(V->getType(), IV.asConstantInteger().getValue());
+ else
+ return false;
+ } else
+ Const =
+ IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType());
}
assert(Const && "Constant is nullptr here!");
DEBUG(dbgs() << " Constant: " << *Const << " = " << *V << '\n');
@@ -1588,7 +1683,6 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
// runSCCP() - Run the Sparse Conditional Constant Propagation algorithm,
// and return true if the function was modified.
-//
static bool runSCCP(Function &F, const DataLayout &DL,
const TargetLibraryInfo *TLI) {
DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
@@ -1628,7 +1722,6 @@ static bool runSCCP(Function &F, const DataLayout &DL,
// Iterate over all of the instructions in a function, replacing them with
// constants if we have found them to be of constant values.
- //
for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) {
Instruction *Inst = &*BI++;
if (Inst->getType()->isVoidTy() || isa<TerminatorInst>(Inst))
@@ -1659,6 +1752,7 @@ PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) {
}
namespace {
+
//===--------------------------------------------------------------------===//
//
/// SCCP Class - This class uses the SCCPSolver to implement a per-function
@@ -1666,18 +1760,20 @@ namespace {
///
class SCCPLegacyPass : public FunctionPass {
public:
+ // Pass identification, replacement for typeid
+ static char ID;
+
+ SCCPLegacyPass() : FunctionPass(ID) {
+ initializeSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
- static char ID; // Pass identification, replacement for typeid
- SCCPLegacyPass() : FunctionPass(ID) {
- initializeSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
- }
// runOnFunction - Run the Sparse Conditional Constant Propagation
// algorithm, and return true if the function was modified.
- //
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
@@ -1687,9 +1783,11 @@ public:
return runSCCP(F, DL, TLI);
}
};
+
} // end anonymous namespace
char SCCPLegacyPass::ID = 0;
+
INITIALIZE_PASS_BEGIN(SCCPLegacyPass, "sccp",
"Sparse Conditional Constant Propagation", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
@@ -1699,38 +1797,11 @@ INITIALIZE_PASS_END(SCCPLegacyPass, "sccp",
// createSCCPPass - This is the public interface to this file.
FunctionPass *llvm::createSCCPPass() { return new SCCPLegacyPass(); }
-static bool AddressIsTaken(const GlobalValue *GV) {
- // Delete any dead constantexpr klingons.
- GV->removeDeadConstantUsers();
-
- for (const Use &U : GV->uses()) {
- const User *UR = U.getUser();
- if (const auto *SI = dyn_cast<StoreInst>(UR)) {
- if (SI->getOperand(0) == GV || SI->isVolatile())
- return true; // Storing addr of GV.
- } else if (isa<InvokeInst>(UR) || isa<CallInst>(UR)) {
- // Make sure we are calling the function, not passing the address.
- ImmutableCallSite CS(cast<Instruction>(UR));
- if (!CS.isCallee(&U))
- return true;
- } else if (const auto *LI = dyn_cast<LoadInst>(UR)) {
- if (LI->isVolatile())
- return true;
- } else if (isa<BlockAddress>(UR)) {
- // blockaddress doesn't take the address of the function, it takes addr
- // of label.
- } else {
- return true;
- }
- }
- return false;
-}
-
static void findReturnsToZap(Function &F,
- SmallPtrSet<Function *, 32> &AddressTakenFunctions,
- SmallVector<ReturnInst *, 8> &ReturnsToZap) {
+ SmallVector<ReturnInst *, 8> &ReturnsToZap,
+ SCCPSolver &Solver) {
// We can only do this if we know that nothing else can call the function.
- if (!F.hasLocalLinkage() || AddressTakenFunctions.count(&F))
+ if (!Solver.isArgumentTrackedFunction(&F))
return;
for (BasicBlock &BB : F)
@@ -1743,39 +1814,22 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
const TargetLibraryInfo *TLI) {
SCCPSolver Solver(DL, TLI);
- // AddressTakenFunctions - This set keeps track of the address-taken functions
- // that are in the input. As IPSCCP runs through and simplifies code,
- // functions that were address taken can end up losing their
- // address-taken-ness. Because of this, we keep track of their addresses from
- // the first pass so we can use them for the later simplification pass.
- SmallPtrSet<Function*, 32> AddressTakenFunctions;
-
// Loop over all functions, marking arguments to those with their addresses
// taken or that are external as overdefined.
- //
for (Function &F : M) {
if (F.isDeclaration())
continue;
- // If this is an exact definition of this function, then we can propagate
- // information about its result into callsites of it.
- // Don't touch naked functions. They may contain asm returning a
- // value we don't see, so we may end up interprocedurally propagating
- // the return value incorrectly.
- if (F.hasExactDefinition() && !F.hasFnAttribute(Attribute::Naked))
+ // Determine if we can track the function's return values. If so, add the
+ // function to the solver's set of return-tracked functions.
+ if (canTrackReturnsInterprocedurally(&F))
Solver.AddTrackedFunction(&F);
- // If this function only has direct calls that we can see, we can track its
- // arguments and return value aggressively, and can assume it is not called
- // unless we see evidence to the contrary.
- if (F.hasLocalLinkage()) {
- if (F.hasAddressTaken()) {
- AddressTakenFunctions.insert(&F);
- }
- else {
- Solver.AddArgumentTrackedFunction(&F);
- continue;
- }
+ // Determine if we can track the function's arguments. If so, add the
+ // function to the solver's set of argument-tracked functions.
+ if (canTrackArgumentsInterprocedurally(&F)) {
+ Solver.AddArgumentTrackedFunction(&F);
+ continue;
}
// Assume the function is called.
@@ -1786,13 +1840,14 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
Solver.markOverdefined(&AI);
}
- // Loop over global variables. We inform the solver about any internal global
- // variables that do not have their 'addresses taken'. If they don't have
- // their addresses taken, we can propagate constants through them.
- for (GlobalVariable &G : M.globals())
- if (!G.isConstant() && G.hasLocalLinkage() &&
- G.hasDefinitiveInitializer() && !AddressIsTaken(&G))
+ // Determine if we can track any of the module's global variables. If so, add
+ // the global variables we can track to the solver's set of tracked global
+ // variables.
+ for (GlobalVariable &G : M.globals()) {
+ G.removeDeadConstantUsers();
+ if (canTrackGlobalVariableInterprocedurally(&G))
Solver.TrackValueOfGlobalVariable(&G);
+ }
// Solve for constants.
bool ResolvedUndefs = true;
@@ -1809,7 +1864,6 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
// Iterate over all of the instructions in the module, replacing them with
// constants if we have found them to be of constant values.
- //
SmallVector<BasicBlock*, 512> BlocksToErase;
for (Function &F : M) {
@@ -1818,9 +1872,15 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
if (Solver.isBlockExecutable(&F.front()))
for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;
- ++AI)
- if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI))
+ ++AI) {
+ if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI)) {
++IPNumArgsElimed;
+ continue;
+ }
+
+ if (!AI->use_empty() && tryToReplaceWithConstantRange(Solver, &*AI))
+ ++IPNumRangeInfoUsed;
+ }
for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
if (!Solver.isBlockExecutable(&*BB)) {
@@ -1897,7 +1957,7 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
Function *F = I.first;
if (I.second.isOverdefined() || F->getReturnType()->isVoidTy())
continue;
- findReturnsToZap(*F, AddressTakenFunctions, ReturnsToZap);
+ findReturnsToZap(*F, ReturnsToZap, Solver);
}
for (const auto &F : Solver.getMRVFunctionsTracked()) {
@@ -1905,7 +1965,7 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
"The return type should be a struct");
StructType *STy = cast<StructType>(F->getReturnType());
if (Solver.isStructLatticeConstant(F, STy))
- findReturnsToZap(*F, AddressTakenFunctions, ReturnsToZap);
+ findReturnsToZap(*F, ReturnsToZap, Solver);
}
// Zap all returns which we've identified as zap to change.
@@ -1943,6 +2003,7 @@ PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) {
}
namespace {
+
//===--------------------------------------------------------------------===//
//
/// IPSCCP Class - This class implements interprocedural Sparse Conditional
@@ -1969,9 +2030,11 @@ public:
AU.addRequired<TargetLibraryInfoWrapperPass>();
}
};
+
} // end anonymous namespace
char IPSCCPLegacyPass::ID = 0;
+
INITIALIZE_PASS_BEGIN(IPSCCPLegacyPass, "ipsccp",
"Interprocedural Sparse Conditional Constant Propagation",
false, false)
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index b9cee5b2ba95..bfe3754f0769 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -24,28 +24,54 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/SROA.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PointerIntPair.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/PtrUseVisitor.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantFolder.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DIBuilder.h"
#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
-#include "llvm/Support/Chrono.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
@@ -55,6 +81,17 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
#ifndef NDEBUG
// We only use this for a debug check.
@@ -87,11 +124,18 @@ static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
cl::Hidden);
+/// Hidden option to allow more aggressive splitting.
+static cl::opt<bool>
+SROASplitNonWholeAllocaSlices("sroa-split-nonwhole-alloca-slices",
+ cl::init(false), cl::Hidden);
+
namespace {
+
/// \brief A custom IRBuilder inserter which prefixes all names, but only in
/// Assert builds.
class IRBuilderPrefixedInserter : public IRBuilderDefaultInserter {
std::string Prefix;
+
const Twine getNameWithPrefix(const Twine &Name) const {
return Name.isTriviallyEmpty() ? Name : Prefix + Name;
}
@@ -107,11 +151,9 @@ protected:
}
};
-/// \brief Provide a typedef for IRBuilder that drops names in release builds.
-using IRBuilderTy = llvm::IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
-}
+/// \brief Provide a type for IRBuilder that drops names in release builds.
+using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
-namespace {
/// \brief A used slice of an alloca.
///
/// This structure represents a slice of an alloca used by some instruction. It
@@ -120,17 +162,18 @@ namespace {
/// or not when forming partitions of the alloca.
class Slice {
/// \brief The beginning offset of the range.
- uint64_t BeginOffset;
+ uint64_t BeginOffset = 0;
/// \brief The ending offset, not included in the range.
- uint64_t EndOffset;
+ uint64_t EndOffset = 0;
/// \brief Storage for both the use of this slice and whether it can be
/// split.
PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
public:
- Slice() : BeginOffset(), EndOffset() {}
+ Slice() = default;
+
Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
: BeginOffset(BeginOffset), EndOffset(EndOffset),
UseAndIsSplittable(U, IsSplittable) {}
@@ -180,12 +223,15 @@ public:
}
bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
};
+
} // end anonymous namespace
namespace llvm {
+
template <typename T> struct isPodLike;
template <> struct isPodLike<Slice> { static const bool value = true; };
-}
+
+} // end namespace llvm
/// \brief Representation of the alloca slices.
///
@@ -207,13 +253,15 @@ public:
/// \brief Support for iterating over the slices.
/// @{
- typedef SmallVectorImpl<Slice>::iterator iterator;
- typedef iterator_range<iterator> range;
+ using iterator = SmallVectorImpl<Slice>::iterator;
+ using range = iterator_range<iterator>;
+
iterator begin() { return Slices.begin(); }
iterator end() { return Slices.end(); }
- typedef SmallVectorImpl<Slice>::const_iterator const_iterator;
- typedef iterator_range<const_iterator> const_range;
+ using const_iterator = SmallVectorImpl<Slice>::const_iterator;
+ using const_range = iterator_range<const_iterator>;
+
const_iterator begin() const { return Slices.begin(); }
const_iterator end() const { return Slices.end(); }
/// @}
@@ -264,6 +312,7 @@ public:
private:
template <typename DerivedT, typename RetT = void> class BuilderBase;
class SliceBuilder;
+
friend class AllocaSlices::SliceBuilder;
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -320,7 +369,7 @@ private:
friend class AllocaSlices;
friend class AllocaSlices::partition_iterator;
- typedef AllocaSlices::iterator iterator;
+ using iterator = AllocaSlices::iterator;
/// \brief The beginning and ending offsets of the alloca for this
/// partition.
@@ -403,12 +452,12 @@ class AllocaSlices::partition_iterator
/// \brief We also need to keep track of the maximum split end offset seen.
/// FIXME: Do we really?
- uint64_t MaxSplitSliceEndOffset;
+ uint64_t MaxSplitSliceEndOffset = 0;
/// \brief Sets the partition to be empty at given iterator, and sets the
/// end iterator.
partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
- : P(SI), SE(SE), MaxSplitSliceEndOffset(0) {
+ : P(SI), SE(SE) {
// If not already at the end, advance our state to form the initial
// partition.
if (SI != SE)
@@ -432,19 +481,21 @@ class AllocaSlices::partition_iterator
// Remove the uses which have ended in the prior partition. This
// cannot change the max split slice end because we just checked that
// the prior partition ended prior to that max.
- P.SplitTails.erase(
- remove_if(P.SplitTails,
- [&](Slice *S) { return S->endOffset() <= P.EndOffset; }),
- P.SplitTails.end());
- assert(any_of(P.SplitTails,
- [&](Slice *S) {
- return S->endOffset() == MaxSplitSliceEndOffset;
- }) &&
+ P.SplitTails.erase(llvm::remove_if(P.SplitTails,
+ [&](Slice *S) {
+ return S->endOffset() <=
+ P.EndOffset;
+ }),
+ P.SplitTails.end());
+ assert(llvm::any_of(P.SplitTails,
+ [&](Slice *S) {
+ return S->endOffset() == MaxSplitSliceEndOffset;
+ }) &&
"Could not find the current max split slice offset!");
- assert(all_of(P.SplitTails,
- [&](Slice *S) {
- return S->endOffset() <= MaxSplitSliceEndOffset;
- }) &&
+ assert(llvm::all_of(P.SplitTails,
+ [&](Slice *S) {
+ return S->endOffset() <= MaxSplitSliceEndOffset;
+ }) &&
"Max split slice end offset is not actually the max!");
}
}
@@ -608,7 +659,8 @@ static Value *foldPHINodeOrSelectInst(Instruction &I) {
class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
friend class PtrUseVisitor<SliceBuilder>;
friend class InstVisitor<SliceBuilder>;
- typedef PtrUseVisitor<SliceBuilder> Base;
+
+ using Base = PtrUseVisitor<SliceBuilder>;
const uint64_t AllocSize;
AllocaSlices &AS;
@@ -996,8 +1048,9 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
return;
}
- Slices.erase(remove_if(Slices, [](const Slice &S) { return S.isDead(); }),
- Slices.end());
+ Slices.erase(
+ llvm::remove_if(Slices, [](const Slice &S) { return S.isDead(); }),
+ Slices.end());
#ifndef NDEBUG
if (SROARandomShuffleSlices) {
@@ -1820,11 +1873,12 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
// do that until all the backends are known to produce good code for all
// integer vector types.
if (!HaveCommonEltTy) {
- CandidateTys.erase(remove_if(CandidateTys,
- [](VectorType *VTy) {
- return !VTy->getElementType()->isIntegerTy();
- }),
- CandidateTys.end());
+ CandidateTys.erase(
+ llvm::remove_if(CandidateTys,
+ [](VectorType *VTy) {
+ return !VTy->getElementType()->isIntegerTy();
+ }),
+ CandidateTys.end());
// If there were no integer vector types, give up.
if (CandidateTys.empty())
@@ -2151,8 +2205,9 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
class llvm::sroa::AllocaSliceRewriter
: public InstVisitor<AllocaSliceRewriter, bool> {
// Befriend the base class so it can delegate to private visit methods.
- friend class llvm::InstVisitor<AllocaSliceRewriter, bool>;
- typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base;
+ friend class InstVisitor<AllocaSliceRewriter, bool>;
+
+ using Base = InstVisitor<AllocaSliceRewriter, bool>;
const DataLayout &DL;
AllocaSlices &AS;
@@ -2182,16 +2237,18 @@ class llvm::sroa::AllocaSliceRewriter
// The original offset of the slice currently being rewritten relative to
// the original alloca.
- uint64_t BeginOffset, EndOffset;
+ uint64_t BeginOffset = 0;
+ uint64_t EndOffset = 0;
+
// The new offsets of the slice currently being rewritten relative to the
// original alloca.
uint64_t NewBeginOffset, NewEndOffset;
uint64_t SliceSize;
- bool IsSplittable;
- bool IsSplit;
- Use *OldUse;
- Instruction *OldPtr;
+ bool IsSplittable = false;
+ bool IsSplit = false;
+ Use *OldUse = nullptr;
+ Instruction *OldPtr = nullptr;
// Track post-rewrite users which are PHI nodes and Selects.
SmallSetVector<PHINode *, 8> &PHIUsers;
@@ -2221,8 +2278,7 @@ public:
VecTy(PromotableVecTy),
ElementTy(VecTy ? VecTy->getElementType() : nullptr),
ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
- BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
- OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
+ PHIUsers(PHIUsers), SelectUsers(SelectUsers),
IRB(NewAI.getContext(), ConstantFolder()) {
if (VecTy) {
assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
@@ -2987,6 +3043,7 @@ private:
};
namespace {
+
/// \brief Visitor to rewrite aggregate loads and stores as scalar.
///
/// This pass aggressively rewrites all aggregate loads and stores on
@@ -2994,7 +3051,7 @@ namespace {
/// with scalar loads and stores.
class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
// Befriend the base class so it can delegate to private visit methods.
- friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>;
+ friend class InstVisitor<AggLoadStoreRewriter, bool>;
/// Queue of pointer uses to analyze and potentially rewrite.
SmallVector<Use *, 8> Queue;
@@ -3037,12 +3094,15 @@ private:
protected:
/// The builder used to form new instructions.
IRBuilderTy IRB;
+
/// The indices which to be used with insert- or extractvalue to select the
/// appropriate value within the aggregate.
SmallVector<unsigned, 4> Indices;
+
/// The indices to a GEP instruction which will move Ptr to the correct slot
/// within the aggregate.
SmallVector<Value *, 4> GEPIndices;
+
/// The base pointer of the original op, used as a base for GEPing the
/// split operations.
Value *Ptr;
@@ -3193,7 +3253,8 @@ private:
return false;
}
};
-}
+
+} // end anonymous namespace
/// \brief Strip aggregate type wrapping.
///
@@ -3485,58 +3546,60 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
// match relative to their starting offset. We have to verify this prior to
// any rewriting.
Stores.erase(
- remove_if(Stores,
- [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
- // Lookup the load we are storing in our map of split
- // offsets.
- auto *LI = cast<LoadInst>(SI->getValueOperand());
- // If it was completely unsplittable, then we're done,
- // and this store can't be pre-split.
- if (UnsplittableLoads.count(LI))
- return true;
+ llvm::remove_if(Stores,
+ [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
+ // Lookup the load we are storing in our map of split
+ // offsets.
+ auto *LI = cast<LoadInst>(SI->getValueOperand());
+ // If it was completely unsplittable, then we're done,
+ // and this store can't be pre-split.
+ if (UnsplittableLoads.count(LI))
+ return true;
- auto LoadOffsetsI = SplitOffsetsMap.find(LI);
- if (LoadOffsetsI == SplitOffsetsMap.end())
- return false; // Unrelated loads are definitely safe.
- auto &LoadOffsets = LoadOffsetsI->second;
+ auto LoadOffsetsI = SplitOffsetsMap.find(LI);
+ if (LoadOffsetsI == SplitOffsetsMap.end())
+ return false; // Unrelated loads are definitely safe.
+ auto &LoadOffsets = LoadOffsetsI->second;
- // Now lookup the store's offsets.
- auto &StoreOffsets = SplitOffsetsMap[SI];
+ // Now lookup the store's offsets.
+ auto &StoreOffsets = SplitOffsetsMap[SI];
- // If the relative offsets of each split in the load and
- // store match exactly, then we can split them and we
- // don't need to remove them here.
- if (LoadOffsets.Splits == StoreOffsets.Splits)
- return false;
+ // If the relative offsets of each split in the load and
+ // store match exactly, then we can split them and we
+ // don't need to remove them here.
+ if (LoadOffsets.Splits == StoreOffsets.Splits)
+ return false;
- DEBUG(dbgs() << " Mismatched splits for load and store:\n"
- << " " << *LI << "\n"
- << " " << *SI << "\n");
+ DEBUG(dbgs()
+ << " Mismatched splits for load and store:\n"
+ << " " << *LI << "\n"
+ << " " << *SI << "\n");
- // We've found a store and load that we need to split
- // with mismatched relative splits. Just give up on them
- // and remove both instructions from our list of
- // candidates.
- UnsplittableLoads.insert(LI);
- return true;
- }),
+ // We've found a store and load that we need to split
+ // with mismatched relative splits. Just give up on them
+ // and remove both instructions from our list of
+ // candidates.
+ UnsplittableLoads.insert(LI);
+ return true;
+ }),
Stores.end());
// Now we have to go *back* through all the stores, because a later store may
// have caused an earlier store's load to become unsplittable and if it is
// unsplittable for the later store, then we can't rely on it being split in
// the earlier store either.
- Stores.erase(remove_if(Stores,
- [&UnsplittableLoads](StoreInst *SI) {
- auto *LI = cast<LoadInst>(SI->getValueOperand());
- return UnsplittableLoads.count(LI);
- }),
+ Stores.erase(llvm::remove_if(Stores,
+ [&UnsplittableLoads](StoreInst *SI) {
+ auto *LI =
+ cast<LoadInst>(SI->getValueOperand());
+ return UnsplittableLoads.count(LI);
+ }),
Stores.end());
// Once we've established all the loads that can't be split for some reason,
// filter any that made it into our list out.
- Loads.erase(remove_if(Loads,
- [&UnsplittableLoads](LoadInst *LI) {
- return UnsplittableLoads.count(LI);
- }),
+ Loads.erase(llvm::remove_if(Loads,
+ [&UnsplittableLoads](LoadInst *LI) {
+ return UnsplittableLoads.count(LI);
+ }),
Loads.end());
// If no loads or stores are left, there is no pre-splitting to be done for
@@ -3804,7 +3867,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
}
// Remove the killed slices that have ben pre-split.
- AS.erase(remove_if(AS, [](const Slice &S) { return S.isDead(); }), AS.end());
+ AS.erase(llvm::remove_if(AS, [](const Slice &S) { return S.isDead(); }),
+ AS.end());
// Insert our new slices. This will sort and merge them into the sorted
// sequence.
@@ -3819,7 +3883,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
// Finally, don't try to promote any allocas that new require re-splitting.
// They have already been added to the worklist above.
PromotableAllocas.erase(
- remove_if(
+ llvm::remove_if(
PromotableAllocas,
[&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }),
PromotableAllocas.end());
@@ -3989,27 +4053,58 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
// First try to pre-split loads and stores.
Changed |= presplitLoadsAndStores(AI, AS);
- // Now that we have identified any pre-splitting opportunities, mark any
- // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail
- // to split these during pre-splitting, we want to force them to be
- // rewritten into a partition.
+ // Now that we have identified any pre-splitting opportunities,
+ // mark loads and stores unsplittable except for the following case.
+ // We leave a slice splittable if all other slices are disjoint or fully
+ // included in the slice, such as whole-alloca loads and stores.
+ // If we fail to split these during pre-splitting, we want to force them
+ // to be rewritten into a partition.
bool IsSorted = true;
- for (Slice &S : AS) {
- if (!S.isSplittable())
- continue;
- // FIXME: We currently leave whole-alloca splittable loads and stores. This
- // used to be the only splittable loads and stores and we need to be
- // confident that the above handling of splittable loads and stores is
- // completely sufficient before we forcibly disable the remaining handling.
- if (S.beginOffset() == 0 &&
- S.endOffset() >= DL.getTypeAllocSize(AI.getAllocatedType()))
- continue;
- if (isa<LoadInst>(S.getUse()->getUser()) ||
- isa<StoreInst>(S.getUse()->getUser())) {
- S.makeUnsplittable();
- IsSorted = false;
+
+ uint64_t AllocaSize = DL.getTypeAllocSize(AI.getAllocatedType());
+ const uint64_t MaxBitVectorSize = 1024;
+ if (SROASplitNonWholeAllocaSlices && AllocaSize <= MaxBitVectorSize) {
+ // If a byte boundary is included in any load or store, a slice starting or
+ // ending at the boundary is not splittable.
+ SmallBitVector SplittableOffset(AllocaSize + 1, true);
+ for (Slice &S : AS)
+ for (unsigned O = S.beginOffset() + 1;
+ O < S.endOffset() && O < AllocaSize; O++)
+ SplittableOffset.reset(O);
+
+ for (Slice &S : AS) {
+ if (!S.isSplittable())
+ continue;
+
+ if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
+ (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
+ continue;
+
+ if (isa<LoadInst>(S.getUse()->getUser()) ||
+ isa<StoreInst>(S.getUse()->getUser())) {
+ S.makeUnsplittable();
+ IsSorted = false;
+ }
}
}
+ else {
+ // We only allow whole-alloca splittable loads and stores
+ // for a large alloca to avoid creating too large BitVector.
+ for (Slice &S : AS) {
+ if (!S.isSplittable())
+ continue;
+
+ if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
+ continue;
+
+ if (isa<LoadInst>(S.getUse()->getUser()) ||
+ isa<StoreInst>(S.getUse()->getUser())) {
+ S.makeUnsplittable();
+ IsSorted = false;
+ }
+ }
+ }
+
if (!IsSorted)
std::sort(AS.begin(), AS.end());
@@ -4044,9 +4139,11 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
// Migrate debug information from the old alloca to the new alloca(s)
// and the individual partitions.
- if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) {
- auto *Var = DbgDecl->getVariable();
- auto *Expr = DbgDecl->getExpression();
+ TinyPtrVector<DbgInfoIntrinsic *> DbgDeclares = FindDbgAddrUses(&AI);
+ if (!DbgDeclares.empty()) {
+ auto *Var = DbgDeclares.front()->getVariable();
+ auto *Expr = DbgDeclares.front()->getExpression();
+ auto VarSize = Var->getSizeInBits();
DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
uint64_t AllocaSize = DL.getTypeSizeInBits(AI.getAllocatedType());
for (auto Fragment : Fragments) {
@@ -4062,21 +4159,43 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
uint64_t Size = Fragment.Size;
if (ExprFragment) {
uint64_t AbsEnd =
- ExprFragment->OffsetInBits + ExprFragment->SizeInBits;
+ ExprFragment->OffsetInBits + ExprFragment->SizeInBits;
if (Start >= AbsEnd)
// No need to describe a SROAed padding.
continue;
Size = std::min(Size, AbsEnd - Start);
}
- FragmentExpr = DIB.createFragmentExpression(Start, Size);
+ // The new, smaller fragment is stenciled out from the old fragment.
+ if (auto OrigFragment = FragmentExpr->getFragmentInfo()) {
+ assert(Start >= OrigFragment->OffsetInBits &&
+ "new fragment is outside of original fragment");
+ Start -= OrigFragment->OffsetInBits;
+ }
+
+ // The alloca may be larger than the variable.
+ if (VarSize) {
+ if (Size > *VarSize)
+ Size = *VarSize;
+ if (Size == 0 || Start + Size > *VarSize)
+ continue;
+ }
+
+ // Avoid creating a fragment expression that covers the entire variable.
+ if (!VarSize || *VarSize != Size) {
+ if (auto E =
+ DIExpression::createFragmentExpression(Expr, Start, Size))
+ FragmentExpr = *E;
+ else
+ continue;
+ }
}
- // Remove any existing dbg.declare intrinsic describing the same alloca.
- if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Fragment.Alloca))
- OldDDI->eraseFromParent();
+ // Remove any existing intrinsics describing the same alloca.
+ for (DbgInfoIntrinsic *OldDII : FindDbgAddrUses(Fragment.Alloca))
+ OldDII->eraseFromParent();
DIB.insertDeclare(Fragment.Alloca, Var, FragmentExpr,
- DbgDecl->getDebugLoc(), &AI);
+ DbgDeclares.front()->getDebugLoc(), &AI);
}
}
return Changed;
@@ -4175,12 +4294,22 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
///
/// We also record the alloca instructions deleted here so that they aren't
/// subsequently handed to mem2reg to promote.
-void SROA::deleteDeadInstructions(
+bool SROA::deleteDeadInstructions(
SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
+ bool Changed = false;
while (!DeadInsts.empty()) {
Instruction *I = DeadInsts.pop_back_val();
DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
+ // If the instruction is an alloca, find the possible dbg.declare connected
+ // to it, and remove it too. We must do this before calling RAUW or we will
+ // not be able to find it.
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+ DeletedAllocas.insert(AI);
+ for (DbgInfoIntrinsic *OldDII : FindDbgAddrUses(AI))
+ OldDII->eraseFromParent();
+ }
+
I->replaceAllUsesWith(UndefValue::get(I->getType()));
for (Use &Operand : I->operands())
@@ -4191,15 +4320,11 @@ void SROA::deleteDeadInstructions(
DeadInsts.insert(U);
}
- if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
- DeletedAllocas.insert(AI);
- if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(AI))
- DbgDecl->eraseFromParent();
- }
-
++NumDeleted;
I->eraseFromParent();
+ Changed = true;
}
+ return Changed;
}
/// \brief Promote the allocas, using the best available technique.
@@ -4241,7 +4366,7 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
do {
while (!Worklist.empty()) {
Changed |= runOnAlloca(*Worklist.pop_back_val());
- deleteDeadInstructions(DeletedAllocas);
+ Changed |= deleteDeadInstructions(DeletedAllocas);
// Remove the deleted allocas from various lists so that we don't try to
// continue processing them.
@@ -4249,7 +4374,7 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
Worklist.remove_if(IsInSet);
PostPromotionWorklist.remove_if(IsInSet);
- PromotableAllocas.erase(remove_if(PromotableAllocas, IsInSet),
+ PromotableAllocas.erase(llvm::remove_if(PromotableAllocas, IsInSet),
PromotableAllocas.end());
DeletedAllocas.clear();
}
@@ -4284,9 +4409,12 @@ class llvm::sroa::SROALegacyPass : public FunctionPass {
SROA Impl;
public:
+ static char ID;
+
SROALegacyPass() : FunctionPass(ID) {
initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
}
+
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
@@ -4296,6 +4424,7 @@ public:
getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
return !PA.areAllPreserved();
}
+
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<DominatorTreeWrapperPass>();
@@ -4304,7 +4433,6 @@ public:
}
StringRef getPassName() const override { return "SROA"; }
- static char ID;
};
char SROALegacyPass::ID = 0;
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index ce6f93eb0c15..3b99ddff2e06 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -35,11 +35,13 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeADCELegacyPassPass(Registry);
initializeBDCELegacyPassPass(Registry);
initializeAlignmentFromAssumptionsPass(Registry);
+ initializeCallSiteSplittingLegacyPassPass(Registry);
initializeConstantHoistingLegacyPassPass(Registry);
initializeConstantPropagationPass(Registry);
initializeCorrelatedValuePropagationPass(Registry);
initializeDCELegacyPassPass(Registry);
initializeDeadInstEliminationPass(Registry);
+ initializeDivRemPairsLegacyPassPass(Registry);
initializeScalarizerPass(Registry);
initializeDSELegacyPassPass(Registry);
initializeGuardWideningLegacyPassPass(Registry);
@@ -73,17 +75,17 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeLowerExpectIntrinsicPass(Registry);
initializeLowerGuardIntrinsicLegacyPassPass(Registry);
initializeMemCpyOptLegacyPassPass(Registry);
+ initializeMergeICmpsPass(Registry);
initializeMergedLoadStoreMotionLegacyPassPass(Registry);
initializeNaryReassociateLegacyPassPass(Registry);
initializePartiallyInlineLibCallsLegacyPassPass(Registry);
initializeReassociateLegacyPassPass(Registry);
initializeRegToMemPass(Registry);
- initializeRewriteStatepointsForGCPass(Registry);
+ initializeRewriteStatepointsForGCLegacyPassPass(Registry);
initializeSCCPLegacyPassPass(Registry);
initializeIPSCCPLegacyPassPass(Registry);
initializeSROALegacyPassPass(Registry);
initializeCFGSimplifyPassPass(Registry);
- initializeLateCFGSimplifyPassPass(Registry);
initializeStructurizeCFGPass(Registry);
initializeSimpleLoopUnswitchLegacyPassPass(Registry);
initializeSinkingLegacyPassPass(Registry);
@@ -98,6 +100,8 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeLoopLoadEliminationPass(Registry);
initializeLoopSimplifyCFGLegacyPassPass(Registry);
initializeLoopVersioningPassPass(Registry);
+ initializeEntryExitInstrumenterPass(Registry);
+ initializePostInlineEntryExitInstrumenterPass(Registry);
}
void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
@@ -117,11 +121,7 @@ void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) {
}
void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createCFGSimplificationPass());
-}
-
-void LLVMAddLateCFGSimplificationPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLateCFGSimplificationPass());
+ unwrap(PM)->add(createCFGSimplificationPass(1, false, false, true));
}
void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
diff --git a/lib/Transforms/Scalar/Scalarizer.cpp b/lib/Transforms/Scalar/Scalarizer.cpp
index d11855f2f3a9..34ed126155be 100644
--- a/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/lib/Transforms/Scalar/Scalarizer.cpp
@@ -1,4 +1,4 @@
-//===--- Scalarizer.cpp - Scalarize vector operations ---------------------===//
+//===- Scalarizer.cpp - Scalarize vector operations -----------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -14,36 +14,59 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Options.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <utility>
using namespace llvm;
#define DEBUG_TYPE "scalarizer"
namespace {
+
// Used to store the scattered form of a vector.
-typedef SmallVector<Value *, 8> ValueVector;
+using ValueVector = SmallVector<Value *, 8>;
// Used to map a vector Value to its scattered form. We use std::map
// because we want iterators to persist across insertion and because the
// values are relatively large.
-typedef std::map<Value *, ValueVector> ScatterMap;
+using ScatterMap = std::map<Value *, ValueVector>;
// Lists Instructions that have been replaced with scalar implementations,
// along with a pointer to their scattered forms.
-typedef SmallVector<std::pair<Instruction *, ValueVector *>, 16> GatherList;
+using GatherList = SmallVector<std::pair<Instruction *, ValueVector *>, 16>;
// Provides a very limited vector-like interface for lazily accessing one
// component of a scattered vector or vector pointer.
class Scatterer {
public:
- Scatterer() {}
+ Scatterer() = default;
// Scatter V into Size components. If new instructions are needed,
// insert them before BBI in BB. If Cache is nonnull, use it to cache
@@ -71,10 +94,12 @@ private:
// called Name that compares X and Y in the same way as FCI.
struct FCmpSplitter {
FCmpSplitter(FCmpInst &fci) : FCI(fci) {}
+
Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
const Twine &Name) const {
return Builder.CreateFCmp(FCI.getPredicate(), Op0, Op1, Name);
}
+
FCmpInst &FCI;
};
@@ -82,10 +107,12 @@ struct FCmpSplitter {
// called Name that compares X and Y in the same way as ICI.
struct ICmpSplitter {
ICmpSplitter(ICmpInst &ici) : ICI(ici) {}
+
Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
const Twine &Name) const {
return Builder.CreateICmp(ICI.getPredicate(), Op0, Op1, Name);
}
+
ICmpInst &ICI;
};
@@ -93,16 +120,18 @@ struct ICmpSplitter {
// a binary operator like BO called Name with operands X and Y.
struct BinarySplitter {
BinarySplitter(BinaryOperator &bo) : BO(bo) {}
+
Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
const Twine &Name) const {
return Builder.CreateBinOp(BO.getOpcode(), Op0, Op1, Name);
}
+
BinaryOperator &BO;
};
// Information about a load or store that we're scalarizing.
struct VectorLayout {
- VectorLayout() : VecTy(nullptr), ElemTy(nullptr), VecAlign(0), ElemSize(0) {}
+ VectorLayout() = default;
// Return the alignment of element I.
uint64_t getElemAlign(unsigned I) {
@@ -110,16 +139,16 @@ struct VectorLayout {
}
// The type of the vector.
- VectorType *VecTy;
+ VectorType *VecTy = nullptr;
// The type of each element.
- Type *ElemTy;
+ Type *ElemTy = nullptr;
// The alignment of the vector.
- uint64_t VecAlign;
+ uint64_t VecAlign = 0;
// The size of each element.
- uint64_t ElemSize;
+ uint64_t ElemSize = 0;
};
class Scalarizer : public FunctionPass,
@@ -127,8 +156,7 @@ class Scalarizer : public FunctionPass,
public:
static char ID;
- Scalarizer() :
- FunctionPass(ID) {
+ Scalarizer() : FunctionPass(ID) {
initializeScalarizerPass(*PassRegistry::getPassRegistry());
}
@@ -137,19 +165,19 @@ public:
// InstVisitor methods. They return true if the instruction was scalarized,
// false if nothing changed.
- bool visitInstruction(Instruction &) { return false; }
+ bool visitInstruction(Instruction &I) { return false; }
bool visitSelectInst(SelectInst &SI);
- bool visitICmpInst(ICmpInst &);
- bool visitFCmpInst(FCmpInst &);
- bool visitBinaryOperator(BinaryOperator &);
- bool visitGetElementPtrInst(GetElementPtrInst &);
- bool visitCastInst(CastInst &);
- bool visitBitCastInst(BitCastInst &);
- bool visitShuffleVectorInst(ShuffleVectorInst &);
- bool visitPHINode(PHINode &);
- bool visitLoadInst(LoadInst &);
- bool visitStoreInst(StoreInst &);
- bool visitCallInst(CallInst &I);
+ bool visitICmpInst(ICmpInst &ICI);
+ bool visitFCmpInst(FCmpInst &FCI);
+ bool visitBinaryOperator(BinaryOperator &BO);
+ bool visitGetElementPtrInst(GetElementPtrInst &GEPI);
+ bool visitCastInst(CastInst &CI);
+ bool visitBitCastInst(BitCastInst &BCI);
+ bool visitShuffleVectorInst(ShuffleVectorInst &SVI);
+ bool visitPHINode(PHINode &PHI);
+ bool visitLoadInst(LoadInst &LI);
+ bool visitStoreInst(StoreInst &SI);
+ bool visitCallInst(CallInst &ICI);
static void registerOptions() {
// This is disabled by default because having separate loads and stores
@@ -162,11 +190,12 @@ public:
}
private:
- Scatterer scatter(Instruction *, Value *);
- void gather(Instruction *, const ValueVector &);
+ Scatterer scatter(Instruction *Point, Value *V);
+ void gather(Instruction *Op, const ValueVector &CV);
bool canTransferMetadata(unsigned Kind);
- void transferMetadata(Instruction *, const ValueVector &);
- bool getVectorLayout(Type *, unsigned, VectorLayout &, const DataLayout &);
+ void transferMetadata(Instruction *Op, const ValueVector &CV);
+ bool getVectorLayout(Type *Ty, unsigned Alignment, VectorLayout &Layout,
+ const DataLayout &DL);
bool finish();
template<typename T> bool splitBinary(Instruction &, const T &);
@@ -179,9 +208,10 @@ private:
bool ScalarizeLoadStore;
};
-char Scalarizer::ID = 0;
} // end anonymous namespace
+char Scalarizer::ID = 0;
+
INITIALIZE_PASS_WITH_OPTIONS(Scalarizer, "scalarizer",
"Scalarize vector operations", false, false)
@@ -222,7 +252,7 @@ Value *Scatterer::operator[](unsigned I) {
// Search through a chain of InsertElementInsts looking for element I.
// Record other elements in the cache. The new V is still suitable
// for all uncached indices.
- for (;;) {
+ while (true) {
InsertElementInst *Insert = dyn_cast<InsertElementInst>(V);
if (!Insert)
break;
diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 84675f41cdd5..209821ff21d7 100644
--- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1,4 +1,4 @@
-//===-- SeparateConstOffsetFromGEP.cpp - ------------------------*- C++ -*-===//
+//===- SeparateConstOffsetFromGEP.cpp -------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -156,27 +156,44 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+#include <string>
using namespace llvm;
using namespace llvm::PatternMatch;
@@ -185,6 +202,7 @@ static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
"disable-separate-const-offset-from-gep", cl::init(false),
cl::desc("Do not separate the constant offset from a GEP instruction"),
cl::Hidden);
+
// Setting this flag may emit false positives when the input module already
// contains dead instructions. Therefore, we set it only in unit tests that are
// free of dead code.
@@ -219,6 +237,7 @@ public:
/// garbage-collect unused instructions in UserChain.
static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
User *&UserChainTail, const DominatorTree *DT);
+
/// Looks for a constant offset from the given GEP index without extracting
/// it. It returns the numeric value of the extracted constant offset (0 if
/// failed). The meaning of the arguments are the same as Extract.
@@ -229,6 +248,7 @@ private:
ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT)
: IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) {
}
+
/// Searches the expression that computes V for a non-zero constant C s.t.
/// V can be reassociated into the form V' + C. If the searching is
/// successful, returns C and update UserChain as a def-use chain from C to V;
@@ -244,9 +264,11 @@ private:
/// non-negative. Levaraging this, we can better split
/// inbounds GEPs.
APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative);
+
/// A helper function to look into both operands of a binary operator.
APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
bool ZeroExtended);
+
/// After finding the constant offset C from the GEP index I, we build a new
/// index I' s.t. I' + C = I. This function builds and returns the new
/// index I' according to UserChain produced by function "find".
@@ -263,6 +285,7 @@ private:
/// (sext(a) + sext(b)) + 5.
/// Given this form, we know I' is sext(a) + sext(b).
Value *rebuildWithoutConstOffset();
+
/// After the first step of rebuilding the GEP index without the constant
/// offset, distribute s/zext to the operands of all operators in UserChain.
/// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
@@ -279,8 +302,10 @@ private:
/// UserChain.size() - 1, and is decremented during
/// the recursion.
Value *distributeExtsAndCloneChain(unsigned ChainIndex);
+
/// Reassociates the GEP index to the form I' + C and returns I'.
Value *removeConstOffset(unsigned ChainIndex);
+
/// A helper function to apply ExtInsts, a list of s/zext, to value V.
/// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
/// returns "sext i32 (zext i16 V to i32) to i64".
@@ -303,10 +328,14 @@ private:
///
/// This path helps to rebuild the new GEP index.
SmallVector<User *, 8> UserChain;
+
/// A data structure used in rebuildWithoutConstOffset. Contains all
/// sext/zext instructions along UserChain.
SmallVector<CastInst *, 16> ExtInsts;
- Instruction *IP; /// Insertion position of cloned instructions.
+
+ /// Insertion position of cloned instructions.
+ Instruction *IP;
+
const DataLayout &DL;
const DominatorTree *DT;
};
@@ -317,9 +346,10 @@ private:
class SeparateConstOffsetFromGEP : public FunctionPass {
public:
static char ID;
+
SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr,
bool LowerGEP = false)
- : FunctionPass(ID), DL(nullptr), DT(nullptr), TM(TM), LowerGEP(LowerGEP) {
+ : FunctionPass(ID), TM(TM), LowerGEP(LowerGEP) {
initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
}
@@ -336,12 +366,14 @@ public:
DL = &M.getDataLayout();
return false;
}
+
bool runOnFunction(Function &F) override;
private:
/// Tries to split the given GEP into a variadic base and a constant offset,
/// and returns true if the splitting succeeds.
bool splitGEP(GetElementPtrInst *GEP);
+
/// Lower a GEP with multiple indices into multiple GEPs with a single index.
/// Function splitGEP already split the original GEP into a variadic part and
/// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
@@ -351,6 +383,7 @@ private:
/// \p AccumulativeByteOffset The constant offset.
void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
int64_t AccumulativeByteOffset);
+
/// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
/// Function splitGEP already split the original GEP into a variadic part and
/// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
@@ -360,12 +393,14 @@ private:
/// \p AccumulativeByteOffset The constant offset.
void lowerToArithmetics(GetElementPtrInst *Variadic,
int64_t AccumulativeByteOffset);
+
/// Finds the constant offset within each index and accumulates them. If
/// LowerGEP is true, it finds in indices of both sequential and structure
/// types, otherwise it only finds in sequential indices. The output
/// NeedsExtraction indicates whether we successfully find a non-zero constant
/// offset.
int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
+
/// Canonicalize array indices to pointer-size integers. This helps to
/// simplify the logic of splitting a GEP. For example, if a + b is a
/// pointer-size integer, we have
@@ -382,6 +417,7 @@ private:
///
/// Verified in @i32_add in split-gep.ll
bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
+
/// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow.
/// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting
/// the constant offset. After extraction, it becomes desirable to reunion the
@@ -392,8 +428,10 @@ private:
/// => constant extraction &a[sext(i) + sext(j)] + 5
/// => reunion &a[sext(i +nsw j)] + 5
bool reuniteExts(Function &F);
+
/// A helper that reunites sexts in an instruction.
bool reuniteExts(Instruction *I);
+
/// Find the closest dominator of <Dominatee> that is equivalent to <Key>.
Instruction *findClosestMatchingDominator(const SCEV *Key,
Instruction *Dominatee);
@@ -401,27 +439,33 @@ private:
void verifyNoDeadCode(Function &F);
bool hasMoreThanOneUseInLoop(Value *v, Loop *L);
+
// Swap the index operand of two GEP.
void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second);
+
// Check if it is safe to swap operand of two GEP.
bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second,
Loop *CurLoop);
- const DataLayout *DL;
- DominatorTree *DT;
+ const DataLayout *DL = nullptr;
+ DominatorTree *DT = nullptr;
ScalarEvolution *SE;
const TargetMachine *TM;
LoopInfo *LI;
TargetLibraryInfo *TLI;
+
/// Whether to lower a GEP with multiple indices into arithmetic operations or
/// multiple GEPs with a single index.
bool LowerGEP;
+
DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingExprs;
};
-} // anonymous namespace
+
+} // end anonymous namespace
char SeparateConstOffsetFromGEP::ID = 0;
+
INITIALIZE_PASS_BEGIN(
SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
"Split GEPs to a variadic base and a constant offset for better CSE", false,
diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index aaab5857e0f1..3d0fca0bc3a5 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -17,6 +17,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
@@ -28,6 +29,7 @@
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
@@ -36,11 +38,15 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/GenericDomTree.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
#include <algorithm>
#include <cassert>
#include <iterator>
+#include <numeric>
#include <utility>
#define DEBUG_TYPE "simple-loop-unswitch"
@@ -51,6 +57,15 @@ STATISTIC(NumBranches, "Number of branches unswitched");
STATISTIC(NumSwitches, "Number of switches unswitched");
STATISTIC(NumTrivial, "Number of unswitches that are trivial");
+static cl::opt<bool> EnableNonTrivialUnswitch(
+ "enable-nontrivial-unswitch", cl::init(false), cl::Hidden,
+ cl::desc("Forcibly enables non-trivial loop unswitching rather than "
+ "following the configuration passed into the pass."));
+
+static cl::opt<int>
+ UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
+ cl::desc("The cost threshold for unswitching a loop."));
+
static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,
Constant &Replacement) {
assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
@@ -68,24 +83,95 @@ static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,
}
}
-/// Update the dominator tree after removing one exiting predecessor of a loop
-/// exit block.
-static void updateLoopExitIDom(BasicBlock *LoopExitBB, Loop &L,
- DominatorTree &DT) {
- assert(pred_begin(LoopExitBB) != pred_end(LoopExitBB) &&
- "Cannot have empty predecessors of the loop exit block if we split "
- "off a block to unswitch!");
+/// Update the IDom for a basic block whose predecessor set has changed.
+///
+/// This routine is designed to work when the domtree update is relatively
+/// localized by leveraging a known common dominator, often a loop header.
+///
+/// FIXME: Should consider hand-rolling a slightly more efficient non-DFS
+/// approach here as we can do that easily by persisting the candidate IDom's
+/// dominating set between each predecessor.
+///
+/// FIXME: Longer term, many uses of this can be replaced by an incremental
+/// domtree update strategy that starts from a known dominating block and
+/// rebuilds that subtree.
+static bool updateIDomWithKnownCommonDominator(BasicBlock *BB,
+ BasicBlock *KnownDominatingBB,
+ DominatorTree &DT) {
+ assert(pred_begin(BB) != pred_end(BB) &&
+ "This routine does not handle unreachable blocks!");
+
+ BasicBlock *OrigIDom = DT[BB]->getIDom()->getBlock();
+
+ BasicBlock *IDom = *pred_begin(BB);
+ assert(DT.dominates(KnownDominatingBB, IDom) &&
+ "Bad known dominating block!");
- BasicBlock *IDom = *pred_begin(LoopExitBB);
// Walk all of the other predecessors finding the nearest common dominator
// until all predecessors are covered or we reach the loop header. The loop
// header necessarily dominates all loop exit blocks in loop simplified form
// so we can early-exit the moment we hit that block.
- for (auto PI = std::next(pred_begin(LoopExitBB)), PE = pred_end(LoopExitBB);
- PI != PE && IDom != L.getHeader(); ++PI)
+ for (auto PI = std::next(pred_begin(BB)), PE = pred_end(BB);
+ PI != PE && IDom != KnownDominatingBB; ++PI) {
+ assert(DT.dominates(KnownDominatingBB, *PI) &&
+ "Bad known dominating block!");
IDom = DT.findNearestCommonDominator(IDom, *PI);
+ }
+
+ if (IDom == OrigIDom)
+ return false;
+
+ DT.changeImmediateDominator(BB, IDom);
+ return true;
+}
+
+// Note that we don't currently use the IDFCalculator here for two reasons:
+// 1) It computes dominator tree levels for the entire function on each run
+// of 'compute'. While this isn't terrible, given that we expect to update
+// relatively small subtrees of the domtree, it isn't necessarily the right
+// tradeoff.
+// 2) The interface doesn't fit this usage well. It doesn't operate in
+// append-only, and builds several sets that we don't need.
+//
+// FIXME: Neither of these issues are a big deal and could be addressed with
+// some amount of refactoring of IDFCalculator. That would allow us to share
+// the core logic here (which is solving the same core problem).
+static void appendDomFrontier(DomTreeNode *Node,
+ SmallSetVector<BasicBlock *, 4> &Worklist,
+ SmallVectorImpl<DomTreeNode *> &DomNodes,
+ SmallPtrSetImpl<BasicBlock *> &DomSet) {
+ assert(DomNodes.empty() && "Must start with no dominator nodes.");
+ assert(DomSet.empty() && "Must start with an empty dominator set.");
+
+ // First flatten this subtree into sequence of nodes by doing a pre-order
+ // walk.
+ DomNodes.push_back(Node);
+ // We intentionally re-evaluate the size as each node can add new children.
+ // Because this is a tree walk, this cannot add any duplicates.
+ for (int i = 0; i < (int)DomNodes.size(); ++i)
+ DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end());
+
+ // Now create a set of the basic blocks so we can quickly test for
+ // dominated successors. We could in theory use the DFS numbers of the
+ // dominator tree for this, but we want this to remain predictably fast
+ // even while we mutate the dominator tree in ways that would invalidate
+ // the DFS numbering.
+ for (DomTreeNode *InnerN : DomNodes)
+ DomSet.insert(InnerN->getBlock());
- DT.changeImmediateDominator(LoopExitBB, IDom);
+ // Now re-walk the nodes, appending every successor of every node that isn't
+ // in the set. Note that we don't append the node itself, even though if it
+ // is a successor it does not strictly dominate itself and thus it would be
+ // part of the dominance frontier. The reason we don't append it is that
+ // the node passed in came *from* the worklist and so it has already been
+ // processed.
+ for (DomTreeNode *InnerN : DomNodes)
+ for (BasicBlock *SuccBB : successors(InnerN->getBlock()))
+ if (!DomSet.count(SuccBB))
+ Worklist.insert(SuccBB);
+
+ DomNodes.clear();
+ DomSet.clear();
}
/// Update the dominator tree after unswitching a particular former exit block.
@@ -127,58 +213,14 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,
// dominator frontier to see if it additionally should move up the dominator
// tree. This lambda appends the dominator frontier for a node on the
// worklist.
- //
- // Note that we don't currently use the IDFCalculator here for two reasons:
- // 1) It computes dominator tree levels for the entire function on each run
- // of 'compute'. While this isn't terrible, given that we expect to update
- // relatively small subtrees of the domtree, it isn't necessarily the right
- // tradeoff.
- // 2) The interface doesn't fit this usage well. It doesn't operate in
- // append-only, and builds several sets that we don't need.
- //
- // FIXME: Neither of these issues are a big deal and could be addressed with
- // some amount of refactoring of IDFCalculator. That would allow us to share
- // the core logic here (which is solving the same core problem).
SmallSetVector<BasicBlock *, 4> Worklist;
+
+ // Scratch data structures reused by domfrontier finding.
SmallVector<DomTreeNode *, 4> DomNodes;
SmallPtrSet<BasicBlock *, 4> DomSet;
- auto AppendDomFrontier = [&](DomTreeNode *Node) {
- assert(DomNodes.empty() && "Must start with no dominator nodes.");
- assert(DomSet.empty() && "Must start with an empty dominator set.");
-
- // First flatten this subtree into sequence of nodes by doing a pre-order
- // walk.
- DomNodes.push_back(Node);
- // We intentionally re-evaluate the size as each node can add new children.
- // Because this is a tree walk, this cannot add any duplicates.
- for (int i = 0; i < (int)DomNodes.size(); ++i)
- DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end());
-
- // Now create a set of the basic blocks so we can quickly test for
- // dominated successors. We could in theory use the DFS numbers of the
- // dominator tree for this, but we want this to remain predictably fast
- // even while we mutate the dominator tree in ways that would invalidate
- // the DFS numbering.
- for (DomTreeNode *InnerN : DomNodes)
- DomSet.insert(InnerN->getBlock());
-
- // Now re-walk the nodes, appending every successor of every node that isn't
- // in the set. Note that we don't append the node itself, even though if it
- // is a successor it does not strictly dominate itself and thus it would be
- // part of the dominance frontier. The reason we don't append it is that
- // the node passed in came *from* the worklist and so it has already been
- // processed.
- for (DomTreeNode *InnerN : DomNodes)
- for (BasicBlock *SuccBB : successors(InnerN->getBlock()))
- if (!DomSet.count(SuccBB))
- Worklist.insert(SuccBB);
-
- DomNodes.clear();
- DomSet.clear();
- };
// Append the initial dom frontier nodes.
- AppendDomFrontier(UnswitchedNode);
+ appendDomFrontier(UnswitchedNode, Worklist, DomNodes, DomSet);
// Walk the worklist. We grow the list in the loop and so must recompute size.
for (int i = 0; i < (int)Worklist.size(); ++i) {
@@ -197,7 +239,7 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,
DT.changeImmediateDominator(Node, OldPHNode);
// Now add this node's dominator frontier to the worklist as well.
- AppendDomFrontier(Node);
+ appendDomFrontier(Node, Worklist, DomNodes, DomSet);
}
}
@@ -395,7 +437,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
// one of the predecessors for the loop exit block and may need to update its
// idom.
if (UnswitchedBB != LoopExitBB)
- updateLoopExitIDom(LoopExitBB, L, DT);
+ updateIDomWithKnownCommonDominator(LoopExitBB, L.getHeader(), DT);
// Since this is an i1 condition we can also trivially replace uses of it
// within the loop with a constant.
@@ -540,7 +582,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
*ParentBB, *OldPH);
- updateLoopExitIDom(DefaultExitBB, L, DT);
+ updateIDomWithKnownCommonDominator(DefaultExitBB, L.getHeader(), DT);
DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
}
}
@@ -567,7 +609,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
*ParentBB, *OldPH);
- updateLoopExitIDom(ExitBB, L, DT);
+ updateIDomWithKnownCommonDominator(ExitBB, L.getHeader(), DT);
}
// Update the case pair to point to the split block.
CasePair.second = SplitExitBB;
@@ -708,15 +750,1172 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
return Changed;
}
+/// Build the cloned blocks for an unswitched copy of the given loop.
+///
+/// The cloned blocks are inserted before the loop preheader (`LoopPH`) and
+/// after the split block (`SplitBB`) that will be used to select between the
+/// cloned and original loop.
+///
+/// This routine handles cloning all of the necessary loop blocks and exit
+/// blocks including rewriting their instructions and the relevant PHI nodes.
+/// It skips loop and exit blocks that are not necessary based on the provided
+/// set. It also correctly creates the unconditional branch in the cloned
+/// unswitched parent block to only point at the unswitched successor.
+///
+/// This does not handle most of the necessary updates to `LoopInfo`. Only exit
+/// block splitting is correctly reflected in `LoopInfo`, essentially all of
+/// the cloned blocks (and their loops) are left without full `LoopInfo`
+/// updates. This also doesn't fully update `DominatorTree`. It adds the cloned
+/// blocks to them but doesn't create the cloned `DominatorTree` structure and
+/// instead the caller must recompute an accurate DT. It *does* correctly
+/// update the `AssumptionCache` provided in `AC`.
+static BasicBlock *buildClonedLoopBlocks(
+ Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB,
+ ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB,
+ BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB,
+ const SmallPtrSetImpl<BasicBlock *> &SkippedLoopAndExitBlocks,
+ ValueToValueMapTy &VMap, AssumptionCache &AC, DominatorTree &DT,
+ LoopInfo &LI) {
+ SmallVector<BasicBlock *, 4> NewBlocks;
+ NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size());
+
+ // We will need to clone a bunch of blocks, wrap up the clone operation in
+ // a helper.
+ auto CloneBlock = [&](BasicBlock *OldBB) {
+ // Clone the basic block and insert it before the new preheader.
+ BasicBlock *NewBB = CloneBasicBlock(OldBB, VMap, ".us", OldBB->getParent());
+ NewBB->moveBefore(LoopPH);
+
+ // Record this block and the mapping.
+ NewBlocks.push_back(NewBB);
+ VMap[OldBB] = NewBB;
+
+ // Add the block to the domtree. We'll move it to the correct position
+ // below.
+ DT.addNewBlock(NewBB, SplitBB);
+
+ return NewBB;
+ };
+
+ // First, clone the preheader.
+ auto *ClonedPH = CloneBlock(LoopPH);
+
+ // Then clone all the loop blocks, skipping the ones that aren't necessary.
+ for (auto *LoopBB : L.blocks())
+ if (!SkippedLoopAndExitBlocks.count(LoopBB))
+ CloneBlock(LoopBB);
+
+ // Split all the loop exit edges so that when we clone the exit blocks, if
+ // any of the exit blocks are *also* a preheader for some other loop, we
+ // don't create multiple predecessors entering the loop header.
+ for (auto *ExitBB : ExitBlocks) {
+ if (SkippedLoopAndExitBlocks.count(ExitBB))
+ continue;
+
+ // When we are going to clone an exit, we don't need to clone all the
+ // instructions in the exit block and we want to ensure we have an easy
+ // place to merge the CFG, so split the exit first. This is always safe to
+ // do because there cannot be any non-loop predecessors of a loop exit in
+ // loop simplified form.
+ auto *MergeBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
+
+ // Rearrange the names to make it easier to write test cases by having the
+ // exit block carry the suffix rather than the merge block carrying the
+ // suffix.
+ MergeBB->takeName(ExitBB);
+ ExitBB->setName(Twine(MergeBB->getName()) + ".split");
+
+ // Now clone the original exit block.
+ auto *ClonedExitBB = CloneBlock(ExitBB);
+ assert(ClonedExitBB->getTerminator()->getNumSuccessors() == 1 &&
+ "Exit block should have been split to have one successor!");
+ assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB &&
+ "Cloned exit block has the wrong successor!");
+
+ // Move the merge block's idom to be the split point as one exit is
+ // dominated by one header, and the other by another, so we know the split
+ // point dominates both. While the dominator tree isn't fully accurate, we
+ // want sub-trees within the original loop to be correctly reflect
+ // dominance within that original loop (at least) and that requires moving
+ // the merge block out of that subtree.
+ // FIXME: This is very brittle as we essentially have a partial contract on
+ // the dominator tree. We really need to instead update it and keep it
+ // valid or stop relying on it.
+ DT.changeImmediateDominator(MergeBB, SplitBB);
+
+ // Remap any cloned instructions and create a merge phi node for them.
+ for (auto ZippedInsts : llvm::zip_first(
+ llvm::make_range(ExitBB->begin(), std::prev(ExitBB->end())),
+ llvm::make_range(ClonedExitBB->begin(),
+ std::prev(ClonedExitBB->end())))) {
+ Instruction &I = std::get<0>(ZippedInsts);
+ Instruction &ClonedI = std::get<1>(ZippedInsts);
+
+ // The only instructions in the exit block should be PHI nodes and
+ // potentially a landing pad.
+ assert(
+ (isa<PHINode>(I) || isa<LandingPadInst>(I) || isa<CatchPadInst>(I)) &&
+ "Bad instruction in exit block!");
+ // We should have a value map between the instruction and its clone.
+ assert(VMap.lookup(&I) == &ClonedI && "Mismatch in the value map!");
+
+ auto *MergePN =
+ PHINode::Create(I.getType(), /*NumReservedValues*/ 2, ".us-phi",
+ &*MergeBB->getFirstInsertionPt());
+ I.replaceAllUsesWith(MergePN);
+ MergePN->addIncoming(&I, ExitBB);
+ MergePN->addIncoming(&ClonedI, ClonedExitBB);
+ }
+ }
+
+ // Rewrite the instructions in the cloned blocks to refer to the instructions
+ // in the cloned blocks. We have to do this as a second pass so that we have
+ // everything available. Also, we have inserted new instructions which may
+ // include assume intrinsics, so we update the assumption cache while
+ // processing this.
+ for (auto *ClonedBB : NewBlocks)
+ for (Instruction &I : *ClonedBB) {
+ RemapInstruction(&I, VMap,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC.registerAssumption(II);
+ }
+
+ // Remove the cloned parent as a predecessor of the cloned continue successor
+ // if we did in fact clone it.
+ auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB));
+ if (auto *ClonedContinueSuccBB =
+ cast_or_null<BasicBlock>(VMap.lookup(ContinueSuccBB)))
+ ClonedContinueSuccBB->removePredecessor(ClonedParentBB,
+ /*DontDeleteUselessPHIs*/ true);
+ // Replace the cloned branch with an unconditional branch to the cloneed
+ // unswitched successor.
+ auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB));
+ ClonedParentBB->getTerminator()->eraseFromParent();
+ BranchInst::Create(ClonedSuccBB, ClonedParentBB);
+
+ // Update any PHI nodes in the cloned successors of the skipped blocks to not
+ // have spurious incoming values.
+ for (auto *LoopBB : L.blocks())
+ if (SkippedLoopAndExitBlocks.count(LoopBB))
+ for (auto *SuccBB : successors(LoopBB))
+ if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB)))
+ for (PHINode &PN : ClonedSuccBB->phis())
+ PN.removeIncomingValue(LoopBB, /*DeletePHIIfEmpty*/ false);
+
+ return ClonedPH;
+}
+
+/// Recursively clone the specified loop and all of its children.
+///
+/// The target parent loop for the clone should be provided, or can be null if
+/// the clone is a top-level loop. While cloning, all the blocks are mapped
+/// with the provided value map. The entire original loop must be present in
+/// the value map. The cloned loop is returned.
+static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL,
+ const ValueToValueMapTy &VMap, LoopInfo &LI) {
+ auto AddClonedBlocksToLoop = [&](Loop &OrigL, Loop &ClonedL) {
+ assert(ClonedL.getBlocks().empty() && "Must start with an empty loop!");
+ ClonedL.reserveBlocks(OrigL.getNumBlocks());
+ for (auto *BB : OrigL.blocks()) {
+ auto *ClonedBB = cast<BasicBlock>(VMap.lookup(BB));
+ ClonedL.addBlockEntry(ClonedBB);
+ if (LI.getLoopFor(BB) == &OrigL) {
+ assert(!LI.getLoopFor(ClonedBB) &&
+ "Should not have an existing loop for this block!");
+ LI.changeLoopFor(ClonedBB, &ClonedL);
+ }
+ }
+ };
+
+ // We specially handle the first loop because it may get cloned into
+ // a different parent and because we most commonly are cloning leaf loops.
+ Loop *ClonedRootL = LI.AllocateLoop();
+ if (RootParentL)
+ RootParentL->addChildLoop(ClonedRootL);
+ else
+ LI.addTopLevelLoop(ClonedRootL);
+ AddClonedBlocksToLoop(OrigRootL, *ClonedRootL);
+
+ if (OrigRootL.empty())
+ return ClonedRootL;
+
+ // If we have a nest, we can quickly clone the entire loop nest using an
+ // iterative approach because it is a tree. We keep the cloned parent in the
+ // data structure to avoid repeatedly querying through a map to find it.
+ SmallVector<std::pair<Loop *, Loop *>, 16> LoopsToClone;
+ // Build up the loops to clone in reverse order as we'll clone them from the
+ // back.
+ for (Loop *ChildL : llvm::reverse(OrigRootL))
+ LoopsToClone.push_back({ClonedRootL, ChildL});
+ do {
+ Loop *ClonedParentL, *L;
+ std::tie(ClonedParentL, L) = LoopsToClone.pop_back_val();
+ Loop *ClonedL = LI.AllocateLoop();
+ ClonedParentL->addChildLoop(ClonedL);
+ AddClonedBlocksToLoop(*L, *ClonedL);
+ for (Loop *ChildL : llvm::reverse(*L))
+ LoopsToClone.push_back({ClonedL, ChildL});
+ } while (!LoopsToClone.empty());
+
+ return ClonedRootL;
+}
+
+/// Build the cloned loops of an original loop from unswitching.
+///
+/// Because unswitching simplifies the CFG of the loop, this isn't a trivial
+/// operation. We need to re-verify that there even is a loop (as the backedge
+/// may not have been cloned), and even if there are remaining backedges the
+/// backedge set may be different. However, we know that each child loop is
+/// undisturbed, we only need to find where to place each child loop within
+/// either any parent loop or within a cloned version of the original loop.
+///
+/// Because child loops may end up cloned outside of any cloned version of the
+/// original loop, multiple cloned sibling loops may be created. All of them
+/// are returned so that the newly introduced loop nest roots can be
+/// identified.
+static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
+ const ValueToValueMapTy &VMap, LoopInfo &LI,
+ SmallVectorImpl<Loop *> &NonChildClonedLoops) {
+ Loop *ClonedL = nullptr;
+
+ auto *OrigPH = OrigL.getLoopPreheader();
+ auto *OrigHeader = OrigL.getHeader();
+
+ auto *ClonedPH = cast<BasicBlock>(VMap.lookup(OrigPH));
+ auto *ClonedHeader = cast<BasicBlock>(VMap.lookup(OrigHeader));
+
+ // We need to know the loops of the cloned exit blocks to even compute the
+ // accurate parent loop. If we only clone exits to some parent of the
+ // original parent, we want to clone into that outer loop. We also keep track
+ // of the loops that our cloned exit blocks participate in.
+ Loop *ParentL = nullptr;
+ SmallVector<BasicBlock *, 4> ClonedExitsInLoops;
+ SmallDenseMap<BasicBlock *, Loop *, 16> ExitLoopMap;
+ ClonedExitsInLoops.reserve(ExitBlocks.size());
+ for (auto *ExitBB : ExitBlocks)
+ if (auto *ClonedExitBB = cast_or_null<BasicBlock>(VMap.lookup(ExitBB)))
+ if (Loop *ExitL = LI.getLoopFor(ExitBB)) {
+ ExitLoopMap[ClonedExitBB] = ExitL;
+ ClonedExitsInLoops.push_back(ClonedExitBB);
+ if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL)))
+ ParentL = ExitL;
+ }
+ assert((!ParentL || ParentL == OrigL.getParentLoop() ||
+ ParentL->contains(OrigL.getParentLoop())) &&
+ "The computed parent loop should always contain (or be) the parent of "
+ "the original loop.");
+
+ // We build the set of blocks dominated by the cloned header from the set of
+ // cloned blocks out of the original loop. While not all of these will
+ // necessarily be in the cloned loop, it is enough to establish that they
+ // aren't in unreachable cycles, etc.
+ SmallSetVector<BasicBlock *, 16> ClonedLoopBlocks;
+ for (auto *BB : OrigL.blocks())
+ if (auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB)))
+ ClonedLoopBlocks.insert(ClonedBB);
+
+ // Rebuild the set of blocks that will end up in the cloned loop. We may have
+ // skipped cloning some region of this loop which can in turn skip some of
+ // the backedges so we have to rebuild the blocks in the loop based on the
+ // backedges that remain after cloning.
+ SmallVector<BasicBlock *, 16> Worklist;
+ SmallPtrSet<BasicBlock *, 16> BlocksInClonedLoop;
+ for (auto *Pred : predecessors(ClonedHeader)) {
+ // The only possible non-loop header predecessor is the preheader because
+ // we know we cloned the loop in simplified form.
+ if (Pred == ClonedPH)
+ continue;
+
+ // Because the loop was in simplified form, the only non-loop predecessor
+ // should be the preheader.
+ assert(ClonedLoopBlocks.count(Pred) && "Found a predecessor of the loop "
+ "header other than the preheader "
+ "that is not part of the loop!");
+
+ // Insert this block into the loop set and on the first visit (and if it
+ // isn't the header we're currently walking) put it into the worklist to
+ // recurse through.
+ if (BlocksInClonedLoop.insert(Pred).second && Pred != ClonedHeader)
+ Worklist.push_back(Pred);
+ }
+
+ // If we had any backedges then there *is* a cloned loop. Put the header into
+ // the loop set and then walk the worklist backwards to find all the blocks
+ // that remain within the loop after cloning.
+ if (!BlocksInClonedLoop.empty()) {
+ BlocksInClonedLoop.insert(ClonedHeader);
+
+ while (!Worklist.empty()) {
+ BasicBlock *BB = Worklist.pop_back_val();
+ assert(BlocksInClonedLoop.count(BB) &&
+ "Didn't put block into the loop set!");
+
+ // Insert any predecessors that are in the possible set into the cloned
+ // set, and if the insert is successful, add them to the worklist. Note
+ // that we filter on the blocks that are definitely reachable via the
+ // backedge to the loop header so we may prune out dead code within the
+ // cloned loop.
+ for (auto *Pred : predecessors(BB))
+ if (ClonedLoopBlocks.count(Pred) &&
+ BlocksInClonedLoop.insert(Pred).second)
+ Worklist.push_back(Pred);
+ }
+
+ ClonedL = LI.AllocateLoop();
+ if (ParentL) {
+ ParentL->addBasicBlockToLoop(ClonedPH, LI);
+ ParentL->addChildLoop(ClonedL);
+ } else {
+ LI.addTopLevelLoop(ClonedL);
+ }
+
+ ClonedL->reserveBlocks(BlocksInClonedLoop.size());
+ // We don't want to just add the cloned loop blocks based on how we
+ // discovered them. The original order of blocks was carefully built in
+ // a way that doesn't rely on predecessor ordering. Rather than re-invent
+ // that logic, we just re-walk the original blocks (and those of the child
+ // loops) and filter them as we add them into the cloned loop.
+ for (auto *BB : OrigL.blocks()) {
+ auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB));
+ if (!ClonedBB || !BlocksInClonedLoop.count(ClonedBB))
+ continue;
+
+ // Directly add the blocks that are only in this loop.
+ if (LI.getLoopFor(BB) == &OrigL) {
+ ClonedL->addBasicBlockToLoop(ClonedBB, LI);
+ continue;
+ }
+
+ // We want to manually add it to this loop and parents.
+ // Registering it with LoopInfo will happen when we clone the top
+ // loop for this block.
+ for (Loop *PL = ClonedL; PL; PL = PL->getParentLoop())
+ PL->addBlockEntry(ClonedBB);
+ }
+
+ // Now add each child loop whose header remains within the cloned loop. All
+ // of the blocks within the loop must satisfy the same constraints as the
+ // header so once we pass the header checks we can just clone the entire
+ // child loop nest.
+ for (Loop *ChildL : OrigL) {
+ auto *ClonedChildHeader =
+ cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader()));
+ if (!ClonedChildHeader || !BlocksInClonedLoop.count(ClonedChildHeader))
+ continue;
+
+#ifndef NDEBUG
+ // We should never have a cloned child loop header but fail to have
+ // all of the blocks for that child loop.
+ for (auto *ChildLoopBB : ChildL->blocks())
+ assert(BlocksInClonedLoop.count(
+ cast<BasicBlock>(VMap.lookup(ChildLoopBB))) &&
+ "Child cloned loop has a header within the cloned outer "
+ "loop but not all of its blocks!");
+#endif
+
+ cloneLoopNest(*ChildL, ClonedL, VMap, LI);
+ }
+ }
+
+ // Now that we've handled all the components of the original loop that were
+ // cloned into a new loop, we still need to handle anything from the original
+ // loop that wasn't in a cloned loop.
+
+ // Figure out what blocks are left to place within any loop nest containing
+ // the unswitched loop. If we never formed a loop, the cloned PH is one of
+ // them.
+ SmallPtrSet<BasicBlock *, 16> UnloopedBlockSet;
+ if (BlocksInClonedLoop.empty())
+ UnloopedBlockSet.insert(ClonedPH);
+ for (auto *ClonedBB : ClonedLoopBlocks)
+ if (!BlocksInClonedLoop.count(ClonedBB))
+ UnloopedBlockSet.insert(ClonedBB);
+
+ // Copy the cloned exits and sort them in ascending loop depth, we'll work
+ // backwards across these to process them inside out. The order shouldn't
+ // matter as we're just trying to build up the map from inside-out; we use
+ // the map in a more stably ordered way below.
+ auto OrderedClonedExitsInLoops = ClonedExitsInLoops;
+ std::sort(OrderedClonedExitsInLoops.begin(), OrderedClonedExitsInLoops.end(),
+ [&](BasicBlock *LHS, BasicBlock *RHS) {
+ return ExitLoopMap.lookup(LHS)->getLoopDepth() <
+ ExitLoopMap.lookup(RHS)->getLoopDepth();
+ });
+
+ // Populate the existing ExitLoopMap with everything reachable from each
+ // exit, starting from the inner most exit.
+ while (!UnloopedBlockSet.empty() && !OrderedClonedExitsInLoops.empty()) {
+ assert(Worklist.empty() && "Didn't clear worklist!");
+
+ BasicBlock *ExitBB = OrderedClonedExitsInLoops.pop_back_val();
+ Loop *ExitL = ExitLoopMap.lookup(ExitBB);
+
+ // Walk the CFG back until we hit the cloned PH adding everything reachable
+ // and in the unlooped set to this exit block's loop.
+ Worklist.push_back(ExitBB);
+ do {
+ BasicBlock *BB = Worklist.pop_back_val();
+ // We can stop recursing at the cloned preheader (if we get there).
+ if (BB == ClonedPH)
+ continue;
+
+ for (BasicBlock *PredBB : predecessors(BB)) {
+ // If this pred has already been moved to our set or is part of some
+ // (inner) loop, no update needed.
+ if (!UnloopedBlockSet.erase(PredBB)) {
+ assert(
+ (BlocksInClonedLoop.count(PredBB) || ExitLoopMap.count(PredBB)) &&
+ "Predecessor not mapped to a loop!");
+ continue;
+ }
+
+ // We just insert into the loop set here. We'll add these blocks to the
+ // exit loop after we build up the set in an order that doesn't rely on
+ // predecessor order (which in turn relies on use list order).
+ bool Inserted = ExitLoopMap.insert({PredBB, ExitL}).second;
+ (void)Inserted;
+ assert(Inserted && "Should only visit an unlooped block once!");
+
+ // And recurse through to its predecessors.
+ Worklist.push_back(PredBB);
+ }
+ } while (!Worklist.empty());
+ }
+
+ // Now that the ExitLoopMap gives as mapping for all the non-looping cloned
+ // blocks to their outer loops, walk the cloned blocks and the cloned exits
+ // in their original order adding them to the correct loop.
+
+ // We need a stable insertion order. We use the order of the original loop
+ // order and map into the correct parent loop.
+ for (auto *BB : llvm::concat<BasicBlock *const>(
+ makeArrayRef(ClonedPH), ClonedLoopBlocks, ClonedExitsInLoops))
+ if (Loop *OuterL = ExitLoopMap.lookup(BB))
+ OuterL->addBasicBlockToLoop(BB, LI);
+
+#ifndef NDEBUG
+ for (auto &BBAndL : ExitLoopMap) {
+ auto *BB = BBAndL.first;
+ auto *OuterL = BBAndL.second;
+ assert(LI.getLoopFor(BB) == OuterL &&
+ "Failed to put all blocks into outer loops!");
+ }
+#endif
+
+ // Now that all the blocks are placed into the correct containing loop in the
+ // absence of child loops, find all the potentially cloned child loops and
+ // clone them into whatever outer loop we placed their header into.
+ for (Loop *ChildL : OrigL) {
+ auto *ClonedChildHeader =
+ cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader()));
+ if (!ClonedChildHeader || BlocksInClonedLoop.count(ClonedChildHeader))
+ continue;
+
+#ifndef NDEBUG
+ for (auto *ChildLoopBB : ChildL->blocks())
+ assert(VMap.count(ChildLoopBB) &&
+ "Cloned a child loop header but not all of that loops blocks!");
+#endif
+
+ NonChildClonedLoops.push_back(cloneLoopNest(
+ *ChildL, ExitLoopMap.lookup(ClonedChildHeader), VMap, LI));
+ }
+
+ // Return the main cloned loop if any.
+ return ClonedL;
+}
+
+static void deleteDeadBlocksFromLoop(Loop &L, BasicBlock *DeadSubtreeRoot,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks,
+ DominatorTree &DT, LoopInfo &LI) {
+ // Walk the dominator tree to build up the set of blocks we will delete here.
+ // The order is designed to allow us to always delete bottom-up and avoid any
+ // dangling uses.
+ SmallSetVector<BasicBlock *, 16> DeadBlocks;
+ DeadBlocks.insert(DeadSubtreeRoot);
+ for (int i = 0; i < (int)DeadBlocks.size(); ++i)
+ for (DomTreeNode *ChildN : *DT[DeadBlocks[i]]) {
+ // FIXME: This assert should pass and that means we don't change nearly
+ // as much below! Consider rewriting all of this to avoid deleting
+ // blocks. They are always cloned before being deleted, and so instead
+ // could just be moved.
+ // FIXME: This in turn means that we might actually be more able to
+ // update the domtree.
+ assert((L.contains(ChildN->getBlock()) ||
+ llvm::find(ExitBlocks, ChildN->getBlock()) != ExitBlocks.end()) &&
+ "Should never reach beyond the loop and exits when deleting!");
+ DeadBlocks.insert(ChildN->getBlock());
+ }
+
+ // Filter out the dead blocks from the exit blocks list so that it can be
+ // used in the caller.
+ llvm::erase_if(ExitBlocks,
+ [&](BasicBlock *BB) { return DeadBlocks.count(BB); });
+
+ // Remove these blocks from their successors.
+ for (auto *BB : DeadBlocks)
+ for (BasicBlock *SuccBB : successors(BB))
+ SuccBB->removePredecessor(BB, /*DontDeleteUselessPHIs*/ true);
+
+ // Walk from this loop up through its parents removing all of the dead blocks.
+ for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) {
+ for (auto *BB : DeadBlocks)
+ ParentL->getBlocksSet().erase(BB);
+ llvm::erase_if(ParentL->getBlocksVector(),
+ [&](BasicBlock *BB) { return DeadBlocks.count(BB); });
+ }
+
+ // Now delete the dead child loops. This raw delete will clear them
+ // recursively.
+ llvm::erase_if(L.getSubLoopsVector(), [&](Loop *ChildL) {
+ if (!DeadBlocks.count(ChildL->getHeader()))
+ return false;
+
+ assert(llvm::all_of(ChildL->blocks(),
+ [&](BasicBlock *ChildBB) {
+ return DeadBlocks.count(ChildBB);
+ }) &&
+ "If the child loop header is dead all blocks in the child loop must "
+ "be dead as well!");
+ LI.destroy(ChildL);
+ return true;
+ });
+
+ // Remove the mappings for the dead blocks.
+ for (auto *BB : DeadBlocks)
+ LI.changeLoopFor(BB, nullptr);
+
+ // Drop all the references from these blocks to others to handle cyclic
+ // references as we start deleting the blocks themselves.
+ for (auto *BB : DeadBlocks)
+ BB->dropAllReferences();
+
+ for (auto *BB : llvm::reverse(DeadBlocks)) {
+ DT.eraseNode(BB);
+ BB->eraseFromParent();
+ }
+}
+
+/// Recompute the set of blocks in a loop after unswitching.
+///
+/// This walks from the original headers predecessors to rebuild the loop. We
+/// take advantage of the fact that new blocks can't have been added, and so we
+/// filter by the original loop's blocks. This also handles potentially
+/// unreachable code that we don't want to explore but might be found examining
+/// the predecessors of the header.
+///
+/// If the original loop is no longer a loop, this will return an empty set. If
+/// it remains a loop, all the blocks within it will be added to the set
+/// (including those blocks in inner loops).
+static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
+ LoopInfo &LI) {
+ SmallPtrSet<const BasicBlock *, 16> LoopBlockSet;
+
+ auto *PH = L.getLoopPreheader();
+ auto *Header = L.getHeader();
+
+ // A worklist to use while walking backwards from the header.
+ SmallVector<BasicBlock *, 16> Worklist;
+
+ // First walk the predecessors of the header to find the backedges. This will
+ // form the basis of our walk.
+ for (auto *Pred : predecessors(Header)) {
+ // Skip the preheader.
+ if (Pred == PH)
+ continue;
+
+ // Because the loop was in simplified form, the only non-loop predecessor
+ // is the preheader.
+ assert(L.contains(Pred) && "Found a predecessor of the loop header other "
+ "than the preheader that is not part of the "
+ "loop!");
+
+ // Insert this block into the loop set and on the first visit and, if it
+ // isn't the header we're currently walking, put it into the worklist to
+ // recurse through.
+ if (LoopBlockSet.insert(Pred).second && Pred != Header)
+ Worklist.push_back(Pred);
+ }
+
+ // If no backedges were found, we're done.
+ if (LoopBlockSet.empty())
+ return LoopBlockSet;
+
+ // Add the loop header to the set.
+ LoopBlockSet.insert(Header);
+
+ // We found backedges, recurse through them to identify the loop blocks.
+ while (!Worklist.empty()) {
+ BasicBlock *BB = Worklist.pop_back_val();
+ assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!");
+
+ // Because we know the inner loop structure remains valid we can use the
+ // loop structure to jump immediately across the entire nested loop.
+ // Further, because it is in loop simplified form, we can directly jump
+ // to its preheader afterward.
+ if (Loop *InnerL = LI.getLoopFor(BB))
+ if (InnerL != &L) {
+ assert(L.contains(InnerL) &&
+ "Should not reach a loop *outside* this loop!");
+ // The preheader is the only possible predecessor of the loop so
+ // insert it into the set and check whether it was already handled.
+ auto *InnerPH = InnerL->getLoopPreheader();
+ assert(L.contains(InnerPH) && "Cannot contain an inner loop block "
+ "but not contain the inner loop "
+ "preheader!");
+ if (!LoopBlockSet.insert(InnerPH).second)
+ // The only way to reach the preheader is through the loop body
+ // itself so if it has been visited the loop is already handled.
+ continue;
+
+ // Insert all of the blocks (other than those already present) into
+ // the loop set. The only block we expect to already be in the set is
+ // the one we used to find this loop as we immediately handle the
+ // others the first time we encounter the loop.
+ for (auto *InnerBB : InnerL->blocks()) {
+ if (InnerBB == BB) {
+ assert(LoopBlockSet.count(InnerBB) &&
+ "Block should already be in the set!");
+ continue;
+ }
+
+ bool Inserted = LoopBlockSet.insert(InnerBB).second;
+ (void)Inserted;
+ assert(Inserted && "Should only insert an inner loop once!");
+ }
+
+ // Add the preheader to the worklist so we will continue past the
+ // loop body.
+ Worklist.push_back(InnerPH);
+ continue;
+ }
+
+ // Insert any predecessors that were in the original loop into the new
+ // set, and if the insert is successful, add them to the worklist.
+ for (auto *Pred : predecessors(BB))
+ if (L.contains(Pred) && LoopBlockSet.insert(Pred).second)
+ Worklist.push_back(Pred);
+ }
+
+ // We've found all the blocks participating in the loop, return our completed
+ // set.
+ return LoopBlockSet;
+}
+
+/// Rebuild a loop after unswitching removes some subset of blocks and edges.
+///
+/// The removal may have removed some child loops entirely but cannot have
+/// disturbed any remaining child loops. However, they may need to be hoisted
+/// to the parent loop (or to be top-level loops). The original loop may be
+/// completely removed.
+///
+/// The sibling loops resulting from this update are returned. If the original
+/// loop remains a valid loop, it will be the first entry in this list with all
+/// of the newly sibling loops following it.
+///
+/// Returns true if the loop remains a loop after unswitching, and false if it
+/// is no longer a loop after unswitching (and should not continue to be
+/// referenced).
+static bool rebuildLoopAfterUnswitch(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
+ LoopInfo &LI,
+ SmallVectorImpl<Loop *> &HoistedLoops) {
+ auto *PH = L.getLoopPreheader();
+
+ // Compute the actual parent loop from the exit blocks. Because we may have
+ // pruned some exits the loop may be different from the original parent.
+ Loop *ParentL = nullptr;
+ SmallVector<Loop *, 4> ExitLoops;
+ SmallVector<BasicBlock *, 4> ExitsInLoops;
+ ExitsInLoops.reserve(ExitBlocks.size());
+ for (auto *ExitBB : ExitBlocks)
+ if (Loop *ExitL = LI.getLoopFor(ExitBB)) {
+ ExitLoops.push_back(ExitL);
+ ExitsInLoops.push_back(ExitBB);
+ if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL)))
+ ParentL = ExitL;
+ }
+
+ // Recompute the blocks participating in this loop. This may be empty if it
+ // is no longer a loop.
+ auto LoopBlockSet = recomputeLoopBlockSet(L, LI);
+
+ // If we still have a loop, we need to re-set the loop's parent as the exit
+ // block set changing may have moved it within the loop nest. Note that this
+ // can only happen when this loop has a parent as it can only hoist the loop
+ // *up* the nest.
+ if (!LoopBlockSet.empty() && L.getParentLoop() != ParentL) {
+ // Remove this loop's (original) blocks from all of the intervening loops.
+ for (Loop *IL = L.getParentLoop(); IL != ParentL;
+ IL = IL->getParentLoop()) {
+ IL->getBlocksSet().erase(PH);
+ for (auto *BB : L.blocks())
+ IL->getBlocksSet().erase(BB);
+ llvm::erase_if(IL->getBlocksVector(), [&](BasicBlock *BB) {
+ return BB == PH || L.contains(BB);
+ });
+ }
+
+ LI.changeLoopFor(PH, ParentL);
+ L.getParentLoop()->removeChildLoop(&L);
+ if (ParentL)
+ ParentL->addChildLoop(&L);
+ else
+ LI.addTopLevelLoop(&L);
+ }
+
+ // Now we update all the blocks which are no longer within the loop.
+ auto &Blocks = L.getBlocksVector();
+ auto BlocksSplitI =
+ LoopBlockSet.empty()
+ ? Blocks.begin()
+ : std::stable_partition(
+ Blocks.begin(), Blocks.end(),
+ [&](BasicBlock *BB) { return LoopBlockSet.count(BB); });
+
+ // Before we erase the list of unlooped blocks, build a set of them.
+ SmallPtrSet<BasicBlock *, 16> UnloopedBlocks(BlocksSplitI, Blocks.end());
+ if (LoopBlockSet.empty())
+ UnloopedBlocks.insert(PH);
+
+ // Now erase these blocks from the loop.
+ for (auto *BB : make_range(BlocksSplitI, Blocks.end()))
+ L.getBlocksSet().erase(BB);
+ Blocks.erase(BlocksSplitI, Blocks.end());
+
+ // Sort the exits in ascending loop depth, we'll work backwards across these
+ // to process them inside out.
+ std::stable_sort(ExitsInLoops.begin(), ExitsInLoops.end(),
+ [&](BasicBlock *LHS, BasicBlock *RHS) {
+ return LI.getLoopDepth(LHS) < LI.getLoopDepth(RHS);
+ });
+
+ // We'll build up a set for each exit loop.
+ SmallPtrSet<BasicBlock *, 16> NewExitLoopBlocks;
+ Loop *PrevExitL = L.getParentLoop(); // The deepest possible exit loop.
+
+ auto RemoveUnloopedBlocksFromLoop =
+ [](Loop &L, SmallPtrSetImpl<BasicBlock *> &UnloopedBlocks) {
+ for (auto *BB : UnloopedBlocks)
+ L.getBlocksSet().erase(BB);
+ llvm::erase_if(L.getBlocksVector(), [&](BasicBlock *BB) {
+ return UnloopedBlocks.count(BB);
+ });
+ };
+
+ SmallVector<BasicBlock *, 16> Worklist;
+ while (!UnloopedBlocks.empty() && !ExitsInLoops.empty()) {
+ assert(Worklist.empty() && "Didn't clear worklist!");
+ assert(NewExitLoopBlocks.empty() && "Didn't clear loop set!");
+
+ // Grab the next exit block, in decreasing loop depth order.
+ BasicBlock *ExitBB = ExitsInLoops.pop_back_val();
+ Loop &ExitL = *LI.getLoopFor(ExitBB);
+ assert(ExitL.contains(&L) && "Exit loop must contain the inner loop!");
+
+ // Erase all of the unlooped blocks from the loops between the previous
+ // exit loop and this exit loop. This works because the ExitInLoops list is
+ // sorted in increasing order of loop depth and thus we visit loops in
+ // decreasing order of loop depth.
+ for (; PrevExitL != &ExitL; PrevExitL = PrevExitL->getParentLoop())
+ RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks);
+
+ // Walk the CFG back until we hit the cloned PH adding everything reachable
+ // and in the unlooped set to this exit block's loop.
+ Worklist.push_back(ExitBB);
+ do {
+ BasicBlock *BB = Worklist.pop_back_val();
+ // We can stop recursing at the cloned preheader (if we get there).
+ if (BB == PH)
+ continue;
+
+ for (BasicBlock *PredBB : predecessors(BB)) {
+ // If this pred has already been moved to our set or is part of some
+ // (inner) loop, no update needed.
+ if (!UnloopedBlocks.erase(PredBB)) {
+ assert((NewExitLoopBlocks.count(PredBB) ||
+ ExitL.contains(LI.getLoopFor(PredBB))) &&
+ "Predecessor not in a nested loop (or already visited)!");
+ continue;
+ }
+
+ // We just insert into the loop set here. We'll add these blocks to the
+ // exit loop after we build up the set in a deterministic order rather
+ // than the predecessor-influenced visit order.
+ bool Inserted = NewExitLoopBlocks.insert(PredBB).second;
+ (void)Inserted;
+ assert(Inserted && "Should only visit an unlooped block once!");
+
+ // And recurse through to its predecessors.
+ Worklist.push_back(PredBB);
+ }
+ } while (!Worklist.empty());
+
+ // If blocks in this exit loop were directly part of the original loop (as
+ // opposed to a child loop) update the map to point to this exit loop. This
+ // just updates a map and so the fact that the order is unstable is fine.
+ for (auto *BB : NewExitLoopBlocks)
+ if (Loop *BBL = LI.getLoopFor(BB))
+ if (BBL == &L || !L.contains(BBL))
+ LI.changeLoopFor(BB, &ExitL);
+
+ // We will remove the remaining unlooped blocks from this loop in the next
+ // iteration or below.
+ NewExitLoopBlocks.clear();
+ }
+
+ // Any remaining unlooped blocks are no longer part of any loop unless they
+ // are part of some child loop.
+ for (; PrevExitL; PrevExitL = PrevExitL->getParentLoop())
+ RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks);
+ for (auto *BB : UnloopedBlocks)
+ if (Loop *BBL = LI.getLoopFor(BB))
+ if (BBL == &L || !L.contains(BBL))
+ LI.changeLoopFor(BB, nullptr);
+
+ // Sink all the child loops whose headers are no longer in the loop set to
+ // the parent (or to be top level loops). We reach into the loop and directly
+ // update its subloop vector to make this batch update efficient.
+ auto &SubLoops = L.getSubLoopsVector();
+ auto SubLoopsSplitI =
+ LoopBlockSet.empty()
+ ? SubLoops.begin()
+ : std::stable_partition(
+ SubLoops.begin(), SubLoops.end(), [&](Loop *SubL) {
+ return LoopBlockSet.count(SubL->getHeader());
+ });
+ for (auto *HoistedL : make_range(SubLoopsSplitI, SubLoops.end())) {
+ HoistedLoops.push_back(HoistedL);
+ HoistedL->setParentLoop(nullptr);
+
+ // To compute the new parent of this hoisted loop we look at where we
+ // placed the preheader above. We can't lookup the header itself because we
+ // retained the mapping from the header to the hoisted loop. But the
+ // preheader and header should have the exact same new parent computed
+ // based on the set of exit blocks from the original loop as the preheader
+ // is a predecessor of the header and so reached in the reverse walk. And
+ // because the loops were all in simplified form the preheader of the
+ // hoisted loop can't be part of some *other* loop.
+ if (auto *NewParentL = LI.getLoopFor(HoistedL->getLoopPreheader()))
+ NewParentL->addChildLoop(HoistedL);
+ else
+ LI.addTopLevelLoop(HoistedL);
+ }
+ SubLoops.erase(SubLoopsSplitI, SubLoops.end());
+
+ // Actually delete the loop if nothing remained within it.
+ if (Blocks.empty()) {
+ assert(SubLoops.empty() &&
+ "Failed to remove all subloops from the original loop!");
+ if (Loop *ParentL = L.getParentLoop())
+ ParentL->removeChildLoop(llvm::find(*ParentL, &L));
+ else
+ LI.removeLoop(llvm::find(LI, &L));
+ LI.destroy(&L);
+ return false;
+ }
+
+ return true;
+}
+
+/// Helper to visit a dominator subtree, invoking a callable on each node.
+///
+/// Returning false at any point will stop walking past that node of the tree.
+template <typename CallableT>
+void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
+ SmallVector<DomTreeNode *, 4> DomWorklist;
+ DomWorklist.push_back(DT[BB]);
+#ifndef NDEBUG
+ SmallPtrSet<DomTreeNode *, 4> Visited;
+ Visited.insert(DT[BB]);
+#endif
+ do {
+ DomTreeNode *N = DomWorklist.pop_back_val();
+
+ // Visit this node.
+ if (!Callable(N->getBlock()))
+ continue;
+
+ // Accumulate the child nodes.
+ for (DomTreeNode *ChildN : *N) {
+ assert(Visited.insert(ChildN).second &&
+ "Cannot visit a node twice when walking a tree!");
+ DomWorklist.push_back(ChildN);
+ }
+ } while (!DomWorklist.empty());
+}
+
+/// Take an invariant branch that has been determined to be safe and worthwhile
+/// to unswitch despite being non-trivial to do so and perform the unswitch.
+///
+/// This directly updates the CFG to hoist the predicate out of the loop, and
+/// clone the necessary parts of the loop to maintain behavior.
+///
+/// It also updates both dominator tree and loopinfo based on the unswitching.
+///
+/// Once unswitching has been performed it runs the provided callback to report
+/// the new loops and no-longer valid loops to the caller.
+static bool unswitchInvariantBranch(
+ Loop &L, BranchInst &BI, DominatorTree &DT, LoopInfo &LI,
+ AssumptionCache &AC,
+ function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) {
+ assert(BI.isConditional() && "Can only unswitch a conditional branch!");
+ assert(L.isLoopInvariant(BI.getCondition()) &&
+ "Can only unswitch an invariant branch condition!");
+
+ // Constant and BBs tracking the cloned and continuing successor.
+ const int ClonedSucc = 0;
+ auto *ParentBB = BI.getParent();
+ auto *UnswitchedSuccBB = BI.getSuccessor(ClonedSucc);
+ auto *ContinueSuccBB = BI.getSuccessor(1 - ClonedSucc);
+
+ assert(UnswitchedSuccBB != ContinueSuccBB &&
+ "Should not unswitch a branch that always goes to the same place!");
+
+ // The branch should be in this exact loop. Any inner loop's invariant branch
+ // should be handled by unswitching that inner loop. The caller of this
+ // routine should filter out any candidates that remain (but were skipped for
+ // whatever reason).
+ assert(LI.getLoopFor(ParentBB) == &L && "Branch in an inner loop!");
+
+ SmallVector<BasicBlock *, 4> ExitBlocks;
+ L.getUniqueExitBlocks(ExitBlocks);
+
+ // We cannot unswitch if exit blocks contain a cleanuppad instruction as we
+ // don't know how to split those exit blocks.
+ // FIXME: We should teach SplitBlock to handle this and remove this
+ // restriction.
+ for (auto *ExitBB : ExitBlocks)
+ if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI()))
+ return false;
+
+ SmallPtrSet<BasicBlock *, 4> ExitBlockSet(ExitBlocks.begin(),
+ ExitBlocks.end());
+
+ // Compute the parent loop now before we start hacking on things.
+ Loop *ParentL = L.getParentLoop();
+
+ // Compute the outer-most loop containing one of our exit blocks. This is the
+ // furthest up our loopnest which can be mutated, which we will use below to
+ // update things.
+ Loop *OuterExitL = &L;
+ for (auto *ExitBB : ExitBlocks) {
+ Loop *NewOuterExitL = LI.getLoopFor(ExitBB);
+ if (!NewOuterExitL) {
+ // We exited the entire nest with this block, so we're done.
+ OuterExitL = nullptr;
+ break;
+ }
+ if (NewOuterExitL != OuterExitL && NewOuterExitL->contains(OuterExitL))
+ OuterExitL = NewOuterExitL;
+ }
+
+ // If the edge we *aren't* cloning in the unswitch (the continuing edge)
+ // dominates its target, we can skip cloning the dominated region of the loop
+ // and its exits. We compute this as a set of nodes to be skipped.
+ SmallPtrSet<BasicBlock *, 4> SkippedLoopAndExitBlocks;
+ if (ContinueSuccBB->getUniquePredecessor() ||
+ llvm::all_of(predecessors(ContinueSuccBB), [&](BasicBlock *PredBB) {
+ return PredBB == ParentBB || DT.dominates(ContinueSuccBB, PredBB);
+ })) {
+ visitDomSubTree(DT, ContinueSuccBB, [&](BasicBlock *BB) {
+ SkippedLoopAndExitBlocks.insert(BB);
+ return true;
+ });
+ }
+ // Similarly, if the edge we *are* cloning in the unswitch (the unswitched
+ // edge) dominates its target, we will end up with dead nodes in the original
+ // loop and its exits that will need to be deleted. Here, we just retain that
+ // the property holds and will compute the deleted set later.
+ bool DeleteUnswitchedSucc =
+ UnswitchedSuccBB->getUniquePredecessor() ||
+ llvm::all_of(predecessors(UnswitchedSuccBB), [&](BasicBlock *PredBB) {
+ return PredBB == ParentBB || DT.dominates(UnswitchedSuccBB, PredBB);
+ });
+
+ // Split the preheader, so that we know that there is a safe place to insert
+ // the conditional branch. We will change the preheader to have a conditional
+ // branch on LoopCond. The original preheader will become the split point
+ // between the unswitched versions, and we will have a new preheader for the
+ // original loop.
+ BasicBlock *SplitBB = L.getLoopPreheader();
+ BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI);
+
+ // Keep a mapping for the cloned values.
+ ValueToValueMapTy VMap;
+
+ // Build the cloned blocks from the loop.
+ auto *ClonedPH = buildClonedLoopBlocks(
+ L, LoopPH, SplitBB, ExitBlocks, ParentBB, UnswitchedSuccBB,
+ ContinueSuccBB, SkippedLoopAndExitBlocks, VMap, AC, DT, LI);
+
+ // Build the cloned loop structure itself. This may be substantially
+ // different from the original structure due to the simplified CFG. This also
+ // handles inserting all the cloned blocks into the correct loops.
+ SmallVector<Loop *, 4> NonChildClonedLoops;
+ Loop *ClonedL =
+ buildClonedLoops(L, ExitBlocks, VMap, LI, NonChildClonedLoops);
+
+ // Remove the parent as a predecessor of the unswitched successor.
+ UnswitchedSuccBB->removePredecessor(ParentBB, /*DontDeleteUselessPHIs*/ true);
+
+ // Now splice the branch from the original loop and use it to select between
+ // the two loops.
+ SplitBB->getTerminator()->eraseFromParent();
+ SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), BI);
+ BI.setSuccessor(ClonedSucc, ClonedPH);
+ BI.setSuccessor(1 - ClonedSucc, LoopPH);
+
+ // Create a new unconditional branch to the continuing block (as opposed to
+ // the one cloned).
+ BranchInst::Create(ContinueSuccBB, ParentBB);
+
+ // Delete anything that was made dead in the original loop due to
+ // unswitching.
+ if (DeleteUnswitchedSucc)
+ deleteDeadBlocksFromLoop(L, UnswitchedSuccBB, ExitBlocks, DT, LI);
+
+ SmallVector<Loop *, 4> HoistedLoops;
+ bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops);
+
+ // This will have completely invalidated the dominator tree. We can't easily
+ // bound how much is invalid because in some cases we will refine the
+ // predecessor set of exit blocks of the loop which can move large unrelated
+ // regions of code into a new subtree.
+ //
+ // FIXME: Eventually, we should use an incremental update utility that
+ // leverages the existing information in the dominator tree (and potentially
+ // the nature of the change) to more efficiently update things.
+ DT.recalculate(*SplitBB->getParent());
+
+ // We can change which blocks are exit blocks of all the cloned sibling
+ // loops, the current loop, and any parent loops which shared exit blocks
+ // with the current loop. As a consequence, we need to re-form LCSSA for
+ // them. But we shouldn't need to re-form LCSSA for any child loops.
+ // FIXME: This could be made more efficient by tracking which exit blocks are
+ // new, and focusing on them, but that isn't likely to be necessary.
+ //
+ // In order to reasonably rebuild LCSSA we need to walk inside-out across the
+ // loop nest and update every loop that could have had its exits changed. We
+ // also need to cover any intervening loops. We add all of these loops to
+ // a list and sort them by loop depth to achieve this without updating
+ // unnecessary loops.
+ auto UpdateLCSSA = [&](Loop &UpdateL) {
+#ifndef NDEBUG
+ for (Loop *ChildL : UpdateL)
+ assert(ChildL->isRecursivelyLCSSAForm(DT, LI) &&
+ "Perturbed a child loop's LCSSA form!");
+#endif
+ formLCSSA(UpdateL, DT, &LI, nullptr);
+ };
+
+ // For non-child cloned loops and hoisted loops, we just need to update LCSSA
+ // and we can do it in any order as they don't nest relative to each other.
+ for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
+ UpdateLCSSA(*UpdatedL);
+
+ // If the original loop had exit blocks, walk up through the outer most loop
+ // of those exit blocks to update LCSSA and form updated dedicated exits.
+ if (OuterExitL != &L) {
+ SmallVector<Loop *, 4> OuterLoops;
+ // We start with the cloned loop and the current loop if they are loops and
+ // move toward OuterExitL. Also, if either the cloned loop or the current
+ // loop have become top level loops we need to walk all the way out.
+ if (ClonedL) {
+ OuterLoops.push_back(ClonedL);
+ if (!ClonedL->getParentLoop())
+ OuterExitL = nullptr;
+ }
+ if (IsStillLoop) {
+ OuterLoops.push_back(&L);
+ if (!L.getParentLoop())
+ OuterExitL = nullptr;
+ }
+ // Grab all of the enclosing loops now.
+ for (Loop *OuterL = ParentL; OuterL != OuterExitL;
+ OuterL = OuterL->getParentLoop())
+ OuterLoops.push_back(OuterL);
+
+ // Finally, update our list of outer loops. This is nicely ordered to work
+ // inside-out.
+ for (Loop *OuterL : OuterLoops) {
+ // First build LCSSA for this loop so that we can preserve it when
+ // forming dedicated exits. We don't want to perturb some other loop's
+ // LCSSA while doing that CFG edit.
+ UpdateLCSSA(*OuterL);
+
+ // For loops reached by this loop's original exit blocks we may
+ // introduced new, non-dedicated exits. At least try to re-form dedicated
+ // exits for these loops. This may fail if they couldn't have dedicated
+ // exits to start with.
+ formDedicatedExitBlocks(OuterL, &DT, &LI, /*PreserveLCSSA*/ true);
+ }
+ }
+
+#ifndef NDEBUG
+ // Verify the entire loop structure to catch any incorrect updates before we
+ // progress in the pass pipeline.
+ LI.verify(DT);
+#endif
+
+ // Now that we've unswitched something, make callbacks to report the changes.
+ // For that we need to merge together the updated loops and the cloned loops
+ // and check whether the original loop survived.
+ SmallVector<Loop *, 4> SibLoops;
+ for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
+ if (UpdatedL->getParentLoop() == ParentL)
+ SibLoops.push_back(UpdatedL);
+ NonTrivialUnswitchCB(IsStillLoop, SibLoops);
+
+ ++NumBranches;
+ return true;
+}
+
+/// Recursively compute the cost of a dominator subtree based on the per-block
+/// cost map provided.
+///
+/// The recursive computation is memozied into the provided DT-indexed cost map
+/// to allow querying it for most nodes in the domtree without it becoming
+/// quadratic.
+static int
+computeDomSubtreeCost(DomTreeNode &N,
+ const SmallDenseMap<BasicBlock *, int, 4> &BBCostMap,
+ SmallDenseMap<DomTreeNode *, int, 4> &DTCostMap) {
+ // Don't accumulate cost (or recurse through) blocks not in our block cost
+ // map and thus not part of the duplication cost being considered.
+ auto BBCostIt = BBCostMap.find(N.getBlock());
+ if (BBCostIt == BBCostMap.end())
+ return 0;
+
+ // Lookup this node to see if we already computed its cost.
+ auto DTCostIt = DTCostMap.find(&N);
+ if (DTCostIt != DTCostMap.end())
+ return DTCostIt->second;
+
+ // If not, we have to compute it. We can't use insert above and update
+ // because computing the cost may insert more things into the map.
+ int Cost = std::accumulate(
+ N.begin(), N.end(), BBCostIt->second, [&](int Sum, DomTreeNode *ChildN) {
+ return Sum + computeDomSubtreeCost(*ChildN, BBCostMap, DTCostMap);
+ });
+ bool Inserted = DTCostMap.insert({&N, Cost}).second;
+ (void)Inserted;
+ assert(Inserted && "Should not insert a node while visiting children!");
+ return Cost;
+}
+
/// Unswitch control flow predicated on loop invariant conditions.
///
/// This first hoists all branches or switches which are trivial (IE, do not
/// require duplicating any part of the loop) out of the loop body. It then
/// looks at other loop invariant control flows and tries to unswitch those as
/// well by cloning the loop if the result is small enough.
-static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
- AssumptionCache &AC) {
- assert(L.isLCSSAForm(DT) &&
+static bool
+unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
+ TargetTransformInfo &TTI, bool NonTrivial,
+ function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) {
+ assert(L.isRecursivelyLCSSAForm(DT, LI) &&
"Loops must be in LCSSA form before unswitching.");
bool Changed = false;
@@ -727,7 +1926,136 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
// Try trivial unswitch first before loop over other basic blocks in the loop.
Changed |= unswitchAllTrivialConditions(L, DT, LI);
- // FIXME: Add support for non-trivial unswitching by cloning the loop.
+ // If we're not doing non-trivial unswitching, we're done. We both accept
+ // a parameter but also check a local flag that can be used for testing
+ // a debugging.
+ if (!NonTrivial && !EnableNonTrivialUnswitch)
+ return Changed;
+
+ // Collect all remaining invariant branch conditions within this loop (as
+ // opposed to an inner loop which would be handled when visiting that inner
+ // loop).
+ SmallVector<TerminatorInst *, 4> UnswitchCandidates;
+ for (auto *BB : L.blocks())
+ if (LI.getLoopFor(BB) == &L)
+ if (auto *BI = dyn_cast<BranchInst>(BB->getTerminator()))
+ if (BI->isConditional() && L.isLoopInvariant(BI->getCondition()) &&
+ BI->getSuccessor(0) != BI->getSuccessor(1))
+ UnswitchCandidates.push_back(BI);
+
+ // If we didn't find any candidates, we're done.
+ if (UnswitchCandidates.empty())
+ return Changed;
+
+ DEBUG(dbgs() << "Considering " << UnswitchCandidates.size()
+ << " non-trivial loop invariant conditions for unswitching.\n");
+
+ // Given that unswitching these terminators will require duplicating parts of
+ // the loop, so we need to be able to model that cost. Compute the ephemeral
+ // values and set up a data structure to hold per-BB costs. We cache each
+ // block's cost so that we don't recompute this when considering different
+ // subsets of the loop for duplication during unswitching.
+ SmallPtrSet<const Value *, 4> EphValues;
+ CodeMetrics::collectEphemeralValues(&L, &AC, EphValues);
+ SmallDenseMap<BasicBlock *, int, 4> BBCostMap;
+
+ // Compute the cost of each block, as well as the total loop cost. Also, bail
+ // out if we see instructions which are incompatible with loop unswitching
+ // (convergent, noduplicate, or cross-basic-block tokens).
+ // FIXME: We might be able to safely handle some of these in non-duplicated
+ // regions.
+ int LoopCost = 0;
+ for (auto *BB : L.blocks()) {
+ int Cost = 0;
+ for (auto &I : *BB) {
+ if (EphValues.count(&I))
+ continue;
+
+ if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
+ return Changed;
+ if (auto CS = CallSite(&I))
+ if (CS.isConvergent() || CS.cannotDuplicate())
+ return Changed;
+
+ Cost += TTI.getUserCost(&I);
+ }
+ assert(Cost >= 0 && "Must not have negative costs!");
+ LoopCost += Cost;
+ assert(LoopCost >= 0 && "Must not have negative loop costs!");
+ BBCostMap[BB] = Cost;
+ }
+ DEBUG(dbgs() << " Total loop cost: " << LoopCost << "\n");
+
+ // Now we find the best candidate by searching for the one with the following
+ // properties in order:
+ //
+ // 1) An unswitching cost below the threshold
+ // 2) The smallest number of duplicated unswitch candidates (to avoid
+ // creating redundant subsequent unswitching)
+ // 3) The smallest cost after unswitching.
+ //
+ // We prioritize reducing fanout of unswitch candidates provided the cost
+ // remains below the threshold because this has a multiplicative effect.
+ //
+ // This requires memoizing each dominator subtree to avoid redundant work.
+ //
+ // FIXME: Need to actually do the number of candidates part above.
+ SmallDenseMap<DomTreeNode *, int, 4> DTCostMap;
+ // Given a terminator which might be unswitched, computes the non-duplicated
+ // cost for that terminator.
+ auto ComputeUnswitchedCost = [&](TerminatorInst *TI) {
+ BasicBlock &BB = *TI->getParent();
+ SmallPtrSet<BasicBlock *, 4> Visited;
+
+ int Cost = LoopCost;
+ for (BasicBlock *SuccBB : successors(&BB)) {
+ // Don't count successors more than once.
+ if (!Visited.insert(SuccBB).second)
+ continue;
+
+ // This successor's domtree will not need to be duplicated after
+ // unswitching if the edge to the successor dominates it (and thus the
+ // entire tree). This essentially means there is no other path into this
+ // subtree and so it will end up live in only one clone of the loop.
+ if (SuccBB->getUniquePredecessor() ||
+ llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) {
+ return PredBB == &BB || DT.dominates(SuccBB, PredBB);
+ })) {
+ Cost -= computeDomSubtreeCost(*DT[SuccBB], BBCostMap, DTCostMap);
+ assert(Cost >= 0 &&
+ "Non-duplicated cost should never exceed total loop cost!");
+ }
+ }
+
+ // Now scale the cost by the number of unique successors minus one. We
+ // subtract one because there is already at least one copy of the entire
+ // loop. This is computing the new cost of unswitching a condition.
+ assert(Visited.size() > 1 &&
+ "Cannot unswitch a condition without multiple distinct successors!");
+ return Cost * (Visited.size() - 1);
+ };
+ TerminatorInst *BestUnswitchTI = nullptr;
+ int BestUnswitchCost;
+ for (TerminatorInst *CandidateTI : UnswitchCandidates) {
+ int CandidateCost = ComputeUnswitchedCost(CandidateTI);
+ DEBUG(dbgs() << " Computed cost of " << CandidateCost
+ << " for unswitch candidate: " << *CandidateTI << "\n");
+ if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) {
+ BestUnswitchTI = CandidateTI;
+ BestUnswitchCost = CandidateCost;
+ }
+ }
+
+ if (BestUnswitchCost < UnswitchThreshold) {
+ DEBUG(dbgs() << " Trying to unswitch non-trivial (cost = "
+ << BestUnswitchCost << ") branch: " << *BestUnswitchTI
+ << "\n");
+ Changed |= unswitchInvariantBranch(L, cast<BranchInst>(*BestUnswitchTI), DT,
+ LI, AC, NonTrivialUnswitchCB);
+ } else {
+ DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " << BestUnswitchCost
+ << "\n");
+ }
return Changed;
}
@@ -740,7 +2068,25 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n");
- if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC))
+ // Save the current loop name in a variable so that we can report it even
+ // after it has been deleted.
+ std::string LoopName = L.getName();
+
+ auto NonTrivialUnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid,
+ ArrayRef<Loop *> NewLoops) {
+ // If we did a non-trivial unswitch, we have added new (cloned) loops.
+ U.addSiblingLoops(NewLoops);
+
+ // If the current loop remains valid, we should revisit it to catch any
+ // other unswitch opportunities. Otherwise, we need to mark it as deleted.
+ if (CurrentLoopValid)
+ U.revisitCurrentLoop();
+ else
+ U.markLoopAsDeleted(L, LoopName);
+ };
+
+ if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial,
+ NonTrivialUnswitchCB))
return PreservedAnalyses::all();
#ifndef NDEBUG
@@ -754,10 +2100,13 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
namespace {
class SimpleLoopUnswitchLegacyPass : public LoopPass {
+ bool NonTrivial;
+
public:
static char ID; // Pass ID, replacement for typeid
- explicit SimpleLoopUnswitchLegacyPass() : LoopPass(ID) {
+ explicit SimpleLoopUnswitchLegacyPass(bool NonTrivial = false)
+ : LoopPass(ID), NonTrivial(NonTrivial) {
initializeSimpleLoopUnswitchLegacyPassPass(
*PassRegistry::getPassRegistry());
}
@@ -766,6 +2115,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
getLoopAnalysisUsage(AU);
}
};
@@ -783,8 +2133,29 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+ auto NonTrivialUnswitchCB = [&L, &LPM](bool CurrentLoopValid,
+ ArrayRef<Loop *> NewLoops) {
+ // If we did a non-trivial unswitch, we have added new (cloned) loops.
+ for (auto *NewL : NewLoops)
+ LPM.addLoop(*NewL);
+
+ // If the current loop remains valid, re-add it to the queue. This is
+ // a little wasteful as we'll finish processing the current loop as well,
+ // but it is the best we can do in the old PM.
+ if (CurrentLoopValid)
+ LPM.addLoop(*L);
+ else
+ LPM.markLoopAsDeleted(*L);
+ };
+
+ bool Changed =
+ unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, NonTrivialUnswitchCB);
- bool Changed = unswitchLoop(*L, DT, LI, AC);
+ // If anything was unswitched, also clear any cached information about this
+ // loop.
+ LPM.deleteSimpleAnalysisLoop(L);
#ifndef NDEBUG
// Historically this pass has had issues with the dominator tree so verify it
@@ -798,11 +2169,13 @@ char SimpleLoopUnswitchLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
"Simple unswitch loops", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
"Simple unswitch loops", false, false)
-Pass *llvm::createSimpleLoopUnswitchLegacyPass() {
- return new SimpleLoopUnswitchLegacyPass();
+Pass *llvm::createSimpleLoopUnswitchLegacyPass(bool NonTrivial) {
+ return new SimpleLoopUnswitchLegacyPass(NonTrivial);
}
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 8754c714c5b2..1522170dc3b9 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -45,9 +45,26 @@ using namespace llvm;
#define DEBUG_TYPE "simplifycfg"
-static cl::opt<unsigned>
-UserBonusInstThreshold("bonus-inst-threshold", cl::Hidden, cl::init(1),
- cl::desc("Control the number of bonus instructions (default = 1)"));
+static cl::opt<unsigned> UserBonusInstThreshold(
+ "bonus-inst-threshold", cl::Hidden, cl::init(1),
+ cl::desc("Control the number of bonus instructions (default = 1)"));
+
+static cl::opt<bool> UserKeepLoops(
+ "keep-loops", cl::Hidden, cl::init(true),
+ cl::desc("Preserve canonical loop structure (default = true)"));
+
+static cl::opt<bool> UserSwitchToLookup(
+ "switch-to-lookup", cl::Hidden, cl::init(false),
+ cl::desc("Convert switches to lookup tables (default = false)"));
+
+static cl::opt<bool> UserForwardSwitchCond(
+ "forward-switch-cond", cl::Hidden, cl::init(false),
+ cl::desc("Forward switch condition to phi ops (default = false)"));
+
+static cl::opt<bool> UserSinkCommonInsts(
+ "sink-common-insts", cl::Hidden, cl::init(false),
+ cl::desc("Sink common instructions (default = false)"));
+
STATISTIC(NumSimpl, "Number of blocks simplified");
@@ -129,9 +146,7 @@ static bool mergeEmptyReturnBlocks(Function &F) {
/// Call SimplifyCFG on all the blocks in the function,
/// iterating until no more changes are made.
static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
- AssumptionCache *AC,
- unsigned BonusInstThreshold,
- bool LateSimplifyCFG) {
+ const SimplifyCFGOptions &Options) {
bool Changed = false;
bool LocalChange = true;
@@ -146,7 +161,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
// Loop over all of the basic blocks and remove them if they are unneeded.
for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
- if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC, &LoopHeaders, LateSimplifyCFG)) {
+ if (simplifyCFG(&*BBIt++, TTI, Options, &LoopHeaders)) {
LocalChange = true;
++NumSimpl;
}
@@ -157,12 +172,10 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
}
static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
- AssumptionCache *AC, int BonusInstThreshold,
- bool LateSimplifyCFG) {
+ const SimplifyCFGOptions &Options) {
bool EverChanged = removeUnreachableBlocks(F);
EverChanged |= mergeEmptyReturnBlocks(F);
- EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold,
- LateSimplifyCFG);
+ EverChanged |= iterativelySimplifyCFG(F, TTI, Options);
// If neither pass changed anything, we're done.
if (!EverChanged) return false;
@@ -176,28 +189,37 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
return true;
do {
- EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold,
- LateSimplifyCFG);
+ EverChanged = iterativelySimplifyCFG(F, TTI, Options);
EverChanged |= removeUnreachableBlocks(F);
} while (EverChanged);
return true;
}
-SimplifyCFGPass::SimplifyCFGPass()
- : BonusInstThreshold(UserBonusInstThreshold),
- LateSimplifyCFG(true) {}
-
-SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold, bool LateSimplifyCFG)
- : BonusInstThreshold(BonusInstThreshold),
- LateSimplifyCFG(LateSimplifyCFG) {}
+// Command-line settings override compile-time settings.
+SimplifyCFGPass::SimplifyCFGPass(const SimplifyCFGOptions &Opts) {
+ Options.BonusInstThreshold = UserBonusInstThreshold.getNumOccurrences()
+ ? UserBonusInstThreshold
+ : Opts.BonusInstThreshold;
+ Options.ForwardSwitchCondToPhi = UserForwardSwitchCond.getNumOccurrences()
+ ? UserForwardSwitchCond
+ : Opts.ForwardSwitchCondToPhi;
+ Options.ConvertSwitchToLookupTable = UserSwitchToLookup.getNumOccurrences()
+ ? UserSwitchToLookup
+ : Opts.ConvertSwitchToLookupTable;
+ Options.NeedCanonicalLoop = UserKeepLoops.getNumOccurrences()
+ ? UserKeepLoops
+ : Opts.NeedCanonicalLoop;
+ Options.SinkCommonInsts = UserSinkCommonInsts.getNumOccurrences()
+ ? UserSinkCommonInsts
+ : Opts.SinkCommonInsts;
+}
PreservedAnalyses SimplifyCFGPass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
- auto &AC = AM.getResult<AssumptionAnalysis>(F);
-
- if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold, LateSimplifyCFG))
+ Options.AC = &AM.getResult<AssumptionAnalysis>(F);
+ if (!simplifyFunctionCFG(F, TTI, Options))
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<GlobalsAA>();
@@ -205,55 +227,54 @@ PreservedAnalyses SimplifyCFGPass::run(Function &F,
}
namespace {
-struct BaseCFGSimplifyPass : public FunctionPass {
- unsigned BonusInstThreshold;
+struct CFGSimplifyPass : public FunctionPass {
+ static char ID;
+ SimplifyCFGOptions Options;
std::function<bool(const Function &)> PredicateFtor;
- bool LateSimplifyCFG;
- BaseCFGSimplifyPass(int T, bool LateSimplifyCFG,
- std::function<bool(const Function &)> Ftor,
- char &ID)
- : FunctionPass(ID), PredicateFtor(std::move(Ftor)),
- LateSimplifyCFG(LateSimplifyCFG) {
- BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T);
+ CFGSimplifyPass(unsigned Threshold = 1, bool ForwardSwitchCond = false,
+ bool ConvertSwitch = false, bool KeepLoops = true,
+ bool SinkCommon = false,
+ std::function<bool(const Function &)> Ftor = nullptr)
+ : FunctionPass(ID), PredicateFtor(std::move(Ftor)) {
+
+ initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+
+ // Check for command-line overrides of options for debug/customization.
+ Options.BonusInstThreshold = UserBonusInstThreshold.getNumOccurrences()
+ ? UserBonusInstThreshold
+ : Threshold;
+
+ Options.ForwardSwitchCondToPhi = UserForwardSwitchCond.getNumOccurrences()
+ ? UserForwardSwitchCond
+ : ForwardSwitchCond;
+
+ Options.ConvertSwitchToLookupTable = UserSwitchToLookup.getNumOccurrences()
+ ? UserSwitchToLookup
+ : ConvertSwitch;
+
+ Options.NeedCanonicalLoop =
+ UserKeepLoops.getNumOccurrences() ? UserKeepLoops : KeepLoops;
+
+ Options.SinkCommonInsts = UserSinkCommonInsts.getNumOccurrences()
+ ? UserSinkCommonInsts
+ : SinkCommon;
}
+
bool runOnFunction(Function &F) override {
if (skipFunction(F) || (PredicateFtor && !PredicateFtor(F)))
return false;
- AssumptionCache *AC =
- &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- const TargetTransformInfo &TTI =
- getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold, LateSimplifyCFG);
+ Options.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ return simplifyFunctionCFG(F, TTI, Options);
}
-
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
};
-
-struct CFGSimplifyPass : public BaseCFGSimplifyPass {
- static char ID; // Pass identification, replacement for typeid
-
- CFGSimplifyPass(int T = -1,
- std::function<bool(const Function &)> Ftor = nullptr)
- : BaseCFGSimplifyPass(T, false, Ftor, ID) {
- initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
- }
-};
-
-struct LateCFGSimplifyPass : public BaseCFGSimplifyPass {
- static char ID; // Pass identification, replacement for typeid
-
- LateCFGSimplifyPass(int T = -1,
- std::function<bool(const Function &)> Ftor = nullptr)
- : BaseCFGSimplifyPass(T, true, Ftor, ID) {
- initializeLateCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
- }
-};
}
char CFGSimplifyPass::ID = 0;
@@ -264,24 +285,12 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
false)
-char LateCFGSimplifyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(LateCFGSimplifyPass, "latesimplifycfg",
- "Simplify the CFG more aggressively", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_END(LateCFGSimplifyPass, "latesimplifycfg",
- "Simplify the CFG more aggressively", false, false)
-
// Public interface to the CFGSimplification pass
FunctionPass *
-llvm::createCFGSimplificationPass(int Threshold,
- std::function<bool(const Function &)> Ftor) {
- return new CFGSimplifyPass(Threshold, std::move(Ftor));
-}
-
-// Public interface to the LateCFGSimplification pass
-FunctionPass *
-llvm::createLateCFGSimplificationPass(int Threshold,
+llvm::createCFGSimplificationPass(unsigned Threshold, bool ForwardSwitchCond,
+ bool ConvertSwitch, bool KeepLoops,
+ bool SinkCommon,
std::function<bool(const Function &)> Ftor) {
- return new LateCFGSimplifyPass(Threshold, std::move(Ftor));
+ return new CFGSimplifyPass(Threshold, ForwardSwitchCond, ConvertSwitch,
+ KeepLoops, SinkCommon, std::move(Ftor));
}
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index 5210f165b874..cfb8a062299f 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -68,7 +68,7 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
MemoryLocation Loc = MemoryLocation::get(L);
for (Instruction *S : Stores)
- if (AA.getModRefInfo(S, Loc) & MRI_Mod)
+ if (isModSet(AA.getModRefInfo(S, Loc)))
return false;
}
@@ -83,7 +83,7 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
return false;
for (Instruction *S : Stores)
- if (AA.getModRefInfo(S, CS) & MRI_Mod)
+ if (isModSet(AA.getModRefInfo(S, CS)))
return false;
}
diff --git a/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
new file mode 100644
index 000000000000..23156d5a4d83
--- /dev/null
+++ b/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
@@ -0,0 +1,811 @@
+//===- SpeculateAroundPHIs.cpp --------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "spec-phis"
+
+STATISTIC(NumPHIsSpeculated, "Number of PHI nodes we speculated around");
+STATISTIC(NumEdgesSplit,
+ "Number of critical edges which were split for speculation");
+STATISTIC(NumSpeculatedInstructions,
+ "Number of instructions we speculated around the PHI nodes");
+STATISTIC(NumNewRedundantInstructions,
+ "Number of new, redundant instructions inserted");
+
+/// Check wether speculating the users of a PHI node around the PHI
+/// will be safe.
+///
+/// This checks both that all of the users are safe and also that all of their
+/// operands are either recursively safe or already available along an incoming
+/// edge to the PHI.
+///
+/// This routine caches both all the safe nodes explored in `PotentialSpecSet`
+/// and the chain of nodes that definitively reach any unsafe node in
+/// `UnsafeSet`. By preserving these between repeated calls to this routine for
+/// PHIs in the same basic block, the exploration here can be reused. However,
+/// these caches must no be reused for PHIs in a different basic block as they
+/// reflect what is available along incoming edges.
+static bool
+isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
+ SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+ SmallPtrSetImpl<Instruction *> &UnsafeSet) {
+ auto *PhiBB = PN.getParent();
+ SmallPtrSet<Instruction *, 4> Visited;
+ SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
+
+ // Walk each user of the PHI node.
+ for (Use &U : PN.uses()) {
+ auto *UI = cast<Instruction>(U.getUser());
+
+ // Ensure the use post-dominates the PHI node. This ensures that, in the
+ // absence of unwinding, the use will actually be reached.
+ // FIXME: We use a blunt hammer of requiring them to be in the same basic
+ // block. We should consider using actual post-dominance here in the
+ // future.
+ if (UI->getParent() != PhiBB) {
+ DEBUG(dbgs() << " Unsafe: use in a different BB: " << *UI << "\n");
+ return false;
+ }
+
+ // FIXME: This check is much too conservative. We're not going to move these
+ // instructions onto new dynamic paths through the program unless there is
+ // a call instruction between the use and the PHI node. And memory isn't
+ // changing unless there is a store in that same sequence. We should
+ // probably change this to do at least a limited scan of the intervening
+ // instructions and allow handling stores in easily proven safe cases.
+ if (mayBeMemoryDependent(*UI)) {
+ DEBUG(dbgs() << " Unsafe: can't speculate use: " << *UI << "\n");
+ return false;
+ }
+
+ // Now do a depth-first search of everything these users depend on to make
+ // sure they are transitively safe. This is a depth-first search, but we
+ // check nodes in preorder to minimize the amount of checking.
+ Visited.insert(UI);
+ DFSStack.push_back({UI, UI->value_op_begin()});
+ do {
+ User::value_op_iterator OpIt;
+ std::tie(UI, OpIt) = DFSStack.pop_back_val();
+
+ while (OpIt != UI->value_op_end()) {
+ auto *OpI = dyn_cast<Instruction>(*OpIt);
+ // Increment to the next operand for whenever we continue.
+ ++OpIt;
+ // No need to visit non-instructions, which can't form dependencies.
+ if (!OpI)
+ continue;
+
+ // Now do the main pre-order checks that this operand is a viable
+ // dependency of something we want to speculate.
+
+ // First do a few checks for instructions that won't require
+ // speculation at all because they are trivially available on the
+ // incoming edge (either through dominance or through an incoming value
+ // to a PHI).
+ //
+ // The cases in the current block will be trivially dominated by the
+ // edge.
+ auto *ParentBB = OpI->getParent();
+ if (ParentBB == PhiBB) {
+ if (isa<PHINode>(OpI)) {
+ // We can trivially map through phi nodes in the same block.
+ continue;
+ }
+ } else if (DT.dominates(ParentBB, PhiBB)) {
+ // Instructions from dominating blocks are already available.
+ continue;
+ }
+
+ // Once we know that we're considering speculating the operand, check
+ // if we've already explored this subgraph and found it to be safe.
+ if (PotentialSpecSet.count(OpI))
+ continue;
+
+ // If we've already explored this subgraph and found it unsafe, bail.
+ // If when we directly test whether this is safe it fails, bail.
+ if (UnsafeSet.count(OpI) || ParentBB != PhiBB ||
+ mayBeMemoryDependent(*OpI)) {
+ DEBUG(dbgs() << " Unsafe: can't speculate transitive use: " << *OpI
+ << "\n");
+ // Record the stack of instructions which reach this node as unsafe
+ // so we prune subsequent searches.
+ UnsafeSet.insert(OpI);
+ for (auto &StackPair : DFSStack) {
+ Instruction *I = StackPair.first;
+ UnsafeSet.insert(I);
+ }
+ return false;
+ }
+
+ // Skip any operands we're already recursively checking.
+ if (!Visited.insert(OpI).second)
+ continue;
+
+ // Push onto the stack and descend. We can directly continue this
+ // loop when ascending.
+ DFSStack.push_back({UI, OpIt});
+ UI = OpI;
+ OpIt = OpI->value_op_begin();
+ }
+
+ // This node and all its operands are safe. Go ahead and cache that for
+ // reuse later.
+ PotentialSpecSet.insert(UI);
+
+ // Continue with the next node on the stack.
+ } while (!DFSStack.empty());
+ }
+
+#ifndef NDEBUG
+ // Every visited operand should have been marked as safe for speculation at
+ // this point. Verify this and return success.
+ for (auto *I : Visited)
+ assert(PotentialSpecSet.count(I) &&
+ "Failed to mark a visited instruction as safe!");
+#endif
+ return true;
+}
+
+/// Check whether, in isolation, a given PHI node is both safe and profitable
+/// to speculate users around.
+///
+/// This handles checking whether there are any constant operands to a PHI
+/// which could represent a useful speculation candidate, whether the users of
+/// the PHI are safe to speculate including all their transitive dependencies,
+/// and whether after speculation there will be some cost savings (profit) to
+/// folding the operands into the users of the PHI node. Returns true if both
+/// safe and profitable with relevant cost savings updated in the map and with
+/// an update to the `PotentialSpecSet`. Returns false if either safety or
+/// profitability are absent. Some new entries may be made to the
+/// `PotentialSpecSet` even when this routine returns false, but they remain
+/// conservatively correct.
+///
+/// The profitability check here is a local one, but it checks this in an
+/// interesting way. Beyond checking that the total cost of materializing the
+/// constants will be less than the cost of folding them into their users, it
+/// also checks that no one incoming constant will have a higher cost when
+/// folded into its users rather than materialized. This higher cost could
+/// result in a dynamic *path* that is more expensive even when the total cost
+/// is lower. Currently, all of the interesting cases where this optimization
+/// should fire are ones where it is a no-loss operation in this sense. If we
+/// ever want to be more aggressive here, we would need to balance the
+/// different incoming edges' cost by looking at their respective
+/// probabilities.
+static bool isSafeAndProfitableToSpeculateAroundPHI(
+ PHINode &PN, SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
+ SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+ SmallPtrSetImpl<Instruction *> &UnsafeSet, DominatorTree &DT,
+ TargetTransformInfo &TTI) {
+ // First see whether there is any cost savings to speculating around this
+ // PHI, and build up a map of the constant inputs to how many times they
+ // occur.
+ bool NonFreeMat = false;
+ struct CostsAndCount {
+ int MatCost = TargetTransformInfo::TCC_Free;
+ int FoldedCost = TargetTransformInfo::TCC_Free;
+ int Count = 0;
+ };
+ SmallDenseMap<ConstantInt *, CostsAndCount, 16> CostsAndCounts;
+ SmallPtrSet<BasicBlock *, 16> IncomingConstantBlocks;
+ for (int i : llvm::seq<int>(0, PN.getNumIncomingValues())) {
+ auto *IncomingC = dyn_cast<ConstantInt>(PN.getIncomingValue(i));
+ if (!IncomingC)
+ continue;
+
+ // Only visit each incoming edge with a constant input once.
+ if (!IncomingConstantBlocks.insert(PN.getIncomingBlock(i)).second)
+ continue;
+
+ auto InsertResult = CostsAndCounts.insert({IncomingC, {}});
+ // Count how many edges share a given incoming costant.
+ ++InsertResult.first->second.Count;
+ // Only compute the cost the first time we see a particular constant.
+ if (!InsertResult.second)
+ continue;
+
+ int &MatCost = InsertResult.first->second.MatCost;
+ MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType());
+ NonFreeMat |= MatCost != TTI.TCC_Free;
+ }
+ if (!NonFreeMat) {
+ DEBUG(dbgs() << " Free: " << PN << "\n");
+ // No profit in free materialization.
+ return false;
+ }
+
+ // Now check that the uses of this PHI can actually be speculated,
+ // otherwise we'll still have to materialize the PHI value.
+ if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) {
+ DEBUG(dbgs() << " Unsafe PHI: " << PN << "\n");
+ return false;
+ }
+
+ // Compute how much (if any) savings are available by speculating around this
+ // PHI.
+ for (Use &U : PN.uses()) {
+ auto *UserI = cast<Instruction>(U.getUser());
+ // Now check whether there is any savings to folding the incoming constants
+ // into this use.
+ unsigned Idx = U.getOperandNo();
+
+ // If we have a binary operator that is commutative, an actual constant
+ // operand would end up on the RHS, so pretend the use of the PHI is on the
+ // RHS.
+ //
+ // Technically, this is a bit weird if *both* operands are PHIs we're
+ // speculating. But if that is the case, giving an "optimistic" cost isn't
+ // a bad thing because after speculation it will constant fold. And
+ // moreover, such cases should likely have been constant folded already by
+ // some other pass, so we shouldn't worry about "modeling" them terribly
+ // accurately here. Similarly, if the other operand is a constant, it still
+ // seems fine to be "optimistic" in our cost modeling, because when the
+ // incoming operand from the PHI node is also a constant, we will end up
+ // constant folding.
+ if (UserI->isBinaryOp() && UserI->isCommutative() && Idx != 1)
+ // Assume we will commute the constant to the RHS to be canonical.
+ Idx = 1;
+
+ // Get the intrinsic ID if this user is an instrinsic.
+ Intrinsic::ID IID = Intrinsic::not_intrinsic;
+ if (auto *UserII = dyn_cast<IntrinsicInst>(UserI))
+ IID = UserII->getIntrinsicID();
+
+ for (auto &IncomingConstantAndCostsAndCount : CostsAndCounts) {
+ ConstantInt *IncomingC = IncomingConstantAndCostsAndCount.first;
+ int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
+ int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
+ if (IID)
+ FoldedCost += TTI.getIntImmCost(IID, Idx, IncomingC->getValue(),
+ IncomingC->getType());
+ else
+ FoldedCost +=
+ TTI.getIntImmCost(UserI->getOpcode(), Idx, IncomingC->getValue(),
+ IncomingC->getType());
+
+ // If we accumulate more folded cost for this incoming constant than
+ // materialized cost, then we'll regress any edge with this constant so
+ // just bail. We're only interested in cases where folding the incoming
+ // constants is at least break-even on all paths.
+ if (FoldedCost > MatCost) {
+ DEBUG(dbgs() << " Not profitable to fold imm: " << *IncomingC << "\n"
+ " Materializing cost: " << MatCost << "\n"
+ " Accumulated folded cost: " << FoldedCost << "\n");
+ return false;
+ }
+ }
+ }
+
+ // Compute the total cost savings afforded by this PHI node.
+ int TotalMatCost = TTI.TCC_Free, TotalFoldedCost = TTI.TCC_Free;
+ for (auto IncomingConstantAndCostsAndCount : CostsAndCounts) {
+ int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
+ int FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
+ int Count = IncomingConstantAndCostsAndCount.second.Count;
+
+ TotalMatCost += MatCost * Count;
+ TotalFoldedCost += FoldedCost * Count;
+ }
+ assert(TotalFoldedCost <= TotalMatCost && "If each constant's folded cost is "
+ "less that its materialized cost, "
+ "the sum must be as well.");
+
+ DEBUG(dbgs() << " Cost savings " << (TotalMatCost - TotalFoldedCost)
+ << ": " << PN << "\n");
+ CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost;
+ return true;
+}
+
+/// Simple helper to walk all the users of a list of phis depth first, and call
+/// a visit function on each one in post-order.
+///
+/// All of the PHIs should be in the same basic block, and this is primarily
+/// used to make a single depth-first walk across their collective users
+/// without revisiting any subgraphs. Callers should provide a fast, idempotent
+/// callable to test whether a node has been visited and the more important
+/// callable to actually visit a particular node.
+///
+/// Depth-first and postorder here refer to the *operand* graph -- we start
+/// from a collection of users of PHI nodes and walk "up" the operands
+/// depth-first.
+template <typename IsVisitedT, typename VisitT>
+static void visitPHIUsersAndDepsInPostOrder(ArrayRef<PHINode *> PNs,
+ IsVisitedT IsVisited,
+ VisitT Visit) {
+ SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
+ for (auto *PN : PNs)
+ for (Use &U : PN->uses()) {
+ auto *UI = cast<Instruction>(U.getUser());
+ if (IsVisited(UI))
+ // Already visited this user, continue across the roots.
+ continue;
+
+ // Otherwise, walk the operand graph depth-first and visit each
+ // dependency in postorder.
+ DFSStack.push_back({UI, UI->value_op_begin()});
+ do {
+ User::value_op_iterator OpIt;
+ std::tie(UI, OpIt) = DFSStack.pop_back_val();
+ while (OpIt != UI->value_op_end()) {
+ auto *OpI = dyn_cast<Instruction>(*OpIt);
+ // Increment to the next operand for whenever we continue.
+ ++OpIt;
+ // No need to visit non-instructions, which can't form dependencies,
+ // or instructions outside of our potential dependency set that we
+ // were given. Finally, if we've already visited the node, continue
+ // to the next.
+ if (!OpI || IsVisited(OpI))
+ continue;
+
+ // Push onto the stack and descend. We can directly continue this
+ // loop when ascending.
+ DFSStack.push_back({UI, OpIt});
+ UI = OpI;
+ OpIt = OpI->value_op_begin();
+ }
+
+ // Finished visiting children, visit this node.
+ assert(!IsVisited(UI) && "Should not have already visited a node!");
+ Visit(UI);
+ } while (!DFSStack.empty());
+ }
+}
+
+/// Find profitable PHIs to speculate.
+///
+/// For a PHI node to be profitable, we need the cost of speculating its users
+/// (and their dependencies) to not exceed the savings of folding the PHI's
+/// constant operands into the speculated users.
+///
+/// Computing this is surprisingly challenging. Because users of two different
+/// PHI nodes can depend on each other or on common other instructions, it may
+/// be profitable to speculate two PHI nodes together even though neither one
+/// in isolation is profitable. The straightforward way to find all the
+/// profitable PHIs would be to check each combination of PHIs' cost, but this
+/// is exponential in complexity.
+///
+/// Even if we assume that we only care about cases where we can consider each
+/// PHI node in isolation (rather than considering cases where none are
+/// profitable in isolation but some subset are profitable as a set), we still
+/// have a challenge. The obvious way to find all individually profitable PHIs
+/// is to iterate until reaching a fixed point, but this will be quadratic in
+/// complexity. =/
+///
+/// This code currently uses a linear-to-compute order for a greedy approach.
+/// It won't find cases where a set of PHIs must be considered together, but it
+/// handles most cases of order dependence without quadratic iteration. The
+/// specific order used is the post-order across the operand DAG. When the last
+/// user of a PHI is visited in this postorder walk, we check it for
+/// profitability.
+///
+/// There is an orthogonal extra complexity to all of this: computing the cost
+/// itself can easily become a linear computation making everything again (at
+/// best) quadratic. Using a postorder over the operand graph makes it
+/// particularly easy to avoid this through dynamic programming. As we do the
+/// postorder walk, we build the transitive cost of that subgraph. It is also
+/// straightforward to then update these costs when we mark a PHI for
+/// speculation so that subsequent PHIs don't re-pay the cost of already
+/// speculated instructions.
+static SmallVector<PHINode *, 16>
+findProfitablePHIs(ArrayRef<PHINode *> PNs,
+ const SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
+ const SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+ int NumPreds, DominatorTree &DT, TargetTransformInfo &TTI) {
+ SmallVector<PHINode *, 16> SpecPNs;
+
+ // First, establish a reverse mapping from immediate users of the PHI nodes
+ // to the nodes themselves, and count how many users each PHI node has in
+ // a way we can update while processing them.
+ SmallDenseMap<Instruction *, TinyPtrVector<PHINode *>, 16> UserToPNMap;
+ SmallDenseMap<PHINode *, int, 16> PNUserCountMap;
+ SmallPtrSet<Instruction *, 16> UserSet;
+ for (auto *PN : PNs) {
+ assert(UserSet.empty() && "Must start with an empty user set!");
+ for (Use &U : PN->uses())
+ UserSet.insert(cast<Instruction>(U.getUser()));
+ PNUserCountMap[PN] = UserSet.size();
+ for (auto *UI : UserSet)
+ UserToPNMap.insert({UI, {}}).first->second.push_back(PN);
+ UserSet.clear();
+ }
+
+ // Now do a DFS across the operand graph of the users, computing cost as we
+ // go and when all costs for a given PHI are known, checking that PHI for
+ // profitability.
+ SmallDenseMap<Instruction *, int, 16> SpecCostMap;
+ visitPHIUsersAndDepsInPostOrder(
+ PNs,
+ /*IsVisited*/
+ [&](Instruction *I) {
+ // We consider anything that isn't potentially speculated to be
+ // "visited" as it is already handled. Similarly, anything that *is*
+ // potentially speculated but for which we have an entry in our cost
+ // map, we're done.
+ return !PotentialSpecSet.count(I) || SpecCostMap.count(I);
+ },
+ /*Visit*/
+ [&](Instruction *I) {
+ // We've fully visited the operands, so sum their cost with this node
+ // and update the cost map.
+ int Cost = TTI.TCC_Free;
+ for (Value *OpV : I->operand_values())
+ if (auto *OpI = dyn_cast<Instruction>(OpV)) {
+ auto CostMapIt = SpecCostMap.find(OpI);
+ if (CostMapIt != SpecCostMap.end())
+ Cost += CostMapIt->second;
+ }
+ Cost += TTI.getUserCost(I);
+ bool Inserted = SpecCostMap.insert({I, Cost}).second;
+ (void)Inserted;
+ assert(Inserted && "Must not re-insert a cost during the DFS!");
+
+ // Now check if this node had a corresponding PHI node using it. If so,
+ // we need to decrement the outstanding user count for it.
+ auto UserPNsIt = UserToPNMap.find(I);
+ if (UserPNsIt == UserToPNMap.end())
+ return;
+ auto &UserPNs = UserPNsIt->second;
+ auto UserPNsSplitIt = std::stable_partition(
+ UserPNs.begin(), UserPNs.end(), [&](PHINode *UserPN) {
+ int &PNUserCount = PNUserCountMap.find(UserPN)->second;
+ assert(
+ PNUserCount > 0 &&
+ "Should never re-visit a PN after its user count hits zero!");
+ --PNUserCount;
+ return PNUserCount != 0;
+ });
+
+ // FIXME: Rather than one at a time, we should sum the savings as the
+ // cost will be completely shared.
+ SmallVector<Instruction *, 16> SpecWorklist;
+ for (auto *PN : llvm::make_range(UserPNsSplitIt, UserPNs.end())) {
+ int SpecCost = TTI.TCC_Free;
+ for (Use &U : PN->uses())
+ SpecCost +=
+ SpecCostMap.find(cast<Instruction>(U.getUser()))->second;
+ SpecCost *= (NumPreds - 1);
+ // When the user count of a PHI node hits zero, we should check its
+ // profitability. If profitable, we should mark it for speculation
+ // and zero out the cost of everything it depends on.
+ int CostSavings = CostSavingsMap.find(PN)->second;
+ if (SpecCost > CostSavings) {
+ DEBUG(dbgs() << " Not profitable, speculation cost: " << *PN << "\n"
+ " Cost savings: " << CostSavings << "\n"
+ " Speculation cost: " << SpecCost << "\n");
+ continue;
+ }
+
+ // We're going to speculate this user-associated PHI. Copy it out and
+ // add its users to the worklist to update their cost.
+ SpecPNs.push_back(PN);
+ for (Use &U : PN->uses()) {
+ auto *UI = cast<Instruction>(U.getUser());
+ auto CostMapIt = SpecCostMap.find(UI);
+ if (CostMapIt->second == 0)
+ continue;
+ // Zero out this cost entry to avoid duplicates.
+ CostMapIt->second = 0;
+ SpecWorklist.push_back(UI);
+ }
+ }
+
+ // Now walk all the operands of the users in the worklist transitively
+ // to zero out all the memoized costs.
+ while (!SpecWorklist.empty()) {
+ Instruction *SpecI = SpecWorklist.pop_back_val();
+ assert(SpecCostMap.find(SpecI)->second == 0 &&
+ "Didn't zero out a cost!");
+
+ // Walk the operands recursively to zero out their cost as well.
+ for (auto *OpV : SpecI->operand_values()) {
+ auto *OpI = dyn_cast<Instruction>(OpV);
+ if (!OpI)
+ continue;
+ auto CostMapIt = SpecCostMap.find(OpI);
+ if (CostMapIt == SpecCostMap.end() || CostMapIt->second == 0)
+ continue;
+ CostMapIt->second = 0;
+ SpecWorklist.push_back(OpI);
+ }
+ }
+ });
+
+ return SpecPNs;
+}
+
+/// Speculate users around a set of PHI nodes.
+///
+/// This routine does the actual speculation around a set of PHI nodes where we
+/// have determined this to be both safe and profitable.
+///
+/// This routine handles any spliting of critical edges necessary to create
+/// a safe block to speculate into as well as cloning the instructions and
+/// rewriting all uses.
+static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
+ SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+ SmallSetVector<BasicBlock *, 16> &PredSet,
+ DominatorTree &DT) {
+ DEBUG(dbgs() << " Speculating around " << SpecPNs.size() << " PHIs!\n");
+ NumPHIsSpeculated += SpecPNs.size();
+
+ // Split any critical edges so that we have a block to hoist into.
+ auto *ParentBB = SpecPNs[0]->getParent();
+ SmallVector<BasicBlock *, 16> SpecPreds;
+ SpecPreds.reserve(PredSet.size());
+ for (auto *PredBB : PredSet) {
+ auto *NewPredBB = SplitCriticalEdge(
+ PredBB, ParentBB,
+ CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges());
+ if (NewPredBB) {
+ ++NumEdgesSplit;
+ DEBUG(dbgs() << " Split critical edge from: " << PredBB->getName()
+ << "\n");
+ SpecPreds.push_back(NewPredBB);
+ } else {
+ assert(PredBB->getSingleSuccessor() == ParentBB &&
+ "We need a non-critical predecessor to speculate into.");
+ assert(!isa<InvokeInst>(PredBB->getTerminator()) &&
+ "Cannot have a non-critical invoke!");
+
+ // Already non-critical, use existing pred.
+ SpecPreds.push_back(PredBB);
+ }
+ }
+
+ SmallPtrSet<Instruction *, 16> SpecSet;
+ SmallVector<Instruction *, 16> SpecList;
+ visitPHIUsersAndDepsInPostOrder(SpecPNs,
+ /*IsVisited*/
+ [&](Instruction *I) {
+ // This is visited if we don't need to
+ // speculate it or we already have
+ // speculated it.
+ return !PotentialSpecSet.count(I) ||
+ SpecSet.count(I);
+ },
+ /*Visit*/
+ [&](Instruction *I) {
+ // All operands scheduled, schedule this
+ // node.
+ SpecSet.insert(I);
+ SpecList.push_back(I);
+ });
+
+ int NumSpecInsts = SpecList.size() * SpecPreds.size();
+ int NumRedundantInsts = NumSpecInsts - SpecList.size();
+ DEBUG(dbgs() << " Inserting " << NumSpecInsts << " speculated instructions, "
+ << NumRedundantInsts << " redundancies\n");
+ NumSpeculatedInstructions += NumSpecInsts;
+ NumNewRedundantInstructions += NumRedundantInsts;
+
+ // Each predecessor is numbered by its index in `SpecPreds`, so for each
+ // instruction we speculate, the speculated instruction is stored in that
+ // index of the vector asosciated with the original instruction. We also
+ // store the incoming values for each predecessor from any PHIs used.
+ SmallDenseMap<Instruction *, SmallVector<Value *, 2>, 16> SpeculatedValueMap;
+
+ // Inject the synthetic mappings to rewrite PHIs to the appropriate incoming
+ // value. This handles both the PHIs we are speculating around and any other
+ // PHIs that happen to be used.
+ for (auto *OrigI : SpecList)
+ for (auto *OpV : OrigI->operand_values()) {
+ auto *OpPN = dyn_cast<PHINode>(OpV);
+ if (!OpPN || OpPN->getParent() != ParentBB)
+ continue;
+
+ auto InsertResult = SpeculatedValueMap.insert({OpPN, {}});
+ if (!InsertResult.second)
+ continue;
+
+ auto &SpeculatedVals = InsertResult.first->second;
+
+ // Populating our structure for mapping is particularly annoying because
+ // finding an incoming value for a particular predecessor block in a PHI
+ // node is a linear time operation! To avoid quadratic behavior, we build
+ // a map for this PHI node's incoming values and then translate it into
+ // the more compact representation used below.
+ SmallDenseMap<BasicBlock *, Value *, 16> IncomingValueMap;
+ for (int i : llvm::seq<int>(0, OpPN->getNumIncomingValues()))
+ IncomingValueMap[OpPN->getIncomingBlock(i)] = OpPN->getIncomingValue(i);
+
+ for (auto *PredBB : SpecPreds)
+ SpeculatedVals.push_back(IncomingValueMap.find(PredBB)->second);
+ }
+
+ // Speculate into each predecessor.
+ for (int PredIdx : llvm::seq<int>(0, SpecPreds.size())) {
+ auto *PredBB = SpecPreds[PredIdx];
+ assert(PredBB->getSingleSuccessor() == ParentBB &&
+ "We need a non-critical predecessor to speculate into.");
+
+ for (auto *OrigI : SpecList) {
+ auto *NewI = OrigI->clone();
+ NewI->setName(Twine(OrigI->getName()) + "." + Twine(PredIdx));
+ NewI->insertBefore(PredBB->getTerminator());
+
+ // Rewrite all the operands to the previously speculated instructions.
+ // Because we're walking in-order, the defs must precede the uses and we
+ // should already have these mappings.
+ for (Use &U : NewI->operands()) {
+ auto *OpI = dyn_cast<Instruction>(U.get());
+ if (!OpI)
+ continue;
+ auto MapIt = SpeculatedValueMap.find(OpI);
+ if (MapIt == SpeculatedValueMap.end())
+ continue;
+ const auto &SpeculatedVals = MapIt->second;
+ assert(SpeculatedVals[PredIdx] &&
+ "Must have a speculated value for this predecessor!");
+ assert(SpeculatedVals[PredIdx]->getType() == OpI->getType() &&
+ "Speculated value has the wrong type!");
+
+ // Rewrite the use to this predecessor's speculated instruction.
+ U.set(SpeculatedVals[PredIdx]);
+ }
+
+ // Commute instructions which now have a constant in the LHS but not the
+ // RHS.
+ if (NewI->isBinaryOp() && NewI->isCommutative() &&
+ isa<Constant>(NewI->getOperand(0)) &&
+ !isa<Constant>(NewI->getOperand(1)))
+ NewI->getOperandUse(0).swap(NewI->getOperandUse(1));
+
+ SpeculatedValueMap[OrigI].push_back(NewI);
+ assert(SpeculatedValueMap[OrigI][PredIdx] == NewI &&
+ "Mismatched speculated instruction index!");
+ }
+ }
+
+ // Walk the speculated instruction list and if they have uses, insert a PHI
+ // for them from the speculated versions, and replace the uses with the PHI.
+ // Then erase the instructions as they have been fully speculated. The walk
+ // needs to be in reverse so that we don't think there are users when we'll
+ // actually eventually remove them later.
+ IRBuilder<> IRB(SpecPNs[0]);
+ for (auto *OrigI : llvm::reverse(SpecList)) {
+ // Check if we need a PHI for any remaining users and if so, insert it.
+ if (!OrigI->use_empty()) {
+ auto *SpecIPN = IRB.CreatePHI(OrigI->getType(), SpecPreds.size(),
+ Twine(OrigI->getName()) + ".phi");
+ // Add the incoming values we speculated.
+ auto &SpeculatedVals = SpeculatedValueMap.find(OrigI)->second;
+ for (int PredIdx : llvm::seq<int>(0, SpecPreds.size()))
+ SpecIPN->addIncoming(SpeculatedVals[PredIdx], SpecPreds[PredIdx]);
+
+ // And replace the uses with the PHI node.
+ OrigI->replaceAllUsesWith(SpecIPN);
+ }
+
+ // It is important to immediately erase this so that it stops using other
+ // instructions. This avoids inserting needless PHIs of them.
+ OrigI->eraseFromParent();
+ }
+
+ // All of the uses of the speculated phi nodes should be removed at this
+ // point, so erase them.
+ for (auto *SpecPN : SpecPNs) {
+ assert(SpecPN->use_empty() && "All users should have been speculated!");
+ SpecPN->eraseFromParent();
+ }
+}
+
+/// Try to speculate around a series of PHIs from a single basic block.
+///
+/// This routine checks whether any of these PHIs are profitable to speculate
+/// users around. If safe and profitable, it does the speculation. It returns
+/// true when at least some speculation occurs.
+static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
+ DominatorTree &DT, TargetTransformInfo &TTI) {
+ DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n");
+
+ // Savings in cost from speculating around a PHI node.
+ SmallDenseMap<PHINode *, int, 16> CostSavingsMap;
+
+ // Remember the set of instructions that are candidates for speculation so
+ // that we can quickly walk things within that space. This prunes out
+ // instructions already available along edges, etc.
+ SmallPtrSet<Instruction *, 16> PotentialSpecSet;
+
+ // Remember the set of instructions that are (transitively) unsafe to
+ // speculate into the incoming edges of this basic block. This avoids
+ // recomputing them for each PHI node we check. This set is specific to this
+ // block though as things are pruned out of it based on what is available
+ // along incoming edges.
+ SmallPtrSet<Instruction *, 16> UnsafeSet;
+
+ // For each PHI node in this block, check whether there are immediate folding
+ // opportunities from speculation, and whether that speculation will be
+ // valid. This determise the set of safe PHIs to speculate.
+ PNs.erase(llvm::remove_if(PNs,
+ [&](PHINode *PN) {
+ return !isSafeAndProfitableToSpeculateAroundPHI(
+ *PN, CostSavingsMap, PotentialSpecSet,
+ UnsafeSet, DT, TTI);
+ }),
+ PNs.end());
+ // If no PHIs were profitable, skip.
+ if (PNs.empty()) {
+ DEBUG(dbgs() << " No safe and profitable PHIs found!\n");
+ return false;
+ }
+
+ // We need to know how much speculation will cost which is determined by how
+ // many incoming edges will need a copy of each speculated instruction.
+ SmallSetVector<BasicBlock *, 16> PredSet;
+ for (auto *PredBB : PNs[0]->blocks()) {
+ if (!PredSet.insert(PredBB))
+ continue;
+
+ // We cannot speculate when a predecessor is an indirect branch.
+ // FIXME: We also can't reliably create a non-critical edge block for
+ // speculation if the predecessor is an invoke. This doesn't seem
+ // fundamental and we should probably be splitting critical edges
+ // differently.
+ if (isa<IndirectBrInst>(PredBB->getTerminator()) ||
+ isa<InvokeInst>(PredBB->getTerminator())) {
+ DEBUG(dbgs() << " Invalid: predecessor terminator: " << PredBB->getName()
+ << "\n");
+ return false;
+ }
+ }
+ if (PredSet.size() < 2) {
+ DEBUG(dbgs() << " Unimportant: phi with only one predecessor\n");
+ return false;
+ }
+
+ SmallVector<PHINode *, 16> SpecPNs = findProfitablePHIs(
+ PNs, CostSavingsMap, PotentialSpecSet, PredSet.size(), DT, TTI);
+ if (SpecPNs.empty())
+ // Nothing to do.
+ return false;
+
+ speculatePHIs(SpecPNs, PotentialSpecSet, PredSet, DT);
+ return true;
+}
+
+PreservedAnalyses SpeculateAroundPHIsPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+
+ bool Changed = false;
+ for (auto *BB : ReversePostOrderTraversal<Function *>(&F)) {
+ SmallVector<PHINode *, 16> PNs;
+ auto BBI = BB->begin();
+ while (auto *PN = dyn_cast<PHINode>(&*BBI)) {
+ PNs.push_back(PN);
+ ++BBI;
+ }
+
+ if (PNs.empty())
+ continue;
+
+ Changed |= tryToSpeculatePHIs(PNs, DT, TTI);
+ }
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ return PA;
+}
diff --git a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 8b8d6590aa6a..ce40af1223f6 100644
--- a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -1,4 +1,4 @@
-//===-- StraightLineStrengthReduce.cpp - ------------------------*- C++ -*-===//
+//===- StraightLineStrengthReduce.cpp - -----------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -55,26 +55,45 @@
//
// - When (i' - i) is constant but i and i' are not, we could still perform
// SLSR.
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
#include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+#include <limits>
#include <list>
#include <vector>
using namespace llvm;
using namespace PatternMatch;
-namespace {
+static const unsigned UnknownAddressSpace =
+ std::numeric_limits<unsigned>::max();
-static const unsigned UnknownAddressSpace = ~0u;
+namespace {
class StraightLineStrengthReduce : public FunctionPass {
public:
@@ -88,20 +107,22 @@ public:
GEP, // &B[..][i * S][..]
};
- Candidate()
- : CandidateKind(Invalid), Base(nullptr), Index(nullptr),
- Stride(nullptr), Ins(nullptr), Basis(nullptr) {}
+ Candidate() = default;
Candidate(Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
Instruction *I)
- : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I),
- Basis(nullptr) {}
- Kind CandidateKind;
- const SCEV *Base;
+ : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I) {}
+
+ Kind CandidateKind = Invalid;
+
+ const SCEV *Base = nullptr;
+
// Note that Index and Stride of a GEP candidate do not necessarily have the
// same integer type. In that case, during rewriting, Stride will be
// sign-extended or truncated to Index's type.
- ConstantInt *Index;
- Value *Stride;
+ ConstantInt *Index = nullptr;
+
+ Value *Stride = nullptr;
+
// The instruction this candidate corresponds to. It helps us to rewrite a
// candidate with respect to its immediate basis. Note that one instruction
// can correspond to multiple candidates depending on how you associate the
@@ -116,16 +137,16 @@ public:
// or
//
// <Base: b, Index: 2, Stride: a + 1>
- Instruction *Ins;
+ Instruction *Ins = nullptr;
+
// Points to the immediate basis of this candidate, or nullptr if we cannot
// find any basis for this candidate.
- Candidate *Basis;
+ Candidate *Basis = nullptr;
};
static char ID;
- StraightLineStrengthReduce()
- : FunctionPass(ID), DL(nullptr), DT(nullptr), TTI(nullptr) {
+ StraightLineStrengthReduce() : FunctionPass(ID) {
initializeStraightLineStrengthReducePass(*PassRegistry::getPassRegistry());
}
@@ -148,46 +169,58 @@ private:
// Returns true if Basis is a basis for C, i.e., Basis dominates C and they
// share the same base and stride.
bool isBasisFor(const Candidate &Basis, const Candidate &C);
+
// Returns whether the candidate can be folded into an addressing mode.
bool isFoldable(const Candidate &C, TargetTransformInfo *TTI,
const DataLayout *DL);
+
// Returns true if C is already in a simplest form and not worth being
// rewritten.
bool isSimplestForm(const Candidate &C);
+
// Checks whether I is in a candidate form. If so, adds all the matching forms
// to Candidates, and tries to find the immediate basis for each of them.
void allocateCandidatesAndFindBasis(Instruction *I);
+
// Allocate candidates and find bases for Add instructions.
void allocateCandidatesAndFindBasisForAdd(Instruction *I);
+
// Given I = LHS + RHS, factors RHS into i * S and makes (LHS + i * S) a
// candidate.
void allocateCandidatesAndFindBasisForAdd(Value *LHS, Value *RHS,
Instruction *I);
// Allocate candidates and find bases for Mul instructions.
void allocateCandidatesAndFindBasisForMul(Instruction *I);
+
// Splits LHS into Base + Index and, if succeeds, calls
// allocateCandidatesAndFindBasis.
void allocateCandidatesAndFindBasisForMul(Value *LHS, Value *RHS,
Instruction *I);
+
// Allocate candidates and find bases for GetElementPtr instructions.
void allocateCandidatesAndFindBasisForGEP(GetElementPtrInst *GEP);
+
// A helper function that scales Idx with ElementSize before invoking
// allocateCandidatesAndFindBasis.
void allocateCandidatesAndFindBasisForGEP(const SCEV *B, ConstantInt *Idx,
Value *S, uint64_t ElementSize,
Instruction *I);
+
// Adds the given form <CT, B, Idx, S> to Candidates, and finds its immediate
// basis.
void allocateCandidatesAndFindBasis(Candidate::Kind CT, const SCEV *B,
ConstantInt *Idx, Value *S,
Instruction *I);
+
// Rewrites candidate C with respect to Basis.
void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis);
+
// A helper function that factors ArrayIdx to a product of a stride and a
// constant index, and invokes allocateCandidatesAndFindBasis with the
// factorings.
void factorArrayIndex(Value *ArrayIdx, const SCEV *Base, uint64_t ElementSize,
GetElementPtrInst *GEP);
+
// Emit code that computes the "bump" from Basis to C. If the candidate is a
// GEP and the bump is not divisible by the element size of the GEP, this
// function sets the BumpWithUglyGEP flag to notify its caller to bump the
@@ -196,19 +229,22 @@ private:
IRBuilder<> &Builder, const DataLayout *DL,
bool &BumpWithUglyGEP);
- const DataLayout *DL;
- DominatorTree *DT;
+ const DataLayout *DL = nullptr;
+ DominatorTree *DT = nullptr;
ScalarEvolution *SE;
- TargetTransformInfo *TTI;
+ TargetTransformInfo *TTI = nullptr;
std::list<Candidate> Candidates;
+
// Temporarily holds all instructions that are unlinked (but not deleted) by
// rewriteCandidateWithBasis. These instructions will be actually removed
// after all rewriting finishes.
std::vector<Instruction *> UnlinkedInstructions;
};
-} // anonymous namespace
+
+} // end anonymous namespace
char StraightLineStrengthReduce::ID = 0;
+
INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr",
"Straight line strength reduction", false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
@@ -650,8 +686,8 @@ void StraightLineStrengthReduce::rewriteCandidateWithBasis(
else
Reduced = Builder.CreateGEP(nullptr, Basis.Ins, Bump);
}
+ break;
}
- break;
default:
llvm_unreachable("C.CandidateKind is invalid");
};
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index 0cccb415efdb..2972e1cff9a4 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -1,4 +1,4 @@
-//===-- StructurizeCFG.cpp ------------------------------------------------===//
+//===- StructurizeCFG.cpp -------------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -7,49 +7,72 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/RegionInfo.h"
#include "llvm/Analysis/RegionIterator.h"
#include "llvm/Analysis/RegionPass.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <cassert>
+#include <utility>
using namespace llvm;
using namespace llvm::PatternMatch;
#define DEBUG_TYPE "structurizecfg"
+// The name for newly created blocks.
+static const char *const FlowBlockName = "Flow";
+
namespace {
// Definition of the complex types used in this pass.
-typedef std::pair<BasicBlock *, Value *> BBValuePair;
+using BBValuePair = std::pair<BasicBlock *, Value *>;
-typedef SmallVector<RegionNode*, 8> RNVector;
-typedef SmallVector<BasicBlock*, 8> BBVector;
-typedef SmallVector<BranchInst*, 8> BranchVector;
-typedef SmallVector<BBValuePair, 2> BBValueVector;
+using RNVector = SmallVector<RegionNode *, 8>;
+using BBVector = SmallVector<BasicBlock *, 8>;
+using BranchVector = SmallVector<BranchInst *, 8>;
+using BBValueVector = SmallVector<BBValuePair, 2>;
-typedef SmallPtrSet<BasicBlock *, 8> BBSet;
+using BBSet = SmallPtrSet<BasicBlock *, 8>;
-typedef MapVector<PHINode *, BBValueVector> PhiMap;
-typedef MapVector<BasicBlock *, BBVector> BB2BBVecMap;
+using PhiMap = MapVector<PHINode *, BBValueVector>;
+using BB2BBVecMap = MapVector<BasicBlock *, BBVector>;
-typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap;
-typedef DenseMap<BasicBlock *, Value *> BBPredicates;
-typedef DenseMap<BasicBlock *, BBPredicates> PredMap;
-typedef DenseMap<BasicBlock *, BasicBlock*> BB2BBMap;
-
-// The name for newly created blocks.
-static const char *const FlowBlockName = "Flow";
+using BBPhiMap = DenseMap<BasicBlock *, PhiMap>;
+using BBPredicates = DenseMap<BasicBlock *, Value *>;
+using PredMap = DenseMap<BasicBlock *, BBPredicates>;
+using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>;
/// Finds the nearest common dominator of a set of BasicBlocks.
///
@@ -736,7 +759,6 @@ void StructurizeCFG::wireFlow(bool ExitUseAllowed,
changeExit(PrevNode, Node->getEntry(), true);
}
PrevNode = Node;
-
} else {
// Insert extra prefix node (or reuse last one)
BasicBlock *Flow = needPrefix(false);
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 90c5c243f464..2a1106b41de2 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -60,6 +60,7 @@
#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/CallSite.h"
@@ -78,7 +79,6 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
#define DEBUG_TYPE "tailcallelim"
@@ -177,7 +177,8 @@ struct AllocaDerivedValueTracker {
};
}
-static bool markTails(Function &F, bool &AllCallsAreTailCalls) {
+static bool markTails(Function &F, bool &AllCallsAreTailCalls,
+ OptimizationRemarkEmitter *ORE) {
if (F.callsFunctionThatReturnsTwice())
return false;
AllCallsAreTailCalls = true;
@@ -228,7 +229,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls) {
Escaped = ESCAPED;
CallInst *CI = dyn_cast<CallInst>(&I);
- if (!CI || CI->isTailCall())
+ if (!CI || CI->isTailCall() || isa<DbgInfoIntrinsic>(&I))
continue;
bool IsNoTail = CI->isNoTailCall() || CI->hasOperandBundles();
@@ -252,9 +253,11 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls) {
break;
}
if (SafeToTail) {
- emitOptimizationRemark(
- F.getContext(), "tailcallelim", F, CI->getDebugLoc(),
- "marked this readnone call a tail call candidate");
+ using namespace ore;
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "tailcall-readnone", CI)
+ << "marked as tail call candidate (readnone)";
+ });
CI->setTailCall();
Modified = true;
continue;
@@ -299,9 +302,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls) {
if (Visited[CI->getParent()] != ESCAPED) {
// If the escape point was part way through the block, calls after the
// escape point wouldn't have been put into DeferredTails.
- emitOptimizationRemark(F.getContext(), "tailcallelim", F,
- CI->getDebugLoc(),
- "marked this call a tail call candidate");
+ DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n");
CI->setTailCall();
Modified = true;
} else {
@@ -330,7 +331,7 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
// Writes to memory only matter if they may alias the pointer
// being loaded from.
const DataLayout &DL = L->getModule()->getDataLayout();
- if ((AA->getModRefInfo(CI, MemoryLocation::get(L)) & MRI_Mod) ||
+ if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) ||
!isSafeToLoadUnconditionally(L->getPointerOperand(),
L->getAlignment(), DL, L))
return false;
@@ -491,7 +492,8 @@ static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
BasicBlock *&OldEntry,
bool &TailCallsAreMarkedTail,
SmallVectorImpl<PHINode *> &ArgumentPHIs,
- AliasAnalysis *AA) {
+ AliasAnalysis *AA,
+ OptimizationRemarkEmitter *ORE) {
// If we are introducing accumulator recursion to eliminate operations after
// the call instruction that are both associative and commutative, the initial
// value for the accumulator is placed in this variable. If this value is set
@@ -551,8 +553,11 @@ static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
BasicBlock *BB = Ret->getParent();
Function *F = BB->getParent();
- emitOptimizationRemark(F->getContext(), "tailcallelim", *F, CI->getDebugLoc(),
- "transforming tail recursion to loop");
+ using namespace ore;
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "tailcall-recursion", CI)
+ << "transforming tail recursion into loop";
+ });
// OK! We can transform this tail call. If this is the first one found,
// create the new entry block, allowing us to branch back to the old entry.
@@ -666,13 +671,11 @@ static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
return true;
}
-static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret,
- BasicBlock *&OldEntry,
- bool &TailCallsAreMarkedTail,
- SmallVectorImpl<PHINode *> &ArgumentPHIs,
- bool CannotTailCallElimCallsMarkedTail,
- const TargetTransformInfo *TTI,
- AliasAnalysis *AA) {
+static bool foldReturnAndProcessPred(
+ BasicBlock *BB, ReturnInst *Ret, BasicBlock *&OldEntry,
+ bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs,
+ bool CannotTailCallElimCallsMarkedTail, const TargetTransformInfo *TTI,
+ AliasAnalysis *AA, OptimizationRemarkEmitter *ORE) {
bool Change = false;
// Make sure this block is a trivial return block.
@@ -708,7 +711,7 @@ static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret,
BB->eraseFromParent();
eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
- ArgumentPHIs, AA);
+ ArgumentPHIs, AA, ORE);
++NumRetDuped;
Change = true;
}
@@ -722,23 +725,25 @@ static bool processReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
SmallVectorImpl<PHINode *> &ArgumentPHIs,
bool CannotTailCallElimCallsMarkedTail,
const TargetTransformInfo *TTI,
- AliasAnalysis *AA) {
+ AliasAnalysis *AA,
+ OptimizationRemarkEmitter *ORE) {
CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail, TTI);
if (!CI)
return false;
return eliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail,
- ArgumentPHIs, AA);
+ ArgumentPHIs, AA, ORE);
}
static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI,
- AliasAnalysis *AA) {
+ AliasAnalysis *AA,
+ OptimizationRemarkEmitter *ORE) {
if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
return false;
bool MadeChange = false;
bool AllCallsAreTailCalls = false;
- MadeChange |= markTails(F, AllCallsAreTailCalls);
+ MadeChange |= markTails(F, AllCallsAreTailCalls, ORE);
if (!AllCallsAreTailCalls)
return MadeChange;
@@ -765,13 +770,13 @@ static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI,
for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) {
BasicBlock *BB = &*BBI++; // foldReturnAndProcessPred may delete BB.
if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
- bool Change =
- processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
- ArgumentPHIs, !CanTRETailMarkedCall, TTI, AA);
+ bool Change = processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
+ ArgumentPHIs, !CanTRETailMarkedCall,
+ TTI, AA, ORE);
if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
Change = foldReturnAndProcessPred(BB, Ret, OldEntry,
TailCallsAreMarkedTail, ArgumentPHIs,
- !CanTRETailMarkedCall, TTI, AA);
+ !CanTRETailMarkedCall, TTI, AA, ORE);
MadeChange |= Change;
}
}
@@ -802,6 +807,7 @@ struct TailCallElim : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
@@ -811,7 +817,8 @@ struct TailCallElim : public FunctionPass {
return eliminateTailRecursion(
F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
- &getAnalysis<AAResultsWrapperPass>().getAAResults());
+ &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+ &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE());
}
};
}
@@ -820,6 +827,7 @@ char TailCallElim::ID = 0;
INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination",
false, false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(TailCallElim, "tailcallelim", "Tail Call Elimination",
false, false)
@@ -833,8 +841,9 @@ PreservedAnalyses TailCallElimPass::run(Function &F,
TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
AliasAnalysis &AA = AM.getResult<AAManager>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- bool Changed = eliminateTailRecursion(F, &TTI, &AA);
+ bool Changed = eliminateTailRecursion(F, &TTI, &AA, &ORE);
if (!Changed)
return PreservedAnalyses::all();