aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm/lib/Transforms/Scalar
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Transforms/Scalar')
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/ADCE.cpp29
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp90
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/BDCE.cpp4
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp295
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp117
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp173
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/DCE.cpp5
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp277
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp335
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp13
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/GVN.cpp133
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp33
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp35
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp207
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp53
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp589
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp36
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp144
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp522
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LICM.cpp210
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp14
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp25
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp127
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp133
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp208
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp574
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp82
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp211
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp360
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp619
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp19
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp24
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp394
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp447
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp149
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp83
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp148
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp101
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/MergeICmps.cpp295
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp84
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp18
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp483
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp12
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp109
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp3
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp90
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SCCP.cpp507
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SROA.cpp509
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Scalar.cpp30
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp45
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp1485
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Sink.cpp12
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp61
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp10
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp163
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp8
61 files changed, 6277 insertions, 4675 deletions
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
index 1e683db50206..ce09a477b5f5 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -174,8 +174,8 @@ class AggressiveDeadCodeElimination {
/// marked live.
void markLiveBranchesFromControlDependences();
- /// Remove instructions not marked live, return if any any instruction
- /// was removed.
+ /// Remove instructions not marked live, return if any instruction was
+ /// removed.
bool removeDeadInstructions();
/// Identify connected sections of the control flow graph which have
@@ -298,8 +298,8 @@ void AggressiveDeadCodeElimination::initialize() {
auto &Info = BlockInfo[BB];
// Real function return
if (isa<ReturnInst>(Info.Terminator)) {
- DEBUG(dbgs() << "post-dom root child is a return: " << BB->getName()
- << '\n';);
+ LLVM_DEBUG(dbgs() << "post-dom root child is a return: " << BB->getName()
+ << '\n';);
continue;
}
@@ -356,7 +356,7 @@ void AggressiveDeadCodeElimination::markLiveInstructions() {
// where we need to mark the inputs as live.
while (!Worklist.empty()) {
Instruction *LiveInst = Worklist.pop_back_val();
- DEBUG(dbgs() << "work live: "; LiveInst->dump(););
+ LLVM_DEBUG(dbgs() << "work live: "; LiveInst->dump(););
for (Use &OI : LiveInst->operands())
if (Instruction *Inst = dyn_cast<Instruction>(OI))
@@ -378,7 +378,7 @@ void AggressiveDeadCodeElimination::markLive(Instruction *I) {
if (Info.Live)
return;
- DEBUG(dbgs() << "mark live: "; I->dump());
+ LLVM_DEBUG(dbgs() << "mark live: "; I->dump());
Info.Live = true;
Worklist.push_back(I);
@@ -402,7 +402,7 @@ void AggressiveDeadCodeElimination::markLive(Instruction *I) {
void AggressiveDeadCodeElimination::markLive(BlockInfoType &BBInfo) {
if (BBInfo.Live)
return;
- DEBUG(dbgs() << "mark block live: " << BBInfo.BB->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "mark block live: " << BBInfo.BB->getName() << '\n');
BBInfo.Live = true;
if (!BBInfo.CFLive) {
BBInfo.CFLive = true;
@@ -463,7 +463,7 @@ void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() {
if (BlocksWithDeadTerminators.empty())
return;
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "new live blocks:\n";
for (auto *BB : NewLiveBlocks)
dbgs() << "\t" << BB->getName() << '\n';
@@ -487,7 +487,7 @@ void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() {
// Dead terminators which control live blocks are now marked live.
for (auto *BB : IDFBlocks) {
- DEBUG(dbgs() << "live control in: " << BB->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "live control in: " << BB->getName() << '\n');
markLive(BB->getTerminator());
}
}
@@ -501,7 +501,7 @@ bool AggressiveDeadCodeElimination::removeDeadInstructions() {
// Updates control and dataflow around dead blocks
updateDeadRegions();
- DEBUG({
+ LLVM_DEBUG({
for (Instruction &I : instructions(F)) {
// Check if the instruction is alive.
if (isLive(&I))
@@ -555,7 +555,7 @@ bool AggressiveDeadCodeElimination::removeDeadInstructions() {
// A dead region is the set of dead blocks with a common live post-dominator.
void AggressiveDeadCodeElimination::updateDeadRegions() {
- DEBUG({
+ LLVM_DEBUG({
dbgs() << "final dead terminator blocks: " << '\n';
for (auto *BB : BlocksWithDeadTerminators)
dbgs() << '\t' << BB->getName()
@@ -607,8 +607,9 @@ void AggressiveDeadCodeElimination::updateDeadRegions() {
// It might have happened that the same successor appeared multiple times
// and the CFG edge wasn't really removed.
if (Succ != PreferredSucc->BB) {
- DEBUG(dbgs() << "ADCE: (Post)DomTree edge enqueued for deletion"
- << BB->getName() << " -> " << Succ->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "ADCE: (Post)DomTree edge enqueued for deletion"
+ << BB->getName() << " -> " << Succ->getName()
+ << "\n");
DeletedEdges.push_back({DominatorTree::Delete, BB, Succ});
}
}
@@ -652,7 +653,7 @@ void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB,
InstInfo[PredTerm].Live = true;
return;
}
- DEBUG(dbgs() << "making unconditional " << BB->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "making unconditional " << BB->getName() << '\n');
NumBranchesRemoved += 1;
IRBuilder<> Builder(PredTerm);
auto *NewTerm = Builder.CreateBr(Target);
diff --git a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index 99480f12da9e..fa7bcec677f7 100644
--- a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -98,8 +98,8 @@ static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,
const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV);
const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV);
- DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is " <<
- *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
+ LLVM_DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is "
+ << *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
if (const SCEVConstant *ConstDUSCEV =
dyn_cast<SCEVConstant>(DiffUnitsSCEV)) {
@@ -139,12 +139,12 @@ static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
// address. This address is displaced by the provided offset.
DiffSCEV = SE->getMinusSCEV(DiffSCEV, OffSCEV);
- DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to " <<
- *AlignSCEV << " and offset " << *OffSCEV <<
- " using diff " << *DiffSCEV << "\n");
+ LLVM_DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to "
+ << *AlignSCEV << " and offset " << *OffSCEV
+ << " using diff " << *DiffSCEV << "\n");
unsigned NewAlignment = getNewAlignmentDiff(DiffSCEV, AlignSCEV, SE);
- DEBUG(dbgs() << "\tnew alignment: " << NewAlignment << "\n");
+ LLVM_DEBUG(dbgs() << "\tnew alignment: " << NewAlignment << "\n");
if (NewAlignment) {
return NewAlignment;
@@ -160,8 +160,8 @@ static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
const SCEV *DiffStartSCEV = DiffARSCEV->getStart();
const SCEV *DiffIncSCEV = DiffARSCEV->getStepRecurrence(*SE);
- DEBUG(dbgs() << "\ttrying start/inc alignment using start " <<
- *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n");
+ LLVM_DEBUG(dbgs() << "\ttrying start/inc alignment using start "
+ << *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n");
// Now compute the new alignment using the displacement to the value in the
// first iteration, and also the alignment using the per-iteration delta.
@@ -170,26 +170,26 @@ static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
NewAlignment = getNewAlignmentDiff(DiffStartSCEV, AlignSCEV, SE);
unsigned NewIncAlignment = getNewAlignmentDiff(DiffIncSCEV, AlignSCEV, SE);
- DEBUG(dbgs() << "\tnew start alignment: " << NewAlignment << "\n");
- DEBUG(dbgs() << "\tnew inc alignment: " << NewIncAlignment << "\n");
+ LLVM_DEBUG(dbgs() << "\tnew start alignment: " << NewAlignment << "\n");
+ LLVM_DEBUG(dbgs() << "\tnew inc alignment: " << NewIncAlignment << "\n");
if (!NewAlignment || !NewIncAlignment) {
return 0;
} else if (NewAlignment > NewIncAlignment) {
if (NewAlignment % NewIncAlignment == 0) {
- DEBUG(dbgs() << "\tnew start/inc alignment: " <<
- NewIncAlignment << "\n");
+ LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << NewIncAlignment
+ << "\n");
return NewIncAlignment;
}
} else if (NewIncAlignment > NewAlignment) {
if (NewIncAlignment % NewAlignment == 0) {
- DEBUG(dbgs() << "\tnew start/inc alignment: " <<
- NewAlignment << "\n");
+ LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << NewAlignment
+ << "\n");
return NewAlignment;
}
} else if (NewIncAlignment == NewAlignment) {
- DEBUG(dbgs() << "\tnew start/inc alignment: " <<
- NewAlignment << "\n");
+ LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << NewAlignment
+ << "\n");
return NewAlignment;
}
}
@@ -339,55 +339,24 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
unsigned NewDestAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
MI->getDest(), SE);
- // For memory transfers, we need a common alignment for both the
- // source and destination. If we have a new alignment for this
- // instruction, but only for one operand, save it. If we reach the
- // other operand through another assumption later, then we may
- // change the alignment at that point.
+ LLVM_DEBUG(dbgs() << "\tmem inst: " << NewDestAlignment << "\n";);
+ if (NewDestAlignment > MI->getDestAlignment()) {
+ MI->setDestAlignment(NewDestAlignment);
+ ++NumMemIntAlignChanged;
+ }
+
+ // For memory transfers, there is also a source alignment that
+ // can be set.
if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
unsigned NewSrcAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
MTI->getSource(), SE);
- DenseMap<MemTransferInst *, unsigned>::iterator DI =
- NewDestAlignments.find(MTI);
- unsigned AltDestAlignment = (DI == NewDestAlignments.end()) ?
- 0 : DI->second;
-
- DenseMap<MemTransferInst *, unsigned>::iterator SI =
- NewSrcAlignments.find(MTI);
- unsigned AltSrcAlignment = (SI == NewSrcAlignments.end()) ?
- 0 : SI->second;
-
- DEBUG(dbgs() << "\tmem trans: " << NewDestAlignment << " " <<
- AltDestAlignment << " " << NewSrcAlignment <<
- " " << AltSrcAlignment << "\n");
-
- // Of these four alignments, pick the largest possible...
- unsigned NewAlignment = 0;
- if (NewDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment))
- NewAlignment = std::max(NewAlignment, NewDestAlignment);
- if (AltDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment))
- NewAlignment = std::max(NewAlignment, AltDestAlignment);
- if (NewSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment))
- NewAlignment = std::max(NewAlignment, NewSrcAlignment);
- if (AltSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment))
- NewAlignment = std::max(NewAlignment, AltSrcAlignment);
-
- if (NewAlignment > MI->getAlignment()) {
- MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
- MI->getParent()->getContext()), NewAlignment));
+ LLVM_DEBUG(dbgs() << "\tmem trans: " << NewSrcAlignment << "\n";);
+
+ if (NewSrcAlignment > MTI->getSourceAlignment()) {
+ MTI->setSourceAlignment(NewSrcAlignment);
++NumMemIntAlignChanged;
}
-
- NewDestAlignments.insert(std::make_pair(MTI, NewDestAlignment));
- NewSrcAlignments.insert(std::make_pair(MTI, NewSrcAlignment));
- } else if (NewDestAlignment > MI->getAlignment()) {
- assert((!isa<MemIntrinsic>(MI) || isa<MemSetInst>(MI)) &&
- "Unknown memory intrinsic");
-
- MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
- MI->getParent()->getContext()), NewDestAlignment));
- ++NumMemIntAlignChanged;
}
}
@@ -421,9 +390,6 @@ bool AlignmentFromAssumptionsPass::runImpl(Function &F, AssumptionCache &AC,
SE = SE_;
DT = DT_;
- NewDestAlignments.clear();
- NewSrcAlignments.clear();
-
bool Changed = false;
for (auto &AssumeVH : AC.assumptions())
if (AssumeVH)
diff --git a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
index 851efa000f65..3a8ef073cb48 100644
--- a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
@@ -20,6 +20,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/Pass.h"
@@ -99,7 +100,7 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
// For live instructions that have all dead bits, first make them dead by
// replacing all uses with something else. Then, if they don't need to
// remain live (because they have side effects, etc.) we can remove them.
- DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
+ LLVM_DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
clearAssumptionsOfUsers(&I, DB);
@@ -114,6 +115,7 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
if (!DB.isInstructionDead(&I))
continue;
+ salvageDebugInfo(I);
Worklist.push_back(&I);
I.dropAllReferences();
Changed = true;
diff --git a/contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
index 7488cd5af8be..5ebfbf8a879b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -59,12 +59,14 @@
#include "llvm/Transforms/Scalar/CallSiteSplitting.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/Cloning.h"
using namespace llvm;
using namespace PatternMatch;
@@ -73,9 +75,16 @@ using namespace PatternMatch;
STATISTIC(NumCallSiteSplit, "Number of call-site split");
-static void addNonNullAttribute(Instruction *CallI, Instruction *NewCallI,
- Value *Op) {
- CallSite CS(NewCallI);
+/// Only allow instructions before a call, if their CodeSize cost is below
+/// DuplicationThreshold. Those instructions need to be duplicated in all
+/// split blocks.
+static cl::opt<unsigned>
+ DuplicationThreshold("callsite-splitting-duplication-threshold", cl::Hidden,
+ cl::desc("Only allow instructions before a call, if "
+ "their cost is below DuplicationThreshold"),
+ cl::init(5));
+
+static void addNonNullAttribute(CallSite CS, Value *Op) {
unsigned ArgNo = 0;
for (auto &I : CS.args()) {
if (&*I == Op)
@@ -84,13 +93,16 @@ static void addNonNullAttribute(Instruction *CallI, Instruction *NewCallI,
}
}
-static void setConstantInArgument(Instruction *CallI, Instruction *NewCallI,
- Value *Op, Constant *ConstValue) {
- CallSite CS(NewCallI);
+static void setConstantInArgument(CallSite CS, Value *Op,
+ Constant *ConstValue) {
unsigned ArgNo = 0;
for (auto &I : CS.args()) {
- if (&*I == Op)
+ if (&*I == Op) {
+ // It is possible we have already added the non-null attribute to the
+ // parameter by using an earlier constraining condition.
+ CS.removeParamAttr(ArgNo, Attribute::NonNull);
CS.setArgument(ArgNo, ConstValue);
+ }
++ArgNo;
}
}
@@ -111,11 +123,13 @@ static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) {
return false;
}
+typedef std::pair<ICmpInst *, unsigned> ConditionTy;
+typedef SmallVector<ConditionTy, 2> ConditionsTy;
+
/// If From has a conditional jump to To, add the condition to Conditions,
/// if it is relevant to any argument at CS.
-static void
-recordCondition(const CallSite &CS, BasicBlock *From, BasicBlock *To,
- SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
+static void recordCondition(CallSite CS, BasicBlock *From, BasicBlock *To,
+ ConditionsTy &Conditions) {
auto *BI = dyn_cast<BranchInst>(From->getTerminator());
if (!BI || !BI->isConditional())
return;
@@ -134,11 +148,10 @@ recordCondition(const CallSite &CS, BasicBlock *From, BasicBlock *To,
}
/// Record ICmp conditions relevant to any argument in CS following Pred's
-/// single successors. If there are conflicting conditions along a path, like
+/// single predecessors. If there are conflicting conditions along a path, like
/// x == 1 and x == 0, the first condition will be used.
-static void
-recordConditions(const CallSite &CS, BasicBlock *Pred,
- SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
+static void recordConditions(CallSite CS, BasicBlock *Pred,
+ ConditionsTy &Conditions) {
recordCondition(CS, Pred, CS.getInstruction()->getParent(), Conditions);
BasicBlock *From = Pred;
BasicBlock *To = Pred;
@@ -151,24 +164,17 @@ recordConditions(const CallSite &CS, BasicBlock *Pred,
}
}
-static Instruction *
-addConditions(CallSite &CS,
- SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
- if (Conditions.empty())
- return nullptr;
-
- Instruction *NewCI = CS.getInstruction()->clone();
+static void addConditions(CallSite CS, const ConditionsTy &Conditions) {
for (auto &Cond : Conditions) {
Value *Arg = Cond.first->getOperand(0);
Constant *ConstVal = cast<Constant>(Cond.first->getOperand(1));
if (Cond.second == ICmpInst::ICMP_EQ)
- setConstantInArgument(CS.getInstruction(), NewCI, Arg, ConstVal);
+ setConstantInArgument(CS, Arg, ConstVal);
else if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) {
assert(Cond.second == ICmpInst::ICMP_NE);
- addNonNullAttribute(CS.getInstruction(), NewCI, Arg);
+ addNonNullAttribute(CS, Arg);
}
}
- return NewCI;
}
static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) {
@@ -177,28 +183,39 @@ static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) {
return Preds;
}
-static bool canSplitCallSite(CallSite CS) {
+static bool canSplitCallSite(CallSite CS, TargetTransformInfo &TTI) {
// FIXME: As of now we handle only CallInst. InvokeInst could be handled
// without too much effort.
Instruction *Instr = CS.getInstruction();
if (!isa<CallInst>(Instr))
return false;
- // Allow splitting a call-site only when there is no instruction before the
- // call-site in the basic block. Based on this constraint, we only clone the
- // call instruction, and we do not move a call-site across any other
- // instruction.
BasicBlock *CallSiteBB = Instr->getParent();
- if (Instr != CallSiteBB->getFirstNonPHIOrDbg())
- return false;
-
// Need 2 predecessors and cannot split an edge from an IndirectBrInst.
SmallVector<BasicBlock *, 2> Preds(predecessors(CallSiteBB));
if (Preds.size() != 2 || isa<IndirectBrInst>(Preds[0]->getTerminator()) ||
isa<IndirectBrInst>(Preds[1]->getTerminator()))
return false;
- return CallSiteBB->canSplitPredecessors();
+ // BasicBlock::canSplitPredecessors is more agressive, so checking for
+ // BasicBlock::isEHPad as well.
+ if (!CallSiteBB->canSplitPredecessors() || CallSiteBB->isEHPad())
+ return false;
+
+ // Allow splitting a call-site only when the CodeSize cost of the
+ // instructions before the call is less then DuplicationThreshold. The
+ // instructions before the call will be duplicated in the split blocks and
+ // corresponding uses will be updated.
+ unsigned Cost = 0;
+ for (auto &InstBeforeCall :
+ llvm::make_range(CallSiteBB->begin(), Instr->getIterator())) {
+ Cost += TTI.getInstructionCost(&InstBeforeCall,
+ TargetTransformInfo::TCK_CodeSize);
+ if (Cost >= DuplicationThreshold)
+ return false;
+ }
+
+ return true;
}
static Instruction *cloneInstForMustTail(Instruction *I, Instruction *Before,
@@ -224,11 +241,11 @@ static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI,
bool IsVoid = SplitBB->getParent()->getReturnType()->isVoidTy();
auto II = std::next(CI->getIterator());
- BitCastInst *BCI = dyn_cast<BitCastInst>(&*II);
+ BitCastInst* BCI = dyn_cast<BitCastInst>(&*II);
if (BCI)
++II;
- ReturnInst *RI = dyn_cast<ReturnInst>(&*II);
+ ReturnInst* RI = dyn_cast<ReturnInst>(&*II);
assert(RI && "`musttail` call must be followed by `ret` instruction");
TerminatorInst *TI = SplitBB->getTerminator();
@@ -241,14 +258,15 @@ static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI,
// that prevents doing this now.
}
-/// Return true if the CS is split into its new predecessors which are directly
-/// hooked to each of its original predecessors pointed by PredBB1 and PredBB2.
-/// CallInst1 and CallInst2 will be the new call-sites placed in the new
-/// predecessors split for PredBB1 and PredBB2, respectively.
+/// For each (predecessor, conditions from predecessors) pair, it will split the
+/// basic block containing the call site, hook it up to the predecessor and
+/// replace the call instruction with new call instructions, which contain
+/// constraints based on the conditions from their predecessors.
/// For example, in the IR below with an OR condition, the call-site can
-/// be split. Assuming PredBB1=Header and PredBB2=TBB, CallInst1 will be the
-/// call-site placed between Header and Tail, and CallInst2 will be the
-/// call-site between TBB and Tail.
+/// be split. In this case, Preds for Tail is [(Header, a == null),
+/// (TBB, a != null, b == null)]. Tail is replaced by 2 split blocks, containing
+/// CallInst1, which has constraints based on the conditions from Head and
+/// CallInst2, which has constraints based on the conditions coming from TBB.
///
/// From :
///
@@ -281,61 +299,59 @@ static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI,
/// Note that in case any arguments at the call-site are constrained by its
/// predecessors, new call-sites with more constrained arguments will be
/// created in createCallSitesOnPredicatedArgument().
-static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2,
- Instruction *CallInst1, Instruction *CallInst2) {
+static void splitCallSite(
+ CallSite CS,
+ const SmallVectorImpl<std::pair<BasicBlock *, ConditionsTy>> &Preds,
+ DominatorTree *DT) {
Instruction *Instr = CS.getInstruction();
BasicBlock *TailBB = Instr->getParent();
bool IsMustTailCall = CS.isMustTailCall();
- assert(Instr == (TailBB->getFirstNonPHIOrDbg()) && "Unexpected call-site");
-
- BasicBlock *SplitBlock1 =
- SplitBlockPredecessors(TailBB, PredBB1, ".predBB1.split");
- BasicBlock *SplitBlock2 =
- SplitBlockPredecessors(TailBB, PredBB2, ".predBB2.split");
-
- assert((SplitBlock1 && SplitBlock2) && "Unexpected new basic block split.");
- if (!CallInst1)
- CallInst1 = Instr->clone();
- if (!CallInst2)
- CallInst2 = Instr->clone();
-
- CallInst1->insertBefore(&*SplitBlock1->getFirstInsertionPt());
- CallInst2->insertBefore(&*SplitBlock2->getFirstInsertionPt());
-
- CallSite CS1(CallInst1);
- CallSite CS2(CallInst2);
-
- // Handle PHIs used as arguments in the call-site.
- for (PHINode &PN : TailBB->phis()) {
- unsigned ArgNo = 0;
- for (auto &CI : CS.args()) {
- if (&*CI == &PN) {
- CS1.setArgument(ArgNo, PN.getIncomingValueForBlock(SplitBlock1));
- CS2.setArgument(ArgNo, PN.getIncomingValueForBlock(SplitBlock2));
+ PHINode *CallPN = nullptr;
+
+ // `musttail` calls must be followed by optional `bitcast`, and `ret`. The
+ // split blocks will be terminated right after that so there're no users for
+ // this phi in a `TailBB`.
+ if (!IsMustTailCall && !Instr->use_empty())
+ CallPN = PHINode::Create(Instr->getType(), Preds.size(), "phi.call");
+
+ LLVM_DEBUG(dbgs() << "split call-site : " << *Instr << " into \n");
+
+ assert(Preds.size() == 2 && "The ValueToValueMaps array has size 2.");
+ // ValueToValueMapTy is neither copy nor moveable, so we use a simple array
+ // here.
+ ValueToValueMapTy ValueToValueMaps[2];
+ for (unsigned i = 0; i < Preds.size(); i++) {
+ BasicBlock *PredBB = Preds[i].first;
+ BasicBlock *SplitBlock = DuplicateInstructionsInSplitBetween(
+ TailBB, PredBB, &*std::next(Instr->getIterator()), ValueToValueMaps[i],
+ DT);
+ assert(SplitBlock && "Unexpected new basic block split.");
+
+ Instruction *NewCI =
+ &*std::prev(SplitBlock->getTerminator()->getIterator());
+ CallSite NewCS(NewCI);
+ addConditions(NewCS, Preds[i].second);
+
+ // Handle PHIs used as arguments in the call-site.
+ for (PHINode &PN : TailBB->phis()) {
+ unsigned ArgNo = 0;
+ for (auto &CI : CS.args()) {
+ if (&*CI == &PN) {
+ NewCS.setArgument(ArgNo, PN.getIncomingValueForBlock(SplitBlock));
+ }
+ ++ArgNo;
}
- ++ArgNo;
}
+ LLVM_DEBUG(dbgs() << " " << *NewCI << " in " << SplitBlock->getName()
+ << "\n");
+ if (CallPN)
+ CallPN->addIncoming(NewCI, SplitBlock);
+
+ // Clone and place bitcast and return instructions before `TI`
+ if (IsMustTailCall)
+ copyMustTailReturn(SplitBlock, Instr, NewCI);
}
- // Clone and place bitcast and return instructions before `TI`
- if (IsMustTailCall) {
- copyMustTailReturn(SplitBlock1, CS.getInstruction(), CallInst1);
- copyMustTailReturn(SplitBlock2, CS.getInstruction(), CallInst2);
- }
-
- // Replace users of the original call with a PHI mering call-sites split.
- if (!IsMustTailCall && Instr->getNumUses()) {
- PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call",
- TailBB->getFirstNonPHI());
- PN->addIncoming(CallInst1, SplitBlock1);
- PN->addIncoming(CallInst2, SplitBlock2);
- Instr->replaceAllUsesWith(PN);
- }
- DEBUG(dbgs() << "split call-site : " << *Instr << " into \n");
- DEBUG(dbgs() << " " << *CallInst1 << " in " << SplitBlock1->getName()
- << "\n");
- DEBUG(dbgs() << " " << *CallInst2 << " in " << SplitBlock2->getName()
- << "\n");
NumCallSiteSplit++;
@@ -354,7 +370,41 @@ static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2,
TailBB->eraseFromParent();
return;
}
- Instr->eraseFromParent();
+
+ auto *OriginalBegin = &*TailBB->begin();
+ // Replace users of the original call with a PHI mering call-sites split.
+ if (CallPN) {
+ CallPN->insertBefore(OriginalBegin);
+ Instr->replaceAllUsesWith(CallPN);
+ }
+
+ // Remove instructions moved to split blocks from TailBB, from the duplicated
+ // call instruction to the beginning of the basic block. If an instruction
+ // has any uses, add a new PHI node to combine the values coming from the
+ // split blocks. The new PHI nodes are placed before the first original
+ // instruction, so we do not end up deleting them. By using reverse-order, we
+ // do not introduce unnecessary PHI nodes for def-use chains from the call
+ // instruction to the beginning of the block.
+ auto I = Instr->getReverseIterator();
+ while (I != TailBB->rend()) {
+ Instruction *CurrentI = &*I++;
+ if (!CurrentI->use_empty()) {
+ // If an existing PHI has users after the call, there is no need to create
+ // a new one.
+ if (isa<PHINode>(CurrentI))
+ continue;
+ PHINode *NewPN = PHINode::Create(CurrentI->getType(), Preds.size());
+ for (auto &Mapping : ValueToValueMaps)
+ NewPN->addIncoming(Mapping[CurrentI],
+ cast<Instruction>(Mapping[CurrentI])->getParent());
+ NewPN->insertBefore(&*TailBB->begin());
+ CurrentI->replaceAllUsesWith(NewPN);
+ }
+ CurrentI->eraseFromParent();
+ // We are done once we handled the first original instruction in TailBB.
+ if (CurrentI == OriginalBegin)
+ break;
+ }
}
// Return true if the call-site has an argument which is a PHI with only
@@ -385,45 +435,59 @@ static bool isPredicatedOnPHI(CallSite CS) {
return false;
}
-static bool tryToSplitOnPHIPredicatedArgument(CallSite CS) {
+static bool tryToSplitOnPHIPredicatedArgument(CallSite CS, DominatorTree *DT) {
if (!isPredicatedOnPHI(CS))
return false;
auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
- splitCallSite(CS, Preds[0], Preds[1], nullptr, nullptr);
+ SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2> PredsCS = {
+ {Preds[0], {}}, {Preds[1], {}}};
+ splitCallSite(CS, PredsCS, DT);
return true;
}
-static bool tryToSplitOnPredicatedArgument(CallSite CS) {
+static bool tryToSplitOnPredicatedArgument(CallSite CS, DominatorTree *DT) {
auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
if (Preds[0] == Preds[1])
return false;
- SmallVector<std::pair<ICmpInst *, unsigned>, 2> C1, C2;
- recordConditions(CS, Preds[0], C1);
- recordConditions(CS, Preds[1], C2);
+ SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2> PredsCS;
+ for (auto *Pred : make_range(Preds.rbegin(), Preds.rend())) {
+ ConditionsTy Conditions;
+ recordConditions(CS, Pred, Conditions);
+ PredsCS.push_back({Pred, Conditions});
+ }
- Instruction *CallInst1 = addConditions(CS, C1);
- Instruction *CallInst2 = addConditions(CS, C2);
- if (!CallInst1 && !CallInst2)
+ if (std::all_of(PredsCS.begin(), PredsCS.end(),
+ [](const std::pair<BasicBlock *, ConditionsTy> &P) {
+ return P.second.empty();
+ }))
return false;
- splitCallSite(CS, Preds[1], Preds[0], CallInst2, CallInst1);
+ splitCallSite(CS, PredsCS, DT);
return true;
}
-static bool tryToSplitCallSite(CallSite CS) {
- if (!CS.arg_size() || !canSplitCallSite(CS))
+static bool tryToSplitCallSite(CallSite CS, TargetTransformInfo &TTI,
+ DominatorTree *DT) {
+ if (!CS.arg_size() || !canSplitCallSite(CS, TTI))
return false;
- return tryToSplitOnPredicatedArgument(CS) ||
- tryToSplitOnPHIPredicatedArgument(CS);
+ return tryToSplitOnPredicatedArgument(CS, DT) ||
+ tryToSplitOnPHIPredicatedArgument(CS, DT);
}
-static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) {
+static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI,
+ TargetTransformInfo &TTI, DominatorTree *DT) {
bool Changed = false;
for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) {
BasicBlock &BB = *BI++;
- for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) {
+ auto II = BB.getFirstNonPHIOrDbg()->getIterator();
+ auto IE = BB.getTerminator()->getIterator();
+ // Iterate until we reach the terminator instruction. tryToSplitCallSite
+ // can replace BB's terminator in case BB is a successor of itself. In that
+ // case, IE will be invalidated and we also have to check the current
+ // terminator.
+ while (II != IE && &*II != BB.getTerminator()) {
Instruction *I = &*II++;
CallSite CS(cast<Value>(I));
if (!CS || isa<IntrinsicInst>(I) || isInstructionTriviallyDead(I, &TLI))
@@ -437,7 +501,7 @@ static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) {
// Check if such path is possible before attempting the splitting.
bool IsMustTail = CS.isMustTailCall();
- Changed |= tryToSplitCallSite(CS);
+ Changed |= tryToSplitCallSite(CS, TTI, DT);
// There're no interesting instructions after this. The call site
// itself might have been erased on splitting.
@@ -457,6 +521,8 @@ struct CallSiteSplittingLegacyPass : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
@@ -465,7 +531,10 @@ struct CallSiteSplittingLegacyPass : public FunctionPass {
return false;
auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
- return doCallSiteSplitting(F, TLI);
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ return doCallSiteSplitting(F, TLI, TTI,
+ DTWP ? &DTWP->getDomTree() : nullptr);
}
};
} // namespace
@@ -474,6 +543,7 @@ char CallSiteSplittingLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting",
"Call-site splitting", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting",
"Call-site splitting", false, false)
FunctionPass *llvm::createCallSiteSplittingPass() {
@@ -483,9 +553,12 @@ FunctionPass *llvm::createCallSiteSplittingPass() {
PreservedAnalyses CallSiteSplittingPass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
- if (!doCallSiteSplitting(F, TLI))
+ if (!doCallSiteSplitting(F, TLI, TTI, DT))
return PreservedAnalyses::all();
PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
return PA;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index e4b08c5ed305..3a675b979017 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -43,8 +43,10 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/InstrTypes.h"
@@ -59,8 +61,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/IR/DebugInfoMetadata.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -84,7 +84,7 @@ static cl::opt<bool> ConstHoistWithBlockFrequency(
namespace {
-/// \brief The constant hoisting pass.
+/// The constant hoisting pass.
class ConstantHoistingLegacyPass : public FunctionPass {
public:
static char ID; // Pass identification, replacement for typeid
@@ -127,13 +127,13 @@ FunctionPass *llvm::createConstantHoistingPass() {
return new ConstantHoistingLegacyPass();
}
-/// \brief Perform the constant hoisting optimization for the given function.
+/// Perform the constant hoisting optimization for the given function.
bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
if (skipFunction(Fn))
return false;
- DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
- DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
+ LLVM_DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
bool MadeChange =
Impl.runImpl(Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn),
@@ -144,16 +144,16 @@ bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
Fn.getEntryBlock());
if (MadeChange) {
- DEBUG(dbgs() << "********** Function after Constant Hoisting: "
- << Fn.getName() << '\n');
- DEBUG(dbgs() << Fn);
+ LLVM_DEBUG(dbgs() << "********** Function after Constant Hoisting: "
+ << Fn.getName() << '\n');
+ LLVM_DEBUG(dbgs() << Fn);
}
- DEBUG(dbgs() << "********** End Constant Hoisting **********\n");
+ LLVM_DEBUG(dbgs() << "********** End Constant Hoisting **********\n");
return MadeChange;
}
-/// \brief Find the constant materialization insertion point.
+/// Find the constant materialization insertion point.
Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
unsigned Idx) const {
// If the operand is a cast instruction, then we have to materialize the
@@ -187,7 +187,7 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
return IDom->getBlock()->getTerminator();
}
-/// \brief Given \p BBs as input, find another set of BBs which collectively
+/// Given \p BBs as input, find another set of BBs which collectively
/// dominates \p BBs and have the minimal sum of frequencies. Return the BB
/// set found in \p BBs.
static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
@@ -289,7 +289,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
}
}
-/// \brief Find an insertion point that dominates all uses.
+/// Find an insertion point that dominates all uses.
SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
const ConstantInfo &ConstInfo) const {
assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
@@ -335,7 +335,7 @@ SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
return InsertPts;
}
-/// \brief Record constant integer ConstInt for instruction Inst at operand
+/// Record constant integer ConstInt for instruction Inst at operand
/// index Idx.
///
/// The operand at index Idx is not necessarily the constant integer itself. It
@@ -364,18 +364,17 @@ void ConstantHoistingPass::collectConstantCandidates(
Itr->second = ConstCandVec.size() - 1;
}
ConstCandVec[Itr->second].addUser(Inst, Idx, Cost);
- DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx)))
- dbgs() << "Collect constant " << *ConstInt << " from " << *Inst
+ LLVM_DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx))) dbgs()
+ << "Collect constant " << *ConstInt << " from " << *Inst
<< " with cost " << Cost << '\n';
- else
- dbgs() << "Collect constant " << *ConstInt << " indirectly from "
- << *Inst << " via " << *Inst->getOperand(Idx) << " with cost "
- << Cost << '\n';
- );
+ else dbgs() << "Collect constant " << *ConstInt
+ << " indirectly from " << *Inst << " via "
+ << *Inst->getOperand(Idx) << " with cost " << Cost
+ << '\n';);
}
}
-/// \brief Check the operand for instruction Inst at index Idx.
+/// Check the operand for instruction Inst at index Idx.
void ConstantHoistingPass::collectConstantCandidates(
ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) {
Value *Opnd = Inst->getOperand(Idx);
@@ -416,7 +415,7 @@ void ConstantHoistingPass::collectConstantCandidates(
}
}
-/// \brief Scan the instruction for expensive integer constants and record them
+/// Scan the instruction for expensive integer constants and record them
/// in the constant candidate vector.
void ConstantHoistingPass::collectConstantCandidates(
ConstCandMapType &ConstCandMap, Instruction *Inst) {
@@ -436,7 +435,7 @@ void ConstantHoistingPass::collectConstantCandidates(
} // end of for all operands
}
-/// \brief Collect all integer constants in the function that cannot be folded
+/// Collect all integer constants in the function that cannot be folded
/// into an instruction itself.
void ConstantHoistingPass::collectConstantCandidates(Function &Fn) {
ConstCandMapType ConstCandMap;
@@ -501,20 +500,21 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
return NumUses;
}
- DEBUG(dbgs() << "== Maximize constants in range ==\n");
+ LLVM_DEBUG(dbgs() << "== Maximize constants in range ==\n");
int MaxCost = -1;
for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
auto Value = ConstCand->ConstInt->getValue();
Type *Ty = ConstCand->ConstInt->getType();
int Cost = 0;
NumUses += ConstCand->Uses.size();
- DEBUG(dbgs() << "= Constant: " << ConstCand->ConstInt->getValue() << "\n");
+ LLVM_DEBUG(dbgs() << "= Constant: " << ConstCand->ConstInt->getValue()
+ << "\n");
for (auto User : ConstCand->Uses) {
unsigned Opcode = User.Inst->getOpcode();
unsigned OpndIdx = User.OpndIdx;
Cost += TTI->getIntImmCost(Opcode, OpndIdx, Value, Ty);
- DEBUG(dbgs() << "Cost: " << Cost << "\n");
+ LLVM_DEBUG(dbgs() << "Cost: " << Cost << "\n");
for (auto C2 = S; C2 != E; ++C2) {
Optional<APInt> Diff = calculateOffsetDiff(
@@ -524,24 +524,24 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
const int ImmCosts =
TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.getValue(), Ty);
Cost -= ImmCosts;
- DEBUG(dbgs() << "Offset " << Diff.getValue() << " "
- << "has penalty: " << ImmCosts << "\n"
- << "Adjusted cost: " << Cost << "\n");
+ LLVM_DEBUG(dbgs() << "Offset " << Diff.getValue() << " "
+ << "has penalty: " << ImmCosts << "\n"
+ << "Adjusted cost: " << Cost << "\n");
}
}
}
- DEBUG(dbgs() << "Cumulative cost: " << Cost << "\n");
+ LLVM_DEBUG(dbgs() << "Cumulative cost: " << Cost << "\n");
if (Cost > MaxCost) {
MaxCost = Cost;
MaxCostItr = ConstCand;
- DEBUG(dbgs() << "New candidate: " << MaxCostItr->ConstInt->getValue()
- << "\n");
+ LLVM_DEBUG(dbgs() << "New candidate: " << MaxCostItr->ConstInt->getValue()
+ << "\n");
}
}
return NumUses;
}
-/// \brief Find the base constant within the given range and rebase all other
+/// Find the base constant within the given range and rebase all other
/// constants with respect to the base constant.
void ConstantHoistingPass::findAndMakeBaseConstant(
ConstCandVecType::iterator S, ConstCandVecType::iterator E) {
@@ -567,12 +567,12 @@ void ConstantHoistingPass::findAndMakeBaseConstant(
ConstantVec.push_back(std::move(ConstInfo));
}
-/// \brief Finds and combines constant candidates that can be easily
+/// Finds and combines constant candidates that can be easily
/// rematerialized with an add from a common base constant.
void ConstantHoistingPass::findBaseConstants() {
// Sort the constants by value and type. This invalidates the mapping!
- std::sort(ConstCandVec.begin(), ConstCandVec.end(),
- [](const ConstantCandidate &LHS, const ConstantCandidate &RHS) {
+ llvm::sort(ConstCandVec.begin(), ConstCandVec.end(),
+ [](const ConstantCandidate &LHS, const ConstantCandidate &RHS) {
if (LHS.ConstInt->getType() != RHS.ConstInt->getType())
return LHS.ConstInt->getType()->getBitWidth() <
RHS.ConstInt->getType()->getBitWidth();
@@ -601,7 +601,7 @@ void ConstantHoistingPass::findBaseConstants() {
findAndMakeBaseConstant(MinValItr, ConstCandVec.end());
}
-/// \brief Updates the operand at Idx in instruction Inst with the result of
+/// Updates the operand at Idx in instruction Inst with the result of
/// instruction Mat. If the instruction is a PHI node then special
/// handling for duplicate values form the same incoming basic block is
/// required.
@@ -629,7 +629,7 @@ static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat) {
return true;
}
-/// \brief Emit materialization code for all rebased constants and update their
+/// Emit materialization code for all rebased constants and update their
/// users.
void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
Constant *Offset,
@@ -641,19 +641,20 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
Mat = BinaryOperator::Create(Instruction::Add, Base, Offset,
"const_mat", InsertionPt);
- DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0)
- << " + " << *Offset << ") in BB "
- << Mat->getParent()->getName() << '\n' << *Mat << '\n');
+ LLVM_DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0)
+ << " + " << *Offset << ") in BB "
+ << Mat->getParent()->getName() << '\n'
+ << *Mat << '\n');
Mat->setDebugLoc(ConstUser.Inst->getDebugLoc());
}
Value *Opnd = ConstUser.Inst->getOperand(ConstUser.OpndIdx);
// Visit constant integer.
if (isa<ConstantInt>(Opnd)) {
- DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+ LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat) && Offset)
Mat->eraseFromParent();
- DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
+ LLVM_DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
return;
}
@@ -669,13 +670,13 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
ClonedCastInst->insertAfter(CastInst);
// Use the same debug location as the original cast instruction.
ClonedCastInst->setDebugLoc(CastInst->getDebugLoc());
- DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n'
- << "To : " << *ClonedCastInst << '\n');
+ LLVM_DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n'
+ << "To : " << *ClonedCastInst << '\n');
}
- DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+ LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ClonedCastInst);
- DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
+ LLVM_DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
return;
}
@@ -689,20 +690,20 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
// Use the same debug location as the instruction we are about to update.
ConstExprInst->setDebugLoc(ConstUser.Inst->getDebugLoc());
- DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n'
- << "From : " << *ConstExpr << '\n');
- DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+ LLVM_DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n'
+ << "From : " << *ConstExpr << '\n');
+ LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ConstExprInst)) {
ConstExprInst->eraseFromParent();
if (Offset)
Mat->eraseFromParent();
}
- DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
+ LLVM_DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
return;
}
}
-/// \brief Hoist and hide the base constant behind a bitcast and emit
+/// Hoist and hide the base constant behind a bitcast and emit
/// materialization code for derived constants.
bool ConstantHoistingPass::emitBaseConstants() {
bool MadeChange = false;
@@ -720,9 +721,9 @@ bool ConstantHoistingPass::emitBaseConstants() {
Base->setDebugLoc(IP->getDebugLoc());
- DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant
- << ") to BB " << IP->getParent()->getName() << '\n'
- << *Base << '\n');
+ LLVM_DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant
+ << ") to BB " << IP->getParent()->getName() << '\n'
+ << *Base << '\n');
// Emit materialization code for all rebased constants.
unsigned Uses = 0;
@@ -765,7 +766,7 @@ bool ConstantHoistingPass::emitBaseConstants() {
return MadeChange;
}
-/// \brief Check all cast instructions we made a copy of and remove them if they
+/// Check all cast instructions we made a copy of and remove them if they
/// have no more users.
void ConstantHoistingPass::deleteDeadCastInst() const {
for (auto const &I : ClonedCastMap)
@@ -773,7 +774,7 @@ void ConstantHoistingPass::deleteDeadCastInst() const {
I.first->eraseFromParent();
}
-/// \brief Optimize expensive integer constants in the given function.
+/// Optimize expensive integer constants in the given function.
bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
DominatorTree &DT, BlockFrequencyInfo *BFI,
BasicBlock &Entry) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
index 4fa27891a974..46915889ce7c 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
@@ -21,12 +21,12 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instruction.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <set>
using namespace llvm;
diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 8f468ebf8949..ea148b728a10 100644
--- a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -19,6 +19,7 @@
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
@@ -28,11 +29,11 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/Type.h"
@@ -43,7 +44,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <utility>
@@ -52,12 +52,14 @@ using namespace llvm;
#define DEBUG_TYPE "correlated-value-propagation"
STATISTIC(NumPhis, "Number of phis propagated");
+STATISTIC(NumPhiCommon, "Number of phis deleted via common incoming value");
STATISTIC(NumSelects, "Number of selects propagated");
STATISTIC(NumMemAccess, "Number of memory access targets propagated");
STATISTIC(NumCmps, "Number of comparisons propagated");
STATISTIC(NumReturns, "Number of return values propagated");
STATISTIC(NumDeadCases, "Number of switch cases removed");
STATISTIC(NumSDivs, "Number of sdiv converted to udiv");
+STATISTIC(NumUDivs, "Number of udivs whose width was decreased");
STATISTIC(NumAShrs, "Number of ashr converted to lshr");
STATISTIC(NumSRems, "Number of srem converted to urem");
STATISTIC(NumOverflows, "Number of overflow checks removed");
@@ -77,8 +79,10 @@ namespace {
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<LazyValueInfoWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
}
};
@@ -88,6 +92,7 @@ char CorrelatedValuePropagation::ID = 0;
INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation",
"Value Propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation",
"Value Propagation", false, false)
@@ -101,14 +106,14 @@ static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
if (S->getType()->isVectorTy()) return false;
if (isa<Constant>(S->getOperand(0))) return false;
- Constant *C = LVI->getConstant(S->getOperand(0), S->getParent(), S);
+ Constant *C = LVI->getConstant(S->getCondition(), S->getParent(), S);
if (!C) return false;
ConstantInt *CI = dyn_cast<ConstantInt>(C);
if (!CI) return false;
- Value *ReplaceWith = S->getOperand(1);
- Value *Other = S->getOperand(2);
+ Value *ReplaceWith = S->getTrueValue();
+ Value *Other = S->getFalseValue();
if (!CI->isOne()) std::swap(ReplaceWith, Other);
if (ReplaceWith == S) ReplaceWith = UndefValue::get(S->getType());
@@ -120,7 +125,63 @@ static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
return true;
}
-static bool processPHI(PHINode *P, LazyValueInfo *LVI,
+/// Try to simplify a phi with constant incoming values that match the edge
+/// values of a non-constant value on all other edges:
+/// bb0:
+/// %isnull = icmp eq i8* %x, null
+/// br i1 %isnull, label %bb2, label %bb1
+/// bb1:
+/// br label %bb2
+/// bb2:
+/// %r = phi i8* [ %x, %bb1 ], [ null, %bb0 ]
+/// -->
+/// %r = %x
+static bool simplifyCommonValuePhi(PHINode *P, LazyValueInfo *LVI,
+ DominatorTree *DT) {
+ // Collect incoming constants and initialize possible common value.
+ SmallVector<std::pair<Constant *, unsigned>, 4> IncomingConstants;
+ Value *CommonValue = nullptr;
+ for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) {
+ Value *Incoming = P->getIncomingValue(i);
+ if (auto *IncomingConstant = dyn_cast<Constant>(Incoming)) {
+ IncomingConstants.push_back(std::make_pair(IncomingConstant, i));
+ } else if (!CommonValue) {
+ // The potential common value is initialized to the first non-constant.
+ CommonValue = Incoming;
+ } else if (Incoming != CommonValue) {
+ // There can be only one non-constant common value.
+ return false;
+ }
+ }
+
+ if (!CommonValue || IncomingConstants.empty())
+ return false;
+
+ // The common value must be valid in all incoming blocks.
+ BasicBlock *ToBB = P->getParent();
+ if (auto *CommonInst = dyn_cast<Instruction>(CommonValue))
+ if (!DT->dominates(CommonInst, ToBB))
+ return false;
+
+ // We have a phi with exactly 1 variable incoming value and 1 or more constant
+ // incoming values. See if all constant incoming values can be mapped back to
+ // the same incoming variable value.
+ for (auto &IncomingConstant : IncomingConstants) {
+ Constant *C = IncomingConstant.first;
+ BasicBlock *IncomingBB = P->getIncomingBlock(IncomingConstant.second);
+ if (C != LVI->getConstantOnEdge(CommonValue, IncomingBB, ToBB, P))
+ return false;
+ }
+
+ // All constant incoming values map to the same variable along the incoming
+ // edges of the phi. The phi is unnecessary.
+ P->replaceAllUsesWith(CommonValue);
+ P->eraseFromParent();
+ ++NumPhiCommon;
+ return true;
+}
+
+static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT,
const SimplifyQuery &SQ) {
bool Changed = false;
@@ -168,7 +229,7 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI,
V = SI->getTrueValue();
}
- DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
+ LLVM_DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
}
P->setIncomingValue(i, V);
@@ -181,6 +242,9 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI,
Changed = true;
}
+ if (!Changed)
+ Changed = simplifyCommonValuePhi(P, LVI, DT);
+
if (Changed)
++NumPhis;
@@ -243,7 +307,7 @@ static bool processCmp(CmpInst *C, LazyValueInfo *LVI) {
/// that cannot fire no matter what the incoming edge can safely be removed. If
/// a case fires on every incoming edge then the entire switch can be removed
/// and replaced with a branch to the case destination.
-static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
+static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI, DominatorTree *DT) {
Value *Cond = SI->getCondition();
BasicBlock *BB = SI->getParent();
@@ -258,6 +322,10 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
// Analyse each switch case in turn.
bool Changed = false;
+ DenseMap<BasicBlock*, int> SuccessorsCount;
+ for (auto *Succ : successors(BB))
+ SuccessorsCount[Succ]++;
+
for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
ConstantInt *Case = CI->getCaseValue();
@@ -292,7 +360,8 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
if (State == LazyValueInfo::False) {
// This case never fires - remove it.
- CI->getCaseSuccessor()->removePredecessor(BB);
+ BasicBlock *Succ = CI->getCaseSuccessor();
+ Succ->removePredecessor(BB);
CI = SI->removeCase(CI);
CE = SI->case_end();
@@ -302,6 +371,8 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
++NumDeadCases;
Changed = true;
+ if (--SuccessorsCount[Succ] == 0)
+ DT->deleteEdge(BB, Succ);
continue;
}
if (State == LazyValueInfo::True) {
@@ -318,10 +389,14 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
++CI;
}
- if (Changed)
+ if (Changed) {
// If the switch has been simplified to the point where it can be replaced
// by a branch then do so now.
- ConstantFoldTerminator(BB);
+ DeferredDominance DDT(*DT);
+ ConstantFoldTerminator(BB, /*DeleteDeadConditions = */ false,
+ /*TLI = */ nullptr, &DDT);
+ DDT.flush();
+ }
return Changed;
}
@@ -430,9 +505,50 @@ static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) {
return true;
}
+/// Try to shrink a udiv/urem's width down to the smallest power of two that's
+/// sufficient to contain its operands.
+static bool processUDivOrURem(BinaryOperator *Instr, LazyValueInfo *LVI) {
+ assert(Instr->getOpcode() == Instruction::UDiv ||
+ Instr->getOpcode() == Instruction::URem);
+ if (Instr->getType()->isVectorTy())
+ return false;
+
+ // Find the smallest power of two bitwidth that's sufficient to hold Instr's
+ // operands.
+ auto OrigWidth = Instr->getType()->getIntegerBitWidth();
+ ConstantRange OperandRange(OrigWidth, /*isFullset=*/false);
+ for (Value *Operand : Instr->operands()) {
+ OperandRange = OperandRange.unionWith(
+ LVI->getConstantRange(Operand, Instr->getParent()));
+ }
+ // Don't shrink below 8 bits wide.
+ unsigned NewWidth = std::max<unsigned>(
+ PowerOf2Ceil(OperandRange.getUnsignedMax().getActiveBits()), 8);
+ // NewWidth might be greater than OrigWidth if OrigWidth is not a power of
+ // two.
+ if (NewWidth >= OrigWidth)
+ return false;
+
+ ++NumUDivs;
+ auto *TruncTy = Type::getIntNTy(Instr->getContext(), NewWidth);
+ auto *LHS = CastInst::Create(Instruction::Trunc, Instr->getOperand(0), TruncTy,
+ Instr->getName() + ".lhs.trunc", Instr);
+ auto *RHS = CastInst::Create(Instruction::Trunc, Instr->getOperand(1), TruncTy,
+ Instr->getName() + ".rhs.trunc", Instr);
+ auto *BO =
+ BinaryOperator::Create(Instr->getOpcode(), LHS, RHS, Instr->getName(), Instr);
+ auto *Zext = CastInst::Create(Instruction::ZExt, BO, Instr->getType(),
+ Instr->getName() + ".zext", Instr);
+ if (BO->getOpcode() == Instruction::UDiv)
+ BO->setIsExact(Instr->isExact());
+
+ Instr->replaceAllUsesWith(Zext);
+ Instr->eraseFromParent();
+ return true;
+}
+
static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
- if (SDI->getType()->isVectorTy() ||
- !hasPositiveOperands(SDI, LVI))
+ if (SDI->getType()->isVectorTy() || !hasPositiveOperands(SDI, LVI))
return false;
++NumSRems;
@@ -440,6 +556,10 @@ static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
SDI->getName(), SDI);
SDI->replaceAllUsesWith(BO);
SDI->eraseFromParent();
+
+ // Try to process our new urem.
+ processUDivOrURem(BO, LVI);
+
return true;
}
@@ -449,8 +569,7 @@ static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
/// conditions, this can sometimes prove conditions instcombine can't by
/// exploiting range information.
static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
- if (SDI->getType()->isVectorTy() ||
- !hasPositiveOperands(SDI, LVI))
+ if (SDI->getType()->isVectorTy() || !hasPositiveOperands(SDI, LVI))
return false;
++NumSDivs;
@@ -460,6 +579,9 @@ static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
SDI->replaceAllUsesWith(BO);
SDI->eraseFromParent();
+ // Try to simplify our new udiv.
+ processUDivOrURem(BO, LVI);
+
return true;
}
@@ -559,7 +681,8 @@ static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
ConstantInt::getFalse(C->getContext());
}
-static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
+static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT,
+ const SimplifyQuery &SQ) {
bool FnChanged = false;
// Visiting in a pre-order depth-first traversal causes us to simplify early
// blocks before querying later blocks (which require us to analyze early
@@ -575,7 +698,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
BBChanged |= processSelect(cast<SelectInst>(II), LVI);
break;
case Instruction::PHI:
- BBChanged |= processPHI(cast<PHINode>(II), LVI, SQ);
+ BBChanged |= processPHI(cast<PHINode>(II), LVI, DT, SQ);
break;
case Instruction::ICmp:
case Instruction::FCmp:
@@ -595,6 +718,10 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
case Instruction::SDiv:
BBChanged |= processSDiv(cast<BinaryOperator>(II), LVI);
break;
+ case Instruction::UDiv:
+ case Instruction::URem:
+ BBChanged |= processUDivOrURem(cast<BinaryOperator>(II), LVI);
+ break;
case Instruction::AShr:
BBChanged |= processAShr(cast<BinaryOperator>(II), LVI);
break;
@@ -607,7 +734,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
Instruction *Term = BB->getTerminator();
switch (Term->getOpcode()) {
case Instruction::Switch:
- BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI);
+ BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI, DT);
break;
case Instruction::Ret: {
auto *RI = cast<ReturnInst>(Term);
@@ -636,18 +763,22 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) {
return false;
LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
- return runImpl(F, LVI, getBestSimplifyQuery(*this, F));
+ DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+ return runImpl(F, LVI, DT, getBestSimplifyQuery(*this, F));
}
PreservedAnalyses
CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
-
LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F);
- bool Changed = runImpl(F, LVI, getBestSimplifyQuery(AM, F));
+ DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+
+ bool Changed = runImpl(F, LVI, DT, getBestSimplifyQuery(AM, F));
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<GlobalsAA>();
+ PA.preserve<DominatorTreeAnalysis>();
return PA;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
index fa4806e884c3..6078967a0f94 100644
--- a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
@@ -20,11 +20,11 @@
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instruction.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
#define DEBUG_TYPE "dce"
@@ -50,6 +50,7 @@ namespace {
for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
Instruction *Inst = &*DI++;
if (isInstructionTriviallyDead(Inst, TLI)) {
+ salvageDebugInfo(*Inst);
Inst->eraseFromParent();
Changed = true;
++DIEEliminated;
@@ -76,6 +77,8 @@ static bool DCEInstruction(Instruction *I,
SmallSetVector<Instruction *, 16> &WorkList,
const TargetLibraryInfo *TLI) {
if (isInstructionTriviallyDead(I, TLI)) {
+ salvageDebugInfo(*I);
+
// Null out all of the instruction's operands to see if any operand becomes
// dead as we go.
for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index b665d94a70aa..dd1a2a6adb82 100644
--- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -30,6 +30,7 @@
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/BasicBlock.h"
@@ -56,11 +57,10 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
-#include <cstdint>
#include <cstddef>
+#include <cstdint>
#include <iterator>
#include <map>
#include <utility>
@@ -115,6 +115,9 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
Instruction *DeadInst = NowDeadInsts.pop_back_val();
++NumFastOther;
+ // Try to preserve debug information attached to the dead instruction.
+ salvageDebugInfo(*DeadInst);
+
// This instruction is dead, zap it, in stages. Start by removing it from
// MemDep, which needs to know the operands and needs it to be in the
// function.
@@ -146,7 +149,8 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
/// Does this instruction write some memory? This only returns true for things
/// that we can analyze with other helpers below.
-static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
+static bool hasAnalyzableMemoryWrite(Instruction *I,
+ const TargetLibraryInfo &TLI) {
if (isa<StoreInst>(I))
return true;
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
@@ -156,6 +160,9 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
case Intrinsic::memset:
case Intrinsic::memmove:
case Intrinsic::memcpy:
+ case Intrinsic::memcpy_element_unordered_atomic:
+ case Intrinsic::memmove_element_unordered_atomic:
+ case Intrinsic::memset_element_unordered_atomic:
case Intrinsic::init_trampoline:
case Intrinsic::lifetime_end:
return true;
@@ -180,43 +187,45 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
/// Return a Location stored to by the specified instruction. If isRemovable
/// returns true, this function and getLocForRead completely describe the memory
/// operations for this instruction.
-static MemoryLocation getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
+static MemoryLocation getLocForWrite(Instruction *Inst) {
+
if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
return MemoryLocation::get(SI);
- if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) {
+ if (auto *MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
// memcpy/memmove/memset.
MemoryLocation Loc = MemoryLocation::getForDest(MI);
return Loc;
}
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
- if (!II)
- return MemoryLocation();
-
- switch (II->getIntrinsicID()) {
- default:
- return MemoryLocation(); // Unhandled intrinsic.
- case Intrinsic::init_trampoline:
- // FIXME: We don't know the size of the trampoline, so we can't really
- // handle it here.
- return MemoryLocation(II->getArgOperand(0));
- case Intrinsic::lifetime_end: {
- uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
- return MemoryLocation(II->getArgOperand(1), Len);
- }
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+ switch (II->getIntrinsicID()) {
+ default:
+ return MemoryLocation(); // Unhandled intrinsic.
+ case Intrinsic::init_trampoline:
+ return MemoryLocation(II->getArgOperand(0));
+ case Intrinsic::lifetime_end: {
+ uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
+ return MemoryLocation(II->getArgOperand(1), Len);
+ }
+ }
}
+ if (auto CS = CallSite(Inst))
+ // All the supported TLI functions so far happen to have dest as their
+ // first argument.
+ return MemoryLocation(CS.getArgument(0));
+ return MemoryLocation();
}
-/// Return the location read by the specified "hasMemoryWrite" instruction if
-/// any.
+/// Return the location read by the specified "hasAnalyzableMemoryWrite"
+/// instruction if any.
static MemoryLocation getLocForRead(Instruction *Inst,
const TargetLibraryInfo &TLI) {
- assert(hasMemoryWrite(Inst, TLI) && "Unknown instruction case");
+ assert(hasAnalyzableMemoryWrite(Inst, TLI) && "Unknown instruction case");
// The only instructions that both read and write are the mem transfer
// instructions (memcpy/memmove).
- if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst))
+ if (auto *MTI = dyn_cast<AnyMemTransferInst>(Inst))
return MemoryLocation::getForSource(MTI);
return MemoryLocation();
}
@@ -230,7 +239,7 @@ static bool isRemovable(Instruction *I) {
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
switch (II->getIntrinsicID()) {
- default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate");
+ default: llvm_unreachable("doesn't pass 'hasAnalyzableMemoryWrite' predicate");
case Intrinsic::lifetime_end:
// Never remove dead lifetime_end's, e.g. because it is followed by a
// free.
@@ -243,9 +252,14 @@ static bool isRemovable(Instruction *I) {
case Intrinsic::memcpy:
// Don't remove volatile memory intrinsics.
return !cast<MemIntrinsic>(II)->isVolatile();
+ case Intrinsic::memcpy_element_unordered_atomic:
+ case Intrinsic::memmove_element_unordered_atomic:
+ case Intrinsic::memset_element_unordered_atomic:
+ return true;
}
}
+ // note: only get here for calls with analyzable writes - i.e. libcalls
if (auto CS = CallSite(I))
return CS.getInstruction()->use_empty();
@@ -264,6 +278,8 @@ static bool isShortenableAtTheEnd(Instruction *I) {
default: return false;
case Intrinsic::memset:
case Intrinsic::memcpy:
+ case Intrinsic::memcpy_element_unordered_atomic:
+ case Intrinsic::memset_element_unordered_atomic:
// Do shorten memory intrinsics.
// FIXME: Add memmove if it's also safe to transform.
return true;
@@ -280,35 +296,27 @@ static bool isShortenableAtTheEnd(Instruction *I) {
static bool isShortenableAtTheBeginning(Instruction *I) {
// FIXME: Handle only memset for now. Supporting memcpy/memmove should be
// easily done by offsetting the source address.
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
- return II && II->getIntrinsicID() == Intrinsic::memset;
+ return isa<AnyMemSetInst>(I);
}
/// Return the pointer that is being written to.
static Value *getStoredPointerOperand(Instruction *I) {
- if (StoreInst *SI = dyn_cast<StoreInst>(I))
- return SI->getPointerOperand();
- if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
- return MI->getDest();
-
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
- switch (II->getIntrinsicID()) {
- default: llvm_unreachable("Unexpected intrinsic!");
- case Intrinsic::init_trampoline:
- return II->getArgOperand(0);
- }
- }
-
- CallSite CS(I);
- // All the supported functions so far happen to have dest as their first
- // argument.
- return CS.getArgument(0);
+ //TODO: factor this to reuse getLocForWrite
+ MemoryLocation Loc = getLocForWrite(I);
+ assert(Loc.Ptr &&
+ "unable to find pointer written for analyzable instruction?");
+ // TODO: most APIs don't expect const Value *
+ return const_cast<Value*>(Loc.Ptr);
}
static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
- const TargetLibraryInfo &TLI) {
+ const TargetLibraryInfo &TLI,
+ const Function *F) {
uint64_t Size;
- if (getObjectSize(V, Size, DL, &TLI))
+ ObjectSizeOpts Opts;
+ Opts.NullIsUnknownSize = NullPointerIsDefined(F);
+
+ if (getObjectSize(V, Size, DL, &TLI, Opts))
return Size;
return MemoryLocation::UnknownSize;
}
@@ -338,7 +346,9 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
const TargetLibraryInfo &TLI,
int64_t &EarlierOff, int64_t &LaterOff,
Instruction *DepWrite,
- InstOverlapIntervalsTy &IOL) {
+ InstOverlapIntervalsTy &IOL,
+ AliasAnalysis &AA,
+ const Function *F) {
// If we don't know the sizes of either access, then we can't do a comparison.
if (Later.Size == MemoryLocation::UnknownSize ||
Earlier.Size == MemoryLocation::UnknownSize)
@@ -349,7 +359,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
// If the start pointers are the same, we just have to compare sizes to see if
// the later store was larger than the earlier store.
- if (P1 == P2) {
+ if (P1 == P2 || AA.isMustAlias(P1, P2)) {
// Make sure that the Later size is >= the Earlier size.
if (Later.Size >= Earlier.Size)
return OW_Complete;
@@ -367,7 +377,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
return OW_Unknown;
// If the "Later" store is to a recognizable object, get its size.
- uint64_t ObjectSize = getPointerSize(UO2, DL, TLI);
+ uint64_t ObjectSize = getPointerSize(UO2, DL, TLI, F);
if (ObjectSize != MemoryLocation::UnknownSize)
if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size)
return OW_Complete;
@@ -415,9 +425,10 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
// Insert our part of the overlap into the map.
auto &IM = IOL[DepWrite];
- DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff << ", " <<
- int64_t(EarlierOff + Earlier.Size) << ") Later [" <<
- LaterOff << ", " << int64_t(LaterOff + Later.Size) << ")\n");
+ LLVM_DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff
+ << ", " << int64_t(EarlierOff + Earlier.Size)
+ << ") Later [" << LaterOff << ", "
+ << int64_t(LaterOff + Later.Size) << ")\n");
// Make sure that we only insert non-overlapping intervals and combine
// adjacent intervals. The intervals are stored in the map with the ending
@@ -454,11 +465,11 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
ILI = IM.begin();
if (ILI->second <= EarlierOff &&
ILI->first >= int64_t(EarlierOff + Earlier.Size)) {
- DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier [" <<
- EarlierOff << ", " <<
- int64_t(EarlierOff + Earlier.Size) <<
- ") Composite Later [" <<
- ILI->second << ", " << ILI->first << ")\n");
+ LLVM_DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier ["
+ << EarlierOff << ", "
+ << int64_t(EarlierOff + Earlier.Size)
+ << ") Composite Later [" << ILI->second << ", "
+ << ILI->first << ")\n");
++NumCompletePartials;
return OW_Complete;
}
@@ -469,10 +480,11 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
if (EnablePartialStoreMerging && LaterOff >= EarlierOff &&
int64_t(EarlierOff + Earlier.Size) > LaterOff &&
uint64_t(LaterOff - EarlierOff) + Later.Size <= Earlier.Size) {
- DEBUG(dbgs() << "DSE: Partial overwrite an earlier load [" << EarlierOff
- << ", " << int64_t(EarlierOff + Earlier.Size)
- << ") by a later store [" << LaterOff << ", "
- << int64_t(LaterOff + Later.Size) << ")\n");
+ LLVM_DEBUG(dbgs() << "DSE: Partial overwrite an earlier load ["
+ << EarlierOff << ", "
+ << int64_t(EarlierOff + Earlier.Size)
+ << ") by a later store [" << LaterOff << ", "
+ << int64_t(LaterOff + Later.Size) << ")\n");
// TODO: Maybe come up with a better name?
return OW_PartialEarlierWithFullLater;
}
@@ -514,8 +526,8 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
/// memory region into an identical pointer) then it doesn't actually make its
/// input dead in the traditional sense. Consider this case:
///
-/// memcpy(A <- B)
-/// memcpy(A <- A)
+/// memmove(A <- B)
+/// memmove(A <- A)
///
/// In this case, the second store to A does not make the first store to A dead.
/// The usual situation isn't an explicit A<-A store like this (which can be
@@ -531,24 +543,35 @@ static bool isPossibleSelfRead(Instruction *Inst,
// Self reads can only happen for instructions that read memory. Get the
// location read.
MemoryLocation InstReadLoc = getLocForRead(Inst, TLI);
- if (!InstReadLoc.Ptr) return false; // Not a reading instruction.
+ if (!InstReadLoc.Ptr)
+ return false; // Not a reading instruction.
// If the read and written loc obviously don't alias, it isn't a read.
- if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false;
-
- // Okay, 'Inst' may copy over itself. However, we can still remove a the
- // DepWrite instruction if we can prove that it reads from the same location
- // as Inst. This handles useful cases like:
- // memcpy(A <- B)
- // memcpy(A <- B)
- // Here we don't know if A/B may alias, but we do know that B/B are must
- // aliases, so removing the first memcpy is safe (assuming it writes <= #
- // bytes as the second one.
- MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);
-
- if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
+ if (AA.isNoAlias(InstReadLoc, InstStoreLoc))
return false;
+ if (isa<AnyMemCpyInst>(Inst)) {
+ // LLVM's memcpy overlap semantics are not fully fleshed out (see PR11763)
+ // but in practice memcpy(A <- B) either means that A and B are disjoint or
+ // are equal (i.e. there are not partial overlaps). Given that, if we have:
+ //
+ // memcpy/memmove(A <- B) // DepWrite
+ // memcpy(A <- B) // Inst
+ //
+ // with Inst reading/writing a >= size than DepWrite, we can reason as
+ // follows:
+ //
+ // - If A == B then both the copies are no-ops, so the DepWrite can be
+ // removed.
+ // - If A != B then A and B are disjoint locations in Inst. Since
+ // Inst.size >= DepWrite.size A and B are disjoint in DepWrite too.
+ // Therefore DepWrite can be removed.
+ MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);
+
+ if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
+ return false;
+ }
+
// If DepWrite doesn't read memory or if we can't prove it is a must alias,
// then it can't be considered dead.
return true;
@@ -650,7 +673,8 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA,
MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB);
while (Dep.isDef() || Dep.isClobber()) {
Instruction *Dependency = Dep.getInst();
- if (!hasMemoryWrite(Dependency, *TLI) || !isRemovable(Dependency))
+ if (!hasAnalyzableMemoryWrite(Dependency, *TLI) ||
+ !isRemovable(Dependency))
break;
Value *DepPointer =
@@ -660,8 +684,9 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA,
if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
break;
- DEBUG(dbgs() << "DSE: Dead Store to soon to be freed memory:\n DEAD: "
- << *Dependency << '\n');
+ LLVM_DEBUG(
+ dbgs() << "DSE: Dead Store to soon to be freed memory:\n DEAD: "
+ << *Dependency << '\n');
// DCE instructions only used to calculate that store.
BasicBlock::iterator BBI(Dependency);
@@ -690,7 +715,8 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA,
static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
SmallSetVector<Value *, 16> &DeadStackObjects,
const DataLayout &DL, AliasAnalysis *AA,
- const TargetLibraryInfo *TLI) {
+ const TargetLibraryInfo *TLI,
+ const Function *F) {
const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL);
// A constant can't be in the dead pointer set.
@@ -707,7 +733,7 @@ static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
// Remove objects that could alias LoadedLoc.
DeadStackObjects.remove_if([&](Value *I) {
// See if the loaded location could alias the stack location.
- MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI));
+ MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI, F));
return !AA->isNoAlias(StackLoc, LoadedLoc);
});
}
@@ -754,7 +780,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
--BBI;
// If we find a store, check to see if it points into a dead stack value.
- if (hasMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) {
+ if (hasAnalyzableMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) {
// See through pointer-to-pointer bitcasts
SmallVector<Value *, 4> Pointers;
GetUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers, DL);
@@ -770,15 +796,16 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
if (AllDead) {
Instruction *Dead = &*BBI;
- DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: "
- << *Dead << "\n Objects: ";
- for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
- E = Pointers.end(); I != E; ++I) {
- dbgs() << **I;
- if (std::next(I) != E)
- dbgs() << ", ";
- }
- dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: "
+ << *Dead << "\n Objects: ";
+ for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
+ E = Pointers.end();
+ I != E; ++I) {
+ dbgs() << **I;
+ if (std::next(I) != E)
+ dbgs() << ", ";
+ } dbgs()
+ << '\n');
// DCE instructions only used to calculate that store.
deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, InstrOrdering, &DeadStackObjects);
@@ -790,8 +817,8 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
// Remove any dead non-memory-mutating instructions.
if (isInstructionTriviallyDead(&*BBI, TLI)) {
- DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n DEAD: "
- << *&*BBI << '\n');
+ LLVM_DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n DEAD: "
+ << *&*BBI << '\n');
deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, InstrOrdering, &DeadStackObjects);
++NumFastOther;
MadeChange = true;
@@ -820,7 +847,8 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
// the call is live.
DeadStackObjects.remove_if([&](Value *I) {
// See if the call site touches the value.
- return isRefSet(AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI)));
+ return isRefSet(AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI,
+ BB.getParent())));
});
// If all of the allocas were clobbered by the call then we're not going
@@ -848,8 +876,6 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
LoadedLoc = MemoryLocation::get(L);
} else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) {
LoadedLoc = MemoryLocation::get(V);
- } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) {
- LoadedLoc = MemoryLocation::getForSource(MTI);
} else if (!BBI->mayReadFromMemory()) {
// Instruction doesn't read memory. Note that stores that weren't removed
// above will hit this case.
@@ -861,7 +887,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
// Remove any allocas from the DeadPointer set that are loaded, as this
// makes any stores above the access live.
- removeAccessedObjects(LoadedLoc, DeadStackObjects, DL, AA, TLI);
+ removeAccessedObjects(LoadedLoc, DeadStackObjects, DL, AA, TLI, BB.getParent());
// If all of the allocas were clobbered by the access then we're not going
// to find anything else to process.
@@ -881,8 +907,8 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
// Power of 2 vector writes are probably always a bad idea to optimize
// as any store/memset/memcpy is likely using vector instructions so
// shortening it to not vector size is likely to be slower
- MemIntrinsic *EarlierIntrinsic = cast<MemIntrinsic>(EarlierWrite);
- unsigned EarlierWriteAlign = EarlierIntrinsic->getAlignment();
+ auto *EarlierIntrinsic = cast<AnyMemIntrinsic>(EarlierWrite);
+ unsigned EarlierWriteAlign = EarlierIntrinsic->getDestAlignment();
if (!IsOverwriteEnd)
LaterOffset = int64_t(LaterOffset + LaterSize);
@@ -890,15 +916,23 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
!((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0))
return false;
- DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW "
- << (IsOverwriteEnd ? "END" : "BEGIN") << ": " << *EarlierWrite
- << "\n KILLER (offset " << LaterOffset << ", " << EarlierSize
- << ")\n");
-
int64_t NewLength = IsOverwriteEnd
? LaterOffset - EarlierOffset
: EarlierSize - (LaterOffset - EarlierOffset);
+ if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(EarlierWrite)) {
+ // When shortening an atomic memory intrinsic, the newly shortened
+ // length must remain an integer multiple of the element size.
+ const uint32_t ElementSize = AMI->getElementSizeInBytes();
+ if (0 != NewLength % ElementSize)
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW "
+ << (IsOverwriteEnd ? "END" : "BEGIN") << ": "
+ << *EarlierWrite << "\n KILLER (offset " << LaterOffset
+ << ", " << EarlierSize << ")\n");
+
Value *EarlierWriteLength = EarlierIntrinsic->getLength();
Value *TrimmedLength =
ConstantInt::get(EarlierWriteLength->getType(), NewLength);
@@ -966,7 +1000,7 @@ static bool removePartiallyOverlappedStores(AliasAnalysis *AA,
bool Changed = false;
for (auto OI : IOL) {
Instruction *EarlierWrite = OI.first;
- MemoryLocation Loc = getLocForWrite(EarlierWrite, *AA);
+ MemoryLocation Loc = getLocForWrite(EarlierWrite);
assert(isRemovable(EarlierWrite) && "Expect only removable instruction");
assert(Loc.Size != MemoryLocation::UnknownSize && "Unexpected mem loc");
@@ -1002,8 +1036,9 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
isRemovable(SI) && memoryIsNotModifiedBetween(DepLoad, SI, AA)) {
- DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n LOAD: "
- << *DepLoad << "\n STORE: " << *SI << '\n');
+ LLVM_DEBUG(
+ dbgs() << "DSE: Remove Store Of Load from same pointer:\n LOAD: "
+ << *DepLoad << "\n STORE: " << *SI << '\n');
deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, InstrOrdering);
++NumRedundantStores;
@@ -1019,7 +1054,7 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) &&
memoryIsNotModifiedBetween(UnderlyingPointer, SI, AA)) {
- DEBUG(
+ LLVM_DEBUG(
dbgs() << "DSE: Remove null store to the calloc'ed object:\n DEAD: "
<< *Inst << "\n OBJECT: " << *UnderlyingPointer << '\n');
@@ -1067,7 +1102,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
}
// Check to see if Inst writes to memory. If not, continue.
- if (!hasMemoryWrite(Inst, *TLI))
+ if (!hasAnalyzableMemoryWrite(Inst, *TLI))
continue;
// eliminateNoopStore will update in iterator, if necessary.
@@ -1085,7 +1120,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
continue;
// Figure out what location is being stored to.
- MemoryLocation Loc = getLocForWrite(Inst, *AA);
+ MemoryLocation Loc = getLocForWrite(Inst);
// If we didn't get a useful location, fail.
if (!Loc.Ptr)
@@ -1107,7 +1142,9 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
//
// Find out what memory location the dependent instruction stores.
Instruction *DepWrite = InstDep.getInst();
- MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA);
+ if (!hasAnalyzableMemoryWrite(DepWrite, *TLI))
+ break;
+ MemoryLocation DepLoc = getLocForWrite(DepWrite);
// If we didn't get a useful location, or if it isn't a size, bail out.
if (!DepLoc.Ptr)
break;
@@ -1145,12 +1182,12 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
if (isRemovable(DepWrite) &&
!isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
int64_t InstWriteOffset, DepWriteOffset;
- OverwriteResult OR =
- isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset,
- DepWrite, IOL);
+ OverwriteResult OR = isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset,
+ InstWriteOffset, DepWrite, IOL, *AA,
+ BB.getParent());
if (OR == OW_Complete) {
- DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: "
- << *DepWrite << "\n KILLER: " << *Inst << '\n');
+ LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite
+ << "\n KILLER: " << *Inst << '\n');
// Delete the store and now-dead instructions that feed it.
deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, &InstrOrdering);
@@ -1208,9 +1245,9 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
// store, shifted appropriately.
APInt Merged =
(EarlierValue & ~Mask) | (LaterValue << LShiftAmount);
- DEBUG(dbgs() << "DSE: Merge Stores:\n Earlier: " << *DepWrite
- << "\n Later: " << *Inst
- << "\n Merged Value: " << Merged << '\n');
+ LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n Earlier: " << *DepWrite
+ << "\n Later: " << *Inst
+ << "\n Merged Value: " << Merged << '\n');
auto *SI = new StoreInst(
ConstantInt::get(Earlier->getValueOperand()->getType(), Merged),
diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 5798e1c4ee99..565745d12e99 100644
--- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -27,6 +27,7 @@
#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
@@ -49,10 +50,10 @@
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
#include "llvm/Support/RecyclingAllocator.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <deque>
#include <memory>
@@ -70,13 +71,16 @@ STATISTIC(NumCSELoad, "Number of load instructions CSE'd");
STATISTIC(NumCSECall, "Number of call instructions CSE'd");
STATISTIC(NumDSE, "Number of trivial dead stores removed");
+DEBUG_COUNTER(CSECounter, "early-cse",
+ "Controls which instructions are removed");
+
//===----------------------------------------------------------------------===//
// SimpleValue
//===----------------------------------------------------------------------===//
namespace {
-/// \brief Struct representing the available values in the scoped hash table.
+/// Struct representing the available values in the scoped hash table.
struct SimpleValue {
Instruction *Inst;
@@ -151,12 +155,15 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
SelectPatternFlavor SPF = matchSelectPattern(Inst, A, B).Flavor;
// TODO: We should also detect FP min/max.
if (SPF == SPF_SMIN || SPF == SPF_SMAX ||
- SPF == SPF_UMIN || SPF == SPF_UMAX ||
- SPF == SPF_ABS || SPF == SPF_NABS) {
+ SPF == SPF_UMIN || SPF == SPF_UMAX) {
if (A > B)
std::swap(A, B);
return hash_combine(Inst->getOpcode(), SPF, A, B);
}
+ if (SPF == SPF_ABS || SPF == SPF_NABS) {
+ // ABS/NABS always puts the input in A and its negation in B.
+ return hash_combine(Inst->getOpcode(), SPF, A, B);
+ }
if (CastInst *CI = dyn_cast<CastInst>(Inst))
return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0));
@@ -226,8 +233,13 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
LSPF == SPF_ABS || LSPF == SPF_NABS) {
Value *RHSA, *RHSB;
SelectPatternFlavor RSPF = matchSelectPattern(RHSI, RHSA, RHSB).Flavor;
- return (LSPF == RSPF && ((LHSA == RHSA && LHSB == RHSB) ||
- (LHSA == RHSB && LHSB == RHSA)));
+ if (LSPF == RSPF) {
+ // Abs results are placed in a defined order by matchSelectPattern.
+ if (LSPF == SPF_ABS || LSPF == SPF_NABS)
+ return LHSA == RHSA && LHSB == RHSB;
+ return ((LHSA == RHSA && LHSB == RHSB) ||
+ (LHSA == RHSB && LHSB == RHSA));
+ }
}
return false;
@@ -239,7 +251,7 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
namespace {
-/// \brief Struct representing the available call values in the scoped hash
+/// Struct representing the available call values in the scoped hash
/// table.
struct CallValue {
Instruction *Inst;
@@ -305,7 +317,7 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
namespace {
-/// \brief A simple and fast domtree-based CSE pass.
+/// A simple and fast domtree-based CSE pass.
///
/// This pass does a simple depth-first walk over the dominator tree,
/// eliminating trivially redundant instructions and using instsimplify to
@@ -329,7 +341,7 @@ public:
ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
AllocatorTy>;
- /// \brief A scoped hash table of the current values of all of our simple
+ /// A scoped hash table of the current values of all of our simple
/// scalar expressions.
///
/// As we walk down the domtree, we look to see if instructions are in this:
@@ -337,8 +349,8 @@ public:
/// that dominated values can succeed in their lookup.
ScopedHTType AvailableValues;
- /// A scoped hash table of the current values of previously encounted memory
- /// locations.
+ /// A scoped hash table of the current values of previously encountered
+ /// memory locations.
///
/// This allows us to get efficient access to dominating loads or stores when
/// we have a fully redundant load. In addition to the most recent load, we
@@ -356,13 +368,12 @@ public:
unsigned Generation = 0;
int MatchingId = -1;
bool IsAtomic = false;
- bool IsInvariant = false;
LoadValue() = default;
LoadValue(Instruction *Inst, unsigned Generation, unsigned MatchingId,
- bool IsAtomic, bool IsInvariant)
+ bool IsAtomic)
: DefInst(Inst), Generation(Generation), MatchingId(MatchingId),
- IsAtomic(IsAtomic), IsInvariant(IsInvariant) {}
+ IsAtomic(IsAtomic) {}
};
using LoadMapAllocator =
@@ -373,8 +384,19 @@ public:
LoadMapAllocator>;
LoadHTType AvailableLoads;
+
+ // A scoped hash table mapping memory locations (represented as typed
+ // addresses) to generation numbers at which that memory location became
+ // (henceforth indefinitely) invariant.
+ using InvariantMapAllocator =
+ RecyclingAllocator<BumpPtrAllocator,
+ ScopedHashTableVal<MemoryLocation, unsigned>>;
+ using InvariantHTType =
+ ScopedHashTable<MemoryLocation, unsigned, DenseMapInfo<MemoryLocation>,
+ InvariantMapAllocator>;
+ InvariantHTType AvailableInvariants;
- /// \brief A scoped hash table of the current values of read-only call
+ /// A scoped hash table of the current values of read-only call
/// values.
///
/// It uses the same generation count as loads.
@@ -382,10 +404,10 @@ public:
ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>;
CallHTType AvailableCalls;
- /// \brief This is the current generation of the memory value.
+ /// This is the current generation of the memory value.
unsigned CurrentGeneration = 0;
- /// \brief Set up the EarlyCSE runner for a particular function.
+ /// Set up the EarlyCSE runner for a particular function.
EarlyCSE(const DataLayout &DL, const TargetLibraryInfo &TLI,
const TargetTransformInfo &TTI, DominatorTree &DT,
AssumptionCache &AC, MemorySSA *MSSA)
@@ -401,15 +423,16 @@ private:
class NodeScope {
public:
NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
- CallHTType &AvailableCalls)
- : Scope(AvailableValues), LoadScope(AvailableLoads),
- CallScope(AvailableCalls) {}
+ InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls)
+ : Scope(AvailableValues), LoadScope(AvailableLoads),
+ InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {}
NodeScope(const NodeScope &) = delete;
NodeScope &operator=(const NodeScope &) = delete;
private:
ScopedHTType::ScopeTy Scope;
LoadHTType::ScopeTy LoadScope;
+ InvariantHTType::ScopeTy InvariantScope;
CallHTType::ScopeTy CallScope;
};
@@ -420,10 +443,13 @@ private:
class StackNode {
public:
StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
- CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n,
- DomTreeNode::iterator child, DomTreeNode::iterator end)
+ InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
+ unsigned cg, DomTreeNode *n, DomTreeNode::iterator child,
+ DomTreeNode::iterator end)
: CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
- EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls)
+ EndIter(end),
+ Scopes(AvailableValues, AvailableLoads, AvailableInvariants,
+ AvailableCalls)
{}
StackNode(const StackNode &) = delete;
StackNode &operator=(const StackNode &) = delete;
@@ -455,7 +481,7 @@ private:
bool Processed = false;
};
- /// \brief Wrapper class to handle memory instructions, including loads,
+ /// Wrapper class to handle memory instructions, including loads,
/// stores and intrinsic loads and stores defined by the target.
class ParseMemoryInst {
public:
@@ -532,12 +558,7 @@ private:
Value *getPointerOperand() const {
if (IsTargetMemInst) return Info.PtrVal;
- if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
- return LI->getPointerOperand();
- } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
- return SI->getPointerOperand();
- }
- return nullptr;
+ return getLoadStorePointerOperand(Inst);
}
bool mayReadFromMemory() const {
@@ -558,6 +579,9 @@ private:
bool processNode(DomTreeNode *Node);
+ bool handleBranchCondition(Instruction *CondInst, const BranchInst *BI,
+ const BasicBlock *BB, const BasicBlock *Pred);
+
Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
if (auto *LI = dyn_cast<LoadInst>(Inst))
return LI;
@@ -568,6 +592,10 @@ private:
ExpectedType);
}
+ /// Return true if the instruction is known to only operate on memory
+ /// provably invariant in the given "generation".
+ bool isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt);
+
bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration,
Instruction *EarlierInst, Instruction *LaterInst);
@@ -661,6 +689,79 @@ bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration,
return MSSA->dominates(LaterDef, EarlierMA);
}
+bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) {
+ // A location loaded from with an invariant_load is assumed to *never* change
+ // within the visible scope of the compilation.
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ if (LI->getMetadata(LLVMContext::MD_invariant_load))
+ return true;
+
+ auto MemLocOpt = MemoryLocation::getOrNone(I);
+ if (!MemLocOpt)
+ // "target" intrinsic forms of loads aren't currently known to
+ // MemoryLocation::get. TODO
+ return false;
+ MemoryLocation MemLoc = *MemLocOpt;
+ if (!AvailableInvariants.count(MemLoc))
+ return false;
+
+ // Is the generation at which this became invariant older than the
+ // current one?
+ return AvailableInvariants.lookup(MemLoc) <= GenAt;
+}
+
+bool EarlyCSE::handleBranchCondition(Instruction *CondInst,
+ const BranchInst *BI, const BasicBlock *BB,
+ const BasicBlock *Pred) {
+ assert(BI->isConditional() && "Should be a conditional branch!");
+ assert(BI->getCondition() == CondInst && "Wrong condition?");
+ assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
+ auto *TorF = (BI->getSuccessor(0) == BB)
+ ? ConstantInt::getTrue(BB->getContext())
+ : ConstantInt::getFalse(BB->getContext());
+ auto MatchBinOp = [](Instruction *I, unsigned Opcode) {
+ if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(I))
+ return BOp->getOpcode() == Opcode;
+ return false;
+ };
+ // If the condition is AND operation, we can propagate its operands into the
+ // true branch. If it is OR operation, we can propagate them into the false
+ // branch.
+ unsigned PropagateOpcode =
+ (BI->getSuccessor(0) == BB) ? Instruction::And : Instruction::Or;
+
+ bool MadeChanges = false;
+ SmallVector<Instruction *, 4> WorkList;
+ SmallPtrSet<Instruction *, 4> Visited;
+ WorkList.push_back(CondInst);
+ while (!WorkList.empty()) {
+ Instruction *Curr = WorkList.pop_back_val();
+
+ AvailableValues.insert(Curr, TorF);
+ LLVM_DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
+ << Curr->getName() << "' as " << *TorF << " in "
+ << BB->getName() << "\n");
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ } else {
+ // Replace all dominated uses with the known value.
+ if (unsigned Count = replaceDominatedUsesWith(Curr, TorF, DT,
+ BasicBlockEdge(Pred, BB))) {
+ NumCSECVP += Count;
+ MadeChanges = true;
+ }
+ }
+
+ if (MatchBinOp(Curr, PropagateOpcode))
+ for (auto &Op : cast<BinaryOperator>(Curr)->operands())
+ if (Instruction *OPI = dyn_cast<Instruction>(Op))
+ if (SimpleValue::canHandle(OPI) && Visited.insert(OPI).second)
+ WorkList.push_back(OPI);
+ }
+
+ return MadeChanges;
+}
+
bool EarlyCSE::processNode(DomTreeNode *Node) {
bool Changed = false;
BasicBlock *BB = Node->getBlock();
@@ -684,22 +785,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
auto *BI = dyn_cast<BranchInst>(Pred->getTerminator());
if (BI && BI->isConditional()) {
auto *CondInst = dyn_cast<Instruction>(BI->getCondition());
- if (CondInst && SimpleValue::canHandle(CondInst)) {
- assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
- auto *TorF = (BI->getSuccessor(0) == BB)
- ? ConstantInt::getTrue(BB->getContext())
- : ConstantInt::getFalse(BB->getContext());
- AvailableValues.insert(CondInst, TorF);
- DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
- << CondInst->getName() << "' as " << *TorF << " in "
- << BB->getName() << "\n");
- // Replace all dominated uses with the known value.
- if (unsigned Count = replaceDominatedUsesWith(
- CondInst, TorF, DT, BasicBlockEdge(Pred, BB))) {
- Changed = true;
- NumCSECVP += Count;
- }
- }
+ if (CondInst && SimpleValue::canHandle(CondInst))
+ Changed |= handleBranchCondition(CondInst, BI, BB, Pred);
}
}
@@ -716,7 +803,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// Dead instructions should just be removed.
if (isInstructionTriviallyDead(Inst, &TLI)) {
- DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
+ LLVM_DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ continue;
+ }
+ salvageDebugInfo(*Inst);
removeMSSA(Inst);
Inst->eraseFromParent();
Changed = true;
@@ -732,31 +824,44 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
auto *CondI =
dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0));
if (CondI && SimpleValue::canHandle(CondI)) {
- DEBUG(dbgs() << "EarlyCSE considering assumption: " << *Inst << '\n');
+ LLVM_DEBUG(dbgs() << "EarlyCSE considering assumption: " << *Inst
+ << '\n');
AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
} else
- DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n');
+ LLVM_DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n');
continue;
}
// Skip sideeffect intrinsics, for the same reason as assume intrinsics.
if (match(Inst, m_Intrinsic<Intrinsic::sideeffect>())) {
- DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << *Inst << '\n');
+ LLVM_DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << *Inst << '\n');
continue;
}
- // Skip invariant.start intrinsics since they only read memory, and we can
- // forward values across it. Also, we dont need to consume the last store
- // since the semantics of invariant.start allow us to perform DSE of the
- // last store, if there was a store following invariant.start. Consider:
+ // We can skip all invariant.start intrinsics since they only read memory,
+ // and we can forward values across it. For invariant starts without
+ // invariant ends, we can use the fact that the invariantness never ends to
+ // start a scope in the current generaton which is true for all future
+ // generations. Also, we dont need to consume the last store since the
+ // semantics of invariant.start allow us to perform DSE of the last
+ // store, if there was a store following invariant.start. Consider:
//
// store 30, i8* p
// invariant.start(p)
// store 40, i8* p
// We can DSE the store to 30, since the store 40 to invariant location p
// causes undefined behaviour.
- if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>()))
+ if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>())) {
+ // If there are any uses, the scope might end.
+ if (!Inst->use_empty())
+ continue;
+ auto *CI = cast<CallInst>(Inst);
+ MemoryLocation MemLoc = MemoryLocation::getForArgument(CI, 1, TLI);
+ // Don't start a scope if we already have a better one pushed
+ if (!AvailableInvariants.count(MemLoc))
+ AvailableInvariants.insert(MemLoc, CurrentGeneration);
continue;
+ }
if (match(Inst, m_Intrinsic<Intrinsic::experimental_guard>())) {
if (auto *CondI =
@@ -767,7 +872,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// Is the condition known to be true?
if (isa<ConstantInt>(KnownCond) &&
cast<ConstantInt>(KnownCond)->isOne()) {
- DEBUG(dbgs() << "EarlyCSE removing guard: " << *Inst << '\n');
+ LLVM_DEBUG(dbgs()
+ << "EarlyCSE removing guard: " << *Inst << '\n');
removeMSSA(Inst);
Inst->eraseFromParent();
Changed = true;
@@ -792,29 +898,39 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// If the instruction can be simplified (e.g. X+0 = X) then replace it with
// its simpler value.
if (Value *V = SimplifyInstruction(Inst, SQ)) {
- DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n');
- bool Killed = false;
- if (!Inst->use_empty()) {
- Inst->replaceAllUsesWith(V);
- Changed = true;
- }
- if (isInstructionTriviallyDead(Inst, &TLI)) {
- removeMSSA(Inst);
- Inst->eraseFromParent();
- Changed = true;
- Killed = true;
+ LLVM_DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V
+ << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ } else {
+ bool Killed = false;
+ if (!Inst->use_empty()) {
+ Inst->replaceAllUsesWith(V);
+ Changed = true;
+ }
+ if (isInstructionTriviallyDead(Inst, &TLI)) {
+ removeMSSA(Inst);
+ Inst->eraseFromParent();
+ Changed = true;
+ Killed = true;
+ }
+ if (Changed)
+ ++NumSimplify;
+ if (Killed)
+ continue;
}
- if (Changed)
- ++NumSimplify;
- if (Killed)
- continue;
}
// If this is a simple instruction that we can value number, process it.
if (SimpleValue::canHandle(Inst)) {
// See if the instruction has an available value. If so, use it.
if (Value *V = AvailableValues.lookup(Inst)) {
- DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << " to: " << *V << '\n');
+ LLVM_DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << " to: " << *V
+ << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ continue;
+ }
if (auto *I = dyn_cast<Instruction>(V))
I->andIRFlags(Inst);
Inst->replaceAllUsesWith(V);
@@ -840,6 +956,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
++CurrentGeneration;
}
+ if (MemInst.isInvariantLoad()) {
+ // If we pass an invariant load, we know that memory location is
+ // indefinitely constant from the moment of first dereferenceability.
+ // We conservatively treat the invariant_load as that moment. If we
+ // pass a invariant load after already establishing a scope, don't
+ // restart it since we want to preserve the earliest point seen.
+ auto MemLoc = MemoryLocation::get(Inst);
+ if (!AvailableInvariants.count(MemLoc))
+ AvailableInvariants.insert(MemLoc, CurrentGeneration);
+ }
+
// If we have an available version of this load, and if it is the right
// generation or the load is known to be from an invariant location,
// replace this instruction.
@@ -854,13 +981,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
!MemInst.isVolatile() && MemInst.isUnordered() &&
// We can't replace an atomic load with one which isn't also atomic.
InVal.IsAtomic >= MemInst.isAtomic() &&
- (InVal.IsInvariant || MemInst.isInvariantLoad() ||
+ (isOperatingOnInvariantMemAt(Inst, InVal.Generation) ||
isSameMemGeneration(InVal.Generation, CurrentGeneration,
InVal.DefInst, Inst))) {
Value *Op = getOrCreateResult(InVal.DefInst, Inst->getType());
if (Op != nullptr) {
- DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
- << " to: " << *InVal.DefInst << '\n');
+ LLVM_DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
+ << " to: " << *InVal.DefInst << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ continue;
+ }
if (!Inst->use_empty())
Inst->replaceAllUsesWith(Op);
removeMSSA(Inst);
@@ -875,7 +1006,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
AvailableLoads.insert(
MemInst.getPointerOperand(),
LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
- MemInst.isAtomic(), MemInst.isInvariantLoad()));
+ MemInst.isAtomic()));
LastStore = nullptr;
continue;
}
@@ -898,8 +1029,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
if (InVal.first != nullptr &&
isSameMemGeneration(InVal.second, CurrentGeneration, InVal.first,
Inst)) {
- DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst
- << " to: " << *InVal.first << '\n');
+ LLVM_DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst
+ << " to: " << *InVal.first << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ continue;
+ }
if (!Inst->use_empty())
Inst->replaceAllUsesWith(InVal.first);
removeMSSA(Inst);
@@ -938,8 +1073,9 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
InVal.MatchingId == MemInst.getMatchingId() &&
// We don't yet handle removing stores with ordering of any kind.
!MemInst.isVolatile() && MemInst.isUnordered() &&
- isSameMemGeneration(InVal.Generation, CurrentGeneration,
- InVal.DefInst, Inst)) {
+ (isOperatingOnInvariantMemAt(Inst, InVal.Generation) ||
+ isSameMemGeneration(InVal.Generation, CurrentGeneration,
+ InVal.DefInst, Inst))) {
// It is okay to have a LastStore to a different pointer here if MemorySSA
// tells us that the load and store are from the same memory generation.
// In that case, LastStore should keep its present value since we're
@@ -949,7 +1085,11 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
MemInst.getPointerOperand() ||
MSSA) &&
"can't have an intervening store if not using MemorySSA!");
- DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n');
+ LLVM_DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ continue;
+ }
removeMSSA(Inst);
Inst->eraseFromParent();
Changed = true;
@@ -980,13 +1120,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
!LastStoreMemInst.isVolatile() &&
"Violated invariant");
if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
- DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
- << " due to: " << *Inst << '\n');
- removeMSSA(LastStore);
- LastStore->eraseFromParent();
- Changed = true;
- ++NumDSE;
- LastStore = nullptr;
+ LLVM_DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
+ << " due to: " << *Inst << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ } else {
+ removeMSSA(LastStore);
+ LastStore->eraseFromParent();
+ Changed = true;
+ ++NumDSE;
+ LastStore = nullptr;
+ }
}
// fallthrough - we can exploit information about this store
}
@@ -999,7 +1143,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
AvailableLoads.insert(
MemInst.getPointerOperand(),
LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
- MemInst.isAtomic(), /*IsInvariant=*/false));
+ MemInst.isAtomic()));
// Remember that this was the last unordered store we saw for DSE. We
// don't yet handle DSE on ordered or volatile stores since we don't
@@ -1031,8 +1175,9 @@ bool EarlyCSE::run() {
// Process the root node.
nodesToProcess.push_back(new StackNode(
- AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration,
- DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end()));
+ AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
+ CurrentGeneration, DT.getRootNode(),
+ DT.getRootNode()->begin(), DT.getRootNode()->end()));
// Save the current generation.
unsigned LiveOutGeneration = CurrentGeneration;
@@ -1056,9 +1201,9 @@ bool EarlyCSE::run() {
// Push the next child onto the stack.
DomTreeNode *child = NodeToProcess->nextChild();
nodesToProcess.push_back(
- new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
- NodeToProcess->childGeneration(), child, child->begin(),
- child->end()));
+ new StackNode(AvailableValues, AvailableLoads, AvailableInvariants,
+ AvailableCalls, NodeToProcess->childGeneration(),
+ child, child->begin(), child->end()));
} else {
// It has been processed, and there are no more children to process,
// so delete it and pop it off the stack.
@@ -1097,7 +1242,7 @@ PreservedAnalyses EarlyCSEPass::run(Function &F,
namespace {
-/// \brief A simple and fast domtree-based CSE pass.
+/// A simple and fast domtree-based CSE pass.
///
/// This pass does a simple depth-first walk over the dominator tree,
/// eliminating trivially redundant instructions and using instsimplify to
diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
index 063df779a30b..117b19fb8a42 100644
--- a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -12,10 +12,10 @@
//===----------------------------------------------------------------------===//
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/CFG.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
#define DEBUG_TYPE "flattencfg"
diff --git a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
index b105ece8dc7c..f2828e80bc58 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -138,7 +138,7 @@ void Float2IntPass::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
// Helper - mark I as having been traversed, having range R.
void Float2IntPass::seen(Instruction *I, ConstantRange R) {
- DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n");
+ LLVM_DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n");
auto IT = SeenInsts.find(I);
if (IT != SeenInsts.end())
IT->second = std::move(R);
@@ -359,7 +359,7 @@ bool Float2IntPass::validateAndTransform() {
for (User *U : I->users()) {
Instruction *UI = dyn_cast<Instruction>(U);
if (!UI || SeenInsts.find(UI) == SeenInsts.end()) {
- DEBUG(dbgs() << "F2I: Failing because of " << *U << "\n");
+ LLVM_DEBUG(dbgs() << "F2I: Failing because of " << *U << "\n");
Fail = true;
break;
}
@@ -380,7 +380,7 @@ bool Float2IntPass::validateAndTransform() {
// lower limits, plus one so it can be signed.
unsigned MinBW = std::max(R.getLower().getMinSignedBits(),
R.getUpper().getMinSignedBits()) + 1;
- DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n");
+ LLVM_DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n");
// If we've run off the realms of the exactly representable integers,
// the floating point result will differ from an integer approximation.
@@ -391,11 +391,12 @@ bool Float2IntPass::validateAndTransform() {
unsigned MaxRepresentableBits
= APFloat::semanticsPrecision(ConvertedToTy->getFltSemantics()) - 1;
if (MinBW > MaxRepresentableBits) {
- DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n");
+ LLVM_DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n");
continue;
}
if (MinBW > 64) {
- DEBUG(dbgs() << "F2I: Value requires more than 64 bits to represent!\n");
+ LLVM_DEBUG(
+ dbgs() << "F2I: Value requires more than 64 bits to represent!\n");
continue;
}
@@ -490,7 +491,7 @@ void Float2IntPass::cleanup() {
}
bool Float2IntPass::runImpl(Function &F) {
- DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n");
// Clear out all state.
ECs = EquivalenceClasses<Instruction*>();
SeenInsts.clear();
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
index e2c1eaf58e43..1e0a22cb14b3 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -38,7 +38,9 @@
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/PHITransAddr.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"
@@ -69,7 +71,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
#include "llvm/Transforms/Utils/VNCoercion.h"
#include <algorithm>
@@ -765,6 +766,15 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
if (SSAUpdate.HasValueForBlock(BB))
continue;
+ // If the value is the load that we will be eliminating, and the block it's
+ // available in is the block that the load is in, then don't add it as
+ // SSAUpdater will resolve the value to the relevant phi which may let it
+ // avoid phi construction entirely if there's actually only one value.
+ if (BB == LI->getParent() &&
+ ((AV.AV.isSimpleValue() && AV.AV.getSimpleValue() == LI) ||
+ (AV.AV.isCoercedLoadValue() && AV.AV.getCoercedLoadValue() == LI)))
+ continue;
+
SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LI, gvn));
}
@@ -783,9 +793,10 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
if (Res->getType() != LoadTy) {
Res = getStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
- DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " "
- << *getSimpleValue() << '\n'
- << *Res << '\n' << "\n\n\n");
+ LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset
+ << " " << *getSimpleValue() << '\n'
+ << *Res << '\n'
+ << "\n\n\n");
}
} else if (isCoercedLoadValue()) {
LoadInst *Load = getCoercedLoadValue();
@@ -799,20 +810,21 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
// but then there all of the operations based on it would need to be
// rehashed. Just leave the dead load around.
gvn.getMemDep().removeInstruction(Load);
- DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << " "
- << *getCoercedLoadValue() << '\n'
- << *Res << '\n'
- << "\n\n\n");
+ LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset
+ << " " << *getCoercedLoadValue() << '\n'
+ << *Res << '\n'
+ << "\n\n\n");
}
} else if (isMemIntrinValue()) {
Res = getMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
InsertPt, DL);
- DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
- << " " << *getMemIntrinValue() << '\n'
- << *Res << '\n' << "\n\n\n");
+ LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
+ << " " << *getMemIntrinValue() << '\n'
+ << *Res << '\n'
+ << "\n\n\n");
} else {
assert(isUndefValue() && "Should be UndefVal");
- DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";);
+ LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";);
return UndefValue::get(LoadTy);
}
assert(Res && "failed to materialize?");
@@ -825,7 +837,7 @@ static bool isLifetimeStart(const Instruction *Inst) {
return false;
}
-/// \brief Try to locate the three instruction involved in a missed
+/// Try to locate the three instruction involved in a missed
/// load-elimination case that is due to an intervening store.
static void reportMayClobberedLoad(LoadInst *LI, MemDepResult DepInfo,
DominatorTree *DT,
@@ -914,13 +926,11 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
}
}
// Nothing known about this clobber, have to be conservative
- DEBUG(
- // fast print dep, using operator<< on instruction is too slow.
- dbgs() << "GVN: load ";
- LI->printAsOperand(dbgs());
- Instruction *I = DepInfo.getInst();
- dbgs() << " is clobbered by " << *I << '\n';
- );
+ LLVM_DEBUG(
+ // fast print dep, using operator<< on instruction is too slow.
+ dbgs() << "GVN: load "; LI->printAsOperand(dbgs());
+ Instruction *I = DepInfo.getInst();
+ dbgs() << " is clobbered by " << *I << '\n';);
if (ORE->allowExtraAnalysis(DEBUG_TYPE))
reportMayClobberedLoad(LI, DepInfo, DT, ORE);
@@ -978,12 +988,10 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
}
// Unknown def - must be conservative
- DEBUG(
- // fast print dep, using operator<< on instruction is too slow.
- dbgs() << "GVN: load ";
- LI->printAsOperand(dbgs());
- dbgs() << " has unknown def " << *DepInst << '\n';
- );
+ LLVM_DEBUG(
+ // fast print dep, using operator<< on instruction is too slow.
+ dbgs() << "GVN: load "; LI->printAsOperand(dbgs());
+ dbgs() << " has unknown def " << *DepInst << '\n';);
return false;
}
@@ -1065,7 +1073,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
// It is illegal to move the array access to any point above the guard,
// because if the index is out of bounds we should deoptimize rather than
// access the array.
- // Check that there is no guard in this block above our intruction.
+ // Check that there is no guard in this block above our instruction.
if (!IsSafeToSpeculativelyExecute) {
auto It = FirstImplicitControlFlowInsts.find(TmpBB);
if (It != FirstImplicitControlFlowInsts.end()) {
@@ -1113,9 +1121,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
// If any predecessor block is an EH pad that does not allow non-PHI
// instructions before the terminator, we can't PRE the load.
if (Pred->getTerminator()->isEHPad()) {
- DEBUG(dbgs()
- << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '"
- << Pred->getName() << "': " << *LI << '\n');
+ LLVM_DEBUG(
+ dbgs() << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '"
+ << Pred->getName() << "': " << *LI << '\n');
return false;
}
@@ -1125,15 +1133,16 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
if (Pred->getTerminator()->getNumSuccessors() != 1) {
if (isa<IndirectBrInst>(Pred->getTerminator())) {
- DEBUG(dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '"
- << Pred->getName() << "': " << *LI << '\n');
+ LLVM_DEBUG(
+ dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '"
+ << Pred->getName() << "': " << *LI << '\n');
return false;
}
if (LoadBB->isEHPad()) {
- DEBUG(dbgs()
- << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '"
- << Pred->getName() << "': " << *LI << '\n');
+ LLVM_DEBUG(
+ dbgs() << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '"
+ << Pred->getName() << "': " << *LI << '\n');
return false;
}
@@ -1161,8 +1170,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB);
assert(!PredLoads.count(OrigPred) && "Split edges shouldn't be in map!");
PredLoads[NewPred] = nullptr;
- DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->"
- << LoadBB->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->"
+ << LoadBB->getName() << '\n');
}
// Check if the load can safely be moved to all the unavailable predecessors.
@@ -1186,8 +1195,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
// If we couldn't find or insert a computation of this phi translated value,
// we fail PRE.
if (!LoadPtr) {
- DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: "
- << *LI->getPointerOperand() << "\n");
+ LLVM_DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: "
+ << *LI->getPointerOperand() << "\n");
CanDoPRE = false;
break;
}
@@ -1208,10 +1217,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
// Okay, we can eliminate this load by inserting a reload in the predecessor
// and using PHI construction to get the value in the other predecessors, do
// it.
- DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n');
- DEBUG(if (!NewInsts.empty())
- dbgs() << "INSERTED " << NewInsts.size() << " INSTS: "
- << *NewInsts.back() << '\n');
+ LLVM_DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n');
+ LLVM_DEBUG(if (!NewInsts.empty()) dbgs()
+ << "INSERTED " << NewInsts.size() << " INSTS: " << *NewInsts.back()
+ << '\n');
// Assign value numbers to the new instructions.
for (Instruction *I : NewInsts) {
@@ -1262,7 +1271,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred,
NewLoad));
MD->invalidateCachedPointerInfo(LoadPtr);
- DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n');
+ LLVM_DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n');
}
// Perform PHI construction.
@@ -1320,11 +1329,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
// clobber in the current block. Reject this early.
if (NumDeps == 1 &&
!Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) {
- DEBUG(
- dbgs() << "GVN: non-local load ";
- LI->printAsOperand(dbgs());
- dbgs() << " has unknown dependencies\n";
- );
+ LLVM_DEBUG(dbgs() << "GVN: non-local load "; LI->printAsOperand(dbgs());
+ dbgs() << " has unknown dependencies\n";);
return false;
}
@@ -1353,7 +1359,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
// load, then it is fully redundant and we can use PHI insertion to compute
// its value. Insert PHIs and remove the fully redundant value now.
if (UnavailableBlocks.empty()) {
- DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
+ LLVM_DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
// Perform PHI construction.
Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
@@ -1506,12 +1512,10 @@ bool GVN::processLoad(LoadInst *L) {
// Only handle the local case below
if (!Dep.isDef() && !Dep.isClobber()) {
// This might be a NonFuncLocal or an Unknown
- DEBUG(
- // fast print dep, using operator<< on instruction is too slow.
- dbgs() << "GVN: load ";
- L->printAsOperand(dbgs());
- dbgs() << " has unknown dependence\n";
- );
+ LLVM_DEBUG(
+ // fast print dep, using operator<< on instruction is too slow.
+ dbgs() << "GVN: load "; L->printAsOperand(dbgs());
+ dbgs() << " has unknown dependence\n";);
return false;
}
@@ -1695,8 +1699,8 @@ bool GVN::replaceOperandsWithConsts(Instruction *Instr) const {
if (it != ReplaceWithConstMap.end()) {
assert(!isa<Constant>(Operand) &&
"Replacing constants with constants is invalid");
- DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " << *it->second
- << " in instruction " << *Instr << '\n');
+ LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with "
+ << *it->second << " in instruction " << *Instr << '\n');
Instr->setOperand(OpNum, it->second);
Changed = true;
}
@@ -2038,7 +2042,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
unsigned Iteration = 0;
while (ShouldContinue) {
- DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n");
+ LLVM_DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n");
ShouldContinue = iterateOnFunction(F);
Changed |= ShouldContinue;
++Iteration;
@@ -2104,9 +2108,10 @@ bool GVN::processBlock(BasicBlock *BB) {
const Instruction *MaybeFirstICF = FirstImplicitControlFlowInsts.lookup(BB);
for (auto *I : InstrsToErase) {
assert(I->getParent() == BB && "Removing instruction from wrong block?");
- DEBUG(dbgs() << "GVN removed: " << *I << '\n');
+ LLVM_DEBUG(dbgs() << "GVN removed: " << *I << '\n');
+ salvageDebugInfo(*I);
if (MD) MD->removeInstruction(I);
- DEBUG(verifyRemoved(I));
+ LLVM_DEBUG(verifyRemoved(I));
if (MaybeFirstICF == I) {
// We have erased the first ICF in block. The map needs to be updated.
InvalidateImplicitCF = true;
@@ -2288,7 +2293,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
PREInstr = CurInst->clone();
if (!performScalarPREInsertion(PREInstr, PREPred, CurrentBlock, ValNo)) {
// If we failed insertion, make sure we remove the instruction.
- DEBUG(verifyRemoved(PREInstr));
+ LLVM_DEBUG(verifyRemoved(PREInstr));
PREInstr->deleteValue();
return false;
}
@@ -2326,10 +2331,10 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
VN.erase(CurInst);
removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
- DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
+ LLVM_DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
if (MD)
MD->removeInstruction(CurInst);
- DEBUG(verifyRemoved(CurInst));
+ LLVM_DEBUG(verifyRemoved(CurInst));
bool InvalidateImplicitCF =
FirstImplicitControlFlowInsts.lookup(CurInst->getParent()) == CurInst;
// FIXME: Intended to be markInstructionForDeletion(CurInst), but it causes
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index 026fab5dbd3b..6d2b25cf6013 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -48,6 +48,7 @@
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/BasicBlock.h"
@@ -72,7 +73,6 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
#include <iterator>
@@ -534,7 +534,7 @@ private:
if (NewBB == DBB && !MSSA->isLiveOnEntryDef(D))
if (auto *UD = dyn_cast<MemoryUseOrDef>(D))
- if (firstInBB(NewPt, UD->getMemoryInst()))
+ if (!firstInBB(UD->getMemoryInst(), NewPt))
// Cannot move the load or store to NewPt above its definition in D.
return false;
@@ -570,7 +570,7 @@ private:
// The ides is inspired from:
// "Partial Redundancy Elimination in SSA Form"
// ROBERT KENNEDY, SUN CHAN, SHIN-MING LIU, RAYMOND LO, PENG TU and FRED CHOW
- // They use similar idea in the forward graph to to find fully redundant and
+ // They use similar idea in the forward graph to find fully redundant and
// partially redundant expressions, here it is used in the inverse graph to
// find fully anticipable instructions at merge point (post-dominator in
// the inverse CFG).
@@ -578,7 +578,7 @@ private:
// Returns true when the values are flowing out to each edge.
bool valueAnticipable(CHIArgs C, TerminatorInst *TI) const {
- if (TI->getNumSuccessors() > (unsigned)std::distance(C.begin(), C.end()))
+ if (TI->getNumSuccessors() > (unsigned)size(C))
return false; // Not enough args in this CHI.
for (auto CHI : C) {
@@ -622,7 +622,7 @@ private:
// Iterate in reverse order to keep lower ranked values on the top.
for (std::pair<VNType, Instruction *> &VI : reverse(it1->second)) {
// Get the value of instruction I
- DEBUG(dbgs() << "\nPushing on stack: " << *VI.second);
+ LLVM_DEBUG(dbgs() << "\nPushing on stack: " << *VI.second);
RenameStack[VI.first].push_back(VI.second);
}
}
@@ -636,7 +636,7 @@ private:
if (P == CHIBBs.end()) {
continue;
}
- DEBUG(dbgs() << "\nLooking at CHIs in: " << Pred->getName(););
+ LLVM_DEBUG(dbgs() << "\nLooking at CHIs in: " << Pred->getName(););
// A CHI is found (BB -> Pred is an edge in the CFG)
// Pop the stack until Top(V) = Ve.
auto &VCHI = P->second;
@@ -651,9 +651,9 @@ private:
DT->properlyDominates(Pred, si->second.back()->getParent())) {
C.Dest = BB; // Assign the edge
C.I = si->second.pop_back_val(); // Assign the argument
- DEBUG(dbgs() << "\nCHI Inserted in BB: " << C.Dest->getName()
- << *C.I << ", VN: " << C.VN.first << ", "
- << C.VN.second);
+ LLVM_DEBUG(dbgs()
+ << "\nCHI Inserted in BB: " << C.Dest->getName() << *C.I
+ << ", VN: " << C.VN.first << ", " << C.VN.second);
}
// Move to next CHI of a different value
It = std::find_if(It, VCHI.end(),
@@ -748,11 +748,11 @@ private:
// TODO: Remove fully-redundant expressions.
// Get instruction from the Map, assume that all the Instructions
// with same VNs have same rank (this is an approximation).
- std::sort(Ranks.begin(), Ranks.end(),
- [this, &Map](const VNType &r1, const VNType &r2) {
- return (rank(*Map.lookup(r1).begin()) <
- rank(*Map.lookup(r2).begin()));
- });
+ llvm::sort(Ranks.begin(), Ranks.end(),
+ [this, &Map](const VNType &r1, const VNType &r2) {
+ return (rank(*Map.lookup(r1).begin()) <
+ rank(*Map.lookup(r2).begin()));
+ });
// - Sort VNs according to their rank, and start with lowest ranked VN
// - Take a VN and for each instruction with same VN
@@ -798,8 +798,8 @@ private:
// Ignore spurious PDFs.
if (DT->properlyDominates(IDFB, V[i]->getParent())) {
OutValue[IDFB].push_back(C);
- DEBUG(dbgs() << "\nInsertion a CHI for BB: " << IDFB->getName()
- << ", for Insn: " << *V[i]);
+ LLVM_DEBUG(dbgs() << "\nInsertion a CHI for BB: " << IDFB->getName()
+ << ", for Insn: " << *V[i]);
}
}
}
@@ -1200,6 +1200,7 @@ INITIALIZE_PASS_BEGIN(GVNHoistLegacyPass, "gvn-hoist",
INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(GVNHoistLegacyPass, "gvn-hoist",
"Early GVN Hoisting of Expressions", false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp b/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp
index 5594c29bbd9f..28c5940db1e0 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -48,6 +48,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
@@ -71,7 +72,6 @@
#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/Transforms/Scalar/GVNExpression.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
@@ -239,7 +239,7 @@ public:
SmallVector<std::pair<BasicBlock *, Value *>, 4> Ops;
for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I)
Ops.push_back({PN->getIncomingBlock(I), PN->getIncomingValue(I)});
- std::sort(Ops.begin(), Ops.end());
+ llvm::sort(Ops.begin(), Ops.end());
for (auto &P : Ops) {
Blocks.push_back(P.first);
Values.push_back(P.second);
@@ -361,7 +361,7 @@ public:
for (auto &U : I->uses())
op_push_back(U.getUser());
- std::sort(op_begin(), op_end());
+ llvm::sort(op_begin(), op_end());
}
void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; }
@@ -561,7 +561,8 @@ public:
GVNSink() = default;
bool run(Function &F) {
- DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "GVNSink: running on function @" << F.getName()
+ << "\n");
unsigned NumSunk = 0;
ReversePostOrderTraversal<Function*> RPOT(&F);
@@ -629,15 +630,15 @@ Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents) {
auto Insts = *LRI;
- DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I
- : Insts) {
+ LLVM_DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I
+ : Insts) {
I->dump();
} dbgs() << " ]\n";);
DenseMap<uint32_t, unsigned> VNums;
for (auto *I : Insts) {
uint32_t N = VN.lookupOrAdd(I);
- DEBUG(dbgs() << " VN=" << Twine::utohexstr(N) << " for" << *I << "\n");
+ LLVM_DEBUG(dbgs() << " VN=" << Twine::utohexstr(N) << " for" << *I << "\n");
if (N == ~0U)
return None;
VNums[N]++;
@@ -749,8 +750,8 @@ Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
}
unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
- DEBUG(dbgs() << "GVNSink: running on basic block ";
- BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "GVNSink: running on basic block ";
+ BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
SmallVector<BasicBlock *, 4> Preds;
for (auto *B : predecessors(BBEnd)) {
auto *T = B->getTerminator();
@@ -761,7 +762,7 @@ unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
}
if (Preds.size() < 2)
return 0;
- std::sort(Preds.begin(), Preds.end());
+ llvm::sort(Preds.begin(), Preds.end());
unsigned NumOrigPreds = Preds.size();
// We can only sink instructions through unconditional branches.
@@ -794,23 +795,23 @@ unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
Candidates.begin(), Candidates.end(),
[](const SinkingInstructionCandidate &A,
const SinkingInstructionCandidate &B) { return A > B; });
- DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C
- : Candidates) dbgs()
- << " " << C << "\n";);
+ LLVM_DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C
+ : Candidates) dbgs()
+ << " " << C << "\n";);
// Pick the top candidate, as long it is positive!
if (Candidates.empty() || Candidates.front().Cost <= 0)
return 0;
auto C = Candidates.front();
- DEBUG(dbgs() << " -- Sinking: " << C << "\n");
+ LLVM_DEBUG(dbgs() << " -- Sinking: " << C << "\n");
BasicBlock *InsertBB = BBEnd;
if (C.Blocks.size() < NumOrigPreds) {
- DEBUG(dbgs() << " -- Splitting edge to "; BBEnd->printAsOperand(dbgs());
- dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " -- Splitting edge to ";
+ BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
InsertBB = SplitBlockPredecessors(BBEnd, C.Blocks, ".gvnsink.split");
if (!InsertBB) {
- DEBUG(dbgs() << " -- FAILED to split edge!\n");
+ LLVM_DEBUG(dbgs() << " -- FAILED to split edge!\n");
// Edge couldn't be split.
return 0;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index c4aeccb85ca7..ad1598d7b8bf 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -40,9 +40,11 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/GuardWidening.h"
+#include <functional>
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/ConstantRange.h"
@@ -53,6 +55,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
using namespace llvm;
@@ -62,9 +65,14 @@ namespace {
class GuardWideningImpl {
DominatorTree &DT;
- PostDominatorTree &PDT;
+ PostDominatorTree *PDT;
LoopInfo &LI;
+ /// Together, these describe the region of interest. This might be all of
+ /// the blocks within a function, or only a given loop's blocks and preheader.
+ DomTreeNode *Root;
+ std::function<bool(BasicBlock*)> BlockFilter;
+
/// The set of guards whose conditions have been widened into dominating
/// guards.
SmallVector<IntrinsicInst *, 16> EliminatedGuards;
@@ -205,39 +213,15 @@ class GuardWideningImpl {
}
public:
- explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree &PDT,
- LoopInfo &LI)
- : DT(DT), PDT(PDT), LI(LI) {}
+
+ explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree *PDT,
+ LoopInfo &LI, DomTreeNode *Root,
+ std::function<bool(BasicBlock*)> BlockFilter)
+ : DT(DT), PDT(PDT), LI(LI), Root(Root), BlockFilter(BlockFilter) {}
/// The entry point for this pass.
bool run();
};
-
-struct GuardWideningLegacyPass : public FunctionPass {
- static char ID;
- GuardWideningPass Impl;
-
- GuardWideningLegacyPass() : FunctionPass(ID) {
- initializeGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
- return GuardWideningImpl(
- getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
- getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(),
- getAnalysis<LoopInfoWrapperPass>().getLoopInfo()).run();
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<PostDominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- }
-};
-
}
bool GuardWideningImpl::run() {
@@ -246,9 +230,12 @@ bool GuardWideningImpl::run() {
DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> GuardsInBlock;
bool Changed = false;
- for (auto DFI = df_begin(DT.getRootNode()), DFE = df_end(DT.getRootNode());
+ for (auto DFI = df_begin(Root), DFE = df_end(Root);
DFI != DFE; ++DFI) {
auto *BB = (*DFI)->getBlock();
+ if (!BlockFilter(BB))
+ continue;
+
auto &CurrentList = GuardsInBlock[BB];
for (auto &I : *BB)
@@ -259,6 +246,7 @@ bool GuardWideningImpl::run() {
Changed |= eliminateGuardViaWidening(II, DFI, GuardsInBlock);
}
+ assert(EliminatedGuards.empty() || Changed);
for (auto *II : EliminatedGuards)
if (!WidenedGuards.count(II))
II->eraseFromParent();
@@ -278,6 +266,8 @@ bool GuardWideningImpl::eliminateGuardViaWidening(
// for the most profit.
for (unsigned i = 0, e = DFSI.getPathLength(); i != e; ++i) {
auto *CurBB = DFSI.getPath(i)->getBlock();
+ if (!BlockFilter(CurBB))
+ break;
auto *CurLoop = LI.getLoopFor(CurBB);
assert(GuardsInBlock.count(CurBB) && "Must have been populated by now!");
const auto &GuardsInCurBB = GuardsInBlock.find(CurBB)->second;
@@ -312,9 +302,9 @@ bool GuardWideningImpl::eliminateGuardViaWidening(
for (auto *Candidate : make_range(I, E)) {
auto Score =
computeWideningScore(GuardInst, GuardInstLoop, Candidate, CurLoop);
- DEBUG(dbgs() << "Score between " << *GuardInst->getArgOperand(0)
- << " and " << *Candidate->getArgOperand(0) << " is "
- << scoreTypeToString(Score) << "\n");
+ LLVM_DEBUG(dbgs() << "Score between " << *GuardInst->getArgOperand(0)
+ << " and " << *Candidate->getArgOperand(0) << " is "
+ << scoreTypeToString(Score) << "\n");
if (Score > BestScoreSoFar) {
BestScoreSoFar = Score;
BestSoFar = Candidate;
@@ -323,15 +313,16 @@ bool GuardWideningImpl::eliminateGuardViaWidening(
}
if (BestScoreSoFar == WS_IllegalOrNegative) {
- DEBUG(dbgs() << "Did not eliminate guard " << *GuardInst << "\n");
+ LLVM_DEBUG(dbgs() << "Did not eliminate guard " << *GuardInst << "\n");
return false;
}
assert(BestSoFar != GuardInst && "Should have never visited same guard!");
assert(DT.dominates(BestSoFar, GuardInst) && "Should be!");
- DEBUG(dbgs() << "Widening " << *GuardInst << " into " << *BestSoFar
- << " with score " << scoreTypeToString(BestScoreSoFar) << "\n");
+ LLVM_DEBUG(dbgs() << "Widening " << *GuardInst << " into " << *BestSoFar
+ << " with score " << scoreTypeToString(BestScoreSoFar)
+ << "\n");
widenGuard(BestSoFar, GuardInst->getArgOperand(0));
GuardInst->setArgOperand(0, ConstantInt::getTrue(GuardInst->getContext()));
EliminatedGuards.push_back(GuardInst);
@@ -345,6 +336,8 @@ GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore(
bool HoistingOutOfLoop = false;
if (DominatingGuardLoop != DominatedGuardLoop) {
+ // Be conservative and don't widen into a sibling loop. TODO: If the
+ // sibling is colder, we should consider allowing this.
if (DominatingGuardLoop &&
!DominatingGuardLoop->contains(DominatedGuardLoop))
return WS_IllegalOrNegative;
@@ -355,9 +348,14 @@ GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore(
if (!isAvailableAt(DominatedGuard->getArgOperand(0), DominatingGuard))
return WS_IllegalOrNegative;
- bool HoistingOutOfIf =
- !PDT.dominates(DominatedGuard->getParent(), DominatingGuard->getParent());
-
+ // If the guard was conditional executed, it may never be reached
+ // dynamically. There are two potential downsides to hoisting it out of the
+ // conditionally executed region: 1) we may spuriously deopt without need and
+ // 2) we have the extra cost of computing the guard condition in the common
+ // case. At the moment, we really only consider the second in our heuristic
+ // here. TODO: evaluate cost model for spurious deopt
+ // NOTE: As written, this also lets us hoist right over another guard which
+ // is essentially just another spelling for control flow.
if (isWideningCondProfitable(DominatedGuard->getArgOperand(0),
DominatingGuard->getArgOperand(0)))
return HoistingOutOfLoop ? WS_VeryPositive : WS_Positive;
@@ -365,7 +363,26 @@ GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore(
if (HoistingOutOfLoop)
return WS_Positive;
- return HoistingOutOfIf ? WS_IllegalOrNegative : WS_Neutral;
+ // Returns true if we might be hoisting above explicit control flow. Note
+ // that this completely ignores implicit control flow (guards, calls which
+ // throw, etc...). That choice appears arbitrary.
+ auto MaybeHoistingOutOfIf = [&]() {
+ auto *DominatingBlock = DominatingGuard->getParent();
+ auto *DominatedBlock = DominatedGuard->getParent();
+
+ // Same Block?
+ if (DominatedBlock == DominatingBlock)
+ return false;
+ // Obvious successor (common loop header/preheader case)
+ if (DominatedBlock == DominatingBlock->getUniqueSuccessor())
+ return false;
+ // TODO: diamond, triangle cases
+ if (!PDT) return true;
+ return !PDT->dominates(DominatedGuard->getParent(),
+ DominatingGuard->getParent());
+ };
+
+ return MaybeHoistingOutOfIf() ? WS_IllegalOrNegative : WS_Neutral;
}
bool GuardWideningImpl::isAvailableAt(Value *V, Instruction *Loc,
@@ -581,9 +598,9 @@ bool GuardWideningImpl::combineRangeChecks(
// CurrentChecks.size() will typically be 3 here, but so far there has been
// no need to hard-code that fact.
- std::sort(CurrentChecks.begin(), CurrentChecks.end(),
- [&](const GuardWideningImpl::RangeCheck &LHS,
- const GuardWideningImpl::RangeCheck &RHS) {
+ llvm::sort(CurrentChecks.begin(), CurrentChecks.end(),
+ [&](const GuardWideningImpl::RangeCheck &LHS,
+ const GuardWideningImpl::RangeCheck &RHS) {
return LHS.getOffsetValue().slt(RHS.getOffsetValue());
});
@@ -651,19 +668,6 @@ bool GuardWideningImpl::combineRangeChecks(
return RangeChecksOut.size() != OldCount;
}
-PreservedAnalyses GuardWideningPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &LI = AM.getResult<LoopAnalysis>(F);
- auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
- if (!GuardWideningImpl(DT, PDT, LI).run())
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- return PA;
-}
-
#ifndef NDEBUG
StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
switch (WS) {
@@ -681,7 +685,82 @@ StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
}
#endif
+PreservedAnalyses GuardWideningPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+ if (!GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(),
+ [](BasicBlock*) { return true; } ).run())
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+namespace {
+struct GuardWideningLegacyPass : public FunctionPass {
+ static char ID;
+
+ GuardWideningLegacyPass() : FunctionPass(ID) {
+ initializeGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+ return GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(),
+ [](BasicBlock*) { return true; } ).run();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<PostDominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ }
+};
+
+/// Same as above, but restricted to a single loop at a time. Can be
+/// scheduled with other loop passes w/o breaking out of LPM
+struct LoopGuardWideningLegacyPass : public LoopPass {
+ static char ID;
+
+ LoopGuardWideningLegacyPass() : LoopPass(ID) {
+ initializeLoopGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
+ auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+ BasicBlock *RootBB = L->getLoopPredecessor();
+ if (!RootBB)
+ RootBB = L->getHeader();
+ auto BlockFilter = [&](BasicBlock *BB) {
+ return BB == RootBB || L->contains(BB);
+ };
+ return GuardWideningImpl(DT, PDT, LI,
+ DT.getNode(RootBB), BlockFilter).run();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ getLoopAnalysisUsage(AU);
+ AU.addPreserved<PostDominatorTreeWrapperPass>();
+ }
+};
+}
+
char GuardWideningLegacyPass::ID = 0;
+char LoopGuardWideningLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(GuardWideningLegacyPass, "guard-widening", "Widen guards",
false, false)
@@ -691,6 +770,20 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(GuardWideningLegacyPass, "guard-widening", "Widen guards",
false, false)
+INITIALIZE_PASS_BEGIN(LoopGuardWideningLegacyPass, "loop-guard-widening",
+ "Widen guards (within a single loop, as a loop pass)",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(LoopGuardWideningLegacyPass, "loop-guard-widening",
+ "Widen guards (within a single loop, as a loop pass)",
+ false, false)
+
FunctionPass *llvm::createGuardWideningPass() {
return new GuardWideningLegacyPass();
}
+
+Pass *llvm::createLoopGuardWideningPass() {
+ return new LoopGuardWideningLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 221fe57581ca..8656e88b79cb 100644
--- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -43,6 +43,7 @@
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/ConstantRange.h"
@@ -77,7 +78,6 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/SimplifyIndVar.h"
#include <cassert>
@@ -210,8 +210,8 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
if (FromBase == ToBase)
return true;
- DEBUG(dbgs() << "INDVARS: GEP rewrite bail out "
- << *FromBase << " != " << *ToBase << "\n");
+ LLVM_DEBUG(dbgs() << "INDVARS: GEP rewrite bail out " << *FromBase
+ << " != " << *ToBase << "\n");
return false;
}
@@ -653,8 +653,9 @@ void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
Value *ExitVal =
expandSCEVIfNeeded(Rewriter, ExitValue, L, Inst, PN->getType());
- DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n'
- << " LoopVal = " << *Inst << "\n");
+ LLVM_DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal
+ << '\n'
+ << " LoopVal = " << *Inst << "\n");
if (!isValidRewrite(Inst, ExitVal)) {
DeadInsts.push_back(ExitVal);
@@ -1084,7 +1085,7 @@ Instruction *WidenIV::cloneBitwiseIVUser(NarrowIVDefUse DU) {
Instruction *NarrowDef = DU.NarrowDef;
Instruction *WideDef = DU.WideDef;
- DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n");
+ LLVM_DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n");
// Replace NarrowDef operands with WideDef. Otherwise, we don't know anything
// about the narrow operand yet so must insert a [sz]ext. It is probably loop
@@ -1115,7 +1116,7 @@ Instruction *WidenIV::cloneArithmeticIVUser(NarrowIVDefUse DU,
Instruction *NarrowDef = DU.NarrowDef;
Instruction *WideDef = DU.WideDef;
- DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
+ LLVM_DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
unsigned IVOpIdx = (NarrowUse->getOperand(0) == NarrowDef) ? 0 : 1;
@@ -1315,8 +1316,8 @@ WidenIV::WidenedRecTy WidenIV::getWideRecurrence(NarrowIVDefUse DU) {
/// This IV user cannot be widen. Replace this use of the original narrow IV
/// with a truncation of the new wide IV to isolate and eliminate the narrow IV.
static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT, LoopInfo *LI) {
- DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef
- << " for user " << *DU.NarrowUse << "\n");
+ LLVM_DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef << " for user "
+ << *DU.NarrowUse << "\n");
IRBuilder<> Builder(
getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));
Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
@@ -1396,8 +1397,8 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType());
UsePhi->replaceAllUsesWith(Trunc);
DeadInsts.emplace_back(UsePhi);
- DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi
- << " to " << *WidePhi << "\n");
+ LLVM_DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi << " to "
+ << *WidePhi << "\n");
}
return nullptr;
}
@@ -1428,15 +1429,16 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
// A wider extend was hidden behind a narrower one. This may induce
// another round of IV widening in which the intermediate IV becomes
// dead. It should be very rare.
- DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi
- << " not wide enough to subsume " << *DU.NarrowUse << "\n");
+ LLVM_DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi
+ << " not wide enough to subsume " << *DU.NarrowUse
+ << "\n");
DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
NewDef = DU.NarrowUse;
}
}
if (NewDef != DU.NarrowUse) {
- DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse
- << " replaced by " << *DU.WideDef << "\n");
+ LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse
+ << " replaced by " << *DU.WideDef << "\n");
++NumElimExt;
DU.NarrowUse->replaceAllUsesWith(NewDef);
DeadInsts.emplace_back(DU.NarrowUse);
@@ -1491,8 +1493,9 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
// absolutely guarantee it. Hence the following failsafe check. In rare cases
// where it fails, we simply throw away the newly created wide use.
if (WideAddRec.first != SE->getSCEV(WideUse)) {
- DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse
- << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec.first << "\n");
+ LLVM_DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse << ": "
+ << *SE->getSCEV(WideUse) << " != " << *WideAddRec.first
+ << "\n");
DeadInsts.emplace_back(WideUse);
return nullptr;
}
@@ -1597,7 +1600,7 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
WideInc->setDebugLoc(OrigInc->getDebugLoc());
}
- DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n");
+ LLVM_DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n");
++NumWidened;
// Traverse the def-use chain using a worklist starting at the original IV.
@@ -2231,12 +2234,12 @@ linearFunctionTestReplace(Loop *L,
else
P = ICmpInst::ICMP_EQ;
- DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n"
- << " LHS:" << *CmpIndVar << '\n'
- << " op:\t"
- << (P == ICmpInst::ICMP_NE ? "!=" : "==") << "\n"
- << " RHS:\t" << *ExitCnt << "\n"
- << " IVCount:\t" << *IVCount << "\n");
+ LLVM_DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n"
+ << " LHS:" << *CmpIndVar << '\n'
+ << " op:\t" << (P == ICmpInst::ICMP_NE ? "!=" : "==")
+ << "\n"
+ << " RHS:\t" << *ExitCnt << "\n"
+ << " IVCount:\t" << *IVCount << "\n");
IRBuilder<> Builder(BI);
@@ -2272,7 +2275,7 @@ linearFunctionTestReplace(Loop *L,
NewLimit = Start + Count;
ExitCnt = ConstantInt::get(CmpIndVar->getType(), NewLimit);
- DEBUG(dbgs() << " Widen RHS:\t" << *ExitCnt << "\n");
+ LLVM_DEBUG(dbgs() << " Widen RHS:\t" << *ExitCnt << "\n");
} else {
// We try to extend trip count first. If that doesn't work we truncate IV.
// Zext(trunc(IV)) == IV implies equivalence of the following two:
diff --git a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index cf98088111be..e2f29705f2dd 100644
--- a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -43,6 +43,7 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Scalar/InductiveRangeCheckElimination.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/None.h"
@@ -52,6 +53,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
@@ -179,10 +181,7 @@ public:
OS << " Step: ";
Step->print(OS);
OS << " End: ";
- if (End)
- End->print(OS);
- else
- OS << "(null)";
+ End->print(OS);
OS << "\n CheckUse: ";
getCheckUse()->getUser()->print(OS);
OS << " Operand: " << getCheckUse()->getOperandNo() << "\n";
@@ -196,7 +195,7 @@ public:
Use *getCheckUse() const { return CheckUse; }
/// Represents an signed integer range [Range.getBegin(), Range.getEnd()). If
- /// R.getEnd() sle R.getBegin(), then R denotes the empty range.
+ /// R.getEnd() le R.getBegin(), then R denotes the empty range.
class Range {
const SCEV *Begin;
@@ -238,17 +237,31 @@ public:
/// checks, and hence don't end up in \p Checks.
static void
extractRangeChecksFromBranch(BranchInst *BI, Loop *L, ScalarEvolution &SE,
- BranchProbabilityInfo &BPI,
+ BranchProbabilityInfo *BPI,
SmallVectorImpl<InductiveRangeCheck> &Checks);
};
-class InductiveRangeCheckElimination : public LoopPass {
+class InductiveRangeCheckElimination {
+ ScalarEvolution &SE;
+ BranchProbabilityInfo *BPI;
+ DominatorTree &DT;
+ LoopInfo &LI;
+
+public:
+ InductiveRangeCheckElimination(ScalarEvolution &SE,
+ BranchProbabilityInfo *BPI, DominatorTree &DT,
+ LoopInfo &LI)
+ : SE(SE), BPI(BPI), DT(DT), LI(LI) {}
+
+ bool run(Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop);
+};
+
+class IRCELegacyPass : public LoopPass {
public:
static char ID;
- InductiveRangeCheckElimination() : LoopPass(ID) {
- initializeInductiveRangeCheckEliminationPass(
- *PassRegistry::getPassRegistry());
+ IRCELegacyPass() : LoopPass(ID) {
+ initializeIRCELegacyPassPass(*PassRegistry::getPassRegistry());
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -261,14 +274,14 @@ public:
} // end anonymous namespace
-char InductiveRangeCheckElimination::ID = 0;
+char IRCELegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(InductiveRangeCheckElimination, "irce",
+INITIALIZE_PASS_BEGIN(IRCELegacyPass, "irce",
"Inductive range check elimination", false, false)
INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_END(InductiveRangeCheckElimination, "irce",
- "Inductive range check elimination", false, false)
+INITIALIZE_PASS_END(IRCELegacyPass, "irce", "Inductive range check elimination",
+ false, false)
StringRef InductiveRangeCheck::rangeCheckKindToStr(
InductiveRangeCheck::RangeCheckKind RCK) {
@@ -299,13 +312,8 @@ InductiveRangeCheck::RangeCheckKind
InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
ScalarEvolution &SE, Value *&Index,
Value *&Length, bool &IsSigned) {
- auto IsNonNegativeAndNotLoopVarying = [&SE, L](Value *V) {
- const SCEV *S = SE.getSCEV(V);
- if (isa<SCEVCouldNotCompute>(S))
- return false;
-
- return SE.getLoopDisposition(S, L) == ScalarEvolution::LoopInvariant &&
- SE.isKnownNonNegative(S);
+ auto IsLoopInvariant = [&SE, L](Value *V) {
+ return SE.isLoopInvariant(SE.getSCEV(V), L);
};
ICmpInst::Predicate Pred = ICI->getPredicate();
@@ -337,7 +345,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
return RANGE_CHECK_LOWER;
}
- if (IsNonNegativeAndNotLoopVarying(LHS)) {
+ if (IsLoopInvariant(LHS)) {
Index = RHS;
Length = LHS;
return RANGE_CHECK_UPPER;
@@ -349,7 +357,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
LLVM_FALLTHROUGH;
case ICmpInst::ICMP_UGT:
IsSigned = false;
- if (IsNonNegativeAndNotLoopVarying(LHS)) {
+ if (IsLoopInvariant(LHS)) {
Index = RHS;
Length = LHS;
return RANGE_CHECK_BOTH;
@@ -394,8 +402,23 @@ void InductiveRangeCheck::extractRangeChecksFromCond(
if (!IsAffineIndex)
return;
+ const SCEV *End = nullptr;
+ // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L".
+ // We can potentially do much better here.
+ if (Length)
+ End = SE.getSCEV(Length);
+ else {
+ assert(RCKind == InductiveRangeCheck::RANGE_CHECK_LOWER && "invariant!");
+ // So far we can only reach this point for Signed range check. This may
+ // change in future. In this case we will need to pick Unsigned max for the
+ // unsigned range check.
+ unsigned BitWidth = cast<IntegerType>(IndexAddRec->getType())->getBitWidth();
+ const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
+ End = SIntMax;
+ }
+
InductiveRangeCheck IRC;
- IRC.End = Length ? SE.getSCEV(Length) : nullptr;
+ IRC.End = End;
IRC.Begin = IndexAddRec->getStart();
IRC.Step = IndexAddRec->getStepRecurrence(SE);
IRC.CheckUse = &ConditionUse;
@@ -405,15 +428,15 @@ void InductiveRangeCheck::extractRangeChecksFromCond(
}
void InductiveRangeCheck::extractRangeChecksFromBranch(
- BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo &BPI,
+ BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI,
SmallVectorImpl<InductiveRangeCheck> &Checks) {
if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch())
return;
BranchProbability LikelyTaken(15, 16);
- if (!SkipProfitabilityChecks &&
- BPI.getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken)
+ if (!SkipProfitabilityChecks && BPI &&
+ BPI->getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken)
return;
SmallPtrSet<Value *, 8> Visited;
@@ -504,9 +527,8 @@ struct LoopStructure {
}
static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &,
- BranchProbabilityInfo &BPI,
- Loop &,
- const char *&);
+ BranchProbabilityInfo *BPI,
+ Loop &, const char *&);
};
/// This class is used to constrain loops to run within a given iteration space.
@@ -573,7 +595,7 @@ class LoopConstrainer {
// Create the appropriate loop structure needed to describe a cloned copy of
// `Original`. The clone is described by `VM`.
Loop *createClonedLoopStructure(Loop *Original, Loop *Parent,
- ValueToValueMapTy &VM);
+ ValueToValueMapTy &VM, bool IsSubloop);
// Rewrite the iteration space of the loop denoted by (LS, Preheader). The
// iteration space of the rewritten loop ends at ExitLoopAt. The start of the
@@ -625,8 +647,8 @@ class LoopConstrainer {
LLVMContext &Ctx;
ScalarEvolution &SE;
DominatorTree &DT;
- LPPassManager &LPM;
LoopInfo &LI;
+ function_ref<void(Loop *, bool)> LPMAddNewLoop;
// Information about the original loop we started out with.
Loop &OriginalLoop;
@@ -646,12 +668,13 @@ class LoopConstrainer {
LoopStructure MainLoopStructure;
public:
- LoopConstrainer(Loop &L, LoopInfo &LI, LPPassManager &LPM,
+ LoopConstrainer(Loop &L, LoopInfo &LI,
+ function_ref<void(Loop *, bool)> LPMAddNewLoop,
const LoopStructure &LS, ScalarEvolution &SE,
DominatorTree &DT, InductiveRangeCheck::Range R)
: F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()),
- SE(SE), DT(DT), LPM(LPM), LI(LI), OriginalLoop(L), Range(R),
- MainLoopStructure(LS) {}
+ SE(SE), DT(DT), LI(LI), LPMAddNewLoop(LPMAddNewLoop), OriginalLoop(L),
+ Range(R), MainLoopStructure(LS) {}
// Entry point for the algorithm. Returns true on success.
bool run();
@@ -666,56 +689,141 @@ void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block,
PN->setIncomingBlock(i, ReplaceBy);
}
-static bool CanBeMax(ScalarEvolution &SE, const SCEV *S, bool Signed) {
- APInt Max = Signed ?
- APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth()) :
- APInt::getMaxValue(cast<IntegerType>(S->getType())->getBitWidth());
- return SE.getSignedRange(S).contains(Max) &&
- SE.getUnsignedRange(S).contains(Max);
+static bool CannotBeMaxInLoop(const SCEV *BoundSCEV, Loop *L,
+ ScalarEvolution &SE, bool Signed) {
+ unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
+ APInt Max = Signed ? APInt::getSignedMaxValue(BitWidth) :
+ APInt::getMaxValue(BitWidth);
+ auto Predicate = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+ return SE.isAvailableAtLoopEntry(BoundSCEV, L) &&
+ SE.isLoopEntryGuardedByCond(L, Predicate, BoundSCEV,
+ SE.getConstant(Max));
+}
+
+/// Given a loop with an deccreasing induction variable, is it possible to
+/// safely calculate the bounds of a new loop using the given Predicate.
+static bool isSafeDecreasingBound(const SCEV *Start,
+ const SCEV *BoundSCEV, const SCEV *Step,
+ ICmpInst::Predicate Pred,
+ unsigned LatchBrExitIdx,
+ Loop *L, ScalarEvolution &SE) {
+ if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT &&
+ Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT)
+ return false;
+
+ if (!SE.isAvailableAtLoopEntry(BoundSCEV, L))
+ return false;
+
+ assert(SE.isKnownNegative(Step) && "expecting negative step");
+
+ LLVM_DEBUG(dbgs() << "irce: isSafeDecreasingBound with:\n");
+ LLVM_DEBUG(dbgs() << "irce: Start: " << *Start << "\n");
+ LLVM_DEBUG(dbgs() << "irce: Step: " << *Step << "\n");
+ LLVM_DEBUG(dbgs() << "irce: BoundSCEV: " << *BoundSCEV << "\n");
+ LLVM_DEBUG(dbgs() << "irce: Pred: " << ICmpInst::getPredicateName(Pred)
+ << "\n");
+ LLVM_DEBUG(dbgs() << "irce: LatchExitBrIdx: " << LatchBrExitIdx << "\n");
+
+ bool IsSigned = ICmpInst::isSigned(Pred);
+ // The predicate that we need to check that the induction variable lies
+ // within bounds.
+ ICmpInst::Predicate BoundPred =
+ IsSigned ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT;
+
+ if (LatchBrExitIdx == 1)
+ return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
+
+ assert(LatchBrExitIdx == 0 &&
+ "LatchBrExitIdx should be either 0 or 1");
+
+ const SCEV *StepPlusOne = SE.getAddExpr(Step, SE.getOne(Step->getType()));
+ unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
+ APInt Min = IsSigned ? APInt::getSignedMinValue(BitWidth) :
+ APInt::getMinValue(BitWidth);
+ const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Min), StepPlusOne);
+
+ const SCEV *MinusOne =
+ SE.getMinusSCEV(BoundSCEV, SE.getOne(BoundSCEV->getType()));
+
+ return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, MinusOne) &&
+ SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit);
+
+}
+
+/// Given a loop with an increasing induction variable, is it possible to
+/// safely calculate the bounds of a new loop using the given Predicate.
+static bool isSafeIncreasingBound(const SCEV *Start,
+ const SCEV *BoundSCEV, const SCEV *Step,
+ ICmpInst::Predicate Pred,
+ unsigned LatchBrExitIdx,
+ Loop *L, ScalarEvolution &SE) {
+ if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT &&
+ Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT)
+ return false;
+
+ if (!SE.isAvailableAtLoopEntry(BoundSCEV, L))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "irce: isSafeIncreasingBound with:\n");
+ LLVM_DEBUG(dbgs() << "irce: Start: " << *Start << "\n");
+ LLVM_DEBUG(dbgs() << "irce: Step: " << *Step << "\n");
+ LLVM_DEBUG(dbgs() << "irce: BoundSCEV: " << *BoundSCEV << "\n");
+ LLVM_DEBUG(dbgs() << "irce: Pred: " << ICmpInst::getPredicateName(Pred)
+ << "\n");
+ LLVM_DEBUG(dbgs() << "irce: LatchExitBrIdx: " << LatchBrExitIdx << "\n");
+
+ bool IsSigned = ICmpInst::isSigned(Pred);
+ // The predicate that we need to check that the induction variable lies
+ // within bounds.
+ ICmpInst::Predicate BoundPred =
+ IsSigned ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
+
+ if (LatchBrExitIdx == 1)
+ return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
+
+ assert(LatchBrExitIdx == 0 && "LatchBrExitIdx should be 0 or 1");
+
+ const SCEV *StepMinusOne =
+ SE.getMinusSCEV(Step, SE.getOne(Step->getType()));
+ unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
+ APInt Max = IsSigned ? APInt::getSignedMaxValue(BitWidth) :
+ APInt::getMaxValue(BitWidth);
+ const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Max), StepMinusOne);
+
+ return (SE.isLoopEntryGuardedByCond(L, BoundPred, Start,
+ SE.getAddExpr(BoundSCEV, Step)) &&
+ SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit));
}
-static bool SumCanReachMax(ScalarEvolution &SE, const SCEV *S1, const SCEV *S2,
- bool Signed) {
- // S1 < INT_MAX - S2 ===> S1 + S2 < INT_MAX.
- assert(SE.isKnownNonNegative(S2) &&
- "We expected the 2nd arg to be non-negative!");
- const SCEV *Max = SE.getConstant(
- Signed ? APInt::getSignedMaxValue(
- cast<IntegerType>(S1->getType())->getBitWidth())
- : APInt::getMaxValue(
- cast<IntegerType>(S1->getType())->getBitWidth()));
- const SCEV *CapForS1 = SE.getMinusSCEV(Max, S2);
- return !SE.isKnownPredicate(Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
- S1, CapForS1);
+static bool CannotBeMinInLoop(const SCEV *BoundSCEV, Loop *L,
+ ScalarEvolution &SE, bool Signed) {
+ unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
+ APInt Min = Signed ? APInt::getSignedMinValue(BitWidth) :
+ APInt::getMinValue(BitWidth);
+ auto Predicate = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+ return SE.isAvailableAtLoopEntry(BoundSCEV, L) &&
+ SE.isLoopEntryGuardedByCond(L, Predicate, BoundSCEV,
+ SE.getConstant(Min));
}
-static bool CanBeMin(ScalarEvolution &SE, const SCEV *S, bool Signed) {
- APInt Min = Signed ?
- APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth()) :
- APInt::getMinValue(cast<IntegerType>(S->getType())->getBitWidth());
- return SE.getSignedRange(S).contains(Min) &&
- SE.getUnsignedRange(S).contains(Min);
+static bool isKnownNonNegativeInLoop(const SCEV *BoundSCEV, const Loop *L,
+ ScalarEvolution &SE) {
+ const SCEV *Zero = SE.getZero(BoundSCEV->getType());
+ return SE.isAvailableAtLoopEntry(BoundSCEV, L) &&
+ SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SGE, BoundSCEV, Zero);
}
-static bool SumCanReachMin(ScalarEvolution &SE, const SCEV *S1, const SCEV *S2,
- bool Signed) {
- // S1 > INT_MIN - S2 ===> S1 + S2 > INT_MIN.
- assert(SE.isKnownNonPositive(S2) &&
- "We expected the 2nd arg to be non-positive!");
- const SCEV *Max = SE.getConstant(
- Signed ? APInt::getSignedMinValue(
- cast<IntegerType>(S1->getType())->getBitWidth())
- : APInt::getMinValue(
- cast<IntegerType>(S1->getType())->getBitWidth()));
- const SCEV *CapForS1 = SE.getMinusSCEV(Max, S2);
- return !SE.isKnownPredicate(Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT,
- S1, CapForS1);
+static bool isKnownNegativeInLoop(const SCEV *BoundSCEV, const Loop *L,
+ ScalarEvolution &SE) {
+ const SCEV *Zero = SE.getZero(BoundSCEV->getType());
+ return SE.isAvailableAtLoopEntry(BoundSCEV, L) &&
+ SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SLT, BoundSCEV, Zero);
}
Optional<LoopStructure>
LoopStructure::parseLoopStructure(ScalarEvolution &SE,
- BranchProbabilityInfo &BPI,
- Loop &L, const char *&FailureReason) {
+ BranchProbabilityInfo *BPI, Loop &L,
+ const char *&FailureReason) {
if (!L.isLoopSimplifyForm()) {
FailureReason = "loop not in LoopSimplify form";
return None;
@@ -750,7 +858,8 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0;
BranchProbability ExitProbability =
- BPI.getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx);
+ BPI ? BPI->getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx)
+ : BranchProbability::getZero();
if (!SkipProfitabilityChecks &&
ExitProbability > BranchProbability(1, MaxExitProbReciprocal)) {
@@ -816,43 +925,29 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap;
};
- // Here we check whether the suggested AddRec is an induction variable that
- // can be handled (i.e. with known constant step), and if yes, calculate its
- // step and identify whether it is increasing or decreasing.
- auto IsInductionVar = [&](const SCEVAddRecExpr *AR, bool &IsIncreasing,
- ConstantInt *&StepCI) {
- if (!AR->isAffine())
- return false;
-
- // Currently we only work with induction variables that have been proved to
- // not wrap. This restriction can potentially be lifted in the future.
-
- if (!HasNoSignedWrap(AR))
- return false;
-
- if (const SCEVConstant *StepExpr =
- dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) {
- StepCI = StepExpr->getValue();
- assert(!StepCI->isZero() && "Zero step?");
- IsIncreasing = !StepCI->isNegative();
- return true;
- }
-
- return false;
- };
-
// `ICI` is interpreted as taking the backedge if the *next* value of the
// induction variable satisfies some constraint.
const SCEVAddRecExpr *IndVarBase = cast<SCEVAddRecExpr>(LeftSCEV);
- bool IsIncreasing = false;
- bool IsSignedPredicate = true;
- ConstantInt *StepCI;
- if (!IsInductionVar(IndVarBase, IsIncreasing, StepCI)) {
+ if (!IndVarBase->isAffine()) {
FailureReason = "LHS in icmp not induction variable";
return None;
}
+ const SCEV* StepRec = IndVarBase->getStepRecurrence(SE);
+ if (!isa<SCEVConstant>(StepRec)) {
+ FailureReason = "LHS in icmp not induction variable";
+ return None;
+ }
+ ConstantInt *StepCI = cast<SCEVConstant>(StepRec)->getValue();
+
+ if (ICI->isEquality() && !HasNoSignedWrap(IndVarBase)) {
+ FailureReason = "LHS in icmp needs nsw for equality predicates";
+ return None;
+ }
+ assert(!StepCI->isZero() && "Zero step?");
+ bool IsIncreasing = !StepCI->isNegative();
+ bool IsSignedPredicate = ICmpInst::isSigned(Pred);
const SCEV *StartNext = IndVarBase->getStart();
const SCEV *Addend = SE.getNegativeSCEV(IndVarBase->getStepRecurrence(SE));
const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
@@ -870,22 +965,29 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
// If both parts are known non-negative, it is profitable to use
// unsigned comparison in increasing loop. This allows us to make the
// comparison check against "RightSCEV + 1" more optimistic.
- if (SE.isKnownNonNegative(IndVarStart) &&
- SE.isKnownNonNegative(RightSCEV))
+ if (isKnownNonNegativeInLoop(IndVarStart, &L, SE) &&
+ isKnownNonNegativeInLoop(RightSCEV, &L, SE))
Pred = ICmpInst::ICMP_ULT;
else
Pred = ICmpInst::ICMP_SLT;
- else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
- !CanBeMin(SE, RightSCEV, /* IsSignedPredicate */ true)) {
+ else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) {
// while (true) { while (true) {
// if (++i == len) ---> if (++i > len - 1)
// break; break;
// ... ...
// } }
- // TODO: Insert ICMP_UGT if both are non-negative?
- Pred = ICmpInst::ICMP_SGT;
- RightSCEV = SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
- DecreasedRightValueByOne = true;
+ if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) &&
+ CannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/false)) {
+ Pred = ICmpInst::ICMP_UGT;
+ RightSCEV = SE.getMinusSCEV(RightSCEV,
+ SE.getOne(RightSCEV->getType()));
+ DecreasedRightValueByOne = true;
+ } else if (CannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/true)) {
+ Pred = ICmpInst::ICMP_SGT;
+ RightSCEV = SE.getMinusSCEV(RightSCEV,
+ SE.getOne(RightSCEV->getType()));
+ DecreasedRightValueByOne = true;
+ }
}
}
@@ -899,36 +1001,18 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
return None;
}
- IsSignedPredicate =
- Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGT;
-
+ IsSignedPredicate = ICmpInst::isSigned(Pred);
if (!IsSignedPredicate && !AllowUnsignedLatchCondition) {
FailureReason = "unsigned latch conditions are explicitly prohibited";
return None;
}
- // The predicate that we need to check that the induction variable lies
- // within bounds.
- ICmpInst::Predicate BoundPred =
- IsSignedPredicate ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
-
+ if (!isSafeIncreasingBound(IndVarStart, RightSCEV, Step, Pred,
+ LatchBrExitIdx, &L, SE)) {
+ FailureReason = "Unsafe loop bounds";
+ return None;
+ }
if (LatchBrExitIdx == 0) {
- const SCEV *StepMinusOne = SE.getMinusSCEV(Step,
- SE.getOne(Step->getType()));
- if (SumCanReachMax(SE, RightSCEV, StepMinusOne, IsSignedPredicate)) {
- // TODO: this restriction is easily removable -- we just have to
- // remember that the icmp was an slt and not an sle.
- FailureReason = "limit may overflow when coercing le to lt";
- return None;
- }
-
- if (!SE.isLoopEntryGuardedByCond(
- &L, BoundPred, IndVarStart,
- SE.getAddExpr(RightSCEV, Step))) {
- FailureReason = "Induction variable start not bounded by upper limit";
- return None;
- }
-
// We need to increase the right value unless we have already decreased
// it virtually when we replaced EQ with SGT.
if (!DecreasedRightValueByOne) {
@@ -936,10 +1020,6 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
RightValue = B.CreateAdd(RightValue, One);
}
} else {
- if (!SE.isLoopEntryGuardedByCond(&L, BoundPred, IndVarStart, RightSCEV)) {
- FailureReason = "Induction variable start not bounded by upper limit";
- return None;
- }
assert(!DecreasedRightValueByOne &&
"Right value can be decreased only for LatchBrExitIdx == 0!");
}
@@ -955,17 +1035,22 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
// that both operands are non-negative, because it will only pessimize
// our check against "RightSCEV - 1".
Pred = ICmpInst::ICMP_SGT;
- else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
- !CanBeMax(SE, RightSCEV, /* IsSignedPredicate */ true)) {
+ else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) {
// while (true) { while (true) {
// if (--i == len) ---> if (--i < len + 1)
// break; break;
// ... ...
// } }
- // TODO: Insert ICMP_ULT if both are non-negative?
- Pred = ICmpInst::ICMP_SLT;
- RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
- IncreasedRightValueByOne = true;
+ if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) &&
+ CannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ false)) {
+ Pred = ICmpInst::ICMP_ULT;
+ RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+ IncreasedRightValueByOne = true;
+ } else if (CannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ true)) {
+ Pred = ICmpInst::ICMP_SLT;
+ RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+ IncreasedRightValueByOne = true;
+ }
}
}
@@ -988,27 +1073,13 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
return None;
}
- // The predicate that we need to check that the induction variable lies
- // within bounds.
- ICmpInst::Predicate BoundPred =
- IsSignedPredicate ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT;
+ if (!isSafeDecreasingBound(IndVarStart, RightSCEV, Step, Pred,
+ LatchBrExitIdx, &L, SE)) {
+ FailureReason = "Unsafe bounds";
+ return None;
+ }
if (LatchBrExitIdx == 0) {
- const SCEV *StepPlusOne = SE.getAddExpr(Step, SE.getOne(Step->getType()));
- if (SumCanReachMin(SE, RightSCEV, StepPlusOne, IsSignedPredicate)) {
- // TODO: this restriction is easily removable -- we just have to
- // remember that the icmp was an sgt and not an sge.
- FailureReason = "limit may overflow when coercing ge to gt";
- return None;
- }
-
- if (!SE.isLoopEntryGuardedByCond(
- &L, BoundPred, IndVarStart,
- SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())))) {
- FailureReason = "Induction variable start not bounded by lower limit";
- return None;
- }
-
// We need to decrease the right value unless we have already increased
// it virtually when we replaced EQ with SLT.
if (!IncreasedRightValueByOne) {
@@ -1016,10 +1087,6 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
RightValue = B.CreateSub(RightValue, One);
}
} else {
- if (!SE.isLoopEntryGuardedByCond(&L, BoundPred, IndVarStart, RightSCEV)) {
- FailureReason = "Induction variable start not bounded by lower limit";
- return None;
- }
assert(!IncreasedRightValueByOne &&
"Right value can be increased only for LatchBrExitIdx == 0!");
}
@@ -1381,13 +1448,14 @@ void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
}
Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
- ValueToValueMapTy &VM) {
+ ValueToValueMapTy &VM,
+ bool IsSubloop) {
Loop &New = *LI.AllocateLoop();
if (Parent)
Parent->addChildLoop(&New);
else
LI.addTopLevelLoop(&New);
- LPM.addLoop(New);
+ LPMAddNewLoop(&New, IsSubloop);
// Add all of the blocks in Original to the new loop.
for (auto *BB : Original->blocks())
@@ -1396,7 +1464,7 @@ Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
// Add all of the subloops to the new loop.
for (Loop *SubLoop : *Original)
- createClonedLoopStructure(SubLoop, &New, VM);
+ createClonedLoopStructure(SubLoop, &New, VM, /* IsSubloop */ true);
return &New;
}
@@ -1414,7 +1482,7 @@ bool LoopConstrainer::run() {
bool IsSignedPredicate = MainLoopStructure.IsSignedPredicate;
Optional<SubRanges> MaybeSR = calculateSubRanges(IsSignedPredicate);
if (!MaybeSR.hasValue()) {
- DEBUG(dbgs() << "irce: could not compute subranges\n");
+ LLVM_DEBUG(dbgs() << "irce: could not compute subranges\n");
return false;
}
@@ -1446,19 +1514,22 @@ bool LoopConstrainer::run() {
if (Increasing)
ExitPreLoopAtSCEV = *SR.LowLimit;
else {
- if (CanBeMin(SE, *SR.HighLimit, IsSignedPredicate)) {
- DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
- << "preloop exit limit. HighLimit = " << *(*SR.HighLimit)
- << "\n");
+ if (CannotBeMinInLoop(*SR.HighLimit, &OriginalLoop, SE,
+ IsSignedPredicate))
+ ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
+ else {
+ LLVM_DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+ << "preloop exit limit. HighLimit = "
+ << *(*SR.HighLimit) << "\n");
return false;
}
- ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
}
if (!isSafeToExpandAt(ExitPreLoopAtSCEV, InsertPt, SE)) {
- DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
- << " preloop exit limit " << *ExitPreLoopAtSCEV
- << " at block " << InsertPt->getParent()->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
+ << " preloop exit limit " << *ExitPreLoopAtSCEV
+ << " at block " << InsertPt->getParent()->getName()
+ << "\n");
return false;
}
@@ -1472,19 +1543,22 @@ bool LoopConstrainer::run() {
if (Increasing)
ExitMainLoopAtSCEV = *SR.HighLimit;
else {
- if (CanBeMin(SE, *SR.LowLimit, IsSignedPredicate)) {
- DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
- << "mainloop exit limit. LowLimit = " << *(*SR.LowLimit)
- << "\n");
+ if (CannotBeMinInLoop(*SR.LowLimit, &OriginalLoop, SE,
+ IsSignedPredicate))
+ ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
+ else {
+ LLVM_DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+ << "mainloop exit limit. LowLimit = "
+ << *(*SR.LowLimit) << "\n");
return false;
}
- ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
}
if (!isSafeToExpandAt(ExitMainLoopAtSCEV, InsertPt, SE)) {
- DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
- << " main loop exit limit " << *ExitMainLoopAtSCEV
- << " at block " << InsertPt->getParent()->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
+ << " main loop exit limit " << *ExitMainLoopAtSCEV
+ << " at block " << InsertPt->getParent()->getName()
+ << "\n");
return false;
}
@@ -1546,13 +1620,15 @@ bool LoopConstrainer::run() {
// LI when LoopSimplifyForm is generated.
Loop *PreL = nullptr, *PostL = nullptr;
if (!PreLoop.Blocks.empty()) {
- PreL = createClonedLoopStructure(
- &OriginalLoop, OriginalLoop.getParentLoop(), PreLoop.Map);
+ PreL = createClonedLoopStructure(&OriginalLoop,
+ OriginalLoop.getParentLoop(), PreLoop.Map,
+ /* IsSubLoop */ false);
}
if (!PostLoop.Blocks.empty()) {
- PostL = createClonedLoopStructure(
- &OriginalLoop, OriginalLoop.getParentLoop(), PostLoop.Map);
+ PostL =
+ createClonedLoopStructure(&OriginalLoop, OriginalLoop.getParentLoop(),
+ PostLoop.Map, /* IsSubLoop */ false);
}
// This function canonicalizes the loop into Loop-Simplify and LCSSA forms.
@@ -1618,32 +1694,34 @@ InductiveRangeCheck::computeSafeIterationSpace(
unsigned BitWidth = cast<IntegerType>(IndVar->getType())->getBitWidth();
const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
- // Substract Y from X so that it does not go through border of the IV
+ // Subtract Y from X so that it does not go through border of the IV
// iteration space. Mathematically, it is equivalent to:
//
- // ClampedSubstract(X, Y) = min(max(X - Y, INT_MIN), INT_MAX). [1]
+ // ClampedSubtract(X, Y) = min(max(X - Y, INT_MIN), INT_MAX). [1]
//
- // In [1], 'X - Y' is a mathematical substraction (result is not bounded to
+ // In [1], 'X - Y' is a mathematical subtraction (result is not bounded to
// any width of bit grid). But after we take min/max, the result is
// guaranteed to be within [INT_MIN, INT_MAX].
//
// In [1], INT_MAX and INT_MIN are respectively signed and unsigned max/min
// values, depending on type of latch condition that defines IV iteration
// space.
- auto ClampedSubstract = [&](const SCEV *X, const SCEV *Y) {
- assert(SE.isKnownNonNegative(X) &&
- "We can only substract from values in [0; SINT_MAX]!");
+ auto ClampedSubtract = [&](const SCEV *X, const SCEV *Y) {
+ // FIXME: The current implementation assumes that X is in [0, SINT_MAX].
+ // This is required to ensure that SINT_MAX - X does not overflow signed and
+ // that X - Y does not overflow unsigned if Y is negative. Can we lift this
+ // restriction and make it work for negative X either?
if (IsLatchSigned) {
// X is a number from signed range, Y is interpreted as signed.
// Even if Y is SINT_MAX, (X - Y) does not reach SINT_MIN. So the only
// thing we should care about is that we didn't cross SINT_MAX.
- // So, if Y is positive, we substract Y safely.
+ // So, if Y is positive, we subtract Y safely.
// Rule 1: Y > 0 ---> Y.
- // If 0 <= -Y <= (SINT_MAX - X), we substract Y safely.
+ // If 0 <= -Y <= (SINT_MAX - X), we subtract Y safely.
// Rule 2: Y >=s (X - SINT_MAX) ---> Y.
- // If 0 <= (SINT_MAX - X) < -Y, we can only substract (X - SINT_MAX).
+ // If 0 <= (SINT_MAX - X) < -Y, we can only subtract (X - SINT_MAX).
// Rule 3: Y <s (X - SINT_MAX) ---> (X - SINT_MAX).
- // It gives us smax(Y, X - SINT_MAX) to substract in all cases.
+ // It gives us smax(Y, X - SINT_MAX) to subtract in all cases.
const SCEV *XMinusSIntMax = SE.getMinusSCEV(X, SIntMax);
return SE.getMinusSCEV(X, SE.getSMaxExpr(Y, XMinusSIntMax),
SCEV::FlagNSW);
@@ -1651,29 +1729,45 @@ InductiveRangeCheck::computeSafeIterationSpace(
// X is a number from unsigned range, Y is interpreted as signed.
// Even if Y is SINT_MIN, (X - Y) does not reach UINT_MAX. So the only
// thing we should care about is that we didn't cross zero.
- // So, if Y is negative, we substract Y safely.
+ // So, if Y is negative, we subtract Y safely.
// Rule 1: Y <s 0 ---> Y.
- // If 0 <= Y <= X, we substract Y safely.
+ // If 0 <= Y <= X, we subtract Y safely.
// Rule 2: Y <=s X ---> Y.
- // If 0 <= X < Y, we should stop at 0 and can only substract X.
+ // If 0 <= X < Y, we should stop at 0 and can only subtract X.
// Rule 3: Y >s X ---> X.
- // It gives us smin(X, Y) to substract in all cases.
+ // It gives us smin(X, Y) to subtract in all cases.
return SE.getMinusSCEV(X, SE.getSMinExpr(X, Y), SCEV::FlagNUW);
};
const SCEV *M = SE.getMinusSCEV(C, A);
const SCEV *Zero = SE.getZero(M->getType());
- const SCEV *Begin = ClampedSubstract(Zero, M);
- const SCEV *L = nullptr;
- // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L".
- // We can potentially do much better here.
- if (const SCEV *EndLimit = getEnd())
- L = EndLimit;
- else {
- assert(Kind == InductiveRangeCheck::RANGE_CHECK_LOWER && "invariant!");
- L = SIntMax;
- }
- const SCEV *End = ClampedSubstract(L, M);
+ // This function returns SCEV equal to 1 if X is non-negative 0 otherwise.
+ auto SCEVCheckNonNegative = [&](const SCEV *X) {
+ const Loop *L = IndVar->getLoop();
+ const SCEV *One = SE.getOne(X->getType());
+ // Can we trivially prove that X is a non-negative or negative value?
+ if (isKnownNonNegativeInLoop(X, L, SE))
+ return One;
+ else if (isKnownNegativeInLoop(X, L, SE))
+ return Zero;
+ // If not, we will have to figure it out during the execution.
+ // Function smax(smin(X, 0), -1) + 1 equals to 1 if X >= 0 and 0 if X < 0.
+ const SCEV *NegOne = SE.getNegativeSCEV(One);
+ return SE.getAddExpr(SE.getSMaxExpr(SE.getSMinExpr(X, Zero), NegOne), One);
+ };
+ // FIXME: Current implementation of ClampedSubtract implicitly assumes that
+ // X is non-negative (in sense of a signed value). We need to re-implement
+ // this function in a way that it will correctly handle negative X as well.
+ // We use it twice: for X = 0 everything is fine, but for X = getEnd() we can
+ // end up with a negative X and produce wrong results. So currently we ensure
+ // that if getEnd() is negative then both ends of the safe range are zero.
+ // Note that this may pessimize elimination of unsigned range checks against
+ // negative values.
+ const SCEV *REnd = getEnd();
+ const SCEV *EndIsNonNegative = SCEVCheckNonNegative(REnd);
+
+ const SCEV *Begin = SE.getMulExpr(ClampedSubtract(Zero, M), EndIsNonNegative);
+ const SCEV *End = SE.getMulExpr(ClampedSubtract(REnd, M), EndIsNonNegative);
return InductiveRangeCheck::Range(Begin, End);
}
@@ -1735,26 +1829,56 @@ IntersectUnsignedRange(ScalarEvolution &SE,
return Ret;
}
-bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
+PreservedAnalyses IRCEPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ Function *F = L.getHeader()->getParent();
+ const auto &FAM =
+ AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
+ auto *BPI = FAM.getCachedResult<BranchProbabilityAnalysis>(*F);
+ InductiveRangeCheckElimination IRCE(AR.SE, BPI, AR.DT, AR.LI);
+ auto LPMAddNewLoop = [&U](Loop *NL, bool IsSubloop) {
+ if (!IsSubloop)
+ U.addSiblingLoops(NL);
+ };
+ bool Changed = IRCE.run(&L, LPMAddNewLoop);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ return getLoopPassPreservedAnalyses();
+}
+
+bool IRCELegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
if (skipLoop(L))
return false;
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ BranchProbabilityInfo &BPI =
+ getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ InductiveRangeCheckElimination IRCE(SE, &BPI, DT, LI);
+ auto LPMAddNewLoop = [&LPM](Loop *NL, bool /* IsSubLoop */) {
+ LPM.addLoop(*NL);
+ };
+ return IRCE.run(L, LPMAddNewLoop);
+}
+
+bool InductiveRangeCheckElimination::run(
+ Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop) {
if (L->getBlocks().size() >= LoopSizeCutoff) {
- DEBUG(dbgs() << "irce: giving up constraining loop, too large\n";);
+ LLVM_DEBUG(dbgs() << "irce: giving up constraining loop, too large\n");
return false;
}
BasicBlock *Preheader = L->getLoopPreheader();
if (!Preheader) {
- DEBUG(dbgs() << "irce: loop has no preheader, leaving\n");
+ LLVM_DEBUG(dbgs() << "irce: loop has no preheader, leaving\n");
return false;
}
LLVMContext &Context = Preheader->getContext();
SmallVector<InductiveRangeCheck, 16> RangeChecks;
- ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- BranchProbabilityInfo &BPI =
- getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
for (auto BBI : L->getBlocks())
if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator()))
@@ -1772,7 +1896,7 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
IRC.print(OS);
};
- DEBUG(PrintRecognizedRangeChecks(dbgs()));
+ LLVM_DEBUG(PrintRecognizedRangeChecks(dbgs()));
if (PrintRangeChecks)
PrintRecognizedRangeChecks(errs());
@@ -1781,8 +1905,8 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
Optional<LoopStructure> MaybeLoopStructure =
LoopStructure::parseLoopStructure(SE, BPI, *L, FailureReason);
if (!MaybeLoopStructure.hasValue()) {
- DEBUG(dbgs() << "irce: could not parse loop structure: " << FailureReason
- << "\n";);
+ LLVM_DEBUG(dbgs() << "irce: could not parse loop structure: "
+ << FailureReason << "\n";);
return false;
}
LoopStructure LS = MaybeLoopStructure.getValue();
@@ -1820,9 +1944,8 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
if (!SafeIterRange.hasValue())
return false;
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- LoopConstrainer LC(*L, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), LPM,
- LS, SE, DT, SafeIterRange.getValue());
+ LoopConstrainer LC(*L, LI, LPMAddNewLoop, LS, SE, DT,
+ SafeIterRange.getValue());
bool Changed = LC.run();
if (Changed) {
@@ -1833,7 +1956,7 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
L->print(dbgs());
};
- DEBUG(PrintConstrainedLoopInfo());
+ LLVM_DEBUG(PrintConstrainedLoopInfo());
if (PrintChangedLoops)
PrintConstrainedLoopInfo();
@@ -1852,5 +1975,5 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
}
Pass *llvm::createInductiveRangeCheckEliminationPass() {
- return new InductiveRangeCheckElimination;
+ return new IRCELegacyPass();
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 7d66c0f73821..fbbc09eb487f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -97,6 +97,7 @@
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
@@ -121,7 +122,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <cassert>
#include <iterator>
@@ -140,7 +140,7 @@ namespace {
using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
-/// \brief InferAddressSpaces
+/// InferAddressSpaces
class InferAddressSpaces : public FunctionPass {
/// Target specific address space which uses of should be replaced if
/// possible.
@@ -260,7 +260,10 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:{
+ case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax: {
const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4));
if (!IsVolatile || !IsVolatile->isZero())
return false;
@@ -289,6 +292,9 @@ void InferAddressSpaces::collectRewritableIntrinsicOperands(
case Intrinsic::objectsize:
case Intrinsic::amdgcn_atomic_inc:
case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax:
appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
PostorderStack, Visited);
break;
@@ -647,13 +653,13 @@ void InferAddressSpaces::inferAddressSpaces(
// Tries to update the address space of the stack top according to the
// address spaces of its operands.
- DEBUG(dbgs() << "Updating the address space of\n " << *V << '\n');
+ LLVM_DEBUG(dbgs() << "Updating the address space of\n " << *V << '\n');
Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace);
if (!NewAS.hasValue())
continue;
// If any updates are made, grabs its users to the worklist because
// their address spaces can also be possibly updated.
- DEBUG(dbgs() << " to " << NewAS.getValue() << '\n');
+ LLVM_DEBUG(dbgs() << " to " << NewAS.getValue() << '\n');
(*InferredAddrSpace)[V] = NewAS.getValue();
for (Value *User : V->users()) {
@@ -779,7 +785,7 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
if (auto *MSI = dyn_cast<MemSetInst>(MI)) {
B.CreateMemSet(NewV, MSI->getValue(),
- MSI->getLength(), MSI->getAlignment(),
+ MSI->getLength(), MSI->getDestAlignment(),
false, // isVolatile
TBAA, ScopeMD, NoAliasMD);
} else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) {
@@ -795,14 +801,16 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
if (isa<MemCpyInst>(MTI)) {
MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
- B.CreateMemCpy(Dest, Src, MTI->getLength(),
- MTI->getAlignment(),
+ B.CreateMemCpy(Dest, MTI->getDestAlignment(),
+ Src, MTI->getSourceAlignment(),
+ MTI->getLength(),
false, // isVolatile
TBAA, TBAAStruct, ScopeMD, NoAliasMD);
} else {
assert(isa<MemMoveInst>(MTI));
- B.CreateMemMove(Dest, Src, MTI->getLength(),
- MTI->getAlignment(),
+ B.CreateMemMove(Dest, MTI->getDestAlignment(),
+ Src, MTI->getSourceAlignment(),
+ MTI->getLength(),
false, // isVolatile
TBAA, ScopeMD, NoAliasMD);
}
@@ -893,15 +901,15 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
if (NewV == nullptr)
continue;
- DEBUG(dbgs() << "Replacing the uses of " << *V
- << "\n with\n " << *NewV << '\n');
+ LLVM_DEBUG(dbgs() << "Replacing the uses of " << *V << "\n with\n "
+ << *NewV << '\n');
if (Constant *C = dyn_cast<Constant>(V)) {
Constant *Replace = ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
C->getType());
if (C != Replace) {
- DEBUG(dbgs() << "Inserting replacement const cast: "
- << Replace << ": " << *Replace << '\n');
+ LLVM_DEBUG(dbgs() << "Inserting replacement const cast: " << Replace
+ << ": " << *Replace << '\n');
C->replaceAllUsesWith(Replace);
V = Replace;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp b/contrib/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
new file mode 100644
index 000000000000..05cd48d83267
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
@@ -0,0 +1,144 @@
+//===- InstSimplifyPass.cpp -----------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "instsimplify"
+
+STATISTIC(NumSimplified, "Number of redundant instructions removed");
+
+static bool runImpl(Function &F, const SimplifyQuery &SQ,
+ OptimizationRemarkEmitter *ORE) {
+ SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
+ bool Changed = false;
+
+ do {
+ for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
+ // Here be subtlety: the iterator must be incremented before the loop
+ // body (not sure why), so a range-for loop won't work here.
+ for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+ Instruction *I = &*BI++;
+ // The first time through the loop ToSimplify is empty and we try to
+ // simplify all instructions. On later iterations ToSimplify is not
+ // empty and we only bother simplifying instructions that are in it.
+ if (!ToSimplify->empty() && !ToSimplify->count(I))
+ continue;
+
+ // Don't waste time simplifying unused instructions.
+ if (!I->use_empty()) {
+ if (Value *V = SimplifyInstruction(I, SQ, ORE)) {
+ // Mark all uses for resimplification next time round the loop.
+ for (User *U : I->users())
+ Next->insert(cast<Instruction>(U));
+ I->replaceAllUsesWith(V);
+ ++NumSimplified;
+ Changed = true;
+ }
+ }
+ if (RecursivelyDeleteTriviallyDeadInstructions(I, SQ.TLI)) {
+ // RecursivelyDeleteTriviallyDeadInstruction can remove more than one
+ // instruction, so simply incrementing the iterator does not work.
+ // When instructions get deleted re-iterate instead.
+ BI = BB->begin();
+ BE = BB->end();
+ Changed = true;
+ }
+ }
+ }
+
+ // Place the list of instructions to simplify on the next loop iteration
+ // into ToSimplify.
+ std::swap(ToSimplify, Next);
+ Next->clear();
+ } while (!ToSimplify->empty());
+
+ return Changed;
+}
+
+namespace {
+struct InstSimplifyLegacyPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ InstSimplifyLegacyPass() : FunctionPass(ID) {
+ initializeInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ }
+
+ /// runOnFunction - Remove instructions that simplify.
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ const DominatorTree *DT =
+ &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ const TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ AssumptionCache *AC =
+ &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ OptimizationRemarkEmitter *ORE =
+ &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ const SimplifyQuery SQ(DL, TLI, DT, AC);
+ return runImpl(F, SQ, ORE);
+ }
+};
+} // namespace
+
+char InstSimplifyLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(InstSimplifyLegacyPass, "instsimplify",
+ "Remove redundant instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(InstSimplifyLegacyPass, "instsimplify",
+ "Remove redundant instructions", false, false)
+
+// Public interface to the simplify instructions pass.
+FunctionPass *llvm::createInstSimplifyLegacyPass() {
+ return new InstSimplifyLegacyPass();
+}
+
+PreservedAnalyses InstSimplifyPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ const SimplifyQuery SQ(DL, &TLI, &DT, &AC);
+ bool Changed = runImpl(F, SQ, &ORE);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 2f1645433fb8..1d66472f93c8 100644
--- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -30,6 +30,7 @@
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
@@ -64,7 +65,6 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <algorithm>
@@ -131,10 +131,11 @@ namespace {
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
- if (PrintLVIAfterJumpThreading)
- AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<LazyValueInfoWrapperPass>();
+ AU.addPreserved<LazyValueInfoWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
}
@@ -148,6 +149,7 @@ char JumpThreading::ID = 0;
INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
"Jump Threading", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
@@ -164,7 +166,7 @@ JumpThreadingPass::JumpThreadingPass(int T) {
}
// Update branch probability information according to conditional
-// branch probablity. This is usually made possible for cloned branches
+// branch probability. This is usually made possible for cloned branches
// in inline instances by the context specific profile in the caller.
// For instance,
//
@@ -278,8 +280,12 @@ bool JumpThreading::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ // Get DT analysis before LVI. When LVI is initialized it conditionally adds
+ // DT if it's available.
+ auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ DeferredDominance DDT(*DT);
std::unique_ptr<BlockFrequencyInfo> BFI;
std::unique_ptr<BranchProbabilityInfo> BPI;
bool HasProfileData = F.hasProfileData();
@@ -289,12 +295,11 @@ bool JumpThreading::runOnFunction(Function &F) {
BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
}
- bool Changed = Impl.runImpl(F, TLI, LVI, AA, HasProfileData, std::move(BFI),
- std::move(BPI));
+ bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DDT, HasProfileData,
+ std::move(BFI), std::move(BPI));
if (PrintLVIAfterJumpThreading) {
dbgs() << "LVI for function '" << F.getName() << "':\n";
- LVI->printLVI(F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
- dbgs());
+ LVI->printLVI(F, *DT, dbgs());
}
return Changed;
}
@@ -302,8 +307,12 @@ bool JumpThreading::runOnFunction(Function &F) {
PreservedAnalyses JumpThreadingPass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ // Get DT analysis before LVI. When LVI is initialized it conditionally adds
+ // DT if it's available.
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &LVI = AM.getResult<LazyValueAnalysis>(F);
auto &AA = AM.getResult<AAManager>(F);
+ DeferredDominance DDT(DT);
std::unique_ptr<BlockFrequencyInfo> BFI;
std::unique_ptr<BranchProbabilityInfo> BPI;
@@ -313,25 +322,28 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
}
- bool Changed = runImpl(F, &TLI, &LVI, &AA, HasProfileData, std::move(BFI),
- std::move(BPI));
+ bool Changed = runImpl(F, &TLI, &LVI, &AA, &DDT, HasProfileData,
+ std::move(BFI), std::move(BPI));
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<GlobalsAA>();
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LazyValueAnalysis>();
return PA;
}
bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
LazyValueInfo *LVI_, AliasAnalysis *AA_,
- bool HasProfileData_,
+ DeferredDominance *DDT_, bool HasProfileData_,
std::unique_ptr<BlockFrequencyInfo> BFI_,
std::unique_ptr<BranchProbabilityInfo> BPI_) {
- DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
+ LLVM_DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
TLI = TLI_;
LVI = LVI_;
AA = AA_;
+ DDT = DDT_;
BFI.reset();
BPI.reset();
// When profile data is available, we need to update edge weights after
@@ -345,69 +357,66 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
BFI = std::move(BFI_);
}
- // Remove unreachable blocks from function as they may result in infinite
- // loop. We do threading if we found something profitable. Jump threading a
- // branch can create other opportunities. If these opportunities form a cycle
- // i.e. if any jump threading is undoing previous threading in the path, then
- // we will loop forever. We take care of this issue by not jump threading for
- // back edges. This works for normal cases but not for unreachable blocks as
- // they may have cycle with no back edge.
- bool EverChanged = false;
- EverChanged |= removeUnreachableBlocks(F, LVI);
+ // JumpThreading must not processes blocks unreachable from entry. It's a
+ // waste of compute time and can potentially lead to hangs.
+ SmallPtrSet<BasicBlock *, 16> Unreachable;
+ DominatorTree &DT = DDT->flush();
+ for (auto &BB : F)
+ if (!DT.isReachableFromEntry(&BB))
+ Unreachable.insert(&BB);
FindLoopHeaders(F);
+ bool EverChanged = false;
bool Changed;
do {
Changed = false;
- for (Function::iterator I = F.begin(), E = F.end(); I != E;) {
- BasicBlock *BB = &*I;
- // Thread all of the branches we can over this block.
- while (ProcessBlock(BB))
+ for (auto &BB : F) {
+ if (Unreachable.count(&BB))
+ continue;
+ while (ProcessBlock(&BB)) // Thread all of the branches we can over BB.
Changed = true;
+ // Stop processing BB if it's the entry or is now deleted. The following
+ // routines attempt to eliminate BB and locating a suitable replacement
+ // for the entry is non-trivial.
+ if (&BB == &F.getEntryBlock() || DDT->pendingDeletedBB(&BB))
+ continue;
- ++I;
-
- // If the block is trivially dead, zap it. This eliminates the successor
- // edges which simplifies the CFG.
- if (pred_empty(BB) &&
- BB != &BB->getParent()->getEntryBlock()) {
- DEBUG(dbgs() << " JT: Deleting dead block '" << BB->getName()
- << "' with terminator: " << *BB->getTerminator() << '\n');
- LoopHeaders.erase(BB);
- LVI->eraseBlock(BB);
- DeleteDeadBlock(BB);
+ if (pred_empty(&BB)) {
+ // When ProcessBlock makes BB unreachable it doesn't bother to fix up
+ // the instructions in it. We must remove BB to prevent invalid IR.
+ LLVM_DEBUG(dbgs() << " JT: Deleting dead block '" << BB.getName()
+ << "' with terminator: " << *BB.getTerminator()
+ << '\n');
+ LoopHeaders.erase(&BB);
+ LVI->eraseBlock(&BB);
+ DeleteDeadBlock(&BB, DDT);
Changed = true;
continue;
}
- BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
-
- // Can't thread an unconditional jump, but if the block is "almost
- // empty", we can replace uses of it with uses of the successor and make
- // this dead.
- // We should not eliminate the loop header or latch either, because
- // eliminating a loop header or latch might later prevent LoopSimplify
- // from transforming nested loops into simplified form. We will rely on
- // later passes in backend to clean up empty blocks.
+ // ProcessBlock doesn't thread BBs with unconditional TIs. However, if BB
+ // is "almost empty", we attempt to merge BB with its sole successor.
+ auto *BI = dyn_cast<BranchInst>(BB.getTerminator());
if (BI && BI->isUnconditional() &&
- BB != &BB->getParent()->getEntryBlock() &&
- // If the terminator is the only non-phi instruction, try to nuke it.
- BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB) &&
- !LoopHeaders.count(BI->getSuccessor(0))) {
- // FIXME: It is always conservatively correct to drop the info
- // for a block even if it doesn't get erased. This isn't totally
- // awesome, but it allows us to use AssertingVH to prevent nasty
- // dangling pointer issues within LazyValueInfo.
- LVI->eraseBlock(BB);
- if (TryToSimplifyUncondBranchFromEmptyBlock(BB))
- Changed = true;
+ // The terminator must be the only non-phi instruction in BB.
+ BB.getFirstNonPHIOrDbg()->isTerminator() &&
+ // Don't alter Loop headers and latches to ensure another pass can
+ // detect and transform nested loops later.
+ !LoopHeaders.count(&BB) && !LoopHeaders.count(BI->getSuccessor(0)) &&
+ TryToSimplifyUncondBranchFromEmptyBlock(&BB, DDT)) {
+ // BB is valid for cleanup here because we passed in DDT. F remains
+ // BB's parent until a DDT->flush() event.
+ LVI->eraseBlock(&BB);
+ Changed = true;
}
}
EverChanged |= Changed;
} while (Changed);
LoopHeaders.clear();
+ DDT->flush();
+ LVI->enableDT();
return EverChanged;
}
@@ -600,6 +609,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
// "X < 4" and "X < 3" is known true but "X < 4" itself is not available.
// Perhaps getConstantOnEdge should be smart enough to do this?
+ if (DDT->pending())
+ LVI->disableDT();
+ else
+ LVI->enableDT();
for (BasicBlock *P : predecessors(BB)) {
// If the value is known by LazyValueInfo to be a constant in a
// predecessor, use that information to try to thread this block.
@@ -613,6 +626,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
/// If I is a PHI node, then we know the incoming values for any constants.
if (PHINode *PN = dyn_cast<PHINode>(I)) {
+ if (DDT->pending())
+ LVI->disableDT();
+ else
+ LVI->enableDT();
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
Value *InVal = PN->getIncomingValue(i);
if (Constant *KC = getKnownConstant(InVal, Preference)) {
@@ -630,11 +647,9 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
}
// Handle Cast instructions. Only see through Cast when the source operand is
- // PHI or Cmp and the source type is i1 to save the compilation time.
+ // PHI or Cmp to save the compilation time.
if (CastInst *CI = dyn_cast<CastInst>(I)) {
Value *Source = CI->getOperand(0);
- if (!Source->getType()->isIntegerTy(1))
- return false;
if (!isa<PHINode>(Source) && !isa<CmpInst>(Source))
return false;
ComputeValueKnownInPredecessors(Source, BB, Result, Preference, CxtI);
@@ -738,20 +753,36 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
CmpInst::Predicate Pred = Cmp->getPredicate();
PHINode *PN = dyn_cast<PHINode>(CmpLHS);
+ if (!PN)
+ PN = dyn_cast<PHINode>(CmpRHS);
if (PN && PN->getParent() == BB) {
const DataLayout &DL = PN->getModule()->getDataLayout();
// We can do this simplification if any comparisons fold to true or false.
// See if any do.
+ if (DDT->pending())
+ LVI->disableDT();
+ else
+ LVI->enableDT();
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
BasicBlock *PredBB = PN->getIncomingBlock(i);
- Value *LHS = PN->getIncomingValue(i);
- Value *RHS = CmpRHS->DoPHITranslation(BB, PredBB);
-
+ Value *LHS, *RHS;
+ if (PN == CmpLHS) {
+ LHS = PN->getIncomingValue(i);
+ RHS = CmpRHS->DoPHITranslation(BB, PredBB);
+ } else {
+ LHS = CmpLHS->DoPHITranslation(BB, PredBB);
+ RHS = PN->getIncomingValue(i);
+ }
Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL});
if (!Res) {
if (!isa<Constant>(RHS))
continue;
+ // getPredicateOnEdge call will make no sense if LHS is defined in BB.
+ auto LHSInst = dyn_cast<Instruction>(LHS);
+ if (LHSInst && LHSInst->getParent() == BB)
+ continue;
+
LazyValueInfo::Tristate
ResT = LVI->getPredicateOnEdge(Pred, LHS,
cast<Constant>(RHS), PredBB, BB,
@@ -775,6 +806,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
if (!isa<Instruction>(CmpLHS) ||
cast<Instruction>(CmpLHS)->getParent() != BB) {
+ if (DDT->pending())
+ LVI->disableDT();
+ else
+ LVI->enableDT();
for (BasicBlock *P : predecessors(BB)) {
// If the value is known by LazyValueInfo to be a constant in a
// predecessor, use that information to try to thread this block.
@@ -803,6 +838,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) {
if (!isa<Instruction>(AddLHS) ||
cast<Instruction>(AddLHS)->getParent() != BB) {
+ if (DDT->pending())
+ LVI->disableDT();
+ else
+ LVI->enableDT();
for (BasicBlock *P : predecessors(BB)) {
// If the value is known by LazyValueInfo to be a ConstantRange in
// a predecessor, use that information to try to thread this
@@ -884,6 +923,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
}
// If all else fails, see if LVI can figure out a constant value for us.
+ if (DDT->pending())
+ LVI->disableDT();
+ else
+ LVI->enableDT();
Constant *CI = LVI->getConstant(V, BB, CxtI);
if (Constant *KC = getKnownConstant(CI, Preference)) {
for (BasicBlock *Pred : predecessors(BB))
@@ -903,10 +946,10 @@ static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) {
unsigned MinSucc = 0;
BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
// Compute the successor with the minimum number of predecessors.
- unsigned MinNumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+ unsigned MinNumPreds = pred_size(TestBB);
for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) {
TestBB = BBTerm->getSuccessor(i);
- unsigned NumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+ unsigned NumPreds = pred_size(TestBB);
if (NumPreds < MinNumPreds) {
MinSucc = i;
MinNumPreds = NumPreds;
@@ -931,8 +974,8 @@ static bool hasAddressTakenAndUsed(BasicBlock *BB) {
bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// If the block is trivially dead, just return and let the caller nuke it.
// This simplifies other transformations.
- if (pred_empty(BB) &&
- BB != &BB->getParent()->getEntryBlock())
+ if (DDT->pendingDeletedBB(BB) ||
+ (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()))
return false;
// If this block has a single predecessor, and if that pred has a single
@@ -948,7 +991,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
LoopHeaders.insert(BB);
LVI->eraseBlock(SinglePred);
- MergeBasicBlockIntoOnlyPred(BB);
+ MergeBasicBlockIntoOnlyPred(BB, nullptr, DDT);
// Now that BB is merged into SinglePred (i.e. SinglePred Code followed by
// BB code within one basic block `BB`), we need to invalidate the LVI
@@ -977,9 +1020,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// Invalidate LVI information for BB if the LVI is not provably true for
// all of BB.
- if (any_of(*BB, [](Instruction &I) {
- return !isGuaranteedToTransferExecutionToSuccessor(&I);
- }))
+ if (!isGuaranteedToTransferExecutionToSuccessor(BB))
LVI->eraseBlock(BB);
return true;
}
@@ -1031,18 +1072,23 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// successors to branch to. Let GetBestDestForJumpOnUndef decide.
if (isa<UndefValue>(Condition)) {
unsigned BestSucc = GetBestDestForJumpOnUndef(BB);
+ std::vector<DominatorTree::UpdateType> Updates;
// Fold the branch/switch.
TerminatorInst *BBTerm = BB->getTerminator();
+ Updates.reserve(BBTerm->getNumSuccessors());
for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) {
if (i == BestSucc) continue;
- BBTerm->getSuccessor(i)->removePredecessor(BB, true);
+ BasicBlock *Succ = BBTerm->getSuccessor(i);
+ Succ->removePredecessor(BB, true);
+ Updates.push_back({DominatorTree::Delete, BB, Succ});
}
- DEBUG(dbgs() << " In block '" << BB->getName()
- << "' folding undef terminator: " << *BBTerm << '\n');
+ LLVM_DEBUG(dbgs() << " In block '" << BB->getName()
+ << "' folding undef terminator: " << *BBTerm << '\n');
BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm);
BBTerm->eraseFromParent();
+ DDT->applyUpdates(Updates);
return true;
}
@@ -1050,10 +1096,11 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// terminator to an unconditional branch. This can occur due to threading in
// other blocks.
if (getKnownConstant(Condition, Preference)) {
- DEBUG(dbgs() << " In block '" << BB->getName()
- << "' folding terminator: " << *BB->getTerminator() << '\n');
+ LLVM_DEBUG(dbgs() << " In block '" << BB->getName()
+ << "' folding terminator: " << *BB->getTerminator()
+ << '\n');
++NumFolds;
- ConstantFoldTerminator(BB, true);
+ ConstantFoldTerminator(BB, true, nullptr, DDT);
return true;
}
@@ -1080,13 +1127,18 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// threading is concerned.
assert(CondBr->isConditional() && "Threading on unconditional terminator");
+ if (DDT->pending())
+ LVI->disableDT();
+ else
+ LVI->enableDT();
LazyValueInfo::Tristate Ret =
LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
CondConst, CondBr);
if (Ret != LazyValueInfo::Unknown) {
unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0;
unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1;
- CondBr->getSuccessor(ToRemove)->removePredecessor(BB, true);
+ BasicBlock *ToRemoveSucc = CondBr->getSuccessor(ToRemove);
+ ToRemoveSucc->removePredecessor(BB, true);
BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
CondBr->eraseFromParent();
if (CondCmp->use_empty())
@@ -1104,6 +1156,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
ConstantInt::getFalse(CondCmp->getType());
ReplaceFoldableUses(CondCmp, CI);
}
+ DDT->deleteEdge(BB, ToRemoveSucc);
return true;
}
@@ -1125,8 +1178,8 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// TODO: There are other places where load PRE would be profitable, such as
// more complex comparisons.
- if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue))
- if (SimplifyPartiallyRedundantLoad(LI))
+ if (LoadInst *LoadI = dyn_cast<LoadInst>(SimplifyValue))
+ if (SimplifyPartiallyRedundantLoad(LoadI))
return true;
// Before threading, try to propagate profile data backwards:
@@ -1182,9 +1235,12 @@ bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
Optional<bool> Implication =
isImpliedCondition(PBI->getCondition(), Cond, DL, CondIsTrue);
if (Implication) {
- BI->getSuccessor(*Implication ? 1 : 0)->removePredecessor(BB);
- BranchInst::Create(BI->getSuccessor(*Implication ? 0 : 1), BI);
+ BasicBlock *KeepSucc = BI->getSuccessor(*Implication ? 0 : 1);
+ BasicBlock *RemoveSucc = BI->getSuccessor(*Implication ? 1 : 0);
+ RemoveSucc->removePredecessor(BB);
+ BranchInst::Create(KeepSucc, BI);
BI->eraseFromParent();
+ DDT->deleteEdge(BB, RemoveSucc);
return true;
}
CurrentBB = CurrentPred;
@@ -1202,17 +1258,17 @@ static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) {
return false;
}
-/// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
-/// load instruction, eliminate it by replacing it with a PHI node. This is an
-/// important optimization that encourages jump threading, and needs to be run
-/// interlaced with other jump threading tasks.
-bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
+/// SimplifyPartiallyRedundantLoad - If LoadI is an obviously partially
+/// redundant load instruction, eliminate it by replacing it with a PHI node.
+/// This is an important optimization that encourages jump threading, and needs
+/// to be run interlaced with other jump threading tasks.
+bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
// Don't hack volatile and ordered loads.
- if (!LI->isUnordered()) return false;
+ if (!LoadI->isUnordered()) return false;
// If the load is defined in a block with exactly one predecessor, it can't be
// partially redundant.
- BasicBlock *LoadBB = LI->getParent();
+ BasicBlock *LoadBB = LoadI->getParent();
if (LoadBB->getSinglePredecessor())
return false;
@@ -1222,7 +1278,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
if (LoadBB->isEHPad())
return false;
- Value *LoadedPtr = LI->getOperand(0);
+ Value *LoadedPtr = LoadI->getOperand(0);
// If the loaded operand is defined in the LoadBB and its not a phi,
// it can't be available in predecessors.
@@ -1231,26 +1287,27 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
// Scan a few instructions up from the load, to see if it is obviously live at
// the entry to its block.
- BasicBlock::iterator BBIt(LI);
+ BasicBlock::iterator BBIt(LoadI);
bool IsLoadCSE;
if (Value *AvailableVal = FindAvailableLoadedValue(
- LI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
+ LoadI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
// If the value of the load is locally available within the block, just use
// it. This frequently occurs for reg2mem'd allocas.
if (IsLoadCSE) {
- LoadInst *NLI = cast<LoadInst>(AvailableVal);
- combineMetadataForCSE(NLI, LI);
+ LoadInst *NLoadI = cast<LoadInst>(AvailableVal);
+ combineMetadataForCSE(NLoadI, LoadI);
};
// If the returned value is the load itself, replace with an undef. This can
// only happen in dead loops.
- if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType());
- if (AvailableVal->getType() != LI->getType())
- AvailableVal =
- CastInst::CreateBitOrPointerCast(AvailableVal, LI->getType(), "", LI);
- LI->replaceAllUsesWith(AvailableVal);
- LI->eraseFromParent();
+ if (AvailableVal == LoadI)
+ AvailableVal = UndefValue::get(LoadI->getType());
+ if (AvailableVal->getType() != LoadI->getType())
+ AvailableVal = CastInst::CreateBitOrPointerCast(
+ AvailableVal, LoadI->getType(), "", LoadI);
+ LoadI->replaceAllUsesWith(AvailableVal);
+ LoadI->eraseFromParent();
return true;
}
@@ -1263,7 +1320,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
// If all of the loads and stores that feed the value have the same AA tags,
// then we can propagate them onto any newly inserted loads.
AAMDNodes AATags;
- LI->getAAMetadata(AATags);
+ LoadI->getAAMetadata(AATags);
SmallPtrSet<BasicBlock*, 8> PredsScanned;
@@ -1285,16 +1342,17 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
Value *PredAvailable = nullptr;
// NOTE: We don't CSE load that is volatile or anything stronger than
// unordered, that should have been checked when we entered the function.
- assert(LI->isUnordered() && "Attempting to CSE volatile or atomic loads");
+ assert(LoadI->isUnordered() &&
+ "Attempting to CSE volatile or atomic loads");
// If this is a load on a phi pointer, phi-translate it and search
// for available load/store to the pointer in predecessors.
Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB);
PredAvailable = FindAvailablePtrLoadStore(
- Ptr, LI->getType(), LI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan,
- AA, &IsLoadCSE, &NumScanedInst);
+ Ptr, LoadI->getType(), LoadI->isAtomic(), PredBB, BBIt,
+ DefMaxInstsToScan, AA, &IsLoadCSE, &NumScanedInst);
// If PredBB has a single predecessor, continue scanning through the
- // single precessor.
+ // single predecessor.
BasicBlock *SinglePredBB = PredBB;
while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() &&
NumScanedInst < DefMaxInstsToScan) {
@@ -1302,7 +1360,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
if (SinglePredBB) {
BBIt = SinglePredBB->end();
PredAvailable = FindAvailablePtrLoadStore(
- Ptr, LI->getType(), LI->isAtomic(), SinglePredBB, BBIt,
+ Ptr, LoadI->getType(), LoadI->isAtomic(), SinglePredBB, BBIt,
(DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,
&NumScanedInst);
}
@@ -1334,15 +1392,15 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
// If the value is unavailable in one of predecessors, we will end up
// inserting a new instruction into them. It is only valid if all the
- // instructions before LI are guaranteed to pass execution to its successor,
- // or if LI is safe to speculate.
+ // instructions before LoadI are guaranteed to pass execution to its
+ // successor, or if LoadI is safe to speculate.
// TODO: If this logic becomes more complex, and we will perform PRE insertion
// farther than to a predecessor, we need to reuse the code from GVN's PRE.
// It requires domination tree analysis, so for this simple case it is an
// overkill.
if (PredsScanned.size() != AvailablePreds.size() &&
- !isSafeToSpeculativelyExecute(LI))
- for (auto I = LoadBB->begin(); &*I != LI; ++I)
+ !isSafeToSpeculativelyExecute(LoadI))
+ for (auto I = LoadBB->begin(); &*I != LoadI; ++I)
if (!isGuaranteedToTransferExecutionToSuccessor(&*I))
return false;
@@ -1381,11 +1439,12 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
if (UnavailablePred) {
assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
"Can't handle critical edge here!");
- LoadInst *NewVal = new LoadInst(
- LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
- LI->getName() + ".pr", false, LI->getAlignment(), LI->getOrdering(),
- LI->getSyncScopeID(), UnavailablePred->getTerminator());
- NewVal->setDebugLoc(LI->getDebugLoc());
+ LoadInst *NewVal =
+ new LoadInst(LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
+ LoadI->getName() + ".pr", false, LoadI->getAlignment(),
+ LoadI->getOrdering(), LoadI->getSyncScopeID(),
+ UnavailablePred->getTerminator());
+ NewVal->setDebugLoc(LoadI->getDebugLoc());
if (AATags)
NewVal->setAAMetadata(AATags);
@@ -1398,10 +1457,10 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
// Create a PHI node at the start of the block for the PRE'd load value.
pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB);
- PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "",
+ PHINode *PN = PHINode::Create(LoadI->getType(), std::distance(PB, PE), "",
&LoadBB->front());
- PN->takeName(LI);
- PN->setDebugLoc(LI->getDebugLoc());
+ PN->takeName(LoadI);
+ PN->setDebugLoc(LoadI->getDebugLoc());
// Insert new entries into the PHI for each predecessor. A single block may
// have multiple entries here.
@@ -1419,19 +1478,19 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
// AvailablePreds vector as we go so that all of the PHI entries for this
// predecessor use the same bitcast.
Value *&PredV = I->second;
- if (PredV->getType() != LI->getType())
- PredV = CastInst::CreateBitOrPointerCast(PredV, LI->getType(), "",
+ if (PredV->getType() != LoadI->getType())
+ PredV = CastInst::CreateBitOrPointerCast(PredV, LoadI->getType(), "",
P->getTerminator());
PN->addIncoming(PredV, I->first);
}
- for (LoadInst *PredLI : CSELoads) {
- combineMetadataForCSE(PredLI, LI);
+ for (LoadInst *PredLoadI : CSELoads) {
+ combineMetadataForCSE(PredLoadI, LoadI);
}
- LI->replaceAllUsesWith(PN);
- LI->eraseFromParent();
+ LoadI->replaceAllUsesWith(PN);
+ LoadI->eraseFromParent();
return true;
}
@@ -1516,12 +1575,12 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
assert(!PredValues.empty() &&
"ComputeValueKnownInPredecessors returned true with no values");
- DEBUG(dbgs() << "IN BB: " << *BB;
- for (const auto &PredValue : PredValues) {
- dbgs() << " BB '" << BB->getName() << "': FOUND condition = "
- << *PredValue.first
- << " for pred '" << PredValue.second->getName() << "'.\n";
- });
+ LLVM_DEBUG(dbgs() << "IN BB: " << *BB;
+ for (const auto &PredValue : PredValues) {
+ dbgs() << " BB '" << BB->getName()
+ << "': FOUND condition = " << *PredValue.first
+ << " for pred '" << PredValue.second->getName() << "'.\n";
+ });
// Decide what we want to thread through. Convert our list of known values to
// a list of known destinations for each pred. This also discards duplicate
@@ -1591,20 +1650,24 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
// not thread. By doing so, we do not need to duplicate the current block and
// also miss potential opportunities in case we dont/cant duplicate.
if (OnlyDest && OnlyDest != MultipleDestSentinel) {
- if (PredWithKnownDest ==
- (size_t)std::distance(pred_begin(BB), pred_end(BB))) {
+ if (PredWithKnownDest == (size_t)pred_size(BB)) {
bool SeenFirstBranchToOnlyDest = false;
+ std::vector <DominatorTree::UpdateType> Updates;
+ Updates.reserve(BB->getTerminator()->getNumSuccessors() - 1);
for (BasicBlock *SuccBB : successors(BB)) {
- if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest)
+ if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest) {
SeenFirstBranchToOnlyDest = true; // Don't modify the first branch.
- else
+ } else {
SuccBB->removePredecessor(BB, true); // This is unreachable successor.
+ Updates.push_back({DominatorTree::Delete, BB, SuccBB});
+ }
}
// Finally update the terminator.
TerminatorInst *Term = BB->getTerminator();
BranchInst::Create(OnlyDest, Term);
Term->eraseFromParent();
+ DDT->applyUpdates(Updates);
// If the condition is now dead due to the removal of the old terminator,
// erase it.
@@ -1839,15 +1902,15 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
BasicBlock *SuccBB) {
// If threading to the same block as we come from, we would infinite loop.
if (SuccBB == BB) {
- DEBUG(dbgs() << " Not threading across BB '" << BB->getName()
- << "' - would thread to self!\n");
+ LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName()
+ << "' - would thread to self!\n");
return false;
}
// If threading this would thread across a loop header, don't thread the edge.
// See the comments above FindLoopHeaders for justifications and caveats.
if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
- DEBUG({
+ LLVM_DEBUG({
bool BBIsHeader = LoopHeaders.count(BB);
bool SuccIsHeader = LoopHeaders.count(SuccBB);
dbgs() << " Not threading across "
@@ -1861,8 +1924,8 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
unsigned JumpThreadCost =
getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
if (JumpThreadCost > BBDupThreshold) {
- DEBUG(dbgs() << " Not threading BB '" << BB->getName()
- << "' - Cost is too high: " << JumpThreadCost << "\n");
+ LLVM_DEBUG(dbgs() << " Not threading BB '" << BB->getName()
+ << "' - Cost is too high: " << JumpThreadCost << "\n");
return false;
}
@@ -1871,17 +1934,21 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
if (PredBBs.size() == 1)
PredBB = PredBBs[0];
else {
- DEBUG(dbgs() << " Factoring out " << PredBBs.size()
- << " common predecessors.\n");
+ LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size()
+ << " common predecessors.\n");
PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
}
// And finally, do it!
- DEBUG(dbgs() << " Threading edge from '" << PredBB->getName() << "' to '"
- << SuccBB->getName() << "' with cost: " << JumpThreadCost
- << ", across block:\n "
- << *BB << "\n");
-
+ LLVM_DEBUG(dbgs() << " Threading edge from '" << PredBB->getName()
+ << "' to '" << SuccBB->getName()
+ << "' with cost: " << JumpThreadCost
+ << ", across block:\n " << *BB << "\n");
+
+ if (DDT->pending())
+ LVI->disableDT();
+ else
+ LVI->enableDT();
LVI->threadEdge(PredBB, BB, SuccBB);
// We are going to have to map operands from the original BB block to the new
@@ -1931,15 +1998,32 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
// PHI nodes for NewBB now.
AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping);
+ // Update the terminator of PredBB to jump to NewBB instead of BB. This
+ // eliminates predecessors from BB, which requires us to simplify any PHI
+ // nodes in BB.
+ TerminatorInst *PredTerm = PredBB->getTerminator();
+ for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
+ if (PredTerm->getSuccessor(i) == BB) {
+ BB->removePredecessor(PredBB, true);
+ PredTerm->setSuccessor(i, NewBB);
+ }
+
+ // Enqueue required DT updates.
+ DDT->applyUpdates({{DominatorTree::Insert, NewBB, SuccBB},
+ {DominatorTree::Insert, PredBB, NewBB},
+ {DominatorTree::Delete, PredBB, BB}});
+
// If there were values defined in BB that are used outside the block, then we
// now have to update all uses of the value to use either the original value,
// the cloned value, or some PHI derived value. This can require arbitrary
// PHI insertion, of which we are prepared to do, clean these up now.
SSAUpdater SSAUpdate;
SmallVector<Use*, 16> UsesToRename;
+
for (Instruction &I : *BB) {
- // Scan all uses of this instruction to see if it is used outside of its
- // block, and if so, record them in UsesToRename.
+ // Scan all uses of this instruction to see if their uses are no longer
+ // dominated by the previous def and if so, record them in UsesToRename.
+ // Also, skip phi operands from PredBB - we'll remove them anyway.
for (Use &U : I.uses()) {
Instruction *User = cast<Instruction>(U.getUser());
if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
@@ -1954,8 +2038,7 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
// If there are no uses outside the block, we're done with this instruction.
if (UsesToRename.empty())
continue;
-
- DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
+ LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
// We found a use of I outside of BB. Rename all uses of I that are outside
// its block to be uses of the appropriate PHI node etc. See ValuesInBlocks
@@ -1966,19 +2049,9 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
while (!UsesToRename.empty())
SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "\n");
}
- // Ok, NewBB is good to go. Update the terminator of PredBB to jump to
- // NewBB instead of BB. This eliminates predecessors from BB, which requires
- // us to simplify any PHI nodes in BB.
- TerminatorInst *PredTerm = PredBB->getTerminator();
- for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
- if (PredTerm->getSuccessor(i) == BB) {
- BB->removePredecessor(PredBB, true);
- PredTerm->setSuccessor(i, NewBB);
- }
-
// At this point, the IR is fully up to date and consistent. Do a quick scan
// over the new instructions and zap any that are constants or dead. This
// frequently happens because of phi translation.
@@ -1998,20 +2071,42 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
BasicBlock *JumpThreadingPass::SplitBlockPreds(BasicBlock *BB,
ArrayRef<BasicBlock *> Preds,
const char *Suffix) {
+ SmallVector<BasicBlock *, 2> NewBBs;
+
// Collect the frequencies of all predecessors of BB, which will be used to
- // update the edge weight on BB->SuccBB.
- BlockFrequency PredBBFreq(0);
+ // update the edge weight of the result of splitting predecessors.
+ DenseMap<BasicBlock *, BlockFrequency> FreqMap;
if (HasProfileData)
for (auto Pred : Preds)
- PredBBFreq += BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB);
+ FreqMap.insert(std::make_pair(
+ Pred, BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB)));
+
+ // In the case when BB is a LandingPad block we create 2 new predecessors
+ // instead of just one.
+ if (BB->isLandingPad()) {
+ std::string NewName = std::string(Suffix) + ".split-lp";
+ SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), NewBBs);
+ } else {
+ NewBBs.push_back(SplitBlockPredecessors(BB, Preds, Suffix));
+ }
- BasicBlock *PredBB = SplitBlockPredecessors(BB, Preds, Suffix);
+ std::vector<DominatorTree::UpdateType> Updates;
+ Updates.reserve((2 * Preds.size()) + NewBBs.size());
+ for (auto NewBB : NewBBs) {
+ BlockFrequency NewBBFreq(0);
+ Updates.push_back({DominatorTree::Insert, NewBB, BB});
+ for (auto Pred : predecessors(NewBB)) {
+ Updates.push_back({DominatorTree::Delete, Pred, BB});
+ Updates.push_back({DominatorTree::Insert, Pred, NewBB});
+ if (HasProfileData) // Update frequencies between Pred -> NewBB.
+ NewBBFreq += FreqMap.lookup(Pred);
+ }
+ if (HasProfileData) // Apply the summed frequency to NewBB.
+ BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+ }
- // Set the block frequency of the newly created PredBB, which is the sum of
- // frequencies of Preds.
- if (HasProfileData)
- BFI->setBlockFreq(PredBB, PredBBFreq.getFrequency());
- return PredBB;
+ DDT->applyUpdates(Updates);
+ return NewBBs[0];
}
bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) {
@@ -2140,42 +2235,49 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
// cause us to transform this into an irreducible loop, don't do this.
// See the comments above FindLoopHeaders for justifications and caveats.
if (LoopHeaders.count(BB)) {
- DEBUG(dbgs() << " Not duplicating loop header '" << BB->getName()
- << "' into predecessor block '" << PredBBs[0]->getName()
- << "' - it might create an irreducible loop!\n");
+ LLVM_DEBUG(dbgs() << " Not duplicating loop header '" << BB->getName()
+ << "' into predecessor block '" << PredBBs[0]->getName()
+ << "' - it might create an irreducible loop!\n");
return false;
}
unsigned DuplicationCost =
getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
if (DuplicationCost > BBDupThreshold) {
- DEBUG(dbgs() << " Not duplicating BB '" << BB->getName()
- << "' - Cost is too high: " << DuplicationCost << "\n");
+ LLVM_DEBUG(dbgs() << " Not duplicating BB '" << BB->getName()
+ << "' - Cost is too high: " << DuplicationCost << "\n");
return false;
}
// And finally, do it! Start by factoring the predecessors if needed.
+ std::vector<DominatorTree::UpdateType> Updates;
BasicBlock *PredBB;
if (PredBBs.size() == 1)
PredBB = PredBBs[0];
else {
- DEBUG(dbgs() << " Factoring out " << PredBBs.size()
- << " common predecessors.\n");
+ LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size()
+ << " common predecessors.\n");
PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
}
+ Updates.push_back({DominatorTree::Delete, PredBB, BB});
// Okay, we decided to do this! Clone all the instructions in BB onto the end
// of PredBB.
- DEBUG(dbgs() << " Duplicating block '" << BB->getName() << "' into end of '"
- << PredBB->getName() << "' to eliminate branch on phi. Cost: "
- << DuplicationCost << " block is:" << *BB << "\n");
+ LLVM_DEBUG(dbgs() << " Duplicating block '" << BB->getName()
+ << "' into end of '" << PredBB->getName()
+ << "' to eliminate branch on phi. Cost: "
+ << DuplicationCost << " block is:" << *BB << "\n");
// Unless PredBB ends with an unconditional branch, split the edge so that we
// can just clone the bits from BB into the end of the new PredBB.
BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
if (!OldPredBranch || !OldPredBranch->isUnconditional()) {
- PredBB = SplitEdge(PredBB, BB);
+ BasicBlock *OldPredBB = PredBB;
+ PredBB = SplitEdge(OldPredBB, BB);
+ Updates.push_back({DominatorTree::Insert, OldPredBB, PredBB});
+ Updates.push_back({DominatorTree::Insert, PredBB, BB});
+ Updates.push_back({DominatorTree::Delete, OldPredBB, BB});
OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
}
@@ -2217,6 +2319,10 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
// Otherwise, insert the new instruction into the block.
New->setName(BI->getName());
PredBB->getInstList().insert(OldPredBranch->getIterator(), New);
+ // Update Dominance from simplified New instruction operands.
+ for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+ if (BasicBlock *SuccBB = dyn_cast<BasicBlock>(New->getOperand(i)))
+ Updates.push_back({DominatorTree::Insert, PredBB, SuccBB});
}
}
@@ -2252,7 +2358,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
if (UsesToRename.empty())
continue;
- DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
+ LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
// We found a use of I outside of BB. Rename all uses of I that are outside
// its block to be uses of the appropriate PHI node etc. See ValuesInBlocks
@@ -2263,7 +2369,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
while (!UsesToRename.empty())
SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "\n");
}
// PredBB no longer jumps to BB, remove entries in the PHI node for the edge
@@ -2272,6 +2378,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
// Remove the unconditional branch at the end of the PredBB block.
OldPredBranch->eraseFromParent();
+ DDT->applyUpdates(Updates);
++NumDupes;
return true;
@@ -2314,6 +2421,10 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
// Now check if one of the select values would allow us to constant fold the
// terminator in BB. We don't do the transform if both sides fold, those
// cases will be threaded in any case.
+ if (DDT->pending())
+ LVI->disableDT();
+ else
+ LVI->enableDT();
LazyValueInfo::Tristate LHSFolds =
LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1),
CondRHS, Pred, BB, CondCmp);
@@ -2344,6 +2455,8 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
// The select is now dead.
SI->eraseFromParent();
+ DDT->applyUpdates({{DominatorTree::Insert, NewBB, BB},
+ {DominatorTree::Insert, Pred, NewBB}});
// Update any other PHI nodes in BB.
for (BasicBlock::iterator BI = BB->begin();
PHINode *Phi = dyn_cast<PHINode>(BI); ++BI)
@@ -2409,7 +2522,7 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
break;
}
} else if (SelectInst *SelectI = dyn_cast<SelectInst>(U.getUser())) {
- // Look for a Select in BB that uses PN as condtion.
+ // Look for a Select in BB that uses PN as condition.
if (isUnfoldCandidate(SelectI, U.get())) {
SI = SelectI;
break;
@@ -2422,11 +2535,25 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
// Expand the select.
TerminatorInst *Term =
SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
+ BasicBlock *SplitBB = SI->getParent();
+ BasicBlock *NewBB = Term->getParent();
PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
NewPN->addIncoming(SI->getFalseValue(), BB);
SI->replaceAllUsesWith(NewPN);
SI->eraseFromParent();
+ // NewBB and SplitBB are newly created blocks which require insertion.
+ std::vector<DominatorTree::UpdateType> Updates;
+ Updates.reserve((2 * SplitBB->getTerminator()->getNumSuccessors()) + 3);
+ Updates.push_back({DominatorTree::Insert, BB, SplitBB});
+ Updates.push_back({DominatorTree::Insert, BB, NewBB});
+ Updates.push_back({DominatorTree::Insert, NewBB, SplitBB});
+ // BB's successors were moved to SplitBB, update DDT accordingly.
+ for (auto *Succ : successors(SplitBB)) {
+ Updates.push_back({DominatorTree::Delete, BB, Succ});
+ Updates.push_back({DominatorTree::Insert, SplitBB, Succ});
+ }
+ DDT->applyUpdates(Updates);
return true;
}
return false;
@@ -2513,8 +2640,8 @@ bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
if (!TrueDestIsSafe && !FalseDestIsSafe)
return false;
- BasicBlock *UnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
- BasicBlock *GuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
+ BasicBlock *PredUnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
+ BasicBlock *PredGuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
ValueToValueMapTy UnguardedMapping, GuardedMapping;
Instruction *AfterGuard = Guard->getNextNode();
@@ -2523,18 +2650,29 @@ bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
return false;
// Duplicate all instructions before the guard and the guard itself to the
// branch where implication is not proved.
- GuardedBlock = DuplicateInstructionsInSplitBetween(
- BB, GuardedBlock, AfterGuard, GuardedMapping);
+ BasicBlock *GuardedBlock = DuplicateInstructionsInSplitBetween(
+ BB, PredGuardedBlock, AfterGuard, GuardedMapping);
assert(GuardedBlock && "Could not create the guarded block?");
// Duplicate all instructions before the guard in the unguarded branch.
// Since we have successfully duplicated the guarded block and this block
// has fewer instructions, we expect it to succeed.
- UnguardedBlock = DuplicateInstructionsInSplitBetween(BB, UnguardedBlock,
- Guard, UnguardedMapping);
+ BasicBlock *UnguardedBlock = DuplicateInstructionsInSplitBetween(
+ BB, PredUnguardedBlock, Guard, UnguardedMapping);
assert(UnguardedBlock && "Could not create the unguarded block?");
- DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
- << GuardedBlock->getName() << "\n");
-
+ LLVM_DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
+ << GuardedBlock->getName() << "\n");
+ // DuplicateInstructionsInSplitBetween inserts a new block "BB.split" between
+ // PredBB and BB. We need to perform two inserts and one delete for each of
+ // the above calls to update Dominators.
+ DDT->applyUpdates(
+ {// Guarded block split.
+ {DominatorTree::Delete, PredGuardedBlock, BB},
+ {DominatorTree::Insert, PredGuardedBlock, GuardedBlock},
+ {DominatorTree::Insert, GuardedBlock, BB},
+ // Unguarded block split.
+ {DominatorTree::Delete, PredUnguardedBlock, BB},
+ {DominatorTree::Insert, PredUnguardedBlock, UnguardedBlock},
+ {DominatorTree::Insert, UnguardedBlock, BB}});
// Some instructions before the guard may still have uses. For them, we need
// to create Phi nodes merging their copies in both guarded and unguarded
// branches. Those instructions that have no uses can be just removed.
diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
index 946474fef062..ff66632f0391 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -47,6 +47,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
@@ -64,7 +65,6 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
#include <algorithm>
@@ -170,7 +170,8 @@ struct LegacyLICMPass : public LoopPass {
/// loop preheaders be inserted into the CFG...
///
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
if (EnableMSSALoopDependency)
AU.addRequired<MemorySSAWrapperPass>();
@@ -220,7 +221,10 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
return PreservedAnalyses::all();
auto PA = getLoopPassPreservedAnalyses();
- PA.preserveSet<CFGAnalyses>();
+
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LoopAnalysis>();
+
return PA;
}
@@ -392,7 +396,8 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
// If the instruction is dead, we would try to sink it because it isn't
// used in the loop, instead, just delete it.
if (isInstructionTriviallyDead(&I, TLI)) {
- DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
+ LLVM_DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
+ salvageDebugInfo(I);
++II;
CurAST->deleteValue(&I);
I.eraseFromParent();
@@ -445,101 +450,78 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
BasicBlock *BB = DTN->getBlock();
// Only need to process the contents of this block if it is not part of a
// subloop (which would already have been processed).
- if (!inSubLoop(BB, CurLoop, LI))
- for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
- Instruction &I = *II++;
- // Try constant folding this instruction. If all the operands are
- // constants, it is technically hoistable, but it would be better to
- // just fold it.
- if (Constant *C = ConstantFoldInstruction(
- &I, I.getModule()->getDataLayout(), TLI)) {
- DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n');
- CurAST->copyValue(&I, C);
- I.replaceAllUsesWith(C);
- if (isInstructionTriviallyDead(&I, TLI)) {
- CurAST->deleteValue(&I);
- I.eraseFromParent();
- }
- Changed = true;
- continue;
- }
+ if (inSubLoop(BB, CurLoop, LI))
+ continue;
- // Attempt to remove floating point division out of the loop by
- // converting it to a reciprocal multiplication.
- if (I.getOpcode() == Instruction::FDiv &&
- CurLoop->isLoopInvariant(I.getOperand(1)) &&
- I.hasAllowReciprocal()) {
- auto Divisor = I.getOperand(1);
- auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
- auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
- ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
- ReciprocalDivisor->insertBefore(&I);
-
- auto Product =
- BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor);
- Product->setFastMathFlags(I.getFastMathFlags());
- Product->insertAfter(&I);
- I.replaceAllUsesWith(Product);
+ // Keep track of whether the prefix of instructions visited so far are such
+ // that the next instruction visited is guaranteed to execute if the loop
+ // is entered.
+ bool IsMustExecute = CurLoop->getHeader() == BB;
+
+ for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
+ Instruction &I = *II++;
+ // Try constant folding this instruction. If all the operands are
+ // constants, it is technically hoistable, but it would be better to
+ // just fold it.
+ if (Constant *C = ConstantFoldInstruction(
+ &I, I.getModule()->getDataLayout(), TLI)) {
+ LLVM_DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C
+ << '\n');
+ CurAST->copyValue(&I, C);
+ I.replaceAllUsesWith(C);
+ if (isInstructionTriviallyDead(&I, TLI)) {
+ CurAST->deleteValue(&I);
I.eraseFromParent();
-
- hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
- Changed = true;
- continue;
}
+ Changed = true;
+ continue;
+ }
+
+ // Try hoisting the instruction out to the preheader. We can only do
+ // this if all of the operands of the instruction are loop invariant and
+ // if it is safe to hoist the instruction.
+ //
+ if (CurLoop->hasLoopInvariantOperands(&I) &&
+ canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) &&
+ (IsMustExecute ||
+ isSafeToExecuteUnconditionally(
+ I, DT, CurLoop, SafetyInfo, ORE,
+ CurLoop->getLoopPreheader()->getTerminator()))) {
+ Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE);
+ continue;
+ }
+
+ // Attempt to remove floating point division out of the loop by
+ // converting it to a reciprocal multiplication.
+ if (I.getOpcode() == Instruction::FDiv &&
+ CurLoop->isLoopInvariant(I.getOperand(1)) &&
+ I.hasAllowReciprocal()) {
+ auto Divisor = I.getOperand(1);
+ auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
+ auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
+ ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
+ ReciprocalDivisor->insertBefore(&I);
+
+ auto Product =
+ BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor);
+ Product->setFastMathFlags(I.getFastMathFlags());
+ Product->insertAfter(&I);
+ I.replaceAllUsesWith(Product);
+ I.eraseFromParent();
- // Try hoisting the instruction out to the preheader. We can only do
- // this if all of the operands of the instruction are loop invariant and
- // if it is safe to hoist the instruction.
- //
- if (CurLoop->hasLoopInvariantOperands(&I) &&
- canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) &&
- isSafeToExecuteUnconditionally(
- I, DT, CurLoop, SafetyInfo, ORE,
- CurLoop->getLoopPreheader()->getTerminator()))
- Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE);
+ hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
+ Changed = true;
+ continue;
}
+
+ if (IsMustExecute)
+ IsMustExecute = isGuaranteedToTransferExecutionToSuccessor(&I);
+ }
}
return Changed;
}
-/// Computes loop safety information, checks loop body & header
-/// for the possibility of may throw exception.
-///
-void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
- assert(CurLoop != nullptr && "CurLoop cant be null");
- BasicBlock *Header = CurLoop->getHeader();
- // Setting default safety values.
- SafetyInfo->MayThrow = false;
- SafetyInfo->HeaderMayThrow = false;
- // Iterate over header and compute safety info.
- for (BasicBlock::iterator I = Header->begin(), E = Header->end();
- (I != E) && !SafetyInfo->HeaderMayThrow; ++I)
- SafetyInfo->HeaderMayThrow |=
- !isGuaranteedToTransferExecutionToSuccessor(&*I);
-
- SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow;
- // Iterate over loop instructions and compute safety info.
- // Skip header as it has been computed and stored in HeaderMayThrow.
- // The first block in loopinfo.Blocks is guaranteed to be the header.
- assert(Header == *CurLoop->getBlocks().begin() &&
- "First block must be header");
- for (Loop::block_iterator BB = std::next(CurLoop->block_begin()),
- BBE = CurLoop->block_end();
- (BB != BBE) && !SafetyInfo->MayThrow; ++BB)
- for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
- (I != E) && !SafetyInfo->MayThrow; ++I)
- SafetyInfo->MayThrow |= !isGuaranteedToTransferExecutionToSuccessor(&*I);
-
- // Compute funclet colors if we might sink/hoist in a function with a funclet
- // personality routine.
- Function *Fn = CurLoop->getHeader()->getParent();
- if (Fn->hasPersonalityFn())
- if (Constant *PersonalityFn = Fn->getPersonalityFn())
- if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
- SafetyInfo->BlockColors = colorEHFunclets(*Fn);
-}
-
// Return true if LI is invariant within scope of the loop. LI is invariant if
// CurLoop is dominated by an invariant.start representing the same memory
// location and size as the memory location LI loads from, and also the
@@ -708,7 +690,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
/// This is true when all incoming values are that instruction.
/// This pattern occurs most often with LCSSA PHI nodes.
///
-static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) {
+static bool isTriviallyReplaceablePHI(const PHINode &PN, const Instruction &I) {
for (const Value *IncValue : PN.incoming_values())
if (IncValue != &I)
return false;
@@ -838,12 +820,12 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
return New;
}
-static Instruction *sinkThroughTriviallyReplacablePHI(
+static Instruction *sinkThroughTriviallyReplaceablePHI(
PHINode *TPN, Instruction *I, LoopInfo *LI,
SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
const LoopSafetyInfo *SafetyInfo, const Loop *CurLoop) {
- assert(isTriviallyReplacablePHI(*TPN, *I) &&
- "Expect only trivially replacalbe PHI");
+ assert(isTriviallyReplaceablePHI(*TPN, *I) &&
+ "Expect only trivially replaceable PHI");
BasicBlock *ExitBlock = TPN->getParent();
Instruction *New;
auto It = SunkCopies.find(ExitBlock);
@@ -886,7 +868,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
assert(ExitBlockSet.count(ExitBB) && "Expect the PHI is in an exit block.");
// Split predecessors of the loop exit to make instructions in the loop are
- // exposed to exit blocks through trivially replacable PHIs while keeping the
+ // exposed to exit blocks through trivially replaceable PHIs while keeping the
// loop in the canonical form where each predecessor of each exit block should
// be contained within the loop. For example, this will convert the loop below
// from
@@ -898,7 +880,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
// %v2 =
// br %LE, %LB1
// LE:
- // %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replacable
+ // %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replaceable
//
// to
//
@@ -909,10 +891,10 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
// %v2 =
// br %LE.split2, %LB1
// LE.split:
- // %p1 = phi [%v1, %LB1] <-- trivially replacable
+ // %p1 = phi [%v1, %LB1] <-- trivially replaceable
// br %LE
// LE.split2:
- // %p2 = phi [%v2, %LB2] <-- trivially replacable
+ // %p2 = phi [%v2, %LB2] <-- trivially replaceable
// br %LE
// LE:
// %p = phi [%p1, %LE.split], [%p2, %LE.split2]
@@ -929,8 +911,14 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
// Since we do not allow splitting EH-block with BlockColors in
// canSplitPredecessors(), we can simply assign predecessor's color to
// the new block.
- if (!BlockColors.empty())
- BlockColors[NewPred] = BlockColors[PredBB];
+ if (!BlockColors.empty()) {
+ // Grab a reference to the ColorVector to be inserted before getting the
+ // reference to the vector we are copying because inserting the new
+ // element in BlockColors might cause the map to be reallocated.
+ ColorVector &ColorsForNewBlock = BlockColors[NewPred];
+ ColorVector &ColorsForOldBlock = BlockColors[PredBB];
+ ColorsForNewBlock = ColorsForOldBlock;
+ }
}
PredBBs.remove(PredBB);
}
@@ -944,7 +932,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
const Loop *CurLoop, LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE, bool FreeInLoop) {
- DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
+ LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
ORE->emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
<< "sinking " << ore::NV("Inst", &I);
@@ -987,14 +975,14 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
}
VisitedUsers.insert(PN);
- if (isTriviallyReplacablePHI(*PN, I))
+ if (isTriviallyReplaceablePHI(*PN, I))
continue;
if (!canSplitPredecessors(PN, SafetyInfo))
return Changed;
// Split predecessors of the PHI so that we can make users trivially
- // replacable.
+ // replaceable.
splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop, SafetyInfo);
// Should rebuild the iterators, as they may be invalidated by
@@ -1029,9 +1017,9 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
PHINode *PN = cast<PHINode>(User);
assert(ExitBlockSet.count(PN->getParent()) &&
"The LCSSA PHI is not in an exit block!");
- // The PHI must be trivially replacable.
- Instruction *New = sinkThroughTriviallyReplacablePHI(PN, &I, LI, SunkCopies,
- SafetyInfo, CurLoop);
+ // The PHI must be trivially replaceable.
+ Instruction *New = sinkThroughTriviallyReplaceablePHI(PN, &I, LI, SunkCopies,
+ SafetyInfo, CurLoop);
PN->replaceAllUsesWith(New);
PN->eraseFromParent();
Changed = true;
@@ -1046,8 +1034,8 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
const LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE) {
auto *Preheader = CurLoop->getLoopPreheader();
- DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
- << "\n");
+ LLVM_DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
+ << "\n");
ORE->emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "Hoisted", &I) << "hoisting "
<< ore::NV("Inst", &I);
@@ -1236,7 +1224,7 @@ bool llvm::promoteLoopAccessesToScalars(
Value *SomePtr = *PointerMustAliases.begin();
BasicBlock *Preheader = CurLoop->getLoopPreheader();
- // It isn't safe to promote a load/store from the loop if the load/store is
+ // It is not safe to promote a load/store from the loop if the load/store is
// conditional. For example, turning:
//
// for () { if (c) *P += 1; }
@@ -1365,7 +1353,7 @@ bool llvm::promoteLoopAccessesToScalars(
// If a store dominates all exit blocks, it is safe to sink.
// As explained above, if an exit block was executed, a dominating
- // store must have been been executed at least once, so we are not
+ // store must have been executed at least once, so we are not
// introducing stores on paths that did not have them.
// Note that this only looks at explicit exit blocks. If we ever
// start sinking stores into unwind edges (see above), this will break.
@@ -1427,8 +1415,8 @@ bool llvm::promoteLoopAccessesToScalars(
return false;
// Otherwise, this is safe to promote, lets do it!
- DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
- << '\n');
+ LLVM_DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
+ << '\n');
ORE->emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar",
LoopUses[0])
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index 7f7c6de76450..3b41b5d96c86 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -71,7 +71,7 @@ public:
private:
bool runOnLoop(Loop *L);
- /// \brief Check if the the stride of the accesses is large enough to
+ /// Check if the stride of the accesses is large enough to
/// warrant a prefetch.
bool isStrideLargeEnough(const SCEVAddRecExpr *AR);
@@ -244,9 +244,9 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
if (ItersAhead > getMaxPrefetchIterationsAhead())
return MadeChange;
- DEBUG(dbgs() << "Prefetching " << ItersAhead
- << " iterations ahead (loop size: " << LoopSize << ") in "
- << L->getHeader()->getParent()->getName() << ": " << *L);
+ LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
+ << " iterations ahead (loop size: " << LoopSize << ") in "
+ << L->getHeader()->getParent()->getName() << ": " << *L);
SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads;
for (const auto BB : L->blocks()) {
@@ -275,7 +275,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
if (!LSCEVAddRec)
continue;
- // Check if the the stride of the accesses is large enough to warrant a
+ // Check if the stride of the accesses is large enough to warrant a
// prefetch.
if (!isStrideLargeEnough(LSCEVAddRec))
continue;
@@ -320,8 +320,8 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1),
ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
++NumPrefetches;
- DEBUG(dbgs() << " Access: " << *PtrValue << ", SCEV: " << *LSCEV
- << "\n");
+ LLVM_DEBUG(dbgs() << " Access: " << *PtrValue << ", SCEV: " << *LSCEV
+ << "\n");
ORE->emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI)
<< "prefetched memory access";
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index 15cd1086f209..d412025d7e94 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -142,14 +142,15 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
// of trouble.
BasicBlock *Preheader = L->getLoopPreheader();
if (!Preheader || !L->hasDedicatedExits()) {
- DEBUG(dbgs()
- << "Deletion requires Loop with preheader and dedicated exits.\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "Deletion requires Loop with preheader and dedicated exits.\n");
return LoopDeletionResult::Unmodified;
}
// We can't remove loops that contain subloops. If the subloops were dead,
// they would already have been removed in earlier executions of this pass.
if (L->begin() != L->end()) {
- DEBUG(dbgs() << "Loop contains subloops.\n");
+ LLVM_DEBUG(dbgs() << "Loop contains subloops.\n");
return LoopDeletionResult::Unmodified;
}
@@ -157,7 +158,7 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
BasicBlock *ExitBlock = L->getUniqueExitBlock();
if (ExitBlock && isLoopNeverExecuted(L)) {
- DEBUG(dbgs() << "Loop is proven to never execute, delete it!");
+ LLVM_DEBUG(dbgs() << "Loop is proven to never execute, delete it!");
// Set incoming value to undef for phi nodes in the exit block.
for (PHINode &P : ExitBlock->phis()) {
std::fill(P.incoming_values().begin(), P.incoming_values().end(),
@@ -178,13 +179,13 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
// block will be branched to, or trying to preserve the branching logic in
// a loop invariant manner.
if (!ExitBlock) {
- DEBUG(dbgs() << "Deletion requires single exit block\n");
+ LLVM_DEBUG(dbgs() << "Deletion requires single exit block\n");
return LoopDeletionResult::Unmodified;
}
// Finally, we have to check that the loop really is dead.
bool Changed = false;
if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader)) {
- DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n");
+ LLVM_DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n");
return Changed ? LoopDeletionResult::Modified
: LoopDeletionResult::Unmodified;
}
@@ -193,12 +194,12 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
// They could be infinite, in which case we'd be changing program behavior.
const SCEV *S = SE.getMaxBackedgeTakenCount(L);
if (isa<SCEVCouldNotCompute>(S)) {
- DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n");
+ LLVM_DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n");
return Changed ? LoopDeletionResult::Modified
: LoopDeletionResult::Unmodified;
}
- DEBUG(dbgs() << "Loop is invariant, delete it!");
+ LLVM_DEBUG(dbgs() << "Loop is invariant, delete it!");
deleteDeadLoop(L, &DT, &SE, &LI);
++NumDeleted;
@@ -209,8 +210,8 @@ PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &Updater) {
- DEBUG(dbgs() << "Analyzing Loop for deletion: ");
- DEBUG(L.dump());
+ LLVM_DEBUG(dbgs() << "Analyzing Loop for deletion: ");
+ LLVM_DEBUG(L.dump());
std::string LoopName = L.getName();
auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI);
if (Result == LoopDeletionResult::Unmodified)
@@ -255,8 +256,8 @@ bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- DEBUG(dbgs() << "Analyzing Loop for deletion: ");
- DEBUG(L->dump());
+ LLVM_DEBUG(dbgs() << "Analyzing Loop for deletion: ");
+ LLVM_DEBUG(L->dump());
LoopDeletionResult Result = deleteLoopIfDead(L, DT, SE, LI);
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 0d7e3db901cb..06083a4f5086 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -111,7 +111,7 @@ STATISTIC(NumLoopsDistributed, "Number of loops distributed");
namespace {
-/// \brief Maintains the set of instructions of the loop for a partition before
+/// Maintains the set of instructions of the loop for a partition before
/// cloning. After cloning, it hosts the new loop.
class InstPartition {
using InstructionSet = SmallPtrSet<Instruction *, 8>;
@@ -122,20 +122,20 @@ public:
Set.insert(I);
}
- /// \brief Returns whether this partition contains a dependence cycle.
+ /// Returns whether this partition contains a dependence cycle.
bool hasDepCycle() const { return DepCycle; }
- /// \brief Adds an instruction to this partition.
+ /// Adds an instruction to this partition.
void add(Instruction *I) { Set.insert(I); }
- /// \brief Collection accessors.
+ /// Collection accessors.
InstructionSet::iterator begin() { return Set.begin(); }
InstructionSet::iterator end() { return Set.end(); }
InstructionSet::const_iterator begin() const { return Set.begin(); }
InstructionSet::const_iterator end() const { return Set.end(); }
bool empty() const { return Set.empty(); }
- /// \brief Moves this partition into \p Other. This partition becomes empty
+ /// Moves this partition into \p Other. This partition becomes empty
/// after this.
void moveTo(InstPartition &Other) {
Other.Set.insert(Set.begin(), Set.end());
@@ -143,7 +143,7 @@ public:
Other.DepCycle |= DepCycle;
}
- /// \brief Populates the partition with a transitive closure of all the
+ /// Populates the partition with a transitive closure of all the
/// instructions that the seeded instructions dependent on.
void populateUsedSet() {
// FIXME: We currently don't use control-dependence but simply include all
@@ -166,7 +166,7 @@ public:
}
}
- /// \brief Clones the original loop.
+ /// Clones the original loop.
///
/// Updates LoopInfo and DominatorTree using the information that block \p
/// LoopDomBB dominates the loop.
@@ -179,27 +179,27 @@ public:
return ClonedLoop;
}
- /// \brief The cloned loop. If this partition is mapped to the original loop,
+ /// The cloned loop. If this partition is mapped to the original loop,
/// this is null.
const Loop *getClonedLoop() const { return ClonedLoop; }
- /// \brief Returns the loop where this partition ends up after distribution.
+ /// Returns the loop where this partition ends up after distribution.
/// If this partition is mapped to the original loop then use the block from
/// the loop.
const Loop *getDistributedLoop() const {
return ClonedLoop ? ClonedLoop : OrigLoop;
}
- /// \brief The VMap that is populated by cloning and then used in
+ /// The VMap that is populated by cloning and then used in
/// remapinstruction to remap the cloned instructions.
ValueToValueMapTy &getVMap() { return VMap; }
- /// \brief Remaps the cloned instructions using VMap.
+ /// Remaps the cloned instructions using VMap.
void remapInstructions() {
remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
}
- /// \brief Based on the set of instructions selected for this partition,
+ /// Based on the set of instructions selected for this partition,
/// removes the unnecessary ones.
void removeUnusedInsts() {
SmallVector<Instruction *, 8> Unused;
@@ -239,30 +239,30 @@ public:
}
private:
- /// \brief Instructions from OrigLoop selected for this partition.
+ /// Instructions from OrigLoop selected for this partition.
InstructionSet Set;
- /// \brief Whether this partition contains a dependence cycle.
+ /// Whether this partition contains a dependence cycle.
bool DepCycle;
- /// \brief The original loop.
+ /// The original loop.
Loop *OrigLoop;
- /// \brief The cloned loop. If this partition is mapped to the original loop,
+ /// The cloned loop. If this partition is mapped to the original loop,
/// this is null.
Loop *ClonedLoop = nullptr;
- /// \brief The blocks of ClonedLoop including the preheader. If this
+ /// The blocks of ClonedLoop including the preheader. If this
/// partition is mapped to the original loop, this is empty.
SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
- /// \brief These gets populated once the set of instructions have been
+ /// These gets populated once the set of instructions have been
/// finalized. If this partition is mapped to the original loop, these are not
/// set.
ValueToValueMapTy VMap;
};
-/// \brief Holds the set of Partitions. It populates them, merges them and then
+/// Holds the set of Partitions. It populates them, merges them and then
/// clones the loops.
class InstPartitionContainer {
using InstToPartitionIdT = DenseMap<Instruction *, int>;
@@ -271,10 +271,10 @@ public:
InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
: L(L), LI(LI), DT(DT) {}
- /// \brief Returns the number of partitions.
+ /// Returns the number of partitions.
unsigned getSize() const { return PartitionContainer.size(); }
- /// \brief Adds \p Inst into the current partition if that is marked to
+ /// Adds \p Inst into the current partition if that is marked to
/// contain cycles. Otherwise start a new partition for it.
void addToCyclicPartition(Instruction *Inst) {
// If the current partition is non-cyclic. Start a new one.
@@ -284,7 +284,7 @@ public:
PartitionContainer.back().add(Inst);
}
- /// \brief Adds \p Inst into a partition that is not marked to contain
+ /// Adds \p Inst into a partition that is not marked to contain
/// dependence cycles.
///
// Initially we isolate memory instructions into as many partitions as
@@ -293,7 +293,7 @@ public:
PartitionContainer.emplace_back(Inst, L);
}
- /// \brief Merges adjacent non-cyclic partitions.
+ /// Merges adjacent non-cyclic partitions.
///
/// The idea is that we currently only want to isolate the non-vectorizable
/// partition. We could later allow more distribution among these partition
@@ -303,7 +303,7 @@ public:
[](const InstPartition *P) { return !P->hasDepCycle(); });
}
- /// \brief If a partition contains only conditional stores, we won't vectorize
+ /// If a partition contains only conditional stores, we won't vectorize
/// it. Try to merge it with a previous cyclic partition.
void mergeNonIfConvertible() {
mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
@@ -323,14 +323,14 @@ public:
});
}
- /// \brief Merges the partitions according to various heuristics.
+ /// Merges the partitions according to various heuristics.
void mergeBeforePopulating() {
mergeAdjacentNonCyclic();
if (!DistributeNonIfConvertible)
mergeNonIfConvertible();
}
- /// \brief Merges partitions in order to ensure that no loads are duplicated.
+ /// Merges partitions in order to ensure that no loads are duplicated.
///
/// We can't duplicate loads because that could potentially reorder them.
/// LoopAccessAnalysis provides dependency information with the context that
@@ -362,9 +362,11 @@ public:
std::tie(LoadToPart, NewElt) =
LoadToPartition.insert(std::make_pair(Inst, PartI));
if (!NewElt) {
- DEBUG(dbgs() << "Merging partitions due to this load in multiple "
- << "partitions: " << PartI << ", "
- << LoadToPart->second << "\n" << *Inst << "\n");
+ LLVM_DEBUG(dbgs()
+ << "Merging partitions due to this load in multiple "
+ << "partitions: " << PartI << ", " << LoadToPart->second
+ << "\n"
+ << *Inst << "\n");
auto PartJ = I;
do {
@@ -398,7 +400,7 @@ public:
return true;
}
- /// \brief Sets up the mapping between instructions to partitions. If the
+ /// Sets up the mapping between instructions to partitions. If the
/// instruction is duplicated across multiple partitions, set the entry to -1.
void setupPartitionIdOnInstructions() {
int PartitionID = 0;
@@ -416,14 +418,14 @@ public:
}
}
- /// \brief Populates the partition with everything that the seeding
+ /// Populates the partition with everything that the seeding
/// instructions require.
void populateUsedSet() {
for (auto &P : PartitionContainer)
P.populateUsedSet();
}
- /// \brief This performs the main chunk of the work of cloning the loops for
+ /// This performs the main chunk of the work of cloning the loops for
/// the partitions.
void cloneLoops() {
BasicBlock *OrigPH = L->getLoopPreheader();
@@ -470,13 +472,13 @@ public:
Curr->getDistributedLoop()->getExitingBlock());
}
- /// \brief Removes the dead instructions from the cloned loops.
+ /// Removes the dead instructions from the cloned loops.
void removeUnusedInsts() {
for (auto &Partition : PartitionContainer)
Partition.removeUnusedInsts();
}
- /// \brief For each memory pointer, it computes the partitionId the pointer is
+ /// For each memory pointer, it computes the partitionId the pointer is
/// used in.
///
/// This returns an array of int where the I-th entry corresponds to I-th
@@ -543,10 +545,10 @@ public:
private:
using PartitionContainerT = std::list<InstPartition>;
- /// \brief List of partitions.
+ /// List of partitions.
PartitionContainerT PartitionContainer;
- /// \brief Mapping from Instruction to partition Id. If the instruction
+ /// Mapping from Instruction to partition Id. If the instruction
/// belongs to multiple partitions the entry contains -1.
InstToPartitionIdT InstToPartitionId;
@@ -554,7 +556,7 @@ private:
LoopInfo *LI;
DominatorTree *DT;
- /// \brief The control structure to merge adjacent partitions if both satisfy
+ /// The control structure to merge adjacent partitions if both satisfy
/// the \p Predicate.
template <class UnaryPredicate>
void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
@@ -575,7 +577,7 @@ private:
}
};
-/// \brief For each memory instruction, this class maintains difference of the
+/// For each memory instruction, this class maintains difference of the
/// number of unsafe dependences that start out from this instruction minus
/// those that end here.
///
@@ -602,7 +604,7 @@ public:
const SmallVectorImpl<Dependence> &Dependences) {
Accesses.append(Instructions.begin(), Instructions.end());
- DEBUG(dbgs() << "Backward dependences:\n");
+ LLVM_DEBUG(dbgs() << "Backward dependences:\n");
for (auto &Dep : Dependences)
if (Dep.isPossiblyBackward()) {
// Note that the designations source and destination follow the program
@@ -611,7 +613,7 @@ public:
++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
--Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
- DEBUG(Dep.print(dbgs(), 2, Instructions));
+ LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions));
}
}
@@ -619,7 +621,7 @@ private:
AccessesType Accesses;
};
-/// \brief The actual class performing the per-loop work.
+/// The actual class performing the per-loop work.
class LoopDistributeForLoop {
public:
LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
@@ -628,12 +630,13 @@ public:
setForced();
}
- /// \brief Try to distribute an inner-most loop.
+ /// Try to distribute an inner-most loop.
bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
assert(L->empty() && "Only process inner loops.");
- DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName()
- << "\" checking " << *L << "\n");
+ LLVM_DEBUG(dbgs() << "\nLDist: In \""
+ << L->getHeader()->getParent()->getName()
+ << "\" checking " << *L << "\n");
if (!L->getExitBlock())
return fail("MultipleExitBlocks", "multiple exit blocks");
@@ -705,7 +708,7 @@ public:
for (auto *Inst : DefsUsedOutside)
Partitions.addToNewNonCyclicPartition(Inst);
- DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
+ LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
if (Partitions.getSize() < 2)
return fail("CantIsolateUnsafeDeps",
"cannot isolate unsafe dependencies");
@@ -713,20 +716,20 @@ public:
// Run the merge heuristics: Merge non-cyclic adjacent partitions since we
// should be able to vectorize these together.
Partitions.mergeBeforePopulating();
- DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
+ LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
if (Partitions.getSize() < 2)
return fail("CantIsolateUnsafeDeps",
"cannot isolate unsafe dependencies");
// Now, populate the partitions with non-memory operations.
Partitions.populateUsedSet();
- DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
+ LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
// In order to preserve original lexical order for loads, keep them in the
// partition that we set up in the MemoryInstructionDependences loop.
if (Partitions.mergeToAvoidDuplicatedLoads()) {
- DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
- << Partitions);
+ LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
+ << Partitions);
if (Partitions.getSize() < 2)
return fail("CantIsolateUnsafeDeps",
"cannot isolate unsafe dependencies");
@@ -740,7 +743,7 @@ public:
return fail("TooManySCEVRuntimeChecks",
"too many SCEV run-time checks needed.\n");
- DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
+ LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
// We're done forming the partitions set up the reverse mapping from
// instructions to partitions.
Partitions.setupPartitionIdOnInstructions();
@@ -759,8 +762,8 @@ public:
RtPtrChecking);
if (!Pred.isAlwaysTrue() || !Checks.empty()) {
- DEBUG(dbgs() << "\nPointers:\n");
- DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+ LLVM_DEBUG(dbgs() << "\nPointers:\n");
+ LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
LoopVersioning LVer(*LAI, L, LI, DT, SE, false);
LVer.setAliasChecks(std::move(Checks));
LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate());
@@ -775,12 +778,12 @@ public:
// Now, we remove the instruction from each loop that don't belong to that
// partition.
Partitions.removeUnusedInsts();
- DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
- DEBUG(Partitions.printBlocks());
+ LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
+ LLVM_DEBUG(Partitions.printBlocks());
if (LDistVerify) {
LI->verify(*DT);
- DT->verifyDomTree();
+ assert(DT->verify(DominatorTree::VerificationLevel::Fast));
}
++NumLoopsDistributed;
@@ -793,12 +796,12 @@ public:
return true;
}
- /// \brief Provide diagnostics then \return with false.
+ /// Provide diagnostics then \return with false.
bool fail(StringRef RemarkName, StringRef Message) {
LLVMContext &Ctx = F->getContext();
bool Forced = isForced().getValueOr(false);
- DEBUG(dbgs() << "Skipping; " << Message << "\n");
+ LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n");
// With Rpass-missed report that distribution failed.
ORE->emit([&]() {
@@ -826,7 +829,7 @@ public:
return false;
}
- /// \brief Return if distribution forced to be enabled/disabled for the loop.
+ /// Return if distribution forced to be enabled/disabled for the loop.
///
/// If the optional has a value, it indicates whether distribution was forced
/// to be enabled (true) or disabled (false). If the optional has no value
@@ -834,7 +837,7 @@ public:
const Optional<bool> &isForced() const { return IsForced; }
private:
- /// \brief Filter out checks between pointers from the same partition.
+ /// Filter out checks between pointers from the same partition.
///
/// \p PtrToPartition contains the partition number for pointers. Partition
/// number -1 means that the pointer is used in multiple partitions. In this
@@ -873,7 +876,7 @@ private:
return Checks;
}
- /// \brief Check whether the loop metadata is forcing distribution to be
+ /// Check whether the loop metadata is forcing distribution to be
/// enabled/disabled.
void setForced() {
Optional<const MDOperand *> Value =
@@ -896,7 +899,7 @@ private:
ScalarEvolution *SE;
OptimizationRemarkEmitter *ORE;
- /// \brief Indicates whether distribution is forced to be enabled/disabled for
+ /// Indicates whether distribution is forced to be enabled/disabled for
/// the loop.
///
/// If the optional has a value, it indicates whether distribution was forced
@@ -939,7 +942,7 @@ static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
namespace {
-/// \brief The pass class.
+/// The pass class.
class LoopDistributeLegacy : public FunctionPass {
public:
static char ID;
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 21551f0a0825..d8692198f7a3 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -37,7 +37,6 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
@@ -57,6 +56,7 @@
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
@@ -87,8 +87,8 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
#include "llvm/Transforms/Utils/BuildLibCalls.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <algorithm>
#include <cassert>
@@ -188,8 +188,9 @@ private:
PHINode *CntPhi, Value *Var);
bool recognizeAndInsertCTLZ();
void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
- PHINode *CntPhi, Value *Var, const DebugLoc DL,
- bool ZeroCheck, bool IsCntPhiUsedOutsideLoop);
+ PHINode *CntPhi, Value *Var, Instruction *DefX,
+ const DebugLoc &DL, bool ZeroCheck,
+ bool IsCntPhiUsedOutsideLoop);
/// @}
};
@@ -310,9 +311,9 @@ bool LoopIdiomRecognize::runOnCountableLoop() {
SmallVector<BasicBlock *, 8> ExitBlocks;
CurLoop->getUniqueExitBlocks(ExitBlocks);
- DEBUG(dbgs() << "loop-idiom Scanning: F["
- << CurLoop->getHeader()->getParent()->getName() << "] Loop %"
- << CurLoop->getHeader()->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "loop-idiom Scanning: F["
+ << CurLoop->getHeader()->getParent()->getName()
+ << "] Loop %" << CurLoop->getHeader()->getName() << "\n");
bool MadeChange = false;
@@ -756,8 +757,8 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
MSIs.insert(MSI);
bool NegStride = SizeInBytes == -Stride;
return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
- MSI->getAlignment(), SplatValue, MSI, MSIs, Ev,
- BECount, NegStride, /*IsLoopMemset=*/true);
+ MSI->getDestAlignment(), SplatValue, MSI, MSIs,
+ Ev, BECount, NegStride, /*IsLoopMemset=*/true);
}
/// mayLoopAccessLocation - Return true if the specified loop might access the
@@ -936,8 +937,9 @@ bool LoopIdiomRecognize::processLoopStridedStore(
NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
}
- DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
- << " from store to: " << *Ev << " at: " << *TheStore << "\n");
+ LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
+ << " from store to: " << *Ev << " at: " << *TheStore
+ << "\n");
NewCall->setDebugLoc(TheStore->getDebugLoc());
// Okay, the memset has been formed. Zap the original store and anything that
@@ -1037,16 +1039,17 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
Value *NumBytes =
Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
- unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
CallInst *NewCall = nullptr;
// Check whether to generate an unordered atomic memcpy:
- // If the load or store are atomic, then they must neccessarily be unordered
+ // If the load or store are atomic, then they must necessarily be unordered
// by previous checks.
if (!SI->isAtomic() && !LI->isAtomic())
- NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align);
+ NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlignment(),
+ LoadBasePtr, LI->getAlignment(), NumBytes);
else {
// We cannot allow unaligned ops for unordered load/store, so reject
// anything where the alignment isn't at least the element size.
+ unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
if (Align < StoreSize)
return false;
@@ -1066,9 +1069,10 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
}
NewCall->setDebugLoc(SI->getDebugLoc());
- DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"
- << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
- << " from store ptr=" << *StoreEv << " at: " << *SI << "\n");
+ LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"
+ << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
+ << " from store ptr=" << *StoreEv << " at: " << *SI
+ << "\n");
// Okay, the memcpy has been formed. Zap the original store and anything that
// feeds into it.
@@ -1084,9 +1088,9 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
bool IsLoopMemset) {
if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) {
- DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
- << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
- << " avoided: multi-block top-level loop\n");
+ LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
+ << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
+ << " avoided: multi-block top-level loop\n");
return true;
}
}
@@ -1195,14 +1199,13 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
VarX1 = DefX2->getOperand(0);
SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
}
- if (!SubOneOp)
+ if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
return false;
- Instruction *SubInst = cast<Instruction>(SubOneOp);
- ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1));
+ ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
if (!Dec ||
- !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) ||
- (SubInst->getOpcode() == Instruction::Add &&
+ !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
+ (SubOneOp->getOpcode() == Instruction::Add &&
Dec->isMinusOne()))) {
return false;
}
@@ -1314,7 +1317,8 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
return false;
// step 2: detect instructions corresponding to "x.next = x >> 1"
- if (!DefX || DefX->getOpcode() != Instruction::AShr)
+ if (!DefX || (DefX->getOpcode() != Instruction::AShr &&
+ DefX->getOpcode() != Instruction::LShr))
return false;
ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
if (!Shft || !Shft->isOne())
@@ -1372,13 +1376,13 @@ bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
bool IsCntPhiUsedOutsideLoop = false;
for (User *U : CntPhi->users())
- if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+ if (!CurLoop->contains(cast<Instruction>(U))) {
IsCntPhiUsedOutsideLoop = true;
break;
}
bool IsCntInstUsedOutsideLoop = false;
for (User *U : CntInst->users())
- if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+ if (!CurLoop->contains(cast<Instruction>(U))) {
IsCntInstUsedOutsideLoop = true;
break;
}
@@ -1395,16 +1399,27 @@ bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
// parent function RunOnLoop.
BasicBlock *PH = CurLoop->getLoopPreheader();
Value *InitX = PhiX->getIncomingValueForBlock(PH);
- // If we check X != 0 before entering the loop we don't need a zero
- // check in CTLZ intrinsic, but only if Cnt Phi is not used outside of the
- // loop (if it is used we count CTLZ(X >> 1)).
- if (!IsCntPhiUsedOutsideLoop)
- if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
- if (BranchInst *PreCondBr =
- dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
- if (matchCondition(PreCondBr, PH) == InitX)
- ZeroCheck = true;
- }
+
+ // Make sure the initial value can't be negative otherwise the ashr in the
+ // loop might never reach zero which would make the loop infinite.
+ if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, *DL))
+ return false;
+
+ // If we are using the count instruction outside the loop, make sure we
+ // have a zero check as a precondition. Without the check the loop would run
+ // one iteration for before any check of the input value. This means 0 and 1
+ // would have identical behavior in the original loop and thus
+ if (!IsCntPhiUsedOutsideLoop) {
+ auto *PreCondBB = PH->getSinglePredecessor();
+ if (!PreCondBB)
+ return false;
+ auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+ if (!PreCondBI)
+ return false;
+ if (matchCondition(PreCondBI, PH) != InitX)
+ return false;
+ ZeroCheck = true;
+ }
// Check if CTLZ intrinsic is profitable. Assume it is always profitable
// if we delete the loop (the loop has only 6 instructions):
@@ -1415,17 +1430,16 @@ bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
// %inc = add nsw %i.0, 1
// br i1 %tobool
- IRBuilder<> Builder(PH->getTerminator());
- SmallVector<const Value *, 2> Ops =
- {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()};
- ArrayRef<const Value *> Args(Ops);
+ const Value *Args[] =
+ {InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
+ : ConstantInt::getFalse(InitX->getContext())};
if (CurLoop->getHeader()->size() != 6 &&
TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
TargetTransformInfo::TCC_Basic)
return false;
- const DebugLoc DL = DefX->getDebugLoc();
- transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck,
+ transformLoopToCountable(PH, CntInst, CntPhi, InitX, DefX,
+ DefX->getDebugLoc(), ZeroCheck,
IsCntPhiUsedOutsideLoop);
return true;
}
@@ -1461,7 +1475,7 @@ bool LoopIdiomRecognize::recognizePopcount() {
if (!EntryBI || EntryBI->isConditional())
return false;
- // It should have a precondition block where the generated popcount instrinsic
+ // It should have a precondition block where the generated popcount intrinsic
// function can be inserted.
auto *PreCondBB = PH->getSinglePredecessor();
if (!PreCondBB)
@@ -1539,8 +1553,9 @@ static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
void LoopIdiomRecognize::transformLoopToCountable(
BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
- const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
- BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator());
+ Instruction *DefX, const DebugLoc &DL, bool ZeroCheck,
+ bool IsCntPhiUsedOutsideLoop) {
+ BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
// Step 1: Insert the CTLZ instruction at the end of the preheader block
// Count = BitWidth - CTLZ(InitX);
@@ -1550,10 +1565,16 @@ void LoopIdiomRecognize::transformLoopToCountable(
Builder.SetCurrentDebugLocation(DL);
Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;
- if (IsCntPhiUsedOutsideLoop)
- InitXNext = Builder.CreateAShr(InitX,
- ConstantInt::get(InitX->getType(), 1));
- else
+ if (IsCntPhiUsedOutsideLoop) {
+ if (DefX->getOpcode() == Instruction::AShr)
+ InitXNext =
+ Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1));
+ else if (DefX->getOpcode() == Instruction::LShr)
+ InitXNext =
+ Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1));
+ else
+ llvm_unreachable("Unexpected opcode!");
+ } else
InitXNext = InitX;
CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
Count = Builder.CreateSub(
@@ -1588,7 +1609,7 @@ void LoopIdiomRecognize::transformLoopToCountable(
// ...
// Br: loop if (Dec != 0)
BasicBlock *Body = *(CurLoop->block_begin());
- auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+ auto *LbBr = cast<BranchInst>(Body->getTerminator());
ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
Type *Ty = Count->getType();
@@ -1625,8 +1646,8 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
Instruction *CntInst,
PHINode *CntPhi, Value *Var) {
BasicBlock *PreHead = CurLoop->getLoopPreheader();
- auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
- const DebugLoc DL = CntInst->getDebugLoc();
+ auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
+ const DebugLoc &DL = CntInst->getDebugLoc();
// Assuming before transformation, the loop is following:
// if (x) // the precondition
@@ -1675,7 +1696,7 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
}
// Step 3: Note that the population count is exactly the trip count of the
- // loop in question, which enable us to to convert the loop from noncountable
+ // loop in question, which enable us to convert the loop from noncountable
// loop into a countable one. The benefit is twofold:
//
// - If the loop only counts population, the entire loop becomes dead after
@@ -1696,7 +1717,7 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
// do { cnt++; x &= x-1; t--) } while (t > 0);
BasicBlock *Body = *(CurLoop->block_begin());
{
- auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+ auto *LbBr = cast<BranchInst>(Body->getTerminator());
ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
Type *Ty = TripCnt->getType();
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 40d468a084d4..71859efbf4bd 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -20,8 +20,10 @@
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/DataLayout.h"
@@ -34,7 +36,6 @@
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <algorithm>
#include <utility>
@@ -45,118 +46,116 @@ using namespace llvm;
STATISTIC(NumSimplified, "Number of redundant instructions simplified");
-static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI,
- AssumptionCache *AC,
- const TargetLibraryInfo *TLI) {
- SmallVector<BasicBlock *, 8> ExitBlocks;
- L->getUniqueExitBlocks(ExitBlocks);
- array_pod_sort(ExitBlocks.begin(), ExitBlocks.end());
-
+static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI,
+ AssumptionCache &AC,
+ const TargetLibraryInfo &TLI) {
+ const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
+ SimplifyQuery SQ(DL, &TLI, &DT, &AC);
+
+ // On the first pass over the loop body we try to simplify every instruction.
+ // On subsequent passes, we can restrict this to only simplifying instructions
+ // where the inputs have been updated. We end up needing two sets: one
+ // containing the instructions we are simplifying in *this* pass, and one for
+ // the instructions we will want to simplify in the *next* pass. We use
+ // pointers so we can swap between two stably allocated sets.
SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
- // The bit we are stealing from the pointer represents whether this basic
- // block is the header of a subloop, in which case we only process its phis.
- using WorklistItem = PointerIntPair<BasicBlock *, 1>;
- SmallVector<WorklistItem, 16> VisitStack;
- SmallPtrSet<BasicBlock *, 32> Visited;
-
- bool Changed = false;
- bool LocalChanged;
- do {
- LocalChanged = false;
-
- VisitStack.clear();
- Visited.clear();
+ // Track the PHI nodes that have already been visited during each iteration so
+ // that we can identify when it is necessary to iterate.
+ SmallPtrSet<PHINode *, 4> VisitedPHIs;
- VisitStack.push_back(WorklistItem(L->getHeader(), false));
+ // While simplifying we may discover dead code or cause code to become dead.
+ // Keep track of all such instructions and we will delete them at the end.
+ SmallVector<Instruction *, 8> DeadInsts;
- while (!VisitStack.empty()) {
- WorklistItem Item = VisitStack.pop_back_val();
- BasicBlock *BB = Item.getPointer();
- bool IsSubloopHeader = Item.getInt();
- const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+ // First we want to create an RPO traversal of the loop body. By processing in
+ // RPO we can ensure that definitions are processed prior to uses (for non PHI
+ // uses) in all cases. This ensures we maximize the simplifications in each
+ // iteration over the loop and minimizes the possible causes for continuing to
+ // iterate.
+ LoopBlocksRPO RPOT(&L);
+ RPOT.perform(&LI);
- // Simplify instructions in the current basic block.
- for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
- Instruction *I = &*BI++;
-
- // The first time through the loop ToSimplify is empty and we try to
- // simplify all instructions. On later iterations ToSimplify is not
- // empty and we only bother simplifying instructions that are in it.
- if (!ToSimplify->empty() && !ToSimplify->count(I))
+ bool Changed = false;
+ for (;;) {
+ for (BasicBlock *BB : RPOT) {
+ for (Instruction &I : *BB) {
+ if (auto *PI = dyn_cast<PHINode>(&I))
+ VisitedPHIs.insert(PI);
+
+ if (I.use_empty()) {
+ if (isInstructionTriviallyDead(&I, &TLI))
+ DeadInsts.push_back(&I);
continue;
-
- // Don't bother simplifying unused instructions.
- if (!I->use_empty()) {
- Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC});
- if (V && LI->replacementPreservesLCSSAForm(I, V)) {
- // Mark all uses for resimplification next time round the loop.
- for (User *U : I->users())
- Next->insert(cast<Instruction>(U));
-
- I->replaceAllUsesWith(V);
- LocalChanged = true;
- ++NumSimplified;
- }
- }
- if (RecursivelyDeleteTriviallyDeadInstructions(I, TLI)) {
- // RecursivelyDeleteTriviallyDeadInstruction can remove more than one
- // instruction, so simply incrementing the iterator does not work.
- // When instructions get deleted re-iterate instead.
- BI = BB->begin();
- BE = BB->end();
- LocalChanged = true;
}
- if (IsSubloopHeader && !isa<PHINode>(I))
- break;
- }
+ // We special case the first iteration which we can detect due to the
+ // empty `ToSimplify` set.
+ bool IsFirstIteration = ToSimplify->empty();
- // Add all successors to the worklist, except for loop exit blocks and the
- // bodies of subloops. We visit the headers of loops so that we can
- // process
- // their phis, but we contract the rest of the subloop body and only
- // follow
- // edges leading back to the original loop.
- for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE;
- ++SI) {
- BasicBlock *SuccBB = *SI;
- if (!Visited.insert(SuccBB).second)
+ if (!IsFirstIteration && !ToSimplify->count(&I))
continue;
- const Loop *SuccLoop = LI->getLoopFor(SuccBB);
- if (SuccLoop && SuccLoop->getHeader() == SuccBB &&
- L->contains(SuccLoop)) {
- VisitStack.push_back(WorklistItem(SuccBB, true));
-
- SmallVector<BasicBlock *, 8> SubLoopExitBlocks;
- SuccLoop->getExitBlocks(SubLoopExitBlocks);
-
- for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) {
- BasicBlock *ExitBB = SubLoopExitBlocks[i];
- if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB).second)
- VisitStack.push_back(WorklistItem(ExitBB, false));
- }
-
+ Value *V = SimplifyInstruction(&I, SQ.getWithInstruction(&I));
+ if (!V || !LI.replacementPreservesLCSSAForm(&I, V))
continue;
- }
- bool IsExitBlock =
- std::binary_search(ExitBlocks.begin(), ExitBlocks.end(), SuccBB);
- if (IsExitBlock)
- continue;
+ for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
+ UI != UE;) {
+ Use &U = *UI++;
+ auto *UserI = cast<Instruction>(U.getUser());
+ U.set(V);
+
+ // If the instruction is used by a PHI node we have already processed
+ // we'll need to iterate on the loop body to converge, so add it to
+ // the next set.
+ if (auto *UserPI = dyn_cast<PHINode>(UserI))
+ if (VisitedPHIs.count(UserPI)) {
+ Next->insert(UserPI);
+ continue;
+ }
+
+ // If we are only simplifying targeted instructions and the user is an
+ // instruction in the loop body, add it to our set of targeted
+ // instructions. Because we process defs before uses (outside of PHIs)
+ // we won't have visited it yet.
+ //
+ // We also skip any uses outside of the loop being simplified. Those
+ // should always be PHI nodes due to LCSSA form, and we don't want to
+ // try to simplify those away.
+ assert((L.contains(UserI) || isa<PHINode>(UserI)) &&
+ "Uses outside the loop should be PHI nodes due to LCSSA!");
+ if (!IsFirstIteration && L.contains(UserI))
+ ToSimplify->insert(UserI);
+ }
- VisitStack.push_back(WorklistItem(SuccBB, false));
+ assert(I.use_empty() && "Should always have replaced all uses!");
+ if (isInstructionTriviallyDead(&I, &TLI))
+ DeadInsts.push_back(&I);
+ ++NumSimplified;
+ Changed = true;
}
}
- // Place the list of instructions to simplify on the next loop iteration
- // into ToSimplify.
- std::swap(ToSimplify, Next);
- Next->clear();
+ // Delete any dead instructions found thus far now that we've finished an
+ // iteration over all instructions in all the loop blocks.
+ if (!DeadInsts.empty()) {
+ Changed = true;
+ RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, &TLI);
+ }
+
+ // If we never found a PHI that needs to be simplified in the next
+ // iteration, we're done.
+ if (Next->empty())
+ break;
- Changed |= LocalChanged;
- } while (LocalChanged);
+ // Otherwise, put the next set in place for the next iteration and reset it
+ // and the visited PHIs for that iteration.
+ std::swap(Next, ToSimplify);
+ Next->clear();
+ VisitedPHIs.clear();
+ DeadInsts.clear();
+ }
return Changed;
}
@@ -174,21 +173,20 @@ public:
bool runOnLoop(Loop *L, LPPassManager &LPM) override {
if (skipLoop(L))
return false;
- DominatorTreeWrapperPass *DTWP =
- getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
- LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- AssumptionCache *AC =
- &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ AssumptionCache &AC =
+ getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
*L->getHeader()->getParent());
- const TargetLibraryInfo *TLI =
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ const TargetLibraryInfo &TLI =
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
- return SimplifyLoopInst(L, DT, LI, AC, TLI);
+ return simplifyLoopInst(*L, DT, LI, AC, TLI);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.setPreservesCFG();
getLoopAnalysisUsage(AU);
@@ -200,7 +198,7 @@ public:
PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &) {
- if (!SimplifyLoopInst(&L, &AR.DT, &AR.LI, &AR.AC, &AR.TLI))
+ if (!simplifyLoopInst(L, AR.DT, AR.LI, AR.AC, AR.TLI))
return PreservedAnalyses::all();
auto PA = getLoopPassPreservedAnalyses();
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 4f8dafef230a..2978165ed8a9 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -15,6 +15,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/DependenceAnalysis.h"
@@ -40,6 +41,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <cassert>
@@ -50,6 +52,8 @@ using namespace llvm;
#define DEBUG_TYPE "loop-interchange"
+STATISTIC(LoopsInterchanged, "Number of loops interchanged");
+
static cl::opt<int> LoopInterchangeCostThreshold(
"loop-interchange-threshold", cl::init(0), cl::Hidden,
cl::desc("Interchange if you gain more than this number"));
@@ -73,8 +77,8 @@ static const unsigned MaxLoopNestDepth = 10;
static void printDepMatrix(CharMatrix &DepMatrix) {
for (auto &Row : DepMatrix) {
for (auto D : Row)
- DEBUG(dbgs() << D << " ");
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << D << " ");
+ LLVM_DEBUG(dbgs() << "\n");
}
}
#endif
@@ -103,8 +107,8 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
}
}
- DEBUG(dbgs() << "Found " << MemInstr.size()
- << " Loads and Stores to analyze\n");
+ LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
+ << " Loads and Stores to analyze\n");
ValueVector::iterator I, IE, J, JE;
@@ -121,11 +125,11 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
// Track Output, Flow, and Anti dependencies.
if (auto D = DI->depends(Src, Dst, true)) {
assert(D->isOrdered() && "Expected an output, flow or anti dep.");
- DEBUG(StringRef DepType =
- D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
- dbgs() << "Found " << DepType
- << " dependency between Src and Dst\n"
- << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
+ LLVM_DEBUG(StringRef DepType =
+ D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
+ dbgs() << "Found " << DepType
+ << " dependency between Src and Dst\n"
+ << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
unsigned Levels = D->getLevels();
char Direction;
for (unsigned II = 1; II <= Levels; ++II) {
@@ -165,17 +169,14 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
DepMatrix.push_back(Dep);
if (DepMatrix.size() > MaxMemInstrCount) {
- DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
- << " dependencies inside loop\n");
+ LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
+ << " dependencies inside loop\n");
return false;
}
}
}
}
- // We don't have a DepMatrix to check legality return false.
- if (DepMatrix.empty())
- return false;
return true;
}
@@ -271,9 +272,9 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
}
static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) {
- DEBUG(dbgs() << "Calling populateWorklist on Func: "
- << L.getHeader()->getParent()->getName() << " Loop: %"
- << L.getHeader()->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
+ << L.getHeader()->getParent()->getName() << " Loop: %"
+ << L.getHeader()->getName() << '\n');
LoopVector LoopList;
Loop *CurrentLoop = &L;
const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
@@ -404,7 +405,9 @@ public:
/// Interchange OuterLoop and InnerLoop.
bool transform();
- void restructureLoops(Loop *InnerLoop, Loop *OuterLoop);
+ void restructureLoops(Loop *NewInner, Loop *NewOuter,
+ BasicBlock *OrigInnerPreHeader,
+ BasicBlock *OrigOuterPreHeader);
void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
private:
@@ -453,6 +456,9 @@ struct LoopInterchange : public FunctionPass {
AU.addRequiredID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
}
bool runOnFunction(Function &F) override {
@@ -462,8 +468,7 @@ struct LoopInterchange : public FunctionPass {
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
- auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
@@ -473,7 +478,7 @@ struct LoopInterchange : public FunctionPass {
for (Loop *L : *LI)
populateWorklist(*L, Worklist);
- DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n");
+ LLVM_DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n");
bool Changed = true;
while (!Worklist.empty()) {
LoopVector LoopList = Worklist.pop_back_val();
@@ -486,15 +491,15 @@ struct LoopInterchange : public FunctionPass {
for (Loop *L : LoopList) {
const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
if (ExitCountOuter == SE->getCouldNotCompute()) {
- DEBUG(dbgs() << "Couldn't compute backedge count\n");
+ LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
return false;
}
if (L->getNumBackEdges() != 1) {
- DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
+ LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
return false;
}
if (!L->getExitingBlock()) {
- DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
+ LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
return false;
}
}
@@ -511,53 +516,38 @@ struct LoopInterchange : public FunctionPass {
bool Changed = false;
unsigned LoopNestDepth = LoopList.size();
if (LoopNestDepth < 2) {
- DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
+ LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
return false;
}
if (LoopNestDepth > MaxLoopNestDepth) {
- DEBUG(dbgs() << "Cannot handle loops of depth greater than "
- << MaxLoopNestDepth << "\n");
+ LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than "
+ << MaxLoopNestDepth << "\n");
return false;
}
if (!isComputableLoopNest(LoopList)) {
- DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
+ LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
return false;
}
- DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth << "\n");
+ LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth
+ << "\n");
CharMatrix DependencyMatrix;
Loop *OuterMostLoop = *(LoopList.begin());
if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
OuterMostLoop, DI)) {
- DEBUG(dbgs() << "Populating dependency matrix failed\n");
+ LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
return false;
}
#ifdef DUMP_DEP_MATRICIES
- DEBUG(dbgs() << "Dependence before interchange\n");
+ LLVM_DEBUG(dbgs() << "Dependence before interchange\n");
printDepMatrix(DependencyMatrix);
#endif
- BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch();
- BranchInst *OuterMostLoopLatchBI =
- dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator());
- if (!OuterMostLoopLatchBI)
- return false;
-
- // Since we currently do not handle LCSSA PHI's any failure in loop
- // condition will now branch to LoopNestExit.
- // TODO: This should be removed once we handle LCSSA PHI nodes.
-
// Get the Outermost loop exit.
- BasicBlock *LoopNestExit;
- if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader())
- LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1);
- else
- LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0);
-
- if (isa<PHINode>(LoopNestExit->begin())) {
- DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now "
- "since on failure all loops branch to loop nest exit.\n");
+ BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
+ if (!LoopNestExit) {
+ LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block");
return false;
}
@@ -573,9 +563,8 @@ struct LoopInterchange : public FunctionPass {
// Update the DependencyMatrix
interChangeDependencies(DependencyMatrix, i, i - 1);
- DT->recalculate(F);
#ifdef DUMP_DEP_MATRICIES
- DEBUG(dbgs() << "Dependence after interchange\n");
+ LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
printDepMatrix(DependencyMatrix);
#endif
Changed |= Interchanged;
@@ -586,21 +575,21 @@ struct LoopInterchange : public FunctionPass {
bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
unsigned OuterLoopId, BasicBlock *LoopNestExit,
std::vector<std::vector<char>> &DependencyMatrix) {
- DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
- << " and OuterLoopId = " << OuterLoopId << "\n");
+ LLVM_DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
+ << " and OuterLoopId = " << OuterLoopId << "\n");
Loop *InnerLoop = LoopList[InnerLoopId];
Loop *OuterLoop = LoopList[OuterLoopId];
LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT,
PreserveLCSSA, ORE);
if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
- DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n");
+ LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
return false;
}
- DEBUG(dbgs() << "Loops are legal to interchange\n");
+ LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
- DEBUG(dbgs() << "Interchanging loops not profitable\n");
+ LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
return false;
}
@@ -614,7 +603,8 @@ struct LoopInterchange : public FunctionPass {
LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
LoopNestExit, LIL.hasInnerLoopReduction());
LIT.transform();
- DEBUG(dbgs() << "Loops interchanged\n");
+ LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
+ LoopsInterchanged++;
return true;
}
};
@@ -631,13 +621,13 @@ bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) {
bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader(
BasicBlock *BB) {
- for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+ for (Instruction &I : *BB) {
// Load corresponding to reduction PHI's are safe while concluding if
// tightly nested.
- if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+ if (LoadInst *L = dyn_cast<LoadInst>(&I)) {
if (!areAllUsesReductions(L, InnerLoop))
return true;
- } else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+ } else if (I.mayHaveSideEffects() || I.mayReadFromMemory())
return true;
}
return false;
@@ -645,13 +635,13 @@ bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader(
bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch(
BasicBlock *BB) {
- for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+ for (Instruction &I : *BB) {
// Stores corresponding to reductions are safe while concluding if tightly
// nested.
- if (StoreInst *L = dyn_cast<StoreInst>(I)) {
+ if (StoreInst *L = dyn_cast<StoreInst>(&I)) {
if (!isa<PHINode>(L->getOperand(0)))
return true;
- } else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+ } else if (I.mayHaveSideEffects() || I.mayReadFromMemory())
return true;
}
return false;
@@ -662,7 +652,7 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
- DEBUG(dbgs() << "Checking if loops are tightly nested\n");
+ LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n");
// A perfectly nested loop will not have any branch in between the outer and
// inner block i.e. outer header will branch to either inner preheader and
@@ -676,14 +666,14 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
if (Succ != InnerLoopPreHeader && Succ != OuterLoopLatch)
return false;
- DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
+ LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
// We do not have any basic block in between now make sure the outer header
// and outer loop latch doesn't contain any unsafe instructions.
if (containsUnsafeInstructionsInHeader(OuterLoopHeader) ||
containsUnsafeInstructionsInLatch(OuterLoopLatch))
return false;
- DEBUG(dbgs() << "Loops are perfectly nested\n");
+ LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
// We have a perfect loop nest.
return true;
}
@@ -717,16 +707,15 @@ bool LoopInterchangeLegality::findInductionAndReductions(
SmallVector<PHINode *, 8> &Reductions) {
if (!L->getLoopLatch() || !L->getLoopPredecessor())
return false;
- for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+ for (PHINode &PHI : L->getHeader()->phis()) {
RecurrenceDescriptor RD;
InductionDescriptor ID;
- PHINode *PHI = cast<PHINode>(I);
- if (InductionDescriptor::isInductionPHI(PHI, L, SE, ID))
- Inductions.push_back(PHI);
- else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
- Reductions.push_back(PHI);
+ if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
+ Inductions.push_back(&PHI);
+ else if (RecurrenceDescriptor::isReductionPHI(&PHI, L, RD))
+ Reductions.push_back(&PHI);
else {
- DEBUG(
+ LLVM_DEBUG(
dbgs() << "Failed to recognize PHI as an induction or reduction.\n");
return false;
}
@@ -735,12 +724,11 @@ bool LoopInterchangeLegality::findInductionAndReductions(
}
static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
- for (auto I = Block->begin(); isa<PHINode>(I); ++I) {
- PHINode *PHI = cast<PHINode>(I);
+ for (PHINode &PHI : Block->phis()) {
// Reduction lcssa phi will have only 1 incoming block that from loop latch.
- if (PHI->getNumIncomingValues() > 1)
+ if (PHI.getNumIncomingValues() > 1)
return false;
- Instruction *Ins = dyn_cast<Instruction>(PHI->getIncomingValue(0));
+ Instruction *Ins = dyn_cast<Instruction>(PHI.getIncomingValue(0));
if (!Ins)
return false;
// Incoming value for lcssa phi's in outer loop exit can only be inner loop
@@ -751,35 +739,38 @@ static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
return true;
}
-static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock,
- BasicBlock *LoopHeader) {
- if (BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator())) {
- assert(BI->getNumSuccessors() == 2 &&
- "Branch leaving loop latch must have 2 successors");
- for (BasicBlock *Succ : BI->successors()) {
- if (Succ == LoopHeader)
- continue;
- return Succ;
- }
- }
- return nullptr;
-}
-
// This function indicates the current limitations in the transform as a result
// of which we do not proceed.
bool LoopInterchangeLegality::currentLimitations() {
BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
- BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
- BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
- BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+
+ // transform currently expects the loop latches to also be the exiting
+ // blocks.
+ if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
+ OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
+ !isa<BranchInst>(InnerLoopLatch->getTerminator()) ||
+ !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {
+ LLVM_DEBUG(
+ dbgs() << "Loops where the latch is not the exiting block are not"
+ << " supported currently.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Loops where the latch is not the exiting block cannot be"
+ " interchange currently.";
+ });
+ return true;
+ }
PHINode *InnerInductionVar;
SmallVector<PHINode *, 8> Inductions;
SmallVector<PHINode *, 8> Reductions;
if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) {
- DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes "
- << "are supported currently.\n");
+ LLVM_DEBUG(
+ dbgs() << "Only inner loops with induction or reduction PHI nodes "
+ << "are supported currently.\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
InnerLoop->getStartLoc(),
@@ -792,8 +783,9 @@ bool LoopInterchangeLegality::currentLimitations() {
// TODO: Currently we handle only loops with 1 induction variable.
if (Inductions.size() != 1) {
- DEBUG(dbgs() << "We currently only support loops with 1 induction variable."
- << "Failed to interchange due to current limitation\n");
+ LLVM_DEBUG(
+ dbgs() << "We currently only support loops with 1 induction variable."
+ << "Failed to interchange due to current limitation\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner",
InnerLoop->getStartLoc(),
@@ -809,8 +801,9 @@ bool LoopInterchangeLegality::currentLimitations() {
InnerInductionVar = Inductions.pop_back_val();
Reductions.clear();
if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) {
- DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes "
- << "are supported currently.\n");
+ LLVM_DEBUG(
+ dbgs() << "Only outer loops with induction or reduction PHI nodes "
+ << "are supported currently.\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
OuterLoop->getStartLoc(),
@@ -824,8 +817,8 @@ bool LoopInterchangeLegality::currentLimitations() {
// Outer loop cannot have reduction because then loops will not be tightly
// nested.
if (!Reductions.empty()) {
- DEBUG(dbgs() << "Outer loops with reductions are not supported "
- << "currently.\n");
+ LLVM_DEBUG(dbgs() << "Outer loops with reductions are not supported "
+ << "currently.\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter",
OuterLoop->getStartLoc(),
@@ -837,8 +830,8 @@ bool LoopInterchangeLegality::currentLimitations() {
}
// TODO: Currently we handle only loops with 1 induction variable.
if (Inductions.size() != 1) {
- DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
- << "supported currently.\n");
+ LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
+ << "supported currently.\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter",
OuterLoop->getStartLoc(),
@@ -851,7 +844,7 @@ bool LoopInterchangeLegality::currentLimitations() {
// TODO: Triangular loops are not handled for now.
if (!isLoopStructureUnderstood(InnerInductionVar)) {
- DEBUG(dbgs() << "Loop structure not understood by pass\n");
+ LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
InnerLoop->getStartLoc(),
@@ -862,23 +855,10 @@ bool LoopInterchangeLegality::currentLimitations() {
}
// TODO: We only handle LCSSA PHI's corresponding to reduction for now.
- BasicBlock *LoopExitBlock =
- getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader);
- if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) {
- DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuter",
- OuterLoop->getStartLoc(),
- OuterLoop->getHeader())
- << "Only outer loops with LCSSA PHIs can be interchange "
- "currently.";
- });
- return true;
- }
-
- LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader);
- if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) {
- DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
+ BasicBlock *InnerExit = InnerLoop->getExitBlock();
+ if (!containsSafePHI(InnerExit, false)) {
+ LLVM_DEBUG(
+ dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner",
InnerLoop->getStartLoc(),
@@ -908,8 +888,9 @@ bool LoopInterchangeLegality::currentLimitations() {
dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
if (!InnerIndexVarInc) {
- DEBUG(dbgs() << "Did not find an instruction to increment the induction "
- << "variable.\n");
+ LLVM_DEBUG(
+ dbgs() << "Did not find an instruction to increment the induction "
+ << "variable.\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner",
InnerLoop->getStartLoc(),
@@ -924,7 +905,8 @@ bool LoopInterchangeLegality::currentLimitations() {
// instruction.
bool FoundInduction = false;
- for (const Instruction &I : llvm::reverse(*InnerLoopLatch)) {
+ for (const Instruction &I :
+ llvm::reverse(InnerLoopLatch->instructionsWithoutDebug())) {
if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) ||
isa<ZExtInst>(I))
continue;
@@ -932,8 +914,8 @@ bool LoopInterchangeLegality::currentLimitations() {
// We found an instruction. If this is not induction variable then it is not
// safe to split this loop latch.
if (!I.isIdenticalTo(InnerIndexVarInc)) {
- DEBUG(dbgs() << "Found unsupported instructions between induction "
- << "variable increment and branch.\n");
+ LLVM_DEBUG(dbgs() << "Found unsupported instructions between induction "
+ << "variable increment and branch.\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(
DEBUG_TYPE, "UnsupportedInsBetweenInduction",
@@ -950,7 +932,7 @@ bool LoopInterchangeLegality::currentLimitations() {
// The loop latch ended and we didn't find the induction variable return as
// current limitation.
if (!FoundInduction) {
- DEBUG(dbgs() << "Did not find the induction variable.\n");
+ LLVM_DEBUG(dbgs() << "Did not find the induction variable.\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable",
InnerLoop->getStartLoc(),
@@ -962,13 +944,50 @@ bool LoopInterchangeLegality::currentLimitations() {
return false;
}
+// We currently support LCSSA PHI nodes in the outer loop exit, if their
+// incoming values do not come from the outer loop latch or if the
+// outer loop latch has a single predecessor. In that case, the value will
+// be available if both the inner and outer loop conditions are true, which
+// will still be true after interchanging. If we have multiple predecessor,
+// that may not be the case, e.g. because the outer loop latch may be executed
+// if the inner loop is not executed.
+static bool areLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
+ BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
+ for (PHINode &PHI : LoopNestExit->phis()) {
+ // FIXME: We currently are not able to detect floating point reductions
+ // and have to use floating point PHIs as a proxy to prevent
+ // interchanging in the presence of floating point reductions.
+ if (PHI.getType()->isFloatingPointTy())
+ return false;
+ for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
+ Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
+ if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
+ continue;
+
+ // The incoming value is defined in the outer loop latch. Currently we
+ // only support that in case the outer loop latch has a single predecessor.
+ // This guarantees that the outer loop latch is executed if and only if
+ // the inner loop is executed (because tightlyNested() guarantees that the
+ // outer loop header only branches to the inner loop or the outer loop
+ // latch).
+ // FIXME: We could weaken this logic and allow multiple predecessors,
+ // if the values are produced outside the loop latch. We would need
+ // additional logic to update the PHI nodes in the exit block as
+ // well.
+ if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
+ return false;
+ }
+ }
+ return true;
+}
+
bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
unsigned OuterLoopId,
CharMatrix &DepMatrix) {
if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
- DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
- << " and OuterLoopId = " << OuterLoopId
- << " due to dependence\n");
+ LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
+ << " and OuterLoopId = " << OuterLoopId
+ << " due to dependence\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
InnerLoop->getStartLoc(),
@@ -977,16 +996,23 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
});
return false;
}
-
// Check if outer and inner loop contain legal instructions only.
for (auto *BB : OuterLoop->blocks())
- for (Instruction &I : *BB)
+ for (Instruction &I : BB->instructionsWithoutDebug())
if (CallInst *CI = dyn_cast<CallInst>(&I)) {
// readnone functions do not prevent interchanging.
if (CI->doesNotReadMemory())
continue;
- DEBUG(dbgs() << "Loops with call instructions cannot be interchanged "
- << "safely.");
+ LLVM_DEBUG(
+ dbgs() << "Loops with call instructions cannot be interchanged "
+ << "safely.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
+ CI->getDebugLoc(),
+ CI->getParent())
+ << "Cannot interchange loops due to call instruction.";
+ });
+
return false;
}
@@ -1015,13 +1041,13 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
// TODO: The loops could not be interchanged due to current limitations in the
// transform module.
if (currentLimitations()) {
- DEBUG(dbgs() << "Not legal because of current transform limitation\n");
+ LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
return false;
}
// Check if the loops are tightly nested.
if (!tightlyNested(OuterLoop, InnerLoop)) {
- DEBUG(dbgs() << "Loops not tightly nested\n");
+ LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
InnerLoop->getStartLoc(),
@@ -1032,6 +1058,17 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
return false;
}
+ if (!areLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
+ LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Found unsupported PHI node in loop exit.";
+ });
+ return false;
+ }
+
return true;
}
@@ -1100,7 +1137,8 @@ static bool isProfitableForVectorization(unsigned InnerLoopId,
}
// If outer loop has dependence and inner loop is loop independent then it is
// profitable to interchange to enable parallelism.
- return true;
+ // If there are no dependences, interchanging will not improve anything.
+ return !DepMatrix.empty();
}
bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
@@ -1115,7 +1153,7 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
// of induction variables in the instruction and allows reordering if number
// of bad orders is more than good.
int Cost = getInstrOrderCost();
- DEBUG(dbgs() << "Cost = " << Cost << "\n");
+ LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
if (Cost < -LoopInterchangeCostThreshold)
return true;
@@ -1138,33 +1176,88 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
Loop *InnerLoop) {
- for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end(); I != E;
- ++I) {
- if (*I == InnerLoop) {
- OuterLoop->removeChildLoop(I);
+ for (Loop *L : *OuterLoop)
+ if (L == InnerLoop) {
+ OuterLoop->removeChildLoop(L);
return;
}
- }
llvm_unreachable("Couldn't find loop");
}
-void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop,
- Loop *OuterLoop) {
+/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
+/// new inner and outer loop after interchanging: NewInner is the original
+/// outer loop and NewOuter is the original inner loop.
+///
+/// Before interchanging, we have the following structure
+/// Outer preheader
+// Outer header
+// Inner preheader
+// Inner header
+// Inner body
+// Inner latch
+// outer bbs
+// Outer latch
+//
+// After interchanging:
+// Inner preheader
+// Inner header
+// Outer preheader
+// Outer header
+// Inner body
+// outer bbs
+// Outer latch
+// Inner latch
+void LoopInterchangeTransform::restructureLoops(
+ Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
+ BasicBlock *OrigOuterPreHeader) {
Loop *OuterLoopParent = OuterLoop->getParentLoop();
+ // The original inner loop preheader moves from the new inner loop to
+ // the parent loop, if there is one.
+ NewInner->removeBlockFromLoop(OrigInnerPreHeader);
+ LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
+
+ // Switch the loop levels.
if (OuterLoopParent) {
// Remove the loop from its parent loop.
- removeChildLoop(OuterLoopParent, OuterLoop);
- removeChildLoop(OuterLoop, InnerLoop);
- OuterLoopParent->addChildLoop(InnerLoop);
+ removeChildLoop(OuterLoopParent, NewInner);
+ removeChildLoop(NewInner, NewOuter);
+ OuterLoopParent->addChildLoop(NewOuter);
} else {
- removeChildLoop(OuterLoop, InnerLoop);
- LI->changeTopLevelLoop(OuterLoop, InnerLoop);
+ removeChildLoop(NewInner, NewOuter);
+ LI->changeTopLevelLoop(NewInner, NewOuter);
+ }
+ while (!NewOuter->empty())
+ NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
+ NewOuter->addChildLoop(NewInner);
+
+ // BBs from the original inner loop.
+ SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
+
+ // Add BBs from the original outer loop to the original inner loop (excluding
+ // BBs already in inner loop)
+ for (BasicBlock *BB : NewInner->blocks())
+ if (LI->getLoopFor(BB) == NewInner)
+ NewOuter->addBlockEntry(BB);
+
+ // Now remove inner loop header and latch from the new inner loop and move
+ // other BBs (the loop body) to the new inner loop.
+ BasicBlock *OuterHeader = NewOuter->getHeader();
+ BasicBlock *OuterLatch = NewOuter->getLoopLatch();
+ for (BasicBlock *BB : OrigInnerBBs) {
+ // Nothing will change for BBs in child loops.
+ if (LI->getLoopFor(BB) != NewOuter)
+ continue;
+ // Remove the new outer loop header and latch from the new inner loop.
+ if (BB == OuterHeader || BB == OuterLatch)
+ NewInner->removeBlockFromLoop(BB);
+ else
+ LI->changeLoopFor(BB, NewInner);
}
- while (!InnerLoop->empty())
- OuterLoop->addChildLoop(InnerLoop->removeChildLoop(InnerLoop->begin()));
-
- InnerLoop->addChildLoop(OuterLoop);
+ // The preheader of the original outer loop becomes part of the new
+ // outer loop.
+ NewOuter->addBlockEntry(OrigOuterPreHeader);
+ LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
}
bool LoopInterchangeTransform::transform() {
@@ -1173,10 +1266,10 @@ bool LoopInterchangeTransform::transform() {
if (InnerLoop->getSubLoops().empty()) {
BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
- DEBUG(dbgs() << "Calling Split Inner Loop\n");
+ LLVM_DEBUG(dbgs() << "Calling Split Inner Loop\n");
PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
if (!InductionPHI) {
- DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
+ LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
return false;
}
@@ -1185,8 +1278,7 @@ bool LoopInterchangeTransform::transform() {
else
InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
- // Ensure that InductionPHI is the first Phi node as required by
- // splitInnerLoopHeader
+ // Ensure that InductionPHI is the first Phi node.
if (&InductionPHI->getParent()->front() != InductionPHI)
InductionPHI->moveBefore(&InductionPHI->getParent()->front());
@@ -1194,20 +1286,20 @@ bool LoopInterchangeTransform::transform() {
// incremented/decremented.
// TODO: This splitting logic may not work always. Fix this.
splitInnerLoopLatch(InnerIndexVar);
- DEBUG(dbgs() << "splitInnerLoopLatch done\n");
+ LLVM_DEBUG(dbgs() << "splitInnerLoopLatch done\n");
// Splits the inner loops phi nodes out into a separate basic block.
- splitInnerLoopHeader();
- DEBUG(dbgs() << "splitInnerLoopHeader done\n");
+ BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+ SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
+ LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
}
Transformed |= adjustLoopLinks();
if (!Transformed) {
- DEBUG(dbgs() << "adjustLoopLinks failed\n");
+ LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
return false;
}
- restructureLoops(InnerLoop, OuterLoop);
return true;
}
@@ -1217,38 +1309,6 @@ void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI);
}
-void LoopInterchangeTransform::splitInnerLoopHeader() {
- // Split the inner loop header out. Here make sure that the reduction PHI's
- // stay in the innerloop body.
- BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
- BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
- if (InnerLoopHasReduction) {
- // Note: The induction PHI must be the first PHI for this to work
- BasicBlock *New = InnerLoopHeader->splitBasicBlock(
- ++(InnerLoopHeader->begin()), InnerLoopHeader->getName() + ".split");
- if (LI)
- if (Loop *L = LI->getLoopFor(InnerLoopHeader))
- L->addBasicBlockToLoop(New, *LI);
-
- // Adjust Reduction PHI's in the block.
- SmallVector<PHINode *, 8> PHIVec;
- for (auto I = New->begin(); isa<PHINode>(I); ++I) {
- PHINode *PHI = dyn_cast<PHINode>(I);
- Value *V = PHI->getIncomingValueForBlock(InnerLoopPreHeader);
- PHI->replaceAllUsesWith(V);
- PHIVec.push_back((PHI));
- }
- for (PHINode *P : PHIVec) {
- P->eraseFromParent();
- }
- } else {
- SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
- }
-
- DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & "
- "InnerLoopHeader\n");
-}
-
/// \brief Move all instructions except the terminator from FromBB right before
/// InsertBefore
static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
@@ -1262,18 +1322,40 @@ static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock,
BasicBlock *OldPred,
BasicBlock *NewPred) {
- for (auto I = CurrBlock->begin(); isa<PHINode>(I); ++I) {
- PHINode *PHI = cast<PHINode>(I);
- unsigned Num = PHI->getNumIncomingValues();
+ for (PHINode &PHI : CurrBlock->phis()) {
+ unsigned Num = PHI.getNumIncomingValues();
for (unsigned i = 0; i < Num; ++i) {
- if (PHI->getIncomingBlock(i) == OldPred)
- PHI->setIncomingBlock(i, NewPred);
+ if (PHI.getIncomingBlock(i) == OldPred)
+ PHI.setIncomingBlock(i, NewPred);
+ }
+ }
+}
+
+/// Update BI to jump to NewBB instead of OldBB. Records updates to
+/// the dominator tree in DTUpdates, if DT should be preserved.
+static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
+ BasicBlock *NewBB,
+ std::vector<DominatorTree::UpdateType> &DTUpdates) {
+ assert(llvm::count_if(BI->successors(),
+ [OldBB](BasicBlock *BB) { return BB == OldBB; }) < 2 &&
+ "BI must jump to OldBB at most once.");
+ for (unsigned i = 0, e = BI->getNumSuccessors(); i < e; ++i) {
+ if (BI->getSuccessor(i) == OldBB) {
+ BI->setSuccessor(i, NewBB);
+
+ DTUpdates.push_back(
+ {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
+ DTUpdates.push_back(
+ {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
+ break;
}
}
}
bool LoopInterchangeTransform::adjustLoopBranches() {
- DEBUG(dbgs() << "adjustLoopBranches called\n");
+ LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
+ std::vector<DominatorTree::UpdateType> DTUpdates;
+
// Adjust the loop preheader
BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
@@ -1313,27 +1395,18 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
return false;
// Adjust Loop Preheader and headers
-
- unsigned NumSucc = OuterLoopPredecessorBI->getNumSuccessors();
- for (unsigned i = 0; i < NumSucc; ++i) {
- if (OuterLoopPredecessorBI->getSuccessor(i) == OuterLoopPreHeader)
- OuterLoopPredecessorBI->setSuccessor(i, InnerLoopPreHeader);
- }
-
- NumSucc = OuterLoopHeaderBI->getNumSuccessors();
- for (unsigned i = 0; i < NumSucc; ++i) {
- if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch)
- OuterLoopHeaderBI->setSuccessor(i, LoopExit);
- else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader)
- OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSuccessor);
- }
+ updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
+ InnerLoopPreHeader, DTUpdates);
+ updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates);
+ updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
+ InnerLoopHeaderSuccessor, DTUpdates);
// Adjust reduction PHI's now that the incoming block has changed.
updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader,
OuterLoopHeader);
- BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI);
- InnerLoopHeaderBI->eraseFromParent();
+ updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
+ OuterLoopPreHeader, DTUpdates);
// -------------Adjust loop latches-----------
if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
@@ -1341,19 +1414,15 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
else
InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
- NumSucc = InnerLoopLatchPredecessorBI->getNumSuccessors();
- for (unsigned i = 0; i < NumSucc; ++i) {
- if (InnerLoopLatchPredecessorBI->getSuccessor(i) == InnerLoopLatch)
- InnerLoopLatchPredecessorBI->setSuccessor(i, InnerLoopLatchSuccessor);
- }
+ updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
+ InnerLoopLatchSuccessor, DTUpdates);
// Adjust PHI nodes in InnerLoopLatchSuccessor. Update all uses of PHI with
// the value and remove this PHI node from inner loop.
SmallVector<PHINode *, 8> LcssaVec;
- for (auto I = InnerLoopLatchSuccessor->begin(); isa<PHINode>(I); ++I) {
- PHINode *LcssaPhi = cast<PHINode>(I);
- LcssaVec.push_back(LcssaPhi);
- }
+ for (PHINode &P : InnerLoopLatchSuccessor->phis())
+ LcssaVec.push_back(&P);
+
for (PHINode *P : LcssaVec) {
Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch);
P->replaceAllUsesWith(Incoming);
@@ -1365,19 +1434,52 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
else
OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
- if (InnerLoopLatchBI->getSuccessor(1) == InnerLoopLatchSuccessor)
- InnerLoopLatchBI->setSuccessor(1, OuterLoopLatchSuccessor);
- else
- InnerLoopLatchBI->setSuccessor(0, OuterLoopLatchSuccessor);
+ updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
+ OuterLoopLatchSuccessor, DTUpdates);
+ updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
+ DTUpdates);
updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch);
- if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopLatchSuccessor) {
- OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch);
- } else {
- OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch);
+ DT->applyUpdates(DTUpdates);
+ restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
+ OuterLoopPreHeader);
+
+ // Now update the reduction PHIs in the inner and outer loop headers.
+ SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
+ for (PHINode &PHI : drop_begin(InnerLoopHeader->phis(), 1))
+ InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
+ for (PHINode &PHI : drop_begin(OuterLoopHeader->phis(), 1))
+ OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
+
+ for (PHINode *PHI : OuterLoopPHIs)
+ PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
+
+ // Move the PHI nodes from the inner loop header to the outer loop header.
+ // We have to deal with one kind of PHI nodes:
+ // 1) PHI nodes that are part of inner loop-only reductions.
+ // We only have to move the PHI node and update the incoming blocks.
+ for (PHINode *PHI : InnerLoopPHIs) {
+ PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
+ for (BasicBlock *InBB : PHI->blocks()) {
+ if (InnerLoop->contains(InBB))
+ continue;
+
+ assert(!isa<PHINode>(PHI->getIncomingValueForBlock(InBB)) &&
+ "Unexpected incoming PHI node, reductions in outer loop are not "
+ "supported yet");
+ PHI->replaceAllUsesWith(PHI->getIncomingValueForBlock(InBB));
+ PHI->eraseFromParent();
+ break;
+ }
}
+ // Update the incoming blocks for moved PHI nodes.
+ updateIncomingBlock(OuterLoopHeader, InnerLoopPreHeader, OuterLoopPreHeader);
+ updateIncomingBlock(OuterLoopHeader, InnerLoopLatch, OuterLoopLatch);
+ updateIncomingBlock(InnerLoopHeader, OuterLoopPreHeader, InnerLoopPreHeader);
+ updateIncomingBlock(InnerLoopHeader, OuterLoopLatch, InnerLoopLatch);
+
return true;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index dfa5ec1f354d..19bd9ebcc15b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -25,7 +25,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
@@ -52,6 +52,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
#include <algorithm>
#include <cassert>
@@ -79,7 +80,7 @@ STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE");
namespace {
-/// \brief Represent a store-to-forwarding candidate.
+/// Represent a store-to-forwarding candidate.
struct StoreToLoadForwardingCandidate {
LoadInst *Load;
StoreInst *Store;
@@ -87,7 +88,7 @@ struct StoreToLoadForwardingCandidate {
StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store)
: Load(Load), Store(Store) {}
- /// \brief Return true if the dependence from the store to the load has a
+ /// Return true if the dependence from the store to the load has a
/// distance of one. E.g. A[i+1] = A[i]
bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
Loop *L) const {
@@ -136,7 +137,7 @@ struct StoreToLoadForwardingCandidate {
} // end anonymous namespace
-/// \brief Check if the store dominates all latches, so as long as there is no
+/// Check if the store dominates all latches, so as long as there is no
/// intervening store this value will be loaded in the next iteration.
static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
DominatorTree *DT) {
@@ -147,21 +148,21 @@ static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
});
}
-/// \brief Return true if the load is not executed on all paths in the loop.
+/// Return true if the load is not executed on all paths in the loop.
static bool isLoadConditional(LoadInst *Load, Loop *L) {
return Load->getParent() != L->getHeader();
}
namespace {
-/// \brief The per-loop class that does most of the work.
+/// The per-loop class that does most of the work.
class LoadEliminationForLoop {
public:
LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
DominatorTree *DT)
: L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.getPSE()) {}
- /// \brief Look through the loop-carried and loop-independent dependences in
+ /// Look through the loop-carried and loop-independent dependences in
/// this loop and find store->load dependences.
///
/// Note that no candidate is returned if LAA has failed to analyze the loop
@@ -178,7 +179,7 @@ public:
// forward and backward dependences qualify. Disqualify loads that have
// other unknown dependences.
- SmallSet<Instruction *, 4> LoadsWithUnknownDepedence;
+ SmallPtrSet<Instruction *, 4> LoadsWithUnknownDepedence;
for (const auto &Dep : *Deps) {
Instruction *Source = Dep.getSource(LAI);
@@ -222,14 +223,14 @@ public:
return Candidates;
}
- /// \brief Return the index of the instruction according to program order.
+ /// Return the index of the instruction according to program order.
unsigned getInstrIndex(Instruction *Inst) {
auto I = InstOrder.find(Inst);
assert(I != InstOrder.end() && "No index for instruction");
return I->second;
}
- /// \brief If a load has multiple candidates associated (i.e. different
+ /// If a load has multiple candidates associated (i.e. different
/// stores), it means that it could be forwarding from multiple stores
/// depending on control flow. Remove these candidates.
///
@@ -284,22 +285,24 @@ public:
Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) {
if (LoadToSingleCand[Cand.Load] != &Cand) {
- DEBUG(dbgs() << "Removing from candidates: \n" << Cand
- << " The load may have multiple stores forwarding to "
- << "it\n");
+ LLVM_DEBUG(
+ dbgs() << "Removing from candidates: \n"
+ << Cand
+ << " The load may have multiple stores forwarding to "
+ << "it\n");
return true;
}
return false;
});
}
- /// \brief Given two pointers operations by their RuntimePointerChecking
+ /// Given two pointers operations by their RuntimePointerChecking
/// indices, return true if they require an alias check.
///
/// We need a check if one is a pointer for a candidate load and the other is
/// a pointer for a possibly intervening store.
bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2,
- const SmallSet<Value *, 4> &PtrsWrittenOnFwdingPath,
+ const SmallPtrSet<Value *, 4> &PtrsWrittenOnFwdingPath,
const std::set<Value *> &CandLoadPtrs) {
Value *Ptr1 =
LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue;
@@ -309,11 +312,11 @@ public:
(PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1)));
}
- /// \brief Return pointers that are possibly written to on the path from a
+ /// Return pointers that are possibly written to on the path from a
/// forwarding store to a load.
///
/// These pointers need to be alias-checked against the forwarding candidates.
- SmallSet<Value *, 4> findPointersWrittenOnForwardingPath(
+ SmallPtrSet<Value *, 4> findPointersWrittenOnForwardingPath(
const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
// From FirstStore to LastLoad neither of the elimination candidate loads
// should overlap with any of the stores.
@@ -351,7 +354,7 @@ public:
// We're looking for stores after the first forwarding store until the end
// of the loop, then from the beginning of the loop until the last
// forwarded-to load. Collect the pointer for the stores.
- SmallSet<Value *, 4> PtrsWrittenOnFwdingPath;
+ SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath;
auto InsertStorePtr = [&](Instruction *I) {
if (auto *S = dyn_cast<StoreInst>(I))
@@ -366,16 +369,16 @@ public:
return PtrsWrittenOnFwdingPath;
}
- /// \brief Determine the pointer alias checks to prove that there are no
+ /// Determine the pointer alias checks to prove that there are no
/// intervening stores.
SmallVector<RuntimePointerChecking::PointerCheck, 4> collectMemchecks(
const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
- SmallSet<Value *, 4> PtrsWrittenOnFwdingPath =
+ SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath =
findPointersWrittenOnForwardingPath(Candidates);
// Collect the pointers of the candidate loads.
- // FIXME: SmallSet does not work with std::inserter.
+ // FIXME: SmallPtrSet does not work with std::inserter.
std::set<Value *> CandLoadPtrs;
transform(Candidates,
std::inserter(CandLoadPtrs, CandLoadPtrs.begin()),
@@ -394,13 +397,14 @@ public:
return false;
});
- DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n");
- DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+ LLVM_DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size()
+ << "):\n");
+ LLVM_DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
return Checks;
}
- /// \brief Perform the transformation for a candidate.
+ /// Perform the transformation for a candidate.
void
propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand,
SCEVExpander &SEE) {
@@ -436,11 +440,11 @@ public:
Cand.Load->replaceAllUsesWith(PHI);
}
- /// \brief Top-level driver for each loop: find store->load forwarding
+ /// Top-level driver for each loop: find store->load forwarding
/// candidates, add run-time checks and perform transformation.
bool processLoop() {
- DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName()
- << "\" checking " << *L << "\n");
+ LLVM_DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName()
+ << "\" checking " << *L << "\n");
// Look for store-to-load forwarding cases across the
// backedge. E.g.:
@@ -479,7 +483,7 @@ public:
SmallVector<StoreToLoadForwardingCandidate, 4> Candidates;
unsigned NumForwarding = 0;
for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) {
- DEBUG(dbgs() << "Candidate " << Cand);
+ LLVM_DEBUG(dbgs() << "Candidate " << Cand);
// Make sure that the stored values is available everywhere in the loop in
// the next iteration.
@@ -498,9 +502,10 @@ public:
continue;
++NumForwarding;
- DEBUG(dbgs()
- << NumForwarding
- << ". Valid store-to-load forwarding across the loop backedge\n");
+ LLVM_DEBUG(
+ dbgs()
+ << NumForwarding
+ << ". Valid store-to-load forwarding across the loop backedge\n");
Candidates.push_back(Cand);
}
if (Candidates.empty())
@@ -513,25 +518,26 @@ public:
// Too many checks are likely to outweigh the benefits of forwarding.
if (Checks.size() > Candidates.size() * CheckPerElim) {
- DEBUG(dbgs() << "Too many run-time checks needed.\n");
+ LLVM_DEBUG(dbgs() << "Too many run-time checks needed.\n");
return false;
}
if (LAI.getPSE().getUnionPredicate().getComplexity() >
LoadElimSCEVCheckThreshold) {
- DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
+ LLVM_DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
return false;
}
if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) {
if (L->getHeader()->getParent()->optForSize()) {
- DEBUG(dbgs() << "Versioning is needed but not allowed when optimizing "
- "for size.\n");
+ LLVM_DEBUG(
+ dbgs() << "Versioning is needed but not allowed when optimizing "
+ "for size.\n");
return false;
}
if (!L->isLoopSimplifyForm()) {
- DEBUG(dbgs() << "Loop is not is loop-simplify form");
+ LLVM_DEBUG(dbgs() << "Loop is not is loop-simplify form");
return false;
}
@@ -558,7 +564,7 @@ public:
private:
Loop *L;
- /// \brief Maps the load/store instructions to their index according to
+ /// Maps the load/store instructions to their index according to
/// program order.
DenseMap<Instruction *, unsigned> InstOrder;
@@ -599,7 +605,7 @@ eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
namespace {
-/// \brief The pass. Most of the work is delegated to the per-loop
+/// The pass. Most of the work is delegated to the per-loop
/// LoadEliminationForLoop class.
class LoopLoadElimination : public FunctionPass {
public:
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp
index 2e4c7b19e476..561ceea1d880 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -155,7 +155,7 @@
// When S = -1 (i.e. reverse iterating loop), the transformation is supported
// when:
// * The loop has a single latch with the condition of the form:
-// B(X) = X <pred> latchLimit, where <pred> is u> or s>.
+// B(X) = X <pred> latchLimit, where <pred> is u>, u>=, s>, or s>=.
// * The guard condition is of the form
// G(X) = X - 1 u< guardLimit
//
@@ -171,9 +171,14 @@
// guardStart u< guardLimit && latchLimit u>= 1.
// Similarly for sgt condition the widened condition is:
// guardStart u< guardLimit && latchLimit s>= 1.
+// For uge condition the widened condition is:
+// guardStart u< guardLimit && latchLimit u> 1.
+// For sge condition the widened condition is:
+// guardStart u< guardLimit && latchLimit s> 1.
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/LoopPredication.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
@@ -198,6 +203,20 @@ static cl::opt<bool> EnableIVTruncation("loop-predication-enable-iv-truncation",
static cl::opt<bool> EnableCountDownLoop("loop-predication-enable-count-down-loop",
cl::Hidden, cl::init(true));
+
+static cl::opt<bool>
+ SkipProfitabilityChecks("loop-predication-skip-profitability-checks",
+ cl::Hidden, cl::init(false));
+
+// This is the scale factor for the latch probability. We use this during
+// profitability analysis to find other exiting blocks that have a much higher
+// probability of exiting the loop instead of loop exiting via latch.
+// This value should be greater than 1 for a sane profitability check.
+static cl::opt<float> LatchExitProbabilityScale(
+ "loop-predication-latch-probability-scale", cl::Hidden, cl::init(2.0),
+ cl::desc("scale factor for the latch probability. Value should be greater "
+ "than 1. Lower values are ignored"));
+
namespace {
class LoopPredication {
/// Represents an induction variable check:
@@ -217,6 +236,7 @@ class LoopPredication {
};
ScalarEvolution *SE;
+ BranchProbabilityInfo *BPI;
Loop *L;
const DataLayout *DL;
@@ -250,6 +270,12 @@ class LoopPredication {
IRBuilder<> &Builder);
bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);
+ // If the loop always exits through another block in the loop, we should not
+ // predicate based on the latch check. For example, the latch check can be a
+ // very coarse grained check and there can be more fine grained exit checks
+ // within the loop. We identify such unprofitable loops through BPI.
+ bool isLoopProfitableToPredicate();
+
// When the IV type is wider than the range operand type, we can still do loop
// predication, by generating SCEVs for the range and latch that are of the
// same type. We achieve this by generating a SCEV truncate expression for the
@@ -266,8 +292,10 @@ class LoopPredication {
// Return the loopLatchCheck corresponding to the RangeCheckType if safe to do
// so.
Optional<LoopICmp> generateLoopLatchCheck(Type *RangeCheckType);
+
public:
- LoopPredication(ScalarEvolution *SE) : SE(SE){};
+ LoopPredication(ScalarEvolution *SE, BranchProbabilityInfo *BPI)
+ : SE(SE), BPI(BPI){};
bool runOnLoop(Loop *L);
};
@@ -279,6 +307,7 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<BranchProbabilityInfoWrapperPass>();
getLoopAnalysisUsage(AU);
}
@@ -286,7 +315,9 @@ public:
if (skipLoop(L))
return false;
auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- LoopPredication LP(SE);
+ BranchProbabilityInfo &BPI =
+ getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+ LoopPredication LP(SE, &BPI);
return LP.runOnLoop(L);
}
};
@@ -296,6 +327,7 @@ char LoopPredicationLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(LoopPredicationLegacyPass, "loop-predication",
"Loop predication", false, false)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
INITIALIZE_PASS_END(LoopPredicationLegacyPass, "loop-predication",
"Loop predication", false, false)
@@ -307,7 +339,11 @@ Pass *llvm::createLoopPredicationPass() {
PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &U) {
- LoopPredication LP(&AR.SE);
+ const auto &FAM =
+ AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
+ Function *F = L.getHeader()->getParent();
+ auto *BPI = FAM.getCachedResult<BranchProbabilityAnalysis>(*F);
+ LoopPredication LP(&AR.SE, BPI);
if (!LP.runOnLoop(&L))
return PreservedAnalyses::all();
@@ -375,11 +411,11 @@ LoopPredication::generateLoopLatchCheck(Type *RangeCheckType) {
if (!NewLatchCheck.IV)
return None;
NewLatchCheck.Limit = SE->getTruncateExpr(LatchCheck.Limit, RangeCheckType);
- DEBUG(dbgs() << "IV of type: " << *LatchType
- << "can be represented as range check type:" << *RangeCheckType
- << "\n");
- DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n");
- DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n");
+ LLVM_DEBUG(dbgs() << "IV of type: " << *LatchType
+ << "can be represented as range check type:"
+ << *RangeCheckType << "\n");
+ LLVM_DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n");
+ LLVM_DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n");
return NewLatchCheck;
}
@@ -412,30 +448,15 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop(
SE->getMinusSCEV(LatchStart, SE->getOne(Ty)));
if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) ||
!CanExpand(LatchLimit) || !CanExpand(RHS)) {
- DEBUG(dbgs() << "Can't expand limit check!\n");
+ LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
return None;
}
- ICmpInst::Predicate LimitCheckPred;
- switch (LatchCheck.Pred) {
- case ICmpInst::ICMP_ULT:
- LimitCheckPred = ICmpInst::ICMP_ULE;
- break;
- case ICmpInst::ICMP_ULE:
- LimitCheckPred = ICmpInst::ICMP_ULT;
- break;
- case ICmpInst::ICMP_SLT:
- LimitCheckPred = ICmpInst::ICMP_SLE;
- break;
- case ICmpInst::ICMP_SLE:
- LimitCheckPred = ICmpInst::ICMP_SLT;
- break;
- default:
- llvm_unreachable("Unsupported loop latch!");
- }
+ auto LimitCheckPred =
+ ICmpInst::getFlippedStrictnessPredicate(LatchCheck.Pred);
- DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n");
- DEBUG(dbgs() << "RHS: " << *RHS << "\n");
- DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n");
+ LLVM_DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n");
+ LLVM_DEBUG(dbgs() << "RHS: " << *RHS << "\n");
+ LLVM_DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n");
Instruction *InsertAt = Preheader->getTerminator();
auto *LimitCheck =
@@ -454,16 +475,16 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
const SCEV *LatchLimit = LatchCheck.Limit;
if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) ||
!CanExpand(LatchLimit)) {
- DEBUG(dbgs() << "Can't expand limit check!\n");
+ LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
return None;
}
// The decrement of the latch check IV should be the same as the
// rangeCheckIV.
auto *PostDecLatchCheckIV = LatchCheck.IV->getPostIncExpr(*SE);
if (RangeCheck.IV != PostDecLatchCheckIV) {
- DEBUG(dbgs() << "Not the same. PostDecLatchCheckIV: "
- << *PostDecLatchCheckIV
- << " and RangeCheckIV: " << *RangeCheck.IV << "\n");
+ LLVM_DEBUG(dbgs() << "Not the same. PostDecLatchCheckIV: "
+ << *PostDecLatchCheckIV
+ << " and RangeCheckIV: " << *RangeCheck.IV << "\n");
return None;
}
@@ -472,9 +493,8 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
// latchLimit <pred> 1.
// See the header comment for reasoning of the checks.
Instruction *InsertAt = Preheader->getTerminator();
- auto LimitCheckPred = ICmpInst::isSigned(LatchCheck.Pred)
- ? ICmpInst::ICMP_SGE
- : ICmpInst::ICMP_UGE;
+ auto LimitCheckPred =
+ ICmpInst::getFlippedStrictnessPredicate(LatchCheck.Pred);
auto *FirstIterationCheck = expandCheck(Expander, Builder, ICmpInst::ICMP_ULT,
GuardStart, GuardLimit, InsertAt);
auto *LimitCheck = expandCheck(Expander, Builder, LimitCheckPred, LatchLimit,
@@ -488,8 +508,8 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
SCEVExpander &Expander,
IRBuilder<> &Builder) {
- DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
- DEBUG(ICI->dump());
+ LLVM_DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
+ LLVM_DEBUG(ICI->dump());
// parseLoopStructure guarantees that the latch condition is:
// ++i <pred> latchLimit, where <pred> is u<, u<=, s<, or s<=.
@@ -497,34 +517,34 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
// i u< guardLimit
auto RangeCheck = parseLoopICmp(ICI);
if (!RangeCheck) {
- DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+ LLVM_DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
return None;
}
- DEBUG(dbgs() << "Guard check:\n");
- DEBUG(RangeCheck->dump());
+ LLVM_DEBUG(dbgs() << "Guard check:\n");
+ LLVM_DEBUG(RangeCheck->dump());
if (RangeCheck->Pred != ICmpInst::ICMP_ULT) {
- DEBUG(dbgs() << "Unsupported range check predicate(" << RangeCheck->Pred
- << ")!\n");
+ LLVM_DEBUG(dbgs() << "Unsupported range check predicate("
+ << RangeCheck->Pred << ")!\n");
return None;
}
auto *RangeCheckIV = RangeCheck->IV;
if (!RangeCheckIV->isAffine()) {
- DEBUG(dbgs() << "Range check IV is not affine!\n");
+ LLVM_DEBUG(dbgs() << "Range check IV is not affine!\n");
return None;
}
auto *Step = RangeCheckIV->getStepRecurrence(*SE);
// We cannot just compare with latch IV step because the latch and range IVs
// may have different types.
if (!isSupportedStep(Step)) {
- DEBUG(dbgs() << "Range check and latch have IVs different steps!\n");
+ LLVM_DEBUG(dbgs() << "Range check and latch have IVs different steps!\n");
return None;
}
auto *Ty = RangeCheckIV->getType();
auto CurrLatchCheckOpt = generateLoopLatchCheck(Ty);
if (!CurrLatchCheckOpt) {
- DEBUG(dbgs() << "Failed to generate a loop latch check "
- "corresponding to range type: "
- << *Ty << "\n");
+ LLVM_DEBUG(dbgs() << "Failed to generate a loop latch check "
+ "corresponding to range type: "
+ << *Ty << "\n");
return None;
}
@@ -535,7 +555,7 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
CurrLatchCheck.IV->getStepRecurrence(*SE)->getType() &&
"Range and latch steps should be of same type!");
if (Step != CurrLatchCheck.IV->getStepRecurrence(*SE)) {
- DEBUG(dbgs() << "Range and latch have different step values!\n");
+ LLVM_DEBUG(dbgs() << "Range and latch have different step values!\n");
return None;
}
@@ -551,14 +571,14 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
SCEVExpander &Expander) {
- DEBUG(dbgs() << "Processing guard:\n");
- DEBUG(Guard->dump());
+ LLVM_DEBUG(dbgs() << "Processing guard:\n");
+ LLVM_DEBUG(Guard->dump());
IRBuilder<> Builder(cast<Instruction>(Preheader->getTerminator()));
// The guard condition is expected to be in form of:
// cond1 && cond2 && cond3 ...
- // Iterate over subconditions looking for for icmp conditions which can be
+ // Iterate over subconditions looking for icmp conditions which can be
// widened across loop iterations. Widening these conditions remember the
// resulting list of subconditions in Checks vector.
SmallVector<Value *, 4> Worklist(1, Guard->getOperand(0));
@@ -605,7 +625,7 @@ bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
LastCheck = Builder.CreateAnd(LastCheck, Check);
Guard->setOperand(0, LastCheck);
- DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
+ LLVM_DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
return true;
}
@@ -614,7 +634,7 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
BasicBlock *LoopLatch = L->getLoopLatch();
if (!LoopLatch) {
- DEBUG(dbgs() << "The loop doesn't have a single latch!\n");
+ LLVM_DEBUG(dbgs() << "The loop doesn't have a single latch!\n");
return None;
}
@@ -625,7 +645,7 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
if (!match(LoopLatch->getTerminator(),
m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)), TrueDest,
FalseDest))) {
- DEBUG(dbgs() << "Failed to match the latch terminator!\n");
+ LLVM_DEBUG(dbgs() << "Failed to match the latch terminator!\n");
return None;
}
assert((TrueDest == L->getHeader() || FalseDest == L->getHeader()) &&
@@ -635,20 +655,20 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
auto Result = parseLoopICmp(Pred, LHS, RHS);
if (!Result) {
- DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+ LLVM_DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
return None;
}
// Check affine first, so if it's not we don't try to compute the step
// recurrence.
if (!Result->IV->isAffine()) {
- DEBUG(dbgs() << "The induction variable is not affine!\n");
+ LLVM_DEBUG(dbgs() << "The induction variable is not affine!\n");
return None;
}
auto *Step = Result->IV->getStepRecurrence(*SE);
if (!isSupportedStep(Step)) {
- DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n");
+ LLVM_DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n");
return None;
}
@@ -658,13 +678,14 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_SLE;
} else {
assert(Step->isAllOnesValue() && "Step should be -1!");
- return Pred != ICmpInst::ICMP_UGT && Pred != ICmpInst::ICMP_SGT;
+ return Pred != ICmpInst::ICMP_UGT && Pred != ICmpInst::ICMP_SGT &&
+ Pred != ICmpInst::ICMP_UGE && Pred != ICmpInst::ICMP_SGE;
}
};
if (IsUnsupportedPredicate(Step, Result->Pred)) {
- DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred
- << ")!\n");
+ LLVM_DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred
+ << ")!\n");
return None;
}
return Result;
@@ -700,11 +721,65 @@ bool LoopPredication::isSafeToTruncateWideIVType(Type *RangeCheckType) {
Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize;
}
+bool LoopPredication::isLoopProfitableToPredicate() {
+ if (SkipProfitabilityChecks || !BPI)
+ return true;
+
+ SmallVector<std::pair<const BasicBlock *, const BasicBlock *>, 8> ExitEdges;
+ L->getExitEdges(ExitEdges);
+ // If there is only one exiting edge in the loop, it is always profitable to
+ // predicate the loop.
+ if (ExitEdges.size() == 1)
+ return true;
+
+ // Calculate the exiting probabilities of all exiting edges from the loop,
+ // starting with the LatchExitProbability.
+ // Heuristic for profitability: If any of the exiting blocks' probability of
+ // exiting the loop is larger than exiting through the latch block, it's not
+ // profitable to predicate the loop.
+ auto *LatchBlock = L->getLoopLatch();
+ assert(LatchBlock && "Should have a single latch at this point!");
+ auto *LatchTerm = LatchBlock->getTerminator();
+ assert(LatchTerm->getNumSuccessors() == 2 &&
+ "expected to be an exiting block with 2 succs!");
+ unsigned LatchBrExitIdx =
+ LatchTerm->getSuccessor(0) == L->getHeader() ? 1 : 0;
+ BranchProbability LatchExitProbability =
+ BPI->getEdgeProbability(LatchBlock, LatchBrExitIdx);
+
+ // Protect against degenerate inputs provided by the user. Providing a value
+ // less than one, can invert the definition of profitable loop predication.
+ float ScaleFactor = LatchExitProbabilityScale;
+ if (ScaleFactor < 1) {
+ LLVM_DEBUG(
+ dbgs()
+ << "Ignored user setting for loop-predication-latch-probability-scale: "
+ << LatchExitProbabilityScale << "\n");
+ LLVM_DEBUG(dbgs() << "The value is set to 1.0\n");
+ ScaleFactor = 1.0;
+ }
+ const auto LatchProbabilityThreshold =
+ LatchExitProbability * ScaleFactor;
+
+ for (const auto &ExitEdge : ExitEdges) {
+ BranchProbability ExitingBlockProbability =
+ BPI->getEdgeProbability(ExitEdge.first, ExitEdge.second);
+ // Some exiting edge has higher probability than the latch exiting edge.
+ // No longer profitable to predicate.
+ if (ExitingBlockProbability > LatchProbabilityThreshold)
+ return false;
+ }
+ // Using BPI, we have concluded that the most probable way to exit from the
+ // loop is through the latch (or there's no profile information and all
+ // exits are equally likely).
+ return true;
+}
+
bool LoopPredication::runOnLoop(Loop *Loop) {
L = Loop;
- DEBUG(dbgs() << "Analyzing ");
- DEBUG(L->dump());
+ LLVM_DEBUG(dbgs() << "Analyzing ");
+ LLVM_DEBUG(L->dump());
Module *M = L->getHeader()->getModule();
@@ -725,9 +800,13 @@ bool LoopPredication::runOnLoop(Loop *Loop) {
return false;
LatchCheck = *LatchCheckOpt;
- DEBUG(dbgs() << "Latch check:\n");
- DEBUG(LatchCheck.dump());
+ LLVM_DEBUG(dbgs() << "Latch check:\n");
+ LLVM_DEBUG(LatchCheck.dump());
+ if (!isLoopProfitableToPredicate()) {
+ LLVM_DEBUG(dbgs() << "Loop not profitable to predicate!\n");
+ return false;
+ }
// Collect all the guards into a vector and process later, so as not
// to invalidate the instruction iterator.
SmallVector<IntrinsicInst *, 4> Guards;
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
index d1a54b877950..9a99e5925572 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -17,7 +17,7 @@
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
@@ -28,6 +28,7 @@
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
@@ -51,8 +52,8 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <cassert>
#include <cstddef>
@@ -69,10 +70,6 @@ using namespace llvm;
STATISTIC(NumRerolledLoops, "Number of rerolled loops");
static cl::opt<unsigned>
-MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden,
- cl::desc("The maximum increment for loop rerolling"));
-
-static cl::opt<unsigned>
NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
cl::Hidden,
cl::desc("The maximum number of failures to tolerate"
@@ -188,7 +185,7 @@ namespace {
bool PreserveLCSSA;
using SmallInstructionVector = SmallVector<Instruction *, 16>;
- using SmallInstructionSet = SmallSet<Instruction *, 16>;
+ using SmallInstructionSet = SmallPtrSet<Instruction *, 16>;
// Map between induction variable and its increment
DenseMap<Instruction *, int64_t> IVToIncMap;
@@ -397,8 +394,8 @@ namespace {
/// Stage 3: Assuming validate() returned true, perform the
/// replacement.
- /// @param IterCount The maximum iteration count of L.
- void replace(const SCEV *IterCount);
+ /// @param BackedgeTakenCount The backedge-taken count of L.
+ void replace(const SCEV *BackedgeTakenCount);
protected:
using UsesTy = MapVector<Instruction *, BitVector>;
@@ -428,8 +425,7 @@ namespace {
bool instrDependsOn(Instruction *I,
UsesTy::iterator Start,
UsesTy::iterator End);
- void replaceIV(Instruction *Inst, Instruction *IV, const SCEV *IterCount);
- void updateNonLoopCtrlIncr();
+ void replaceIV(DAGRootSet &DRS, const SCEV *Start, const SCEV *IncrExpr);
LoopReroll *Parent;
@@ -482,8 +478,8 @@ namespace {
void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
void collectPossibleReductions(Loop *L,
ReductionTracker &Reductions);
- bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount,
- ReductionTracker &Reductions);
+ bool reroll(Instruction *IV, Loop *L, BasicBlock *Header,
+ const SCEV *BackedgeTakenCount, ReductionTracker &Reductions);
};
} // end anonymous namespace
@@ -510,48 +506,6 @@ static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
return false;
}
-static const SCEVConstant *getIncrmentFactorSCEV(ScalarEvolution *SE,
- const SCEV *SCEVExpr,
- Instruction &IV) {
- const SCEVMulExpr *MulSCEV = dyn_cast<SCEVMulExpr>(SCEVExpr);
-
- // If StepRecurrence of a SCEVExpr is a constant (c1 * c2, c2 = sizeof(ptr)),
- // Return c1.
- if (!MulSCEV && IV.getType()->isPointerTy())
- if (const SCEVConstant *IncSCEV = dyn_cast<SCEVConstant>(SCEVExpr)) {
- const PointerType *PTy = cast<PointerType>(IV.getType());
- Type *ElTy = PTy->getElementType();
- const SCEV *SizeOfExpr =
- SE->getSizeOfExpr(SE->getEffectiveSCEVType(IV.getType()), ElTy);
- if (IncSCEV->getValue()->getValue().isNegative()) {
- const SCEV *NewSCEV =
- SE->getUDivExpr(SE->getNegativeSCEV(SCEVExpr), SizeOfExpr);
- return dyn_cast<SCEVConstant>(SE->getNegativeSCEV(NewSCEV));
- } else {
- return dyn_cast<SCEVConstant>(SE->getUDivExpr(SCEVExpr, SizeOfExpr));
- }
- }
-
- if (!MulSCEV)
- return nullptr;
-
- // If StepRecurrence of a SCEVExpr is a c * sizeof(x), where c is constant,
- // Return c.
- const SCEVConstant *CIncSCEV = nullptr;
- for (const SCEV *Operand : MulSCEV->operands()) {
- if (const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Operand)) {
- CIncSCEV = Constant;
- } else if (const SCEVUnknown *Unknown = dyn_cast<SCEVUnknown>(Operand)) {
- Type *AllocTy;
- if (!Unknown->isSizeOf(AllocTy))
- break;
- } else {
- return nullptr;
- }
- }
- return CIncSCEV;
-}
-
// Check if an IV is only used to control the loop. There are two cases:
// 1. It only has one use which is loop increment, and the increment is only
// used by comparison and the PHI (could has sext with nsw in between), and the
@@ -632,25 +586,17 @@ void LoopReroll::collectPossibleIVs(Loop *L,
continue;
if (!PHISCEV->isAffine())
continue;
- const SCEVConstant *IncSCEV = nullptr;
- if (I->getType()->isPointerTy())
- IncSCEV =
- getIncrmentFactorSCEV(SE, PHISCEV->getStepRecurrence(*SE), *I);
- else
- IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE));
+ auto IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE));
if (IncSCEV) {
- const APInt &AInt = IncSCEV->getValue()->getValue().abs();
- if (IncSCEV->getValue()->isZero() || AInt.uge(MaxInc))
- continue;
IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue();
- DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV
- << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV
+ << "\n");
if (isLoopControlIV(L, &*I)) {
assert(!LoopControlIV && "Found two loop control only IV");
LoopControlIV = &(*I);
- DEBUG(dbgs() << "LRR: Possible loop control only IV: " << *I << " = "
- << *PHISCEV << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: Possible loop control only IV: " << *I
+ << " = " << *PHISCEV << "\n");
} else
PossibleIVs.push_back(&*I);
}
@@ -717,8 +663,8 @@ void LoopReroll::collectPossibleReductions(Loop *L,
if (!SLR.valid())
continue;
- DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " <<
- SLR.size() << " chained instructions)\n");
+ LLVM_DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with "
+ << SLR.size() << " chained instructions)\n");
Reductions.addSLR(SLR);
}
}
@@ -856,7 +802,8 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
BaseUsers.push_back(II);
continue;
} else {
- DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I
+ << "\n");
return false;
}
}
@@ -878,7 +825,7 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
// away.
if (BaseUsers.size()) {
if (Roots.find(0) != Roots.end()) {
- DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
+ LLVM_DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
return false;
}
Roots[0] = Base;
@@ -894,9 +841,9 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
if (KV.first == 0)
continue;
if (!KV.second->hasNUses(NumBaseUses)) {
- DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
- << "#Base=" << NumBaseUses << ", #Root=" <<
- KV.second->getNumUses() << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
+ << "#Base=" << NumBaseUses
+ << ", #Root=" << KV.second->getNumUses() << "\n");
return false;
}
}
@@ -1024,13 +971,14 @@ bool LoopReroll::DAGRootTracker::findRoots() {
// Ensure all sets have the same size.
if (RootSets.empty()) {
- DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
+ LLVM_DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
return false;
}
for (auto &V : RootSets) {
if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) {
- DEBUG(dbgs()
- << "LRR: Aborting because not all root sets have the same size\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "LRR: Aborting because not all root sets have the same size\n");
return false;
}
}
@@ -1038,13 +986,14 @@ bool LoopReroll::DAGRootTracker::findRoots() {
Scale = RootSets[0].Roots.size() + 1;
if (Scale > IL_MaxRerollIterations) {
- DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
- << "#Found=" << Scale << ", #Max=" << IL_MaxRerollIterations
- << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
+ << "#Found=" << Scale
+ << ", #Max=" << IL_MaxRerollIterations << "\n");
return false;
}
- DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale
+ << "\n");
return true;
}
@@ -1078,7 +1027,7 @@ bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &Po
// While we're here, check the use sets are the same size.
if (V.size() != VBase.size()) {
- DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
+ LLVM_DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
return false;
}
@@ -1235,17 +1184,17 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
// set.
for (auto &KV : Uses) {
if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) {
- DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
- << *KV.first << " (#uses=" << KV.second.count() << ")\n");
+ LLVM_DEBUG(
+ dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
+ << *KV.first << " (#uses=" << KV.second.count() << ")\n");
return false;
}
}
- DEBUG(
- for (auto &KV : Uses) {
- dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
- }
- );
+ LLVM_DEBUG(for (auto &KV
+ : Uses) {
+ dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
+ });
for (unsigned Iter = 1; Iter < Scale; ++Iter) {
// In addition to regular aliasing information, we need to look for
@@ -1304,8 +1253,8 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
if (TryIt == Uses.end() || TryIt == RootIt ||
instrDependsOn(TryIt->first, RootIt, TryIt)) {
- DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
- " vs. " << *RootInst << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at "
+ << *BaseInst << " vs. " << *RootInst << "\n");
return false;
}
@@ -1341,8 +1290,8 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
// root instruction, does not also belong to the base set or the set of
// some other root instruction.
if (RootIt->second.count() > 1) {
- DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
- " vs. " << *RootInst << " (prev. case overlap)\n");
+ LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+ << " vs. " << *RootInst << " (prev. case overlap)\n");
return false;
}
@@ -1352,8 +1301,9 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
if (RootInst->mayReadFromMemory())
for (auto &K : AST) {
if (K.aliasesUnknownInst(RootInst, *AA)) {
- DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
- " vs. " << *RootInst << " (depends on future store)\n");
+ LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at "
+ << *BaseInst << " vs. " << *RootInst
+ << " (depends on future store)\n");
return false;
}
}
@@ -1366,9 +1316,9 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
!isSafeToSpeculativelyExecute(BaseInst)) ||
(!isUnorderedLoadStore(RootInst) &&
!isSafeToSpeculativelyExecute(RootInst)))) {
- DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
- " vs. " << *RootInst <<
- " (side effects prevent reordering)\n");
+ LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+ << " vs. " << *RootInst
+ << " (side effects prevent reordering)\n");
return false;
}
@@ -1419,8 +1369,9 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
BaseInst->getOperand(!j) == Op2) {
Swapped = true;
} else {
- DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
- << " vs. " << *RootInst << " (operand " << j << ")\n");
+ LLVM_DEBUG(dbgs()
+ << "LRR: iteration root match failed at " << *BaseInst
+ << " vs. " << *RootInst << " (operand " << j << ")\n");
return false;
}
}
@@ -1433,8 +1384,8 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
hasUsesOutsideLoop(BaseInst, L)) ||
(!PossibleRedLastSet.count(RootInst) &&
hasUsesOutsideLoop(RootInst, L))) {
- DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
- " vs. " << *RootInst << " (uses outside loop)\n");
+ LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+ << " vs. " << *RootInst << " (uses outside loop)\n");
return false;
}
@@ -1451,20 +1402,32 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
"Mismatched set sizes!");
}
- DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
- *IV << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: Matched all iteration increments for " << *IV
+ << "\n");
return true;
}
-void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
+void LoopReroll::DAGRootTracker::replace(const SCEV *BackedgeTakenCount) {
BasicBlock *Header = L->getHeader();
+
+ // Compute the start and increment for each BaseInst before we start erasing
+ // instructions.
+ SmallVector<const SCEV *, 8> StartExprs;
+ SmallVector<const SCEV *, 8> IncrExprs;
+ for (auto &DRS : RootSets) {
+ const SCEVAddRecExpr *IVSCEV =
+ cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
+ StartExprs.push_back(IVSCEV->getStart());
+ IncrExprs.push_back(SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), IVSCEV));
+ }
+
// Remove instructions associated with non-base iterations.
for (BasicBlock::reverse_iterator J = Header->rbegin(), JE = Header->rend();
J != JE;) {
unsigned I = Uses[&*J].find_first();
if (I > 0 && I < IL_All) {
- DEBUG(dbgs() << "LRR: removing: " << *J << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: removing: " << *J << "\n");
J++->eraseFromParent();
continue;
}
@@ -1472,74 +1435,47 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
++J;
}
- bool HasTwoIVs = LoopControlIV && LoopControlIV != IV;
+ // Rewrite each BaseInst using SCEV.
+ for (size_t i = 0, e = RootSets.size(); i != e; ++i)
+ // Insert the new induction variable.
+ replaceIV(RootSets[i], StartExprs[i], IncrExprs[i]);
- if (HasTwoIVs) {
- updateNonLoopCtrlIncr();
- replaceIV(LoopControlIV, LoopControlIV, IterCount);
- } else
- // We need to create a new induction variable for each different BaseInst.
- for (auto &DRS : RootSets)
- // Insert the new induction variable.
- replaceIV(DRS.BaseInst, IV, IterCount);
+ { // Limit the lifetime of SCEVExpander.
+ BranchInst *BI = cast<BranchInst>(Header->getTerminator());
+ const DataLayout &DL = Header->getModule()->getDataLayout();
+ SCEVExpander Expander(*SE, DL, "reroll");
+ auto Zero = SE->getZero(BackedgeTakenCount->getType());
+ auto One = SE->getOne(BackedgeTakenCount->getType());
+ auto NewIVSCEV = SE->getAddRecExpr(Zero, One, L, SCEV::FlagAnyWrap);
+ Value *NewIV =
+ Expander.expandCodeFor(NewIVSCEV, BackedgeTakenCount->getType(),
+ Header->getFirstNonPHIOrDbg());
+ // FIXME: This arithmetic can overflow.
+ auto TripCount = SE->getAddExpr(BackedgeTakenCount, One);
+ auto ScaledTripCount = SE->getMulExpr(
+ TripCount, SE->getConstant(BackedgeTakenCount->getType(), Scale));
+ auto ScaledBECount = SE->getMinusSCEV(ScaledTripCount, One);
+ Value *TakenCount =
+ Expander.expandCodeFor(ScaledBECount, BackedgeTakenCount->getType(),
+ Header->getFirstNonPHIOrDbg());
+ Value *Cond =
+ new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, TakenCount, "exitcond");
+ BI->setCondition(Cond);
+
+ if (BI->getSuccessor(1) != Header)
+ BI->swapSuccessors();
+ }
SimplifyInstructionsInBlock(Header, TLI);
DeleteDeadPHIs(Header, TLI);
}
-// For non-loop-control IVs, we only need to update the last increment
-// with right amount, then we are done.
-void LoopReroll::DAGRootTracker::updateNonLoopCtrlIncr() {
- const SCEV *NewInc = nullptr;
- for (auto *LoopInc : LoopIncs) {
- GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LoopInc);
- const SCEVConstant *COp = nullptr;
- if (GEP && LoopInc->getOperand(0)->getType()->isPointerTy()) {
- COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(1)));
- } else {
- COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(0)));
- if (!COp)
- COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(1)));
- }
-
- assert(COp && "Didn't find constant operand of LoopInc!\n");
-
- const APInt &AInt = COp->getValue()->getValue();
- const SCEV *ScaleSCEV = SE->getConstant(COp->getType(), Scale);
- if (AInt.isNegative()) {
- NewInc = SE->getNegativeSCEV(COp);
- NewInc = SE->getUDivExpr(NewInc, ScaleSCEV);
- NewInc = SE->getNegativeSCEV(NewInc);
- } else
- NewInc = SE->getUDivExpr(COp, ScaleSCEV);
-
- LoopInc->setOperand(1, dyn_cast<SCEVConstant>(NewInc)->getValue());
- }
-}
-
-void LoopReroll::DAGRootTracker::replaceIV(Instruction *Inst,
- Instruction *InstIV,
- const SCEV *IterCount) {
+void LoopReroll::DAGRootTracker::replaceIV(DAGRootSet &DRS,
+ const SCEV *Start,
+ const SCEV *IncrExpr) {
BasicBlock *Header = L->getHeader();
- int64_t Inc = IVToIncMap[InstIV];
- bool NeedNewIV = InstIV == LoopControlIV;
- bool Negative = !NeedNewIV && Inc < 0;
-
- const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(Inst));
- const SCEV *Start = RealIVSCEV->getStart();
-
- if (NeedNewIV)
- Start = SE->getConstant(Start->getType(), 0);
-
- const SCEV *SizeOfExpr = nullptr;
- const SCEV *IncrExpr =
- SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1);
- if (auto *PTy = dyn_cast<PointerType>(Inst->getType())) {
- Type *ElTy = PTy->getElementType();
- SizeOfExpr =
- SE->getSizeOfExpr(SE->getEffectiveSCEVType(Inst->getType()), ElTy);
- IncrExpr = SE->getMulExpr(IncrExpr, SizeOfExpr);
- }
+ Instruction *Inst = DRS.BaseInst;
+
const SCEV *NewIVSCEV =
SE->getAddRecExpr(Start, IncrExpr, L, SCEV::FlagAnyWrap);
@@ -1552,54 +1488,6 @@ void LoopReroll::DAGRootTracker::replaceIV(Instruction *Inst,
for (auto &KV : Uses)
if (KV.second.find_first() == 0)
KV.first->replaceUsesOfWith(Inst, NewIV);
-
- if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
- // FIXME: Why do we need this check?
- if (Uses[BI].find_first() == IL_All) {
- const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
-
- if (NeedNewIV)
- ICSCEV = SE->getMulExpr(IterCount,
- SE->getConstant(IterCount->getType(), Scale));
-
- // Iteration count SCEV minus or plus 1
- const SCEV *MinusPlus1SCEV =
- SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1);
- if (Inst->getType()->isPointerTy()) {
- assert(SizeOfExpr && "SizeOfExpr is not initialized");
- MinusPlus1SCEV = SE->getMulExpr(MinusPlus1SCEV, SizeOfExpr);
- }
-
- const SCEV *ICMinusPlus1SCEV = SE->getMinusSCEV(ICSCEV, MinusPlus1SCEV);
- // Iteration count minus 1
- Instruction *InsertPtr = nullptr;
- if (isa<SCEVConstant>(ICMinusPlus1SCEV)) {
- InsertPtr = BI;
- } else {
- BasicBlock *Preheader = L->getLoopPreheader();
- if (!Preheader)
- Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
- InsertPtr = Preheader->getTerminator();
- }
-
- if (!isa<PointerType>(NewIV->getType()) && NeedNewIV &&
- (SE->getTypeSizeInBits(NewIV->getType()) <
- SE->getTypeSizeInBits(ICMinusPlus1SCEV->getType()))) {
- IRBuilder<> Builder(BI);
- Builder.SetCurrentDebugLocation(BI->getDebugLoc());
- NewIV = Builder.CreateSExt(NewIV, ICMinusPlus1SCEV->getType());
- }
- Value *ICMinusPlus1 = Expander.expandCodeFor(
- ICMinusPlus1SCEV, NewIV->getType(), InsertPtr);
-
- Value *Cond =
- new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinusPlus1, "exitcond");
- BI->setCondition(Cond);
-
- if (BI->getSuccessor(1) != Header)
- BI->swapSuccessors();
- }
- }
}
}
@@ -1617,17 +1505,17 @@ bool LoopReroll::ReductionTracker::validateSelected() {
int Iter = PossibleRedIter[J];
if (Iter != PrevIter && Iter != PrevIter + 1 &&
!PossibleReds[i].getReducedValue()->isAssociative()) {
- DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
- J << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: "
+ << J << "\n");
return false;
}
if (Iter != PrevIter) {
if (Count != BaseCount) {
- DEBUG(dbgs() << "LRR: Iteration " << PrevIter <<
- " reduction use count " << Count <<
- " is not equal to the base use count " <<
- BaseCount << "\n");
+ LLVM_DEBUG(dbgs()
+ << "LRR: Iteration " << PrevIter << " reduction use count "
+ << Count << " is not equal to the base use count "
+ << BaseCount << "\n");
return false;
}
@@ -1716,15 +1604,15 @@ void LoopReroll::ReductionTracker::replaceSelected() {
// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
// have been validated), then we reroll the loop.
bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
- const SCEV *IterCount,
+ const SCEV *BackedgeTakenCount,
ReductionTracker &Reductions) {
DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA,
IVToIncMap, LoopControlIV);
if (!DAGRoots.findRoots())
return false;
- DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
- *IV << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: Found all root induction increments for: " << *IV
+ << "\n");
if (!DAGRoots.validate(Reductions))
return false;
@@ -1734,7 +1622,7 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
// making changes!
Reductions.replaceSelected();
- DAGRoots.replace(IterCount);
+ DAGRoots.replace(BackedgeTakenCount);
++NumRerolledLoops;
return true;
@@ -1752,9 +1640,9 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
BasicBlock *Header = L->getHeader();
- DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() <<
- "] Loop %" << Header->getName() << " (" <<
- L->getNumBlocks() << " block(s))\n");
+ LLVM_DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << "] Loop %"
+ << Header->getName() << " (" << L->getNumBlocks()
+ << " block(s))\n");
// For now, we'll handle only single BB loops.
if (L->getNumBlocks() > 1)
@@ -1763,10 +1651,10 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
if (!SE->hasLoopInvariantBackedgeTakenCount(L))
return false;
- const SCEV *LIBETC = SE->getBackedgeTakenCount(L);
- const SCEV *IterCount = SE->getAddExpr(LIBETC, SE->getOne(LIBETC->getType()));
- DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n");
- DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n");
+ const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+ LLVM_DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: backedge-taken count = " << *BackedgeTakenCount
+ << "\n");
// First, we need to find the induction variable with respect to which we can
// reroll (there may be several possible options).
@@ -1776,7 +1664,7 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
collectPossibleIVs(L, PossibleIVs);
if (PossibleIVs.empty()) {
- DEBUG(dbgs() << "LRR: No possible IVs found\n");
+ LLVM_DEBUG(dbgs() << "LRR: No possible IVs found\n");
return false;
}
@@ -1787,11 +1675,11 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
// For each possible IV, collect the associated possible set of 'root' nodes
// (i+1, i+2, etc.).
for (Instruction *PossibleIV : PossibleIVs)
- if (reroll(PossibleIV, L, Header, IterCount, Reductions)) {
+ if (reroll(PossibleIV, L, Header, BackedgeTakenCount, Reductions)) {
Changed = true;
break;
}
- DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n");
+ LLVM_DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n");
// Trip count of L has changed so SE must be re-evaluated.
if (Changed)
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index a91f53ba663f..eeaad39dc1d1 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -13,33 +13,15 @@
#include "llvm/Transforms/Scalar/LoopRotation.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopRotationUtils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
using namespace llvm;
#define DEBUG_TYPE "loop-rotate"
@@ -48,595 +30,6 @@ static cl::opt<unsigned> DefaultRotationThreshold(
"rotation-max-header-size", cl::init(16), cl::Hidden,
cl::desc("The default maximum header size for automatic loop rotation"));
-STATISTIC(NumRotated, "Number of loops rotated");
-
-namespace {
-/// A simple loop rotation transformation.
-class LoopRotate {
- const unsigned MaxHeaderSize;
- LoopInfo *LI;
- const TargetTransformInfo *TTI;
- AssumptionCache *AC;
- DominatorTree *DT;
- ScalarEvolution *SE;
- const SimplifyQuery &SQ;
-
-public:
- LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
- const TargetTransformInfo *TTI, AssumptionCache *AC,
- DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ)
- : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
- SQ(SQ) {}
- bool processLoop(Loop *L);
-
-private:
- bool rotateLoop(Loop *L, bool SimplifiedLatch);
- bool simplifyLoopLatch(Loop *L);
-};
-} // end anonymous namespace
-
-/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
-/// old header into the preheader. If there were uses of the values produced by
-/// these instruction that were outside of the loop, we have to insert PHI nodes
-/// to merge the two values. Do this now.
-static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
- BasicBlock *OrigPreheader,
- ValueToValueMapTy &ValueMap,
- SmallVectorImpl<PHINode*> *InsertedPHIs) {
- // Remove PHI node entries that are no longer live.
- BasicBlock::iterator I, E = OrigHeader->end();
- for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
- PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
-
- // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
- // as necessary.
- SSAUpdater SSA(InsertedPHIs);
- for (I = OrigHeader->begin(); I != E; ++I) {
- Value *OrigHeaderVal = &*I;
-
- // If there are no uses of the value (e.g. because it returns void), there
- // is nothing to rewrite.
- if (OrigHeaderVal->use_empty())
- continue;
-
- Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal);
-
- // The value now exits in two versions: the initial value in the preheader
- // and the loop "next" value in the original header.
- SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
- SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
- SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
-
- // Visit each use of the OrigHeader instruction.
- for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
- UE = OrigHeaderVal->use_end();
- UI != UE;) {
- // Grab the use before incrementing the iterator.
- Use &U = *UI;
-
- // Increment the iterator before removing the use from the list.
- ++UI;
-
- // SSAUpdater can't handle a non-PHI use in the same block as an
- // earlier def. We can easily handle those cases manually.
- Instruction *UserInst = cast<Instruction>(U.getUser());
- if (!isa<PHINode>(UserInst)) {
- BasicBlock *UserBB = UserInst->getParent();
-
- // The original users in the OrigHeader are already using the
- // original definitions.
- if (UserBB == OrigHeader)
- continue;
-
- // Users in the OrigPreHeader need to use the value to which the
- // original definitions are mapped.
- if (UserBB == OrigPreheader) {
- U = OrigPreHeaderVal;
- continue;
- }
- }
-
- // Anything else can be handled by SSAUpdater.
- SSA.RewriteUse(U);
- }
-
- // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
- // intrinsics.
- SmallVector<DbgValueInst *, 1> DbgValues;
- llvm::findDbgValues(DbgValues, OrigHeaderVal);
- for (auto &DbgValue : DbgValues) {
- // The original users in the OrigHeader are already using the original
- // definitions.
- BasicBlock *UserBB = DbgValue->getParent();
- if (UserBB == OrigHeader)
- continue;
-
- // Users in the OrigPreHeader need to use the value to which the
- // original definitions are mapped and anything else can be handled by
- // the SSAUpdater. To avoid adding PHINodes, check if the value is
- // available in UserBB, if not substitute undef.
- Value *NewVal;
- if (UserBB == OrigPreheader)
- NewVal = OrigPreHeaderVal;
- else if (SSA.HasValueForBlock(UserBB))
- NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
- else
- NewVal = UndefValue::get(OrigHeaderVal->getType());
- DbgValue->setOperand(0,
- MetadataAsValue::get(OrigHeaderVal->getContext(),
- ValueAsMetadata::get(NewVal)));
- }
- }
-}
-
-/// Propagate dbg.value intrinsics through the newly inserted Phis.
-static void insertDebugValues(BasicBlock *OrigHeader,
- SmallVectorImpl<PHINode*> &InsertedPHIs) {
- ValueToValueMapTy DbgValueMap;
-
- // Map existing PHI nodes to their dbg.values.
- for (auto &I : *OrigHeader) {
- if (auto DbgII = dyn_cast<DbgInfoIntrinsic>(&I)) {
- if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation()))
- DbgValueMap.insert({Loc, DbgII});
- }
- }
-
- // Then iterate through the new PHIs and look to see if they use one of the
- // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will
- // propagate the info through the new PHI.
- LLVMContext &C = OrigHeader->getContext();
- for (auto PHI : InsertedPHIs) {
- for (auto VI : PHI->operand_values()) {
- auto V = DbgValueMap.find(VI);
- if (V != DbgValueMap.end()) {
- auto *DbgII = cast<DbgInfoIntrinsic>(V->second);
- Instruction *NewDbgII = DbgII->clone();
- auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI));
- NewDbgII->setOperand(0, PhiMAV);
- BasicBlock *Parent = PHI->getParent();
- NewDbgII->insertBefore(Parent->getFirstNonPHIOrDbgOrLifetime());
- }
- }
- }
-}
-
-/// Rotate loop LP. Return true if the loop is rotated.
-///
-/// \param SimplifiedLatch is true if the latch was just folded into the final
-/// loop exit. In this case we may want to rotate even though the new latch is
-/// now an exiting branch. This rotation would have happened had the latch not
-/// been simplified. However, if SimplifiedLatch is false, then we avoid
-/// rotating loops in which the latch exits to avoid excessive or endless
-/// rotation. LoopRotate should be repeatable and converge to a canonical
-/// form. This property is satisfied because simplifying the loop latch can only
-/// happen once across multiple invocations of the LoopRotate pass.
-bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
- // If the loop has only one block then there is not much to rotate.
- if (L->getBlocks().size() == 1)
- return false;
-
- BasicBlock *OrigHeader = L->getHeader();
- BasicBlock *OrigLatch = L->getLoopLatch();
-
- BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
- if (!BI || BI->isUnconditional())
- return false;
-
- // If the loop header is not one of the loop exiting blocks then
- // either this loop is already rotated or it is not
- // suitable for loop rotation transformations.
- if (!L->isLoopExiting(OrigHeader))
- return false;
-
- // If the loop latch already contains a branch that leaves the loop then the
- // loop is already rotated.
- if (!OrigLatch)
- return false;
-
- // Rotate if either the loop latch does *not* exit the loop, or if the loop
- // latch was just simplified.
- if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch)
- return false;
-
- // Check size of original header and reject loop if it is very big or we can't
- // duplicate blocks inside it.
- {
- SmallPtrSet<const Value *, 32> EphValues;
- CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
- CodeMetrics Metrics;
- Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
- if (Metrics.notDuplicatable) {
- DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
- << " instructions: ";
- L->dump());
- return false;
- }
- if (Metrics.convergent) {
- DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
- "instructions: ";
- L->dump());
- return false;
- }
- if (Metrics.NumInsts > MaxHeaderSize)
- return false;
- }
-
- // Now, this loop is suitable for rotation.
- BasicBlock *OrigPreheader = L->getLoopPreheader();
-
- // If the loop could not be converted to canonical form, it must have an
- // indirectbr in it, just give up.
- if (!OrigPreheader)
- return false;
-
- // Anything ScalarEvolution may know about this loop or the PHI nodes
- // in its header will soon be invalidated.
- if (SE)
- SE->forgetLoop(L);
-
- DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
-
- // Find new Loop header. NewHeader is a Header's one and only successor
- // that is inside loop. Header's other successor is outside the
- // loop. Otherwise loop is not suitable for rotation.
- BasicBlock *Exit = BI->getSuccessor(0);
- BasicBlock *NewHeader = BI->getSuccessor(1);
- if (L->contains(Exit))
- std::swap(Exit, NewHeader);
- assert(NewHeader && "Unable to determine new loop header");
- assert(L->contains(NewHeader) && !L->contains(Exit) &&
- "Unable to determine loop header and exit blocks");
-
- // This code assumes that the new header has exactly one predecessor.
- // Remove any single-entry PHI nodes in it.
- assert(NewHeader->getSinglePredecessor() &&
- "New header doesn't have one pred!");
- FoldSingleEntryPHINodes(NewHeader);
-
- // Begin by walking OrigHeader and populating ValueMap with an entry for
- // each Instruction.
- BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
- ValueToValueMapTy ValueMap;
-
- // For PHI nodes, the value available in OldPreHeader is just the
- // incoming value from OldPreHeader.
- for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
- ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
-
- // For the rest of the instructions, either hoist to the OrigPreheader if
- // possible or create a clone in the OldPreHeader if not.
- TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
-
- // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication.
- using DbgIntrinsicHash =
- std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>;
- auto makeHash = [](DbgInfoIntrinsic *D) -> DbgIntrinsicHash {
- return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()};
- };
- SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
- for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend();
- I != E; ++I) {
- if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&*I))
- DbgIntrinsics.insert(makeHash(DII));
- else
- break;
- }
-
- while (I != E) {
- Instruction *Inst = &*I++;
-
- // If the instruction's operands are invariant and it doesn't read or write
- // memory, then it is safe to hoist. Doing this doesn't change the order of
- // execution in the preheader, but does prevent the instruction from
- // executing in each iteration of the loop. This means it is safe to hoist
- // something that might trap, but isn't safe to hoist something that reads
- // memory (without proving that the loop doesn't write).
- if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
- !Inst->mayWriteToMemory() && !isa<TerminatorInst>(Inst) &&
- !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
- Inst->moveBefore(LoopEntryBranch);
- continue;
- }
-
- // Otherwise, create a duplicate of the instruction.
- Instruction *C = Inst->clone();
-
- // Eagerly remap the operands of the instruction.
- RemapInstruction(C, ValueMap,
- RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-
- // Avoid inserting the same intrinsic twice.
- if (auto *DII = dyn_cast<DbgInfoIntrinsic>(C))
- if (DbgIntrinsics.count(makeHash(DII))) {
- C->deleteValue();
- continue;
- }
-
- // With the operands remapped, see if the instruction constant folds or is
- // otherwise simplifyable. This commonly occurs because the entry from PHI
- // nodes allows icmps and other instructions to fold.
- Value *V = SimplifyInstruction(C, SQ);
- if (V && LI->replacementPreservesLCSSAForm(C, V)) {
- // If so, then delete the temporary instruction and stick the folded value
- // in the map.
- ValueMap[Inst] = V;
- if (!C->mayHaveSideEffects()) {
- C->deleteValue();
- C = nullptr;
- }
- } else {
- ValueMap[Inst] = C;
- }
- if (C) {
- // Otherwise, stick the new instruction into the new block!
- C->setName(Inst->getName());
- C->insertBefore(LoopEntryBranch);
-
- if (auto *II = dyn_cast<IntrinsicInst>(C))
- if (II->getIntrinsicID() == Intrinsic::assume)
- AC->registerAssumption(II);
- }
- }
-
- // Along with all the other instructions, we just cloned OrigHeader's
- // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
- // successors by duplicating their incoming values for OrigHeader.
- TerminatorInst *TI = OrigHeader->getTerminator();
- for (BasicBlock *SuccBB : TI->successors())
- for (BasicBlock::iterator BI = SuccBB->begin();
- PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
- PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
-
- // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
- // OrigPreHeader's old terminator (the original branch into the loop), and
- // remove the corresponding incoming values from the PHI nodes in OrigHeader.
- LoopEntryBranch->eraseFromParent();
-
-
- SmallVector<PHINode*, 2> InsertedPHIs;
- // If there were any uses of instructions in the duplicated block outside the
- // loop, update them, inserting PHI nodes as required
- RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
- &InsertedPHIs);
-
- // Attach dbg.value intrinsics to the new phis if that phi uses a value that
- // previously had debug metadata attached. This keeps the debug info
- // up-to-date in the loop body.
- if (!InsertedPHIs.empty())
- insertDebugValues(OrigHeader, InsertedPHIs);
-
- // NewHeader is now the header of the loop.
- L->moveToHeader(NewHeader);
- assert(L->getHeader() == NewHeader && "Latch block is our new header");
-
- // Inform DT about changes to the CFG.
- if (DT) {
- // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
- // the DT about the removed edge to the OrigHeader (that got removed).
- SmallVector<DominatorTree::UpdateType, 3> Updates;
- Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit});
- Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
- Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
- DT->applyUpdates(Updates);
- }
-
- // At this point, we've finished our major CFG changes. As part of cloning
- // the loop into the preheader we've simplified instructions and the
- // duplicated conditional branch may now be branching on a constant. If it is
- // branching on a constant and if that constant means that we enter the loop,
- // then we fold away the cond branch to an uncond branch. This simplifies the
- // loop in cases important for nested loops, and it also means we don't have
- // to split as many edges.
- BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
- assert(PHBI->isConditional() && "Should be clone of BI condbr!");
- if (!isa<ConstantInt>(PHBI->getCondition()) ||
- PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
- NewHeader) {
- // The conditional branch can't be folded, handle the general case.
- // Split edges as necessary to preserve LoopSimplify form.
-
- // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
- // thus is not a preheader anymore.
- // Split the edge to form a real preheader.
- BasicBlock *NewPH = SplitCriticalEdge(
- OrigPreheader, NewHeader,
- CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
- NewPH->setName(NewHeader->getName() + ".lr.ph");
-
- // Preserve canonical loop form, which means that 'Exit' should have only
- // one predecessor. Note that Exit could be an exit block for multiple
- // nested loops, causing both of the edges to now be critical and need to
- // be split.
- SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
- bool SplitLatchEdge = false;
- for (BasicBlock *ExitPred : ExitPreds) {
- // We only need to split loop exit edges.
- Loop *PredLoop = LI->getLoopFor(ExitPred);
- if (!PredLoop || PredLoop->contains(Exit))
- continue;
- if (isa<IndirectBrInst>(ExitPred->getTerminator()))
- continue;
- SplitLatchEdge |= L->getLoopLatch() == ExitPred;
- BasicBlock *ExitSplit = SplitCriticalEdge(
- ExitPred, Exit,
- CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
- ExitSplit->moveBefore(Exit);
- }
- assert(SplitLatchEdge &&
- "Despite splitting all preds, failed to split latch exit?");
- } else {
- // We can fold the conditional branch in the preheader, this makes things
- // simpler. The first step is to remove the extra edge to the Exit block.
- Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
- BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
- NewBI->setDebugLoc(PHBI->getDebugLoc());
- PHBI->eraseFromParent();
-
- // With our CFG finalized, update DomTree if it is available.
- if (DT) DT->deleteEdge(OrigPreheader, Exit);
- }
-
- assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
- assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
-
- // Now that the CFG and DomTree are in a consistent state again, try to merge
- // the OrigHeader block into OrigLatch. This will succeed if they are
- // connected by an unconditional branch. This is just a cleanup so the
- // emitted code isn't too gross in this common case.
- MergeBlockIntoPredecessor(OrigHeader, DT, LI);
-
- DEBUG(dbgs() << "LoopRotation: into "; L->dump());
-
- ++NumRotated;
- return true;
-}
-
-/// Determine whether the instructions in this range may be safely and cheaply
-/// speculated. This is not an important enough situation to develop complex
-/// heuristics. We handle a single arithmetic instruction along with any type
-/// conversions.
-static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
- BasicBlock::iterator End, Loop *L) {
- bool seenIncrement = false;
- bool MultiExitLoop = false;
-
- if (!L->getExitingBlock())
- MultiExitLoop = true;
-
- for (BasicBlock::iterator I = Begin; I != End; ++I) {
-
- if (!isSafeToSpeculativelyExecute(&*I))
- return false;
-
- if (isa<DbgInfoIntrinsic>(I))
- continue;
-
- switch (I->getOpcode()) {
- default:
- return false;
- case Instruction::GetElementPtr:
- // GEPs are cheap if all indices are constant.
- if (!cast<GEPOperator>(I)->hasAllConstantIndices())
- return false;
- // fall-thru to increment case
- LLVM_FALLTHROUGH;
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr: {
- Value *IVOpnd =
- !isa<Constant>(I->getOperand(0))
- ? I->getOperand(0)
- : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr;
- if (!IVOpnd)
- return false;
-
- // If increment operand is used outside of the loop, this speculation
- // could cause extra live range interference.
- if (MultiExitLoop) {
- for (User *UseI : IVOpnd->users()) {
- auto *UserInst = cast<Instruction>(UseI);
- if (!L->contains(UserInst))
- return false;
- }
- }
-
- if (seenIncrement)
- return false;
- seenIncrement = true;
- break;
- }
- case Instruction::Trunc:
- case Instruction::ZExt:
- case Instruction::SExt:
- // ignore type conversions
- break;
- }
- }
- return true;
-}
-
-/// Fold the loop tail into the loop exit by speculating the loop tail
-/// instructions. Typically, this is a single post-increment. In the case of a
-/// simple 2-block loop, hoisting the increment can be much better than
-/// duplicating the entire loop header. In the case of loops with early exits,
-/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
-/// canonical form so downstream passes can handle it.
-///
-/// I don't believe this invalidates SCEV.
-bool LoopRotate::simplifyLoopLatch(Loop *L) {
- BasicBlock *Latch = L->getLoopLatch();
- if (!Latch || Latch->hasAddressTaken())
- return false;
-
- BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
- if (!Jmp || !Jmp->isUnconditional())
- return false;
-
- BasicBlock *LastExit = Latch->getSinglePredecessor();
- if (!LastExit || !L->isLoopExiting(LastExit))
- return false;
-
- BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
- if (!BI)
- return false;
-
- if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
- return false;
-
- DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
- << LastExit->getName() << "\n");
-
- // Hoist the instructions from Latch into LastExit.
- LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
- Latch->begin(), Jmp->getIterator());
-
- unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
- BasicBlock *Header = Jmp->getSuccessor(0);
- assert(Header == L->getHeader() && "expected a backward branch");
-
- // Remove Latch from the CFG so that LastExit becomes the new Latch.
- BI->setSuccessor(FallThruPath, Header);
- Latch->replaceSuccessorsPhiUsesWith(LastExit);
- Jmp->eraseFromParent();
-
- // Nuke the Latch block.
- assert(Latch->empty() && "unable to evacuate Latch");
- LI->removeBlock(Latch);
- if (DT)
- DT->eraseNode(Latch);
- Latch->eraseFromParent();
- return true;
-}
-
-/// Rotate \c L, and return true if any modification was made.
-bool LoopRotate::processLoop(Loop *L) {
- // Save the loop metadata.
- MDNode *LoopMD = L->getLoopID();
-
- // Simplify the loop latch before attempting to rotate the header
- // upward. Rotation may not be needed if the loop tail can be folded into the
- // loop exit.
- bool SimplifiedLatch = simplifyLoopLatch(L);
-
- bool MadeChange = rotateLoop(L, SimplifiedLatch);
- assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) &&
- "Loop latch should be exiting after loop-rotate.");
-
- // Restore the loop metadata.
- // NB! We presume LoopRotation DOESN'T ADD its own metadata.
- if ((MadeChange || SimplifiedLatch) && LoopMD)
- L->setLoopID(LoopMD);
-
- return MadeChange || SimplifiedLatch;
-}
-
LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication)
: EnableHeaderDuplication(EnableHeaderDuplication) {}
@@ -646,10 +39,10 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
int Threshold = EnableHeaderDuplication ? DefaultRotationThreshold : 0;
const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL);
- LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
- SQ);
- bool Changed = LR.processLoop(&L);
+ bool Changed = LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE, SQ,
+ false, Threshold, false);
+
if (!Changed)
return PreservedAnalyses::all();
@@ -691,8 +84,8 @@ public:
auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
auto *SE = SEWP ? &SEWP->getSE() : nullptr;
const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
- LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE, SQ);
- return LR.processLoop(L);
+ return LoopRotation(L, LI, TTI, AC, DT, SE, SQ, false, MaxHeaderSize,
+ false);
}
};
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 35c05e84fd68..2b83d3dc5f1b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -30,13 +30,16 @@
#include "llvm/IR/Dominators.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
using namespace llvm;
#define DEBUG_TYPE "loop-simplifycfg"
-static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) {
+static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI,
+ ScalarEvolution &SE) {
bool Changed = false;
// Copy blocks into a temporary array to avoid iterator invalidation issues
// as we remove them.
@@ -53,11 +56,10 @@ static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) {
if (!Pred || !Pred->getSingleSuccessor() || LI.getLoopFor(Pred) != &L)
continue;
- // Pred is going to disappear, so we need to update the loop info.
- if (L.getHeader() == Pred)
- L.moveToHeader(Succ);
- LI.removeBlock(Pred);
- MergeBasicBlockIntoOnlyPred(Succ, &DT);
+ // Merge Succ into Pred and delete it.
+ MergeBlockIntoPredecessor(Succ, &DT, &LI);
+
+ SE.forgetLoop(&L);
Changed = true;
}
@@ -67,7 +69,7 @@ static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) {
PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &) {
- if (!simplifyLoopCFG(L, AR.DT, AR.LI))
+ if (!simplifyLoopCFG(L, AR.DT, AR.LI, AR.SE))
return PreservedAnalyses::all();
return getLoopPassPreservedAnalyses();
@@ -87,7 +89,8 @@ public:
DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- return simplifyLoopCFG(*L, DT, LI);
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ return simplifyLoopCFG(*L, DT, LI, SE);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
index 430a7085d93f..760177c9c5e9 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
@@ -42,6 +42,7 @@
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
@@ -49,7 +50,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
using namespace llvm;
@@ -200,17 +200,19 @@ static bool sinkInstruction(Loop &L, Instruction &I,
SmallVector<BasicBlock *, 2> SortedBBsToSinkInto;
SortedBBsToSinkInto.insert(SortedBBsToSinkInto.begin(), BBsToSinkInto.begin(),
BBsToSinkInto.end());
- std::sort(SortedBBsToSinkInto.begin(), SortedBBsToSinkInto.end(),
- [&](BasicBlock *A, BasicBlock *B) {
- return *LoopBlockNumber.find(A) < *LoopBlockNumber.find(B);
- });
+ llvm::sort(SortedBBsToSinkInto.begin(), SortedBBsToSinkInto.end(),
+ [&](BasicBlock *A, BasicBlock *B) {
+ return LoopBlockNumber.find(A)->second <
+ LoopBlockNumber.find(B)->second;
+ });
BasicBlock *MoveBB = *SortedBBsToSinkInto.begin();
// FIXME: Optimize the efficiency for cloned value replacement. The current
// implementation is O(SortedBBsToSinkInto.size() * I.num_uses()).
- for (BasicBlock *N : SortedBBsToSinkInto) {
- if (N == MoveBB)
- continue;
+ for (BasicBlock *N : makeArrayRef(SortedBBsToSinkInto).drop_front(1)) {
+ assert(LoopBlockNumber.find(N)->second >
+ LoopBlockNumber.find(MoveBB)->second &&
+ "BBs not sorted!");
// Clone I and replace its uses.
Instruction *IC = I.clone();
IC->setName(I.getName());
@@ -224,11 +226,11 @@ static bool sinkInstruction(Loop &L, Instruction &I,
}
// Replaces uses of I with IC in blocks dominated by N
replaceDominatedUsesWith(&I, IC, DT, N);
- DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
- << '\n');
+ LLVM_DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
+ << '\n');
NumLoopSunkCloned++;
}
- DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n');
NumLoopSunk++;
I.moveBefore(&*MoveBB->getFirstInsertionPt());
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index ff3e9eef16d9..fa83b48210bc 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -75,6 +75,8 @@
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ScalarEvolutionNormalization.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
@@ -105,8 +107,8 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
@@ -121,7 +123,7 @@ using namespace llvm;
#define DEBUG_TYPE "loop-reduce"
-/// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for
+/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
/// bail out. This threshold is far beyond the number of users that LSR can
/// conceivably solve, so it should not affect generated code, but catches the
/// worst cases before LSR burns too much compile time and stack space.
@@ -185,6 +187,8 @@ struct MemAccessTy {
unsigned AS = UnknownAddressSpace) {
return MemAccessTy(Type::getVoidTy(Ctx), AS);
}
+
+ Type *getType() { return MemTy; }
};
/// This class holds data which is used to order reuse candidates.
@@ -327,7 +331,7 @@ struct Formula {
/// #2 enforces that 1 * reg is reg.
/// #3 ensures invariant regs with respect to current loop can be combined
/// together in LSR codegen.
- /// This invariant can be temporarly broken while building a formula.
+ /// This invariant can be temporarily broken while building a formula.
/// However, every formula inserted into the LSRInstance must be in canonical
/// form.
SmallVector<const SCEV *, 4> BaseRegs;
@@ -442,7 +446,7 @@ void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
canonicalize(*L);
}
-/// \brief Check whether or not this formula statisfies the canonical
+/// Check whether or not this formula satisfies the canonical
/// representation.
/// \see Formula::BaseRegs.
bool Formula::isCanonical(const Loop &L) const {
@@ -470,7 +474,7 @@ bool Formula::isCanonical(const Loop &L) const {
return I == BaseRegs.end();
}
-/// \brief Helper method to morph a formula into its canonical representation.
+/// Helper method to morph a formula into its canonical representation.
/// \see Formula::BaseRegs.
/// Every formula having more than one base register, must use the ScaledReg
/// field. Otherwise, we would have to do special cases everywhere in LSR
@@ -505,7 +509,7 @@ void Formula::canonicalize(const Loop &L) {
}
}
-/// \brief Get rid of the scale in the formula.
+/// Get rid of the scale in the formula.
/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
/// \return true if it was possible to get rid of the scale, false otherwise.
/// \note After this operation the formula may not be in the canonical form.
@@ -818,7 +822,7 @@ static bool isAddressUse(const TargetTransformInfo &TTI,
/// Return the type of the memory being accessed.
static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
- Instruction *Inst) {
+ Instruction *Inst, Value *OperandVal) {
MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
AccessTy.MemTy = SI->getOperand(0)->getType();
@@ -832,7 +836,14 @@ static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
switch (II->getIntrinsicID()) {
case Intrinsic::prefetch:
+ case Intrinsic::memset:
AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
+ AccessTy.MemTy = OperandVal->getType();
+ break;
+ case Intrinsic::memmove:
+ case Intrinsic::memcpy:
+ AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
+ AccessTy.MemTy = OperandVal->getType();
break;
default: {
MemIntrinsicInfo IntrInfo;
@@ -937,7 +948,7 @@ static bool isHighCostExpansion(const SCEV *S,
return true;
}
-/// If any of the instructions is the specified set are trivially dead, delete
+/// If any of the instructions in the specified set are trivially dead, delete
/// them and see if this makes any of their operands subsequently dead.
static bool
DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
@@ -970,7 +981,7 @@ class LSRUse;
} // end anonymous namespace
-/// \brief Check if the addressing mode defined by \p F is completely
+/// Check if the addressing mode defined by \p F is completely
/// folded in \p LU at isel time.
/// This includes address-mode folding and special icmp tricks.
/// This function returns true if \p LU can accommodate what \p F
@@ -1040,12 +1051,14 @@ private:
void RateRegister(const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
const Loop *L,
- ScalarEvolution &SE, DominatorTree &DT);
+ ScalarEvolution &SE, DominatorTree &DT,
+ const TargetTransformInfo &TTI);
void RatePrimaryRegister(const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
const Loop *L,
ScalarEvolution &SE, DominatorTree &DT,
- SmallPtrSetImpl<const SCEV *> *LoserRegs);
+ SmallPtrSetImpl<const SCEV *> *LoserRegs,
+ const TargetTransformInfo &TTI);
};
/// An operand value in an instruction which is to be replaced with some
@@ -1194,7 +1207,8 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
void Cost::RateRegister(const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
const Loop *L,
- ScalarEvolution &SE, DominatorTree &DT) {
+ ScalarEvolution &SE, DominatorTree &DT,
+ const TargetTransformInfo &TTI) {
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
// If this is an addrec for another loop, it should be an invariant
// with respect to L since L is the innermost loop (at least
@@ -1215,13 +1229,28 @@ void Cost::RateRegister(const SCEV *Reg,
++C.NumRegs;
return;
}
- C.AddRecCost += 1; /// TODO: This should be a function of the stride.
+
+ unsigned LoopCost = 1;
+ if (TTI.shouldFavorPostInc()) {
+ const SCEV *LoopStep = AR->getStepRecurrence(SE);
+ if (isa<SCEVConstant>(LoopStep)) {
+ // Check if a post-indexed load/store can be used.
+ if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
+ TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
+ const SCEV *LoopStart = AR->getStart();
+ if (!isa<SCEVConstant>(LoopStart) &&
+ SE.isLoopInvariant(LoopStart, L))
+ LoopCost = 0;
+ }
+ }
+ }
+ C.AddRecCost += LoopCost;
// Add the step value register, if it needs one.
// TODO: The non-affine case isn't precisely modeled here.
if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
if (!Regs.count(AR->getOperand(1))) {
- RateRegister(AR->getOperand(1), Regs, L, SE, DT);
+ RateRegister(AR->getOperand(1), Regs, L, SE, DT, TTI);
if (isLoser())
return;
}
@@ -1249,13 +1278,14 @@ void Cost::RatePrimaryRegister(const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
const Loop *L,
ScalarEvolution &SE, DominatorTree &DT,
- SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+ SmallPtrSetImpl<const SCEV *> *LoserRegs,
+ const TargetTransformInfo &TTI) {
if (LoserRegs && LoserRegs->count(Reg)) {
Lose();
return;
}
if (Regs.insert(Reg).second) {
- RateRegister(Reg, Regs, L, SE, DT);
+ RateRegister(Reg, Regs, L, SE, DT, TTI);
if (LoserRegs && isLoser())
LoserRegs->insert(Reg);
}
@@ -1279,7 +1309,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
Lose();
return;
}
- RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
+ RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs, TTI);
if (isLoser())
return;
}
@@ -1288,7 +1318,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
Lose();
return;
}
- RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
+ RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs, TTI);
if (isLoser())
return;
}
@@ -1343,14 +1373,15 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
// If ICmpZero formula ends with not 0, it could not be replaced by
// just add or sub. We'll need to compare final result of AddRec.
- // That means we'll need an additional instruction.
+ // That means we'll need an additional instruction. But if the target can
+ // macro-fuse a compare with a branch, don't count this extra instruction.
// For -10 + {0, +, 1}:
// i = i + 1;
// cmp i, 10
//
// For {-10, +, 1}:
// i = i + 1;
- if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
+ if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() && !TTI.canMacroFuseCmp())
C.Insns++;
// Each new AddRec adds 1 instruction to calculation.
C.Insns += (C.AddRecCost - PrevAddRecCost);
@@ -1456,7 +1487,7 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
SmallVector<const SCEV *, 4> Key = F.BaseRegs;
if (F.ScaledReg) Key.push_back(F.ScaledReg);
// Unstable sort by host order ok, because this is only used for uniquifying.
- std::sort(Key.begin(), Key.end());
+ llvm::sort(Key.begin(), Key.end());
return Uniquifier.count(Key);
}
@@ -1480,7 +1511,7 @@ bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
SmallVector<const SCEV *, 4> Key = F.BaseRegs;
if (F.ScaledReg) Key.push_back(F.ScaledReg);
// Unstable sort by host order ok, because this is only used for uniquifying.
- std::sort(Key.begin(), Key.end());
+ llvm::sort(Key.begin(), Key.end());
if (!Uniquifier.insert(Key).second)
return false;
@@ -2384,24 +2415,27 @@ LSRInstance::OptimizeLoopTermCond() {
C->getValue().isMinSignedValue())
goto decline_post_inc;
// Check for possible scaled-address reuse.
- MemAccessTy AccessTy = getAccessType(TTI, UI->getUser());
- int64_t Scale = C->getSExtValue();
- if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
- /*BaseOffset=*/0,
- /*HasBaseReg=*/false, Scale,
- AccessTy.AddrSpace))
- goto decline_post_inc;
- Scale = -Scale;
- if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
- /*BaseOffset=*/0,
- /*HasBaseReg=*/false, Scale,
- AccessTy.AddrSpace))
- goto decline_post_inc;
+ if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) {
+ MemAccessTy AccessTy = getAccessType(
+ TTI, UI->getUser(), UI->getOperandValToReplace());
+ int64_t Scale = C->getSExtValue();
+ if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+ /*BaseOffset=*/0,
+ /*HasBaseReg=*/false, Scale,
+ AccessTy.AddrSpace))
+ goto decline_post_inc;
+ Scale = -Scale;
+ if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+ /*BaseOffset=*/0,
+ /*HasBaseReg=*/false, Scale,
+ AccessTy.AddrSpace))
+ goto decline_post_inc;
+ }
}
}
- DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
- << *Cond << '\n');
+ LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
+ << *Cond << '\n');
// It's possible for the setcc instruction to be anywhere in the loop, and
// possible for it to have multiple users. If it is not immediately before
@@ -2642,7 +2676,7 @@ void LSRInstance::CollectInterestingTypesAndFactors() {
if (Types.size() == 1)
Types.clear();
- DEBUG(print_factors_and_types(dbgs()));
+ LLVM_DEBUG(print_factors_and_types(dbgs()));
}
/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
@@ -2666,7 +2700,7 @@ findIVOperand(User::op_iterator OI, User::op_iterator OE,
return OI;
}
-/// IVChain logic must consistenctly peek base TruncInst operands, so wrap it in
+/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
/// a convenient helper.
static Value *getWideOperand(Value *Oper) {
if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
@@ -2773,10 +2807,9 @@ isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
return false;
if (!Users.empty()) {
- DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
- for (Instruction *Inst : Users) {
- dbgs() << " " << *Inst << "\n";
- });
+ LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
+ for (Instruction *Inst
+ : Users) { dbgs() << " " << *Inst << "\n"; });
return false;
}
assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
@@ -2829,8 +2862,8 @@ isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
// the stride.
cost -= NumReusedIncrements;
- DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
- << "\n");
+ LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
+ << "\n");
return cost < 0;
}
@@ -2883,7 +2916,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
if (isa<PHINode>(UserInst))
return;
if (NChains >= MaxChains && !StressIVChain) {
- DEBUG(dbgs() << "IV Chain Limit\n");
+ LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
return;
}
LastIncExpr = OperExpr;
@@ -2896,11 +2929,11 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
OperExprBase));
ChainUsersVec.resize(NChains);
- DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
- << ") IV=" << *LastIncExpr << "\n");
+ LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
+ << ") IV=" << *LastIncExpr << "\n");
} else {
- DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
- << ") IV+" << *LastIncExpr << "\n");
+ LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
+ << ") IV+" << *LastIncExpr << "\n");
// Add this IV user to the end of the chain.
IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
}
@@ -2970,7 +3003,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
/// loop latch. This will discover chains on side paths, but requires
/// maintaining multiple copies of the Chains state.
void LSRInstance::CollectChains() {
- DEBUG(dbgs() << "Collecting IV Chains.\n");
+ LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
SmallVector<ChainUsers, 8> ChainUsersVec;
SmallVector<BasicBlock *,8> LatchPath;
@@ -3039,10 +3072,10 @@ void LSRInstance::CollectChains() {
void LSRInstance::FinalizeChain(IVChain &Chain) {
assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
- DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
+ LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
for (const IVInc &Inc : Chain) {
- DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
+ LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
IVIncSet.insert(UseI);
@@ -3059,7 +3092,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
if (IncConst->getAPInt().getMinSignedBits() > 64)
return false;
- MemAccessTy AccessTy = getAccessType(TTI, UserInst);
+ MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
int64_t IncOffset = IncConst->getValue()->getSExtValue();
if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
IncOffset, /*HaseBaseReg=*/false))
@@ -3099,11 +3132,11 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
}
if (IVOpIter == IVOpEnd) {
// Gracefully give up on this chain.
- DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
+ LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
return;
}
- DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
+ LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
Type *IVTy = IVSrc->getType();
Type *IntTy = SE.getEffectiveSCEVType(IVTy);
const SCEV *LeftOverExpr = nullptr;
@@ -3179,7 +3212,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
find(UserInst->operands(), U.getOperandValToReplace());
assert(UseI != UserInst->op_end() && "cannot find IV operand");
if (IVIncSet.count(UseI)) {
- DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
+ LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
continue;
}
@@ -3187,7 +3220,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
MemAccessTy AccessTy;
if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
Kind = LSRUse::Address;
- AccessTy = getAccessType(TTI, UserInst);
+ AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
}
const SCEV *S = IU.getExpr(U);
@@ -3255,7 +3288,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
}
}
- DEBUG(print_fixups(dbgs()));
+ LLVM_DEBUG(print_fixups(dbgs()));
}
/// Insert a formula for the given expression into the given use, separating out
@@ -3464,12 +3497,45 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
return S;
}
-/// \brief Helper function for LSRInstance::GenerateReassociations.
+/// Return true if the SCEV represents a value that may end up as a
+/// post-increment operation.
+static bool mayUsePostIncMode(const TargetTransformInfo &TTI,
+ LSRUse &LU, const SCEV *S, const Loop *L,
+ ScalarEvolution &SE) {
+ if (LU.Kind != LSRUse::Address ||
+ !LU.AccessTy.getType()->isIntOrIntVectorTy())
+ return false;
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
+ if (!AR)
+ return false;
+ const SCEV *LoopStep = AR->getStepRecurrence(SE);
+ if (!isa<SCEVConstant>(LoopStep))
+ return false;
+ if (LU.AccessTy.getType()->getScalarSizeInBits() !=
+ LoopStep->getType()->getScalarSizeInBits())
+ return false;
+ // Check if a post-indexed load/store can be used.
+ if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
+ TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
+ const SCEV *LoopStart = AR->getStart();
+ if (!isa<SCEVConstant>(LoopStart) && SE.isLoopInvariant(LoopStart, L))
+ return true;
+ }
+ return false;
+}
+
+/// Helper function for LSRInstance::GenerateReassociations.
void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
const Formula &Base,
unsigned Depth, size_t Idx,
bool IsScaledReg) {
const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+ // Don't generate reassociations for the base register of a value that
+ // may generate a post-increment operator. The reason is that the
+ // reassociations cause extra base+register formula to be created,
+ // and possibly chosen, but the post-increment is more efficient.
+ if (TTI.shouldFavorPostInc() && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
+ return;
SmallVector<const SCEV *, 8> AddOps;
const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
if (Remainder)
@@ -3542,7 +3608,12 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
if (InsertFormula(LU, LUIdx, F))
// If that formula hadn't been seen before, recurse to find more like
// it.
- GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth + 1);
+ // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
+ // Because just Depth is not enough to bound compile time.
+ // This means that every time AddOps.size() is greater 16^x we will add
+ // x to Depth.
+ GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
+ Depth + 1 + (Log2_32(AddOps.size()) >> 2));
}
}
@@ -3596,7 +3667,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
}
}
-/// \brief Helper function for LSRInstance::GenerateSymbolicOffsets.
+/// Helper function for LSRInstance::GenerateSymbolicOffsets.
void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
const Formula &Base, size_t Idx,
bool IsScaledReg) {
@@ -3628,7 +3699,7 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
/* IsScaledReg */ true);
}
-/// \brief Helper function for LSRInstance::GenerateConstantOffsets.
+/// Helper function for LSRInstance::GenerateConstantOffsets.
void LSRInstance::GenerateConstantOffsetsImpl(
LSRUse &LU, unsigned LUIdx, const Formula &Base,
const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
@@ -3938,10 +4009,11 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
if (Imms.size() == 1)
continue;
- DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
- for (const auto &Entry : Imms)
- dbgs() << ' ' << Entry.first;
- dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
+ for (const auto &Entry
+ : Imms) dbgs()
+ << ' ' << Entry.first;
+ dbgs() << '\n');
// Examine each offset.
for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
@@ -3953,7 +4025,8 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
if (!isa<SCEVConstant>(OrigReg) &&
UsedByIndicesMap[Reg].count() == 1) {
- DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
+ LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
+ << '\n');
continue;
}
@@ -4038,6 +4111,9 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
LU.Kind, LU.AccessTy, NewF)) {
+ if (TTI.shouldFavorPostInc() &&
+ mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
+ continue;
if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
continue;
NewF = F;
@@ -4099,9 +4175,9 @@ LSRInstance::GenerateAllReuseFormulae() {
GenerateCrossUseConstantOffsets();
- DEBUG(dbgs() << "\n"
- "After generating reuse formulae:\n";
- print_uses(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n"
+ "After generating reuse formulae:\n";
+ print_uses(dbgs()));
}
/// If there are multiple formulae with the same set of registers used
@@ -4123,7 +4199,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
LSRUse &LU = Uses[LUIdx];
- DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
+ dbgs() << '\n');
bool Any = false;
for (size_t FIdx = 0, NumForms = LU.Formulae.size();
@@ -4147,8 +4224,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
// as the basis of rediscovering the desired formula that uses an AddRec
// corresponding to the existing phi. Once all formulae have been
// generated, these initial losers may be pruned.
- DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
- dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
+ dbgs() << "\n");
}
else {
SmallVector<const SCEV *, 4> Key;
@@ -4161,7 +4238,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
Key.push_back(F.ScaledReg);
// Unstable sort by host order ok, because this is only used for
// uniquifying.
- std::sort(Key.begin(), Key.end());
+ llvm::sort(Key.begin(), Key.end());
std::pair<BestFormulaeTy::const_iterator, bool> P =
BestFormulae.insert(std::make_pair(Key, FIdx));
@@ -4175,10 +4252,10 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU);
if (CostF.isLess(CostBest, TTI))
std::swap(F, Best);
- DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
- dbgs() << "\n"
- " in favor of formula "; Best.print(dbgs());
- dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
+ dbgs() << "\n"
+ " in favor of formula ";
+ Best.print(dbgs()); dbgs() << '\n');
}
#ifndef NDEBUG
ChangedFormulae = true;
@@ -4197,11 +4274,11 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
BestFormulae.clear();
}
- DEBUG(if (ChangedFormulae) {
- dbgs() << "\n"
- "After filtering out undesirable candidates:\n";
- print_uses(dbgs());
- });
+ LLVM_DEBUG(if (ChangedFormulae) {
+ dbgs() << "\n"
+ "After filtering out undesirable candidates:\n";
+ print_uses(dbgs());
+ });
}
// This is a rough guess that seems to work fairly well.
@@ -4230,11 +4307,11 @@ size_t LSRInstance::EstimateSearchSpaceComplexity() const {
/// register pressure); remove it to simplify the system.
void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
- DEBUG(dbgs() << "The search space is too complex.\n");
+ LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
- DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
- "which use a superset of registers used by other "
- "formulae.\n");
+ LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
+ "which use a superset of registers used by other "
+ "formulae.\n");
for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
LSRUse &LU = Uses[LUIdx];
@@ -4252,7 +4329,8 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
(I - F.BaseRegs.begin()));
if (LU.HasFormulaWithSameRegs(NewF)) {
- DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
+ dbgs() << '\n');
LU.DeleteFormula(F);
--i;
--e;
@@ -4267,8 +4345,8 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
(I - F.BaseRegs.begin()));
if (LU.HasFormulaWithSameRegs(NewF)) {
- DEBUG(dbgs() << " Deleting "; F.print(dbgs());
- dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
+ dbgs() << '\n');
LU.DeleteFormula(F);
--i;
--e;
@@ -4283,8 +4361,7 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
LU.RecomputeRegs(LUIdx, RegUses);
}
- DEBUG(dbgs() << "After pre-selection:\n";
- print_uses(dbgs()));
+ LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
}
}
@@ -4294,9 +4371,10 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
if (EstimateSearchSpaceComplexity() < ComplexityLimit)
return;
- DEBUG(dbgs() << "The search space is too complex.\n"
- "Narrowing the search space by assuming that uses separated "
- "by a constant offset will use the same registers.\n");
+ LLVM_DEBUG(
+ dbgs() << "The search space is too complex.\n"
+ "Narrowing the search space by assuming that uses separated "
+ "by a constant offset will use the same registers.\n");
// This is especially useful for unrolled loops.
@@ -4314,7 +4392,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
LU.Kind, LU.AccessTy))
continue;
- DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
@@ -4322,7 +4400,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
for (LSRFixup &Fixup : LU.Fixups) {
Fixup.Offset += F.BaseOffset;
LUThatHas->pushFixup(Fixup);
- DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
+ LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
}
// Delete formulae from the new use which are no longer legal.
@@ -4331,8 +4409,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
Formula &F = LUThatHas->Formulae[i];
if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
LUThatHas->Kind, LUThatHas->AccessTy, F)) {
- DEBUG(dbgs() << " Deleting "; F.print(dbgs());
- dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
LUThatHas->DeleteFormula(F);
--i;
--e;
@@ -4351,7 +4428,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
}
}
- DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+ LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
}
/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
@@ -4359,15 +4436,14 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
/// eliminate.
void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
- DEBUG(dbgs() << "The search space is too complex.\n");
+ LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
- DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
- "undesirable dedicated registers.\n");
+ LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
+ "undesirable dedicated registers.\n");
FilterOutUndesirableDedicatedRegisters();
- DEBUG(dbgs() << "After pre-selection:\n";
- print_uses(dbgs()));
+ LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
}
}
@@ -4378,15 +4454,16 @@ void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
/// The benefit is that it is more likely to find out a better solution
/// from a formulae set with more Scale and ScaledReg variations than
/// a formulae set with the same Scale and ScaledReg. The picking winner
-/// reg heurstic will often keep the formulae with the same Scale and
+/// reg heuristic will often keep the formulae with the same Scale and
/// ScaledReg and filter others, and we want to avoid that if possible.
void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
if (EstimateSearchSpaceComplexity() < ComplexityLimit)
return;
- DEBUG(dbgs() << "The search space is too complex.\n"
- "Narrowing the search space by choosing the best Formula "
- "from the Formulae with the same Scale and ScaledReg.\n");
+ LLVM_DEBUG(
+ dbgs() << "The search space is too complex.\n"
+ "Narrowing the search space by choosing the best Formula "
+ "from the Formulae with the same Scale and ScaledReg.\n");
// Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
@@ -4400,7 +4477,8 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
LSRUse &LU = Uses[LUIdx];
- DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
+ dbgs() << '\n');
// Return true if Formula FA is better than Formula FB.
auto IsBetterThan = [&](Formula &FA, Formula &FB) {
@@ -4444,10 +4522,10 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
Formula &Best = LU.Formulae[P.first->second];
if (IsBetterThan(F, Best))
std::swap(F, Best);
- DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
- dbgs() << "\n"
- " in favor of formula ";
- Best.print(dbgs()); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
+ dbgs() << "\n"
+ " in favor of formula ";
+ Best.print(dbgs()); dbgs() << '\n');
#ifndef NDEBUG
ChangedFormulae = true;
#endif
@@ -4463,7 +4541,7 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
BestFormulae.clear();
}
- DEBUG(if (ChangedFormulae) {
+ LLVM_DEBUG(if (ChangedFormulae) {
dbgs() << "\n"
"After filtering out undesirable candidates:\n";
print_uses(dbgs());
@@ -4522,7 +4600,7 @@ void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
// Used in each formula of a solution (in example above this is reg(c)).
// We can skip them in calculations.
SmallPtrSet<const SCEV *, 4> UniqRegs;
- DEBUG(dbgs() << "The search space is too complex.\n");
+ LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
// Map each register to probability of not selecting
DenseMap <const SCEV *, float> RegNumMap;
@@ -4542,7 +4620,8 @@ void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
RegNumMap.insert(std::make_pair(Reg, PNotSel));
}
- DEBUG(dbgs() << "Narrowing the search space by deleting costly formulas\n");
+ LLVM_DEBUG(
+ dbgs() << "Narrowing the search space by deleting costly formulas\n");
// Delete formulas where registers number expectation is high.
for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
@@ -4584,26 +4663,25 @@ void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
MinIdx = i;
}
}
- DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
- dbgs() << " with min reg num " << FMinRegNum << '\n');
+ LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
+ dbgs() << " with min reg num " << FMinRegNum << '\n');
if (MinIdx != 0)
std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
while (LU.Formulae.size() != 1) {
- DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
- dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
+ dbgs() << '\n');
LU.Formulae.pop_back();
}
LU.RecomputeRegs(LUIdx, RegUses);
assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
Formula &F = LU.Formulae[0];
- DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
// When we choose the formula, the regs become unique.
UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
if (F.ScaledReg)
UniqRegs.insert(F.ScaledReg);
}
- DEBUG(dbgs() << "After pre-selection:\n";
- print_uses(dbgs()));
+ LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
}
/// Pick a register which seems likely to be profitable, and then in any use
@@ -4616,7 +4694,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
// Ok, we have too many of formulae on our hands to conveniently handle.
// Use a rough heuristic to thin out the list.
- DEBUG(dbgs() << "The search space is too complex.\n");
+ LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
// Pick the register which is used by the most LSRUses, which is likely
// to be a good reuse register candidate.
@@ -4637,8 +4715,8 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
}
}
- DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
- << " will yield profitable reuse.\n");
+ LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
+ << " will yield profitable reuse.\n");
Taken.insert(Best);
// In any use with formulae which references this register, delete formulae
@@ -4651,7 +4729,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
Formula &F = LU.Formulae[i];
if (!F.referencesReg(Best)) {
- DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
LU.DeleteFormula(F);
--e;
--i;
@@ -4665,8 +4743,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
LU.RecomputeRegs(LUIdx, RegUses);
}
- DEBUG(dbgs() << "After pre-selection:\n";
- print_uses(dbgs()));
+ LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
}
}
@@ -4748,11 +4825,11 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
if (F.getNumRegs() == 1 && Workspace.size() == 1)
VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
} else {
- DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
- dbgs() << ".\n Regs:";
- for (const SCEV *S : NewRegs)
- dbgs() << ' ' << *S;
- dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
+ dbgs() << ".\n Regs:"; for (const SCEV *S
+ : NewRegs) dbgs()
+ << ' ' << *S;
+ dbgs() << '\n');
SolutionCost = NewCost;
Solution = Workspace;
@@ -4777,22 +4854,22 @@ void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
CurRegs, VisitedRegs);
if (Solution.empty()) {
- DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
+ LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
return;
}
// Ok, we've now made all our decisions.
- DEBUG(dbgs() << "\n"
- "The chosen solution requires "; SolutionCost.print(dbgs());
- dbgs() << ":\n";
- for (size_t i = 0, e = Uses.size(); i != e; ++i) {
- dbgs() << " ";
- Uses[i].print(dbgs());
- dbgs() << "\n"
- " ";
- Solution[i]->print(dbgs());
- dbgs() << '\n';
- });
+ LLVM_DEBUG(dbgs() << "\n"
+ "The chosen solution requires ";
+ SolutionCost.print(dbgs()); dbgs() << ":\n";
+ for (size_t i = 0, e = Uses.size(); i != e; ++i) {
+ dbgs() << " ";
+ Uses[i].print(dbgs());
+ dbgs() << "\n"
+ " ";
+ Solution[i]->print(dbgs());
+ dbgs() << '\n';
+ });
assert(Solution.size() == Uses.size() && "Malformed solution!");
}
@@ -4993,7 +5070,7 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
// Unless the addressing mode will not be folded.
if (!Ops.empty() && LU.Kind == LSRUse::Address &&
isAMCompletelyFolded(TTI, LU, F)) {
- Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
+ Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
Ops.clear();
Ops.push_back(SE.getUnknown(FullV));
}
@@ -5266,7 +5343,8 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
for (const IVStrideUse &U : IU) {
if (++NumUsers > MaxIVUsers) {
(void)U;
- DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U << "\n");
+ LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
+ << "\n");
return;
}
// Bail out if we have a PHI on an EHPad that gets a value from a
@@ -5299,9 +5377,9 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
}
#endif // DEBUG
- DEBUG(dbgs() << "\nLSR on loop ";
- L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
- dbgs() << ":\n");
+ LLVM_DEBUG(dbgs() << "\nLSR on loop ";
+ L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
+ dbgs() << ":\n");
// First, perform some low-level loop optimizations.
OptimizeShadowIV();
@@ -5312,7 +5390,7 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
// Skip nested loops until we can model them better with formulae.
if (!L->empty()) {
- DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
+ LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
return;
}
@@ -5322,9 +5400,11 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
CollectFixupsAndInitialFormulae();
CollectLoopInvariantFixupsAndFormulae();
- assert(!Uses.empty() && "IVUsers reported at least one use");
- DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
- print_uses(dbgs()));
+ if (Uses.empty())
+ return;
+
+ LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
+ print_uses(dbgs()));
// Now use the reuse data to generate a bunch of interesting ways
// to formulate the values needed for the uses.
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
new file mode 100644
index 000000000000..86c99aed4417
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -0,0 +1,447 @@
+//===- LoopUnrollAndJam.cpp - Loop unroll and jam pass --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements an unroll and jam pass. Most of the work is done by
+// Utils/UnrollLoopAndJam.cpp.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll-and-jam"
+
+static cl::opt<bool>
+ AllowUnrollAndJam("allow-unroll-and-jam", cl::Hidden,
+ cl::desc("Allows loops to be unroll-and-jammed."));
+
+static cl::opt<unsigned> UnrollAndJamCount(
+ "unroll-and-jam-count", cl::Hidden,
+ cl::desc("Use this unroll count for all loops including those with "
+ "unroll_and_jam_count pragma values, for testing purposes"));
+
+static cl::opt<unsigned> UnrollAndJamThreshold(
+ "unroll-and-jam-threshold", cl::init(60), cl::Hidden,
+ cl::desc("Threshold to use for inner loop when doing unroll and jam."));
+
+static cl::opt<unsigned> PragmaUnrollAndJamThreshold(
+ "pragma-unroll-and-jam-threshold", cl::init(1024), cl::Hidden,
+ cl::desc("Unrolled size limit for loops with an unroll_and_jam(full) or "
+ "unroll_count pragma."));
+
+// Returns the loop hint metadata node with the given name (for example,
+// "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
+// returned.
+static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
+ if (MDNode *LoopID = L->getLoopID())
+ return GetUnrollMetadata(LoopID, Name);
+ return nullptr;
+}
+
+// Returns true if the loop has any metadata starting with Prefix. For example a
+// Prefix of "llvm.loop.unroll." returns true if we have any unroll metadata.
+static bool HasAnyUnrollPragma(const Loop *L, StringRef Prefix) {
+ if (MDNode *LoopID = L->getLoopID()) {
+ // First operand should refer to the loop id itself.
+ assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+ assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+ for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+ MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+ if (!MD)
+ continue;
+
+ MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+ if (!S)
+ continue;
+
+ if (S->getString().startswith(Prefix))
+ return true;
+ }
+ }
+ return false;
+}
+
+// Returns true if the loop has an unroll_and_jam(enable) pragma.
+static bool HasUnrollAndJamEnablePragma(const Loop *L) {
+ return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.enable");
+}
+
+// Returns true if the loop has an unroll_and_jam(disable) pragma.
+static bool HasUnrollAndJamDisablePragma(const Loop *L) {
+ return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.disable");
+}
+
+// If loop has an unroll_and_jam_count pragma return the (necessarily
+// positive) value from the pragma. Otherwise return 0.
+static unsigned UnrollAndJamCountPragmaValue(const Loop *L) {
+ MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.count");
+ if (MD) {
+ assert(MD->getNumOperands() == 2 &&
+ "Unroll count hint metadata should have two operands.");
+ unsigned Count =
+ mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+ assert(Count >= 1 && "Unroll count must be positive.");
+ return Count;
+ }
+ return 0;
+}
+
+// Returns loop size estimation for unrolled loop.
+static uint64_t
+getUnrollAndJammedLoopSize(unsigned LoopSize,
+ TargetTransformInfo::UnrollingPreferences &UP) {
+ assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
+ return static_cast<uint64_t>(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
+}
+
+// Calculates unroll and jam count and writes it to UP.Count. Returns true if
+// unroll count was set explicitly.
+static bool computeUnrollAndJamCount(
+ Loop *L, Loop *SubLoop, const TargetTransformInfo &TTI, DominatorTree &DT,
+ LoopInfo *LI, ScalarEvolution &SE,
+ const SmallPtrSetImpl<const Value *> &EphValues,
+ OptimizationRemarkEmitter *ORE, unsigned OuterTripCount,
+ unsigned OuterTripMultiple, unsigned OuterLoopSize, unsigned InnerTripCount,
+ unsigned InnerLoopSize, TargetTransformInfo::UnrollingPreferences &UP) {
+ // Check for explicit Count from the "unroll-and-jam-count" option.
+ bool UserUnrollCount = UnrollAndJamCount.getNumOccurrences() > 0;
+ if (UserUnrollCount) {
+ UP.Count = UnrollAndJamCount;
+ UP.Force = true;
+ if (UP.AllowRemainder &&
+ getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
+ getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
+ UP.UnrollAndJamInnerLoopThreshold)
+ return true;
+ }
+
+ // Check for unroll_and_jam pragmas
+ unsigned PragmaCount = UnrollAndJamCountPragmaValue(L);
+ if (PragmaCount > 0) {
+ UP.Count = PragmaCount;
+ UP.Runtime = true;
+ UP.Force = true;
+ if ((UP.AllowRemainder || (OuterTripMultiple % PragmaCount == 0)) &&
+ getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
+ getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
+ UP.UnrollAndJamInnerLoopThreshold)
+ return true;
+ }
+
+ // Use computeUnrollCount from the loop unroller to get a sensible count
+ // for the unrolling the outer loop. This uses UP.Threshold /
+ // UP.PartialThreshold / UP.MaxCount to come up with sensible loop values.
+ // We have already checked that the loop has no unroll.* pragmas.
+ unsigned MaxTripCount = 0;
+ bool UseUpperBound = false;
+ bool ExplicitUnroll = computeUnrollCount(
+ L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount,
+ OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
+ if (ExplicitUnroll || UseUpperBound) {
+ // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
+ // for the unroller instead.
+ UP.Count = 0;
+ return false;
+ }
+
+ bool PragmaEnableUnroll = HasUnrollAndJamEnablePragma(L);
+ ExplicitUnroll = PragmaCount > 0 || PragmaEnableUnroll || UserUnrollCount;
+
+ // If the loop has an unrolling pragma, we want to be more aggressive with
+ // unrolling limits.
+ if (ExplicitUnroll && OuterTripCount != 0)
+ UP.UnrollAndJamInnerLoopThreshold = PragmaUnrollAndJamThreshold;
+
+ if (!UP.AllowRemainder && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
+ UP.UnrollAndJamInnerLoopThreshold) {
+ UP.Count = 0;
+ return false;
+ }
+
+ // If the inner loop count is known and small, leave the entire loop nest to
+ // be the unroller
+ if (!ExplicitUnroll && InnerTripCount &&
+ InnerLoopSize * InnerTripCount < UP.Threshold) {
+ UP.Count = 0;
+ return false;
+ }
+
+ // We have a sensible limit for the outer loop, now adjust it for the inner
+ // loop and UP.UnrollAndJamInnerLoopThreshold.
+ while (UP.Count != 0 && UP.AllowRemainder &&
+ getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
+ UP.UnrollAndJamInnerLoopThreshold)
+ UP.Count--;
+
+ if (!ExplicitUnroll) {
+ // Check for situations where UnJ is likely to be unprofitable. Including
+ // subloops with more than 1 block.
+ if (SubLoop->getBlocks().size() != 1) {
+ UP.Count = 0;
+ return false;
+ }
+
+ // Limit to loops where there is something to gain from unrolling and
+ // jamming the loop. In this case, look for loads that are invariant in the
+ // outer loop and can become shared.
+ unsigned NumInvariant = 0;
+ for (BasicBlock *BB : SubLoop->getBlocks()) {
+ for (Instruction &I : *BB) {
+ if (auto *Ld = dyn_cast<LoadInst>(&I)) {
+ Value *V = Ld->getPointerOperand();
+ const SCEV *LSCEV = SE.getSCEVAtScope(V, L);
+ if (SE.isLoopInvariant(LSCEV, L))
+ NumInvariant++;
+ }
+ }
+ }
+ if (NumInvariant == 0) {
+ UP.Count = 0;
+ return false;
+ }
+ }
+
+ return ExplicitUnroll;
+}
+
+static LoopUnrollResult
+tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
+ ScalarEvolution &SE, const TargetTransformInfo &TTI,
+ AssumptionCache &AC, DependenceInfo &DI,
+ OptimizationRemarkEmitter &ORE, int OptLevel) {
+ // Quick checks of the correct loop form
+ if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
+ return LoopUnrollResult::Unmodified;
+ Loop *SubLoop = L->getSubLoops()[0];
+ if (!SubLoop->isLoopSimplifyForm())
+ return LoopUnrollResult::Unmodified;
+
+ BasicBlock *Latch = L->getLoopLatch();
+ BasicBlock *Exit = L->getExitingBlock();
+ BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
+ BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
+
+ if (Latch != Exit || SubLoopLatch != SubLoopExit)
+ return LoopUnrollResult::Unmodified;
+
+ TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
+ L, SE, TTI, OptLevel, None, None, None, None, None, None);
+ if (AllowUnrollAndJam.getNumOccurrences() > 0)
+ UP.UnrollAndJam = AllowUnrollAndJam;
+ if (UnrollAndJamThreshold.getNumOccurrences() > 0)
+ UP.UnrollAndJamInnerLoopThreshold = UnrollAndJamThreshold;
+ // Exit early if unrolling is disabled.
+ if (!UP.UnrollAndJam || UP.UnrollAndJamInnerLoopThreshold == 0)
+ return LoopUnrollResult::Unmodified;
+
+ LLVM_DEBUG(dbgs() << "Loop Unroll and Jam: F["
+ << L->getHeader()->getParent()->getName() << "] Loop %"
+ << L->getHeader()->getName() << "\n");
+
+ // A loop with any unroll pragma (enabling/disabling/count/etc) is left for
+ // the unroller, so long as it does not explicitly have unroll_and_jam
+ // metadata. This means #pragma nounroll will disable unroll and jam as well
+ // as unrolling
+ if (HasUnrollAndJamDisablePragma(L) ||
+ (HasAnyUnrollPragma(L, "llvm.loop.unroll.") &&
+ !HasAnyUnrollPragma(L, "llvm.loop.unroll_and_jam."))) {
+ LLVM_DEBUG(dbgs() << " Disabled due to pragma.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ if (!isSafeToUnrollAndJam(L, SE, DT, DI)) {
+ LLVM_DEBUG(dbgs() << " Disabled due to not being safe.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ // Approximate the loop size and collect useful info
+ unsigned NumInlineCandidates;
+ bool NotDuplicatable;
+ bool Convergent;
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+ unsigned InnerLoopSize =
+ ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable,
+ Convergent, TTI, EphValues, UP.BEInsns);
+ unsigned OuterLoopSize =
+ ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
+ TTI, EphValues, UP.BEInsns);
+ LLVM_DEBUG(dbgs() << " Outer Loop Size: " << OuterLoopSize << "\n");
+ LLVM_DEBUG(dbgs() << " Inner Loop Size: " << InnerLoopSize << "\n");
+ if (NotDuplicatable) {
+ LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable "
+ "instructions.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+ if (NumInlineCandidates != 0) {
+ LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+ if (Convergent) {
+ LLVM_DEBUG(
+ dbgs() << " Not unrolling loop with convergent instructions.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ // Find trip count and trip multiple
+ unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch);
+ unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch);
+ unsigned InnerTripCount = SE.getSmallConstantTripCount(SubLoop, SubLoopLatch);
+
+ // Decide if, and by how much, to unroll
+ bool IsCountSetExplicitly = computeUnrollAndJamCount(
+ L, SubLoop, TTI, DT, LI, SE, EphValues, &ORE, OuterTripCount,
+ OuterTripMultiple, OuterLoopSize, InnerTripCount, InnerLoopSize, UP);
+ if (UP.Count <= 1)
+ return LoopUnrollResult::Unmodified;
+ // Unroll factor (Count) must be less or equal to TripCount.
+ if (OuterTripCount && UP.Count > OuterTripCount)
+ UP.Count = OuterTripCount;
+
+ LoopUnrollResult UnrollResult =
+ UnrollAndJamLoop(L, UP.Count, OuterTripCount, OuterTripMultiple,
+ UP.UnrollRemainder, LI, &SE, &DT, &AC, &ORE);
+
+ // If loop has an unroll count pragma or unrolled by explicitly set count
+ // mark loop as unrolled to prevent unrolling beyond that requested.
+ if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly)
+ L->setLoopAlreadyUnrolled();
+
+ return UnrollResult;
+}
+
+namespace {
+
+class LoopUnrollAndJam : public LoopPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+ unsigned OptLevel;
+
+ LoopUnrollAndJam(int OptLevel = 2) : LoopPass(ID), OptLevel(OptLevel) {
+ initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+
+ Function &F = *L->getHeader()->getParent();
+
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ const TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
+ // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+ // pass. Function analyses need to be preserved across loop transformations
+ // but ORE cannot be preserved (see comment before the pass definition).
+ OptimizationRemarkEmitter ORE(&F);
+
+ LoopUnrollResult Result =
+ tryToUnrollAndJamLoop(L, DT, LI, SE, TTI, AC, DI, ORE, OptLevel);
+
+ if (Result == LoopUnrollResult::FullyUnrolled)
+ LPM.markLoopAsDeleted(*L);
+
+ return Result != LoopUnrollResult::Unmodified;
+ }
+
+ /// This transformation requires natural loop information & requires that
+ /// loop preheaders be inserted into the CFG...
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<DependenceAnalysisWrapperPass>();
+ getLoopAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char LoopUnrollAndJam::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopUnrollAndJam, "loop-unroll-and-jam",
+ "Unroll and Jam loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
+INITIALIZE_PASS_END(LoopUnrollAndJam, "loop-unroll-and-jam",
+ "Unroll and Jam loops", false, false)
+
+Pass *llvm::createLoopUnrollAndJamPass(int OptLevel) {
+ return new LoopUnrollAndJam(OptLevel);
+}
+
+PreservedAnalyses LoopUnrollAndJamPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ const auto &FAM =
+ AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
+ Function *F = L.getHeader()->getParent();
+
+ auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
+ // FIXME: This should probably be optional rather than required.
+ if (!ORE)
+ report_fatal_error(
+ "LoopUnrollAndJamPass: OptimizationRemarkEmitterAnalysis not cached at "
+ "a higher level");
+
+ DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
+
+ LoopUnrollResult Result = tryToUnrollAndJamLoop(
+ &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, DI, *ORE, OptLevel);
+
+ if (Result == LoopUnrollResult::Unmodified)
+ return PreservedAnalyses::all();
+
+ return getLoopPassPreservedAnalyses();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 15e7da5e1a7a..634215c9770f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -53,6 +53,7 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
@@ -164,7 +165,7 @@ static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
/// Gather the various unrolling parameters based on the defaults, compiler
/// flags, TTI overrides and user specified parameters.
-static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
+TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
@@ -191,6 +192,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
UP.Force = false;
UP.UpperBound = false;
UP.AllowPeeling = true;
+ UP.UnrollAndJam = false;
+ UP.UnrollAndJamInnerLoopThreshold = 60;
// Override with any target specific settings
TTI.getUnrollingPreferences(L, SE, UP);
@@ -285,17 +288,17 @@ struct UnrolledInstStateKeyInfo {
};
struct EstimatedUnrollCost {
- /// \brief The estimated cost after unrolling.
+ /// The estimated cost after unrolling.
unsigned UnrolledCost;
- /// \brief The estimated dynamic cost of executing the instructions in the
+ /// The estimated dynamic cost of executing the instructions in the
/// rolled form.
unsigned RolledDynamicCost;
};
} // end anonymous namespace
-/// \brief Figure out if the loop is worth full unrolling.
+/// Figure out if the loop is worth full unrolling.
///
/// Complete loop unrolling can make some loads constant, and we need to know
/// if that would expose any further optimization opportunities. This routine
@@ -308,10 +311,10 @@ struct EstimatedUnrollCost {
/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
/// the analysis failed (no benefits expected from the unrolling, or the loop is
/// too big to analyze), the returned value is None.
-static Optional<EstimatedUnrollCost>
-analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
- ScalarEvolution &SE, const TargetTransformInfo &TTI,
- unsigned MaxUnrolledLoopSize) {
+static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
+ const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
+ const SmallPtrSetImpl<const Value *> &EphValues,
+ const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize) {
// We want to be able to scale offsets by the trip count and add more offsets
// to them without checking for overflows, and we already don't want to
// analyze *massive* trip counts, so we force the max to be reasonably small.
@@ -405,9 +408,9 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
// First accumulate the cost of this instruction.
if (!Cost.IsFree) {
UnrolledCost += TTI.getUserCost(I);
- DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration
- << "): ");
- DEBUG(I->dump());
+ LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
+ << Iteration << "): ");
+ LLVM_DEBUG(I->dump());
}
// We must count the cost of every operand which is not free,
@@ -442,14 +445,14 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
assert(L->isLCSSAForm(DT) &&
"Must have loops in LCSSA form to track live-out values.");
- DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
+ LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
// Simulate execution of each iteration of the loop counting instructions,
// which would be simplified.
// Since the same load will take different values on different iterations,
// we literally have to go through all loop's iterations.
for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
- DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
+ LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
// Prepare for the iteration by collecting any simplified entry or backedge
// inputs.
@@ -490,7 +493,9 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
// it. We don't change the actual IR, just count optimization
// opportunities.
for (Instruction &I : *BB) {
- if (isa<DbgInfoIntrinsic>(I))
+ // These won't get into the final code - don't even try calculating the
+ // cost for them.
+ if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
continue;
// Track this instruction's expected baseline cost when executing the
@@ -512,8 +517,13 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
// Can't properly model a cost of a call.
// FIXME: With a proper cost model we should be able to do it.
- if(isa<CallInst>(&I))
- return None;
+ if (auto *CI = dyn_cast<CallInst>(&I)) {
+ const Function *Callee = CI->getCalledFunction();
+ if (!Callee || TTI.isLoweredToCall(Callee)) {
+ LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n");
+ return None;
+ }
+ }
// If the instruction might have a side-effect recursively account for
// the cost of it and all the instructions leading up to it.
@@ -522,10 +532,10 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
// If unrolled body turns out to be too big, bail out.
if (UnrolledCost > MaxUnrolledLoopSize) {
- DEBUG(dbgs() << " Exceeded threshold.. exiting.\n"
- << " UnrolledCost: " << UnrolledCost
- << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
- << "\n");
+ LLVM_DEBUG(dbgs() << " Exceeded threshold.. exiting.\n"
+ << " UnrolledCost: " << UnrolledCost
+ << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
+ << "\n");
return None;
}
}
@@ -578,8 +588,8 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
// If we found no optimization opportunities on the first iteration, we
// won't find them on later ones too.
if (UnrolledCost == RolledDynamicCost) {
- DEBUG(dbgs() << " No opportunities found.. exiting.\n"
- << " UnrolledCost: " << UnrolledCost << "\n");
+ LLVM_DEBUG(dbgs() << " No opportunities found.. exiting.\n"
+ << " UnrolledCost: " << UnrolledCost << "\n");
return None;
}
}
@@ -600,20 +610,17 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
}
}
- DEBUG(dbgs() << "Analysis finished:\n"
- << "UnrolledCost: " << UnrolledCost << ", "
- << "RolledDynamicCost: " << RolledDynamicCost << "\n");
+ LLVM_DEBUG(dbgs() << "Analysis finished:\n"
+ << "UnrolledCost: " << UnrolledCost << ", "
+ << "RolledDynamicCost: " << RolledDynamicCost << "\n");
return {{UnrolledCost, RolledDynamicCost}};
}
/// ApproximateLoopSize - Approximate the size of the loop.
-static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
- bool &NotDuplicatable, bool &Convergent,
- const TargetTransformInfo &TTI,
- AssumptionCache *AC, unsigned BEInsns) {
- SmallPtrSet<const Value *, 32> EphValues;
- CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
+unsigned llvm::ApproximateLoopSize(
+ const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
+ const TargetTransformInfo &TTI,
+ const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
CodeMetrics Metrics;
for (BasicBlock *BB : L->blocks())
Metrics.analyzeBasicBlock(BB, TTI, EphValues);
@@ -706,10 +713,11 @@ static uint64_t getUnrolledLoopSize(
// Returns true if unroll count was set explicitly.
// Calculates unroll count and writes it to UP.Count.
-static bool computeUnrollCount(
+bool llvm::computeUnrollCount(
Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
- ScalarEvolution &SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
- unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize,
+ ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
+ OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
+ unsigned &TripMultiple, unsigned LoopSize,
TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
// Check for explicit Count.
// 1st priority is unroll count set by "unroll-count" option.
@@ -729,7 +737,7 @@ static bool computeUnrollCount(
UP.Runtime = true;
UP.AllowExpensiveTripCount = true;
UP.Force = true;
- if (UP.AllowRemainder &&
+ if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) &&
getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
return true;
}
@@ -746,8 +754,8 @@ static bool computeUnrollCount(
if (ExplicitUnroll && TripCount != 0) {
// If the loop has an unrolling pragma, we want to be more aggressive with
- // unrolling limits. Set thresholds to at least the PragmaThreshold value
- // which is larger than the default limits.
+ // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
+ // value which is larger than the default limits.
UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
UP.PartialThreshold =
std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
@@ -763,7 +771,7 @@ static bool computeUnrollCount(
// compute the former when the latter is zero.
unsigned ExactTripCount = TripCount;
assert((ExactTripCount == 0 || MaxTripCount == 0) &&
- "ExtractTripCound and MaxTripCount cannot both be non zero.");
+ "ExtractTripCount and MaxTripCount cannot both be non zero.");
unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount;
UP.Count = FullUnrollTripCount;
if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
@@ -779,7 +787,7 @@ static bool computeUnrollCount(
// helps to remove a significant number of instructions.
// To check that, run additional analysis on the loop.
if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
- L, FullUnrollTripCount, DT, SE, TTI,
+ L, FullUnrollTripCount, DT, SE, EphValues, TTI,
UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
unsigned Boost =
getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
@@ -794,7 +802,7 @@ static bool computeUnrollCount(
}
// 4th priority is loop peeling
- computePeelCount(L, LoopSize, UP, TripCount);
+ computePeelCount(L, LoopSize, UP, TripCount, SE);
if (UP.PeelCount) {
UP.Runtime = false;
UP.Count = 1;
@@ -802,12 +810,12 @@ static bool computeUnrollCount(
}
// 5th priority is partial unrolling.
- // Try partial unroll only when TripCount could be staticaly calculated.
+ // Try partial unroll only when TripCount could be statically calculated.
if (TripCount) {
UP.Partial |= ExplicitUnroll;
if (!UP.Partial) {
- DEBUG(dbgs() << " will not try to unroll partially because "
- << "-unroll-allow-partial not given\n");
+ LLVM_DEBUG(dbgs() << " will not try to unroll partially because "
+ << "-unroll-allow-partial not given\n");
UP.Count = 0;
return false;
}
@@ -894,8 +902,9 @@ static bool computeUnrollCount(
// Reduce count based on the type of unrolling and the threshold values.
UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
if (!UP.Runtime) {
- DEBUG(dbgs() << " will not try to unroll loop with runtime trip count "
- << "-unroll-runtime not given\n");
+ LLVM_DEBUG(
+ dbgs() << " will not try to unroll loop with runtime trip count "
+ << "-unroll-runtime not given\n");
UP.Count = 0;
return false;
}
@@ -915,12 +924,13 @@ static bool computeUnrollCount(
if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
while (UP.Count != 0 && TripMultiple % UP.Count != 0)
UP.Count >>= 1;
- DEBUG(dbgs() << "Remainder loop is restricted (that could architecture "
- "specific or because the loop contains a convergent "
- "instruction), so unroll count must divide the trip "
- "multiple, "
- << TripMultiple << ". Reducing unroll count from "
- << OrigCount << " to " << UP.Count << ".\n");
+ LLVM_DEBUG(
+ dbgs() << "Remainder loop is restricted (that could architecture "
+ "specific or because the loop contains a convergent "
+ "instruction), so unroll count must divide the trip "
+ "multiple, "
+ << TripMultiple << ". Reducing unroll count from " << OrigCount
+ << " to " << UP.Count << ".\n");
using namespace ore;
@@ -942,7 +952,8 @@ static bool computeUnrollCount(
if (UP.Count > UP.MaxCount)
UP.Count = UP.MaxCount;
- DEBUG(dbgs() << " partially unrolling with count: " << UP.Count << "\n");
+ LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count
+ << "\n");
if (UP.Count < 2)
UP.Count = 0;
return ExplicitUnroll;
@@ -955,12 +966,13 @@ static LoopUnrollResult tryToUnrollLoop(
Optional<unsigned> ProvidedCount, Optional<unsigned> ProvidedThreshold,
Optional<bool> ProvidedAllowPartial, Optional<bool> ProvidedRuntime,
Optional<bool> ProvidedUpperBound, Optional<bool> ProvidedAllowPeeling) {
- DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName()
- << "] Loop %" << L->getHeader()->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Loop Unroll: F["
+ << L->getHeader()->getParent()->getName() << "] Loop %"
+ << L->getHeader()->getName() << "\n");
if (HasUnrollDisablePragma(L))
return LoopUnrollResult::Unmodified;
if (!L->isLoopSimplifyForm()) {
- DEBUG(
+ LLVM_DEBUG(
dbgs() << " Not unrolling loop which is not in loop-simplify form.\n");
return LoopUnrollResult::Unmodified;
}
@@ -975,16 +987,21 @@ static LoopUnrollResult tryToUnrollLoop(
// Exit early if unrolling is disabled.
if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
return LoopUnrollResult::Unmodified;
- unsigned LoopSize = ApproximateLoopSize(
- L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC, UP.BEInsns);
- DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
+
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+
+ unsigned LoopSize =
+ ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
+ TTI, EphValues, UP.BEInsns);
+ LLVM_DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
if (NotDuplicatable) {
- DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
- << " instructions.\n");
+ LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
+ << " instructions.\n");
return LoopUnrollResult::Unmodified;
}
if (NumInlineCandidates != 0) {
- DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
+ LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
return LoopUnrollResult::Unmodified;
}
@@ -1030,7 +1047,7 @@ static LoopUnrollResult tryToUnrollLoop(
// loop tests remains the same compared to the non-unrolled version, whereas
// the generic upper bound unrolling keeps all but the last loop test so the
// number of loop tests goes up which may end up being worse on targets with
- // constriained branch predictor resources so is controlled by an option.)
+ // constrained branch predictor resources so is controlled by an option.)
// In addition we only unroll small upper bounds.
if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) {
MaxTripCount = 0;
@@ -1040,9 +1057,9 @@ static LoopUnrollResult tryToUnrollLoop(
// computeUnrollCount() decides whether it is beneficial to use upper bound to
// fully unroll the loop.
bool UseUpperBound = false;
- bool IsCountSetExplicitly =
- computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount,
- TripMultiple, LoopSize, UP, UseUpperBound);
+ bool IsCountSetExplicitly = computeUnrollCount(
+ L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount,
+ TripMultiple, LoopSize, UP, UseUpperBound);
if (!UP.Count)
return LoopUnrollResult::Unmodified;
// Unroll factor (Count) must be less or equal to TripCount.
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
index f2405d9b0c03..b12586758925 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -28,7 +28,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
@@ -39,6 +39,7 @@
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"
@@ -66,7 +67,6 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <algorithm>
@@ -298,9 +298,9 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount;
if (Metrics.notDuplicatable) {
- DEBUG(dbgs() << "NOT unswitching loop %"
- << L->getHeader()->getName() << ", contents cannot be "
- << "duplicated!\n");
+ LLVM_DEBUG(dbgs() << "NOT unswitching loop %" << L->getHeader()->getName()
+ << ", contents cannot be "
+ << "duplicated!\n");
return false;
}
}
@@ -635,6 +635,12 @@ bool LoopUnswitch::processCurrentLoop() {
return true;
}
+ // Do not do non-trivial unswitch while optimizing for size.
+ // FIXME: Use Function::optForSize().
+ if (OptimizeForSize ||
+ loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
+ return false;
+
// Run through the instructions in the loop, keeping track of three things:
//
// - That we do not unswitch loops containing convergent operations, as we
@@ -666,12 +672,6 @@ bool LoopUnswitch::processCurrentLoop() {
}
}
- // Do not do non-trivial unswitch while optimizing for size.
- // FIXME: Use Function::optForSize().
- if (OptimizeForSize ||
- loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
- return false;
-
for (IntrinsicInst *Guard : Guards) {
Value *LoopCond =
FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed).first;
@@ -856,20 +856,20 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
TerminatorInst *TI) {
// Check to see if it would be profitable to unswitch current loop.
if (!BranchesInfo.CostAllowsUnswitching()) {
- DEBUG(dbgs() << "NOT unswitching loop %"
- << currentLoop->getHeader()->getName()
- << " at non-trivial condition '" << *Val
- << "' == " << *LoopCond << "\n"
- << ". Cost too high.\n");
+ LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
+ << currentLoop->getHeader()->getName()
+ << " at non-trivial condition '" << *Val
+ << "' == " << *LoopCond << "\n"
+ << ". Cost too high.\n");
return false;
}
if (hasBranchDivergence &&
getAnalysis<DivergenceAnalysis>().isDivergent(LoopCond)) {
- DEBUG(dbgs() << "NOT unswitching loop %"
- << currentLoop->getHeader()->getName()
- << " at non-trivial condition '" << *Val
- << "' == " << *LoopCond << "\n"
- << ". Condition is divergent.\n");
+ LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
+ << currentLoop->getHeader()->getName()
+ << " at non-trivial condition '" << *Val
+ << "' == " << *LoopCond << "\n"
+ << ". Condition is divergent.\n");
return false;
}
@@ -910,6 +910,7 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
BranchInst *OldBranch,
TerminatorInst *TI) {
assert(OldBranch->isUnconditional() && "Preheader is not split correctly");
+ assert(TrueDest != FalseDest && "Branch targets should be different");
// Insert a conditional branch on LIC to the two preheaders. The original
// code is the true version and the new code is the false version.
Value *BranchVal = LIC;
@@ -942,9 +943,9 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
if (DT) {
// First, add both successors.
SmallVector<DominatorTree::UpdateType, 3> Updates;
- if (TrueDest != OldBranchParent)
+ if (TrueDest != OldBranchSucc)
Updates.push_back({DominatorTree::Insert, OldBranchParent, TrueDest});
- if (FalseDest != OldBranchParent)
+ if (FalseDest != OldBranchSucc)
Updates.push_back({DominatorTree::Insert, OldBranchParent, FalseDest});
// If both of the new successors are different from the old one, inform the
// DT that the edge was deleted.
@@ -970,11 +971,15 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
BasicBlock *ExitBlock,
TerminatorInst *TI) {
- DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
- << loopHeader->getName() << " [" << L->getBlocks().size()
- << " blocks] in Function "
- << L->getHeader()->getParent()->getName() << " on cond: " << *Val
- << " == " << *Cond << "\n");
+ LLVM_DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
+ << loopHeader->getName() << " [" << L->getBlocks().size()
+ << " blocks] in Function "
+ << L->getHeader()->getParent()->getName()
+ << " on cond: " << *Val << " == " << *Cond << "\n");
+ // We are going to make essential changes to CFG. This may invalidate cached
+ // information for L or one of its parent loops in SCEV.
+ if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
+ SEWP->getSE().forgetTopmostLoop(L);
// First step, split the preheader, so that we know that there is a safe place
// to insert the conditional branch. We will change loopPreheader to have a
@@ -1038,7 +1043,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
// until it finds the trivial condition candidate (condition that is not a
// constant). Since unswitching generates branches with constant conditions,
// this scenario could be very common in practice.
- SmallSet<BasicBlock*, 8> Visited;
+ SmallPtrSet<BasicBlock*, 8> Visited;
while (true) {
// If we exit loop or reach a previous visited block, then
@@ -1196,13 +1201,15 @@ void LoopUnswitch::SplitExitEdges(Loop *L,
void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
Loop *L, TerminatorInst *TI) {
Function *F = loopHeader->getParent();
- DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
- << loopHeader->getName() << " [" << L->getBlocks().size()
- << " blocks] in Function " << F->getName()
- << " when '" << *Val << "' == " << *LIC << "\n");
+ LLVM_DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
+ << loopHeader->getName() << " [" << L->getBlocks().size()
+ << " blocks] in Function " << F->getName() << " when '"
+ << *Val << "' == " << *LIC << "\n");
+ // We are going to make essential changes to CFG. This may invalidate cached
+ // information for L or one of its parent loops in SCEV.
if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
- SEWP->getSE().forgetLoop(L);
+ SEWP->getSE().forgetTopmostLoop(L);
LoopBlocks.clear();
NewBlocks.clear();
@@ -1355,7 +1362,7 @@ static void RemoveFromWorklist(Instruction *I,
static void ReplaceUsesOfWith(Instruction *I, Value *V,
std::vector<Instruction*> &Worklist,
Loop *L, LPPassManager *LPM) {
- DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n");
// Add uses to the worklist, which may be dead now.
for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
@@ -1524,7 +1531,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
// Simple DCE.
if (isInstructionTriviallyDead(I)) {
- DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n");
+ LLVM_DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n");
// Add uses to the worklist, which may be dead now.
for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
@@ -1557,8 +1564,8 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
if (!SinglePred) continue; // Nothing to do.
assert(SinglePred == Pred && "CFG broken");
- DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- "
- << Succ->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- "
+ << Succ->getName() << "\n");
// Resolve any single entry PHI nodes in Succ.
while (PHINode *PN = dyn_cast<PHINode>(Succ->begin()))
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 53b25e688e82..06e86081e8a0 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -68,6 +68,7 @@
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/Constants.h"
@@ -85,6 +86,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
#include <cassert>
@@ -111,7 +113,7 @@ static cl::opt<unsigned> LVLoopDepthThreshold(
"LoopVersioningLICM's threshold for maximum allowed loop nest/depth"),
cl::init(2), cl::Hidden);
-/// \brief Create MDNode for input string.
+/// Create MDNode for input string.
static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) {
LLVMContext &Context = TheLoop->getHeader()->getContext();
Metadata *MDs[] = {
@@ -120,7 +122,7 @@ static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) {
return MDNode::get(Context, MDs);
}
-/// \brief Set input string into loop metadata by keeping other values intact.
+/// Set input string into loop metadata by keeping other values intact.
void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
unsigned V) {
SmallVector<Metadata *, 4> MDs(1);
@@ -166,6 +168,7 @@ struct LoopVersioningLICM : public LoopPass {
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
}
StringRef getPassName() const override { return "Loop Versioning for LICM"; }
@@ -178,6 +181,7 @@ struct LoopVersioningLICM : public LoopPass {
LoadAndStoreCounter = 0;
InvariantCounter = 0;
IsReadOnlyLoop = true;
+ ORE = nullptr;
CurAST.reset();
}
@@ -207,7 +211,7 @@ private:
Loop *CurLoop = nullptr;
// AliasSet information for the current loop.
- std::unique_ptr<AliasSetTracker> CurAST;
+ std::unique_ptr<AliasSetTracker> CurAST;
// Maximum loop nest threshold
unsigned LoopDepthThreshold;
@@ -224,6 +228,9 @@ private:
// Read only loop marker.
bool IsReadOnlyLoop = true;
+ // OptimizationRemarkEmitter
+ OptimizationRemarkEmitter *ORE;
+
bool isLegalForVersioning();
bool legalLoopStructure();
bool legalLoopInstructions();
@@ -235,58 +242,57 @@ private:
} // end anonymous namespace
-/// \brief Check loop structure and confirms it's good for LoopVersioningLICM.
+/// Check loop structure and confirms it's good for LoopVersioningLICM.
bool LoopVersioningLICM::legalLoopStructure() {
// Loop must be in loop simplify form.
if (!CurLoop->isLoopSimplifyForm()) {
- DEBUG(
- dbgs() << " loop is not in loop-simplify form.\n");
+ LLVM_DEBUG(dbgs() << " loop is not in loop-simplify form.\n");
return false;
}
// Loop should be innermost loop, if not return false.
if (!CurLoop->getSubLoops().empty()) {
- DEBUG(dbgs() << " loop is not innermost\n");
+ LLVM_DEBUG(dbgs() << " loop is not innermost\n");
return false;
}
// Loop should have a single backedge, if not return false.
if (CurLoop->getNumBackEdges() != 1) {
- DEBUG(dbgs() << " loop has multiple backedges\n");
+ LLVM_DEBUG(dbgs() << " loop has multiple backedges\n");
return false;
}
// Loop must have a single exiting block, if not return false.
if (!CurLoop->getExitingBlock()) {
- DEBUG(dbgs() << " loop has multiple exiting block\n");
+ LLVM_DEBUG(dbgs() << " loop has multiple exiting block\n");
return false;
}
// We only handle bottom-tested loop, i.e. loop in which the condition is
// checked at the end of each iteration. With that we can assume that all
// instructions in the loop are executed the same number of times.
if (CurLoop->getExitingBlock() != CurLoop->getLoopLatch()) {
- DEBUG(dbgs() << " loop is not bottom tested\n");
+ LLVM_DEBUG(dbgs() << " loop is not bottom tested\n");
return false;
}
// Parallel loops must not have aliasing loop-invariant memory accesses.
// Hence we don't need to version anything in this case.
if (CurLoop->isAnnotatedParallel()) {
- DEBUG(dbgs() << " Parallel loop is not worth versioning\n");
+ LLVM_DEBUG(dbgs() << " Parallel loop is not worth versioning\n");
return false;
}
// Loop depth more then LoopDepthThreshold are not allowed
if (CurLoop->getLoopDepth() > LoopDepthThreshold) {
- DEBUG(dbgs() << " loop depth is more then threshold\n");
+ LLVM_DEBUG(dbgs() << " loop depth is more then threshold\n");
return false;
}
// We need to be able to compute the loop trip count in order
// to generate the bound checks.
const SCEV *ExitCount = SE->getBackedgeTakenCount(CurLoop);
if (ExitCount == SE->getCouldNotCompute()) {
- DEBUG(dbgs() << " loop does not has trip count\n");
+ LLVM_DEBUG(dbgs() << " loop does not has trip count\n");
return false;
}
return true;
}
-/// \brief Check memory accesses in loop and confirms it's good for
+/// Check memory accesses in loop and confirms it's good for
/// LoopVersioningLICM.
bool LoopVersioningLICM::legalLoopMemoryAccesses() {
bool HasMayAlias = false;
@@ -328,24 +334,24 @@ bool LoopVersioningLICM::legalLoopMemoryAccesses() {
}
// Ensure types should be of same type.
if (!TypeSafety) {
- DEBUG(dbgs() << " Alias tracker type safety failed!\n");
+ LLVM_DEBUG(dbgs() << " Alias tracker type safety failed!\n");
return false;
}
// Ensure loop body shouldn't be read only.
if (!HasMod) {
- DEBUG(dbgs() << " No memory modified in loop body\n");
+ LLVM_DEBUG(dbgs() << " No memory modified in loop body\n");
return false;
}
// Make sure alias set has may alias case.
// If there no alias memory ambiguity, return false.
if (!HasMayAlias) {
- DEBUG(dbgs() << " No ambiguity in memory access.\n");
+ LLVM_DEBUG(dbgs() << " No ambiguity in memory access.\n");
return false;
}
return true;
}
-/// \brief Check loop instructions safe for Loop versioning.
+/// Check loop instructions safe for Loop versioning.
/// It returns true if it's safe else returns false.
/// Consider following:
/// 1) Check all load store in loop body are non atomic & non volatile.
@@ -355,12 +361,12 @@ bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) {
assert(I != nullptr && "Null instruction found!");
// Check function call safety
if (isa<CallInst>(I) && !AA->doesNotAccessMemory(CallSite(I))) {
- DEBUG(dbgs() << " Unsafe call site found.\n");
+ LLVM_DEBUG(dbgs() << " Unsafe call site found.\n");
return false;
}
// Avoid loops with possiblity of throw
if (I->mayThrow()) {
- DEBUG(dbgs() << " May throw instruction found in loop body\n");
+ LLVM_DEBUG(dbgs() << " May throw instruction found in loop body\n");
return false;
}
// If current instruction is load instructions
@@ -368,7 +374,7 @@ bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) {
if (I->mayReadFromMemory()) {
LoadInst *Ld = dyn_cast<LoadInst>(I);
if (!Ld || !Ld->isSimple()) {
- DEBUG(dbgs() << " Found a non-simple load.\n");
+ LLVM_DEBUG(dbgs() << " Found a non-simple load.\n");
return false;
}
LoadAndStoreCounter++;
@@ -382,7 +388,7 @@ bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) {
else if (I->mayWriteToMemory()) {
StoreInst *St = dyn_cast<StoreInst>(I);
if (!St || !St->isSimple()) {
- DEBUG(dbgs() << " Found a non-simple store.\n");
+ LLVM_DEBUG(dbgs() << " Found a non-simple store.\n");
return false;
}
LoadAndStoreCounter++;
@@ -396,59 +402,87 @@ bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) {
return true;
}
-/// \brief Check loop instructions and confirms it's good for
+/// Check loop instructions and confirms it's good for
/// LoopVersioningLICM.
bool LoopVersioningLICM::legalLoopInstructions() {
// Resetting counters.
LoadAndStoreCounter = 0;
InvariantCounter = 0;
IsReadOnlyLoop = true;
+ using namespace ore;
// Iterate over loop blocks and instructions of each block and check
// instruction safety.
for (auto *Block : CurLoop->getBlocks())
for (auto &Inst : *Block) {
// If instruction is unsafe just return false.
- if (!instructionSafeForVersioning(&Inst))
+ if (!instructionSafeForVersioning(&Inst)) {
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopInst", &Inst)
+ << " Unsafe Loop Instruction";
+ });
return false;
+ }
}
// Get LoopAccessInfo from current loop.
LAI = &LAA->getInfo(CurLoop);
// Check LoopAccessInfo for need of runtime check.
if (LAI->getRuntimePointerChecking()->getChecks().empty()) {
- DEBUG(dbgs() << " LAA: Runtime check not found !!\n");
+ LLVM_DEBUG(dbgs() << " LAA: Runtime check not found !!\n");
return false;
}
// Number of runtime-checks should be less then RuntimeMemoryCheckThreshold
if (LAI->getNumRuntimePointerChecks() >
VectorizerParams::RuntimeMemoryCheckThreshold) {
- DEBUG(dbgs() << " LAA: Runtime checks are more than threshold !!\n");
+ LLVM_DEBUG(
+ dbgs() << " LAA: Runtime checks are more than threshold !!\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "RuntimeCheck",
+ CurLoop->getStartLoc(),
+ CurLoop->getHeader())
+ << "Number of runtime checks "
+ << NV("RuntimeChecks", LAI->getNumRuntimePointerChecks())
+ << " exceeds threshold "
+ << NV("Threshold", VectorizerParams::RuntimeMemoryCheckThreshold);
+ });
return false;
}
// Loop should have at least one invariant load or store instruction.
if (!InvariantCounter) {
- DEBUG(dbgs() << " Invariant not found !!\n");
+ LLVM_DEBUG(dbgs() << " Invariant not found !!\n");
return false;
}
// Read only loop not allowed.
if (IsReadOnlyLoop) {
- DEBUG(dbgs() << " Found a read-only loop!\n");
+ LLVM_DEBUG(dbgs() << " Found a read-only loop!\n");
return false;
}
// Profitablity check:
// Check invariant threshold, should be in limit.
if (InvariantCounter * 100 < InvariantThreshold * LoadAndStoreCounter) {
- DEBUG(dbgs()
- << " Invariant load & store are less then defined threshold\n");
- DEBUG(dbgs() << " Invariant loads & stores: "
- << ((InvariantCounter * 100) / LoadAndStoreCounter) << "%\n");
- DEBUG(dbgs() << " Invariant loads & store threshold: "
- << InvariantThreshold << "%\n");
+ LLVM_DEBUG(
+ dbgs()
+ << " Invariant load & store are less then defined threshold\n");
+ LLVM_DEBUG(dbgs() << " Invariant loads & stores: "
+ << ((InvariantCounter * 100) / LoadAndStoreCounter)
+ << "%\n");
+ LLVM_DEBUG(dbgs() << " Invariant loads & store threshold: "
+ << InvariantThreshold << "%\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "InvariantThreshold",
+ CurLoop->getStartLoc(),
+ CurLoop->getHeader())
+ << "Invariant load & store "
+ << NV("LoadAndStoreCounter",
+ ((InvariantCounter * 100) / LoadAndStoreCounter))
+ << " are less then defined threshold "
+ << NV("Threshold", InvariantThreshold);
+ });
return false;
}
return true;
}
-/// \brief It checks loop is already visited or not.
+/// It checks loop is already visited or not.
/// check loop meta data, if loop revisited return true
/// else false.
bool LoopVersioningLICM::isLoopAlreadyVisited() {
@@ -459,42 +493,64 @@ bool LoopVersioningLICM::isLoopAlreadyVisited() {
return false;
}
-/// \brief Checks legality for LoopVersioningLICM by considering following:
+/// Checks legality for LoopVersioningLICM by considering following:
/// a) loop structure legality b) loop instruction legality
/// c) loop memory access legality.
/// Return true if legal else returns false.
bool LoopVersioningLICM::isLegalForVersioning() {
- DEBUG(dbgs() << "Loop: " << *CurLoop);
+ using namespace ore;
+ LLVM_DEBUG(dbgs() << "Loop: " << *CurLoop);
// Make sure not re-visiting same loop again.
if (isLoopAlreadyVisited()) {
- DEBUG(
+ LLVM_DEBUG(
dbgs() << " Revisiting loop in LoopVersioningLICM not allowed.\n\n");
return false;
}
// Check loop structure leagality.
if (!legalLoopStructure()) {
- DEBUG(
+ LLVM_DEBUG(
dbgs() << " Loop structure not suitable for LoopVersioningLICM\n\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopStruct",
+ CurLoop->getStartLoc(),
+ CurLoop->getHeader())
+ << " Unsafe Loop structure";
+ });
return false;
}
// Check loop instruction leagality.
if (!legalLoopInstructions()) {
- DEBUG(dbgs()
- << " Loop instructions not suitable for LoopVersioningLICM\n\n");
+ LLVM_DEBUG(
+ dbgs()
+ << " Loop instructions not suitable for LoopVersioningLICM\n\n");
return false;
}
// Check loop memory access leagality.
if (!legalLoopMemoryAccesses()) {
- DEBUG(dbgs()
- << " Loop memory access not suitable for LoopVersioningLICM\n\n");
+ LLVM_DEBUG(
+ dbgs()
+ << " Loop memory access not suitable for LoopVersioningLICM\n\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopMemoryAccess",
+ CurLoop->getStartLoc(),
+ CurLoop->getHeader())
+ << " Unsafe Loop memory access";
+ });
return false;
}
// Loop versioning is feasible, return true.
- DEBUG(dbgs() << " Loop Versioning found to be beneficial\n\n");
+ LLVM_DEBUG(dbgs() << " Loop Versioning found to be beneficial\n\n");
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "IsLegalForVersioning",
+ CurLoop->getStartLoc(), CurLoop->getHeader())
+ << " Versioned loop for LICM."
+ << " Number of runtime checks we had to insert "
+ << NV("RuntimeChecks", LAI->getNumRuntimePointerChecks());
+ });
return true;
}
-/// \brief Update loop with aggressive aliasing assumptions.
+/// Update loop with aggressive aliasing assumptions.
/// It marks no-alias to any pairs of memory operations by assuming
/// loop should not have any must-alias memory accesses pairs.
/// During LoopVersioningLICM legality we ignore loops having must
@@ -542,6 +598,7 @@ bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) {
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+ ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
LAI = nullptr;
// Set Current Loop
CurLoop = L;
@@ -592,6 +649,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(LoopVersioningLICM, "loop-versioning-licm",
"Loop Versioning For LICM", false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 46f8a3564265..68bfa0030395 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -357,7 +357,7 @@ PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F,
}
namespace {
-/// \brief Legacy pass for lowering expect intrinsics out of the IR.
+/// Legacy pass for lowering expect intrinsics out of the IR.
///
/// When this pass is run over a function it uses expect intrinsics which feed
/// branches and switches to provide branch weight metadata for those
diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 9c870b42a747..3b74421a47a0 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -25,6 +25,7 @@
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/BasicBlock.h"
@@ -55,7 +56,6 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -263,7 +263,7 @@ public:
void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
- addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI);
+ addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getDestAlignment(), MSI);
}
void addRange(int64_t Start, int64_t Size, Value *Ptr,
@@ -479,10 +479,10 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
AMemSet =
Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
- DEBUG(dbgs() << "Replace stores:\n";
- for (Instruction *SI : Range.TheStores)
- dbgs() << *SI << '\n';
- dbgs() << "With: " << *AMemSet << '\n');
+ LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI
+ : Range.TheStores) dbgs()
+ << *SI << '\n';
+ dbgs() << "With: " << *AMemSet << '\n');
if (!Range.TheStores.empty())
AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
@@ -498,16 +498,25 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
return AMemSet;
}
-static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
- const LoadInst *LI) {
+static unsigned findStoreAlignment(const DataLayout &DL, const StoreInst *SI) {
unsigned StoreAlign = SI->getAlignment();
if (!StoreAlign)
StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
+ return StoreAlign;
+}
+
+static unsigned findLoadAlignment(const DataLayout &DL, const LoadInst *LI) {
unsigned LoadAlign = LI->getAlignment();
if (!LoadAlign)
LoadAlign = DL.getABITypeAlignment(LI->getType());
+ return LoadAlign;
+}
- return std::min(StoreAlign, LoadAlign);
+static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
+ const LoadInst *LI) {
+ unsigned StoreAlign = findStoreAlignment(DL, SI);
+ unsigned LoadAlign = findLoadAlignment(DL, LI);
+ return MinAlign(StoreAlign, LoadAlign);
}
// This method try to lift a store instruction before position P.
@@ -522,7 +531,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
return false;
// Keep track of the arguments of all instruction we plan to lift
- // so we can make sure to lift them as well if apropriate.
+ // so we can make sure to lift them as well if appropriate.
DenseSet<Instruction*> Args;
if (auto *Ptr = dyn_cast<Instruction>(SI->getPointerOperand()))
if (Ptr->getParent() == SI->getParent())
@@ -594,7 +603,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
// We made it, we need to lift
for (auto *I : llvm::reverse(ToLift)) {
- DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
+ LLVM_DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
I->moveBefore(P);
}
@@ -656,22 +665,23 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
if (!AA.isNoAlias(MemoryLocation::get(SI), LoadLoc))
UseMemMove = true;
- unsigned Align = findCommonAlignment(DL, SI, LI);
uint64_t Size = DL.getTypeStoreSize(T);
IRBuilder<> Builder(P);
Instruction *M;
if (UseMemMove)
- M = Builder.CreateMemMove(SI->getPointerOperand(),
- LI->getPointerOperand(), Size,
- Align, SI->isVolatile());
+ M = Builder.CreateMemMove(
+ SI->getPointerOperand(), findStoreAlignment(DL, SI),
+ LI->getPointerOperand(), findLoadAlignment(DL, LI), Size,
+ SI->isVolatile());
else
- M = Builder.CreateMemCpy(SI->getPointerOperand(),
- LI->getPointerOperand(), Size,
- Align, SI->isVolatile());
+ M = Builder.CreateMemCpy(
+ SI->getPointerOperand(), findStoreAlignment(DL, SI),
+ LI->getPointerOperand(), findLoadAlignment(DL, LI), Size,
+ SI->isVolatile());
- DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI
- << " => " << *M << "\n");
+ LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => "
+ << *M << "\n");
MD->removeInstruction(SI);
SI->eraseFromParent();
@@ -760,7 +770,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal,
Size, Align, SI->isVolatile());
- DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
+ LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
MD->removeInstruction(SI);
SI->eraseFromParent();
@@ -1047,20 +1057,17 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
// If all checks passed, then we can transform M.
- // Make sure to use the lesser of the alignment of the source and the dest
- // since we're changing where we're reading from, but don't want to increase
- // the alignment past what can be read from or written to.
// TODO: Is this worth it if we're creating a less aligned memcpy? For
// example we could be moving from movaps -> movq on x86.
- unsigned Align = std::min(MDep->getAlignment(), M->getAlignment());
-
IRBuilder<> Builder(M);
if (UseMemMove)
- Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(),
- Align, M->isVolatile());
+ Builder.CreateMemMove(M->getRawDest(), M->getDestAlignment(),
+ MDep->getRawSource(), MDep->getSourceAlignment(),
+ M->getLength(), M->isVolatile());
else
- Builder.CreateMemCpy(M->getRawDest(), MDep->getRawSource(), M->getLength(),
- Align, M->isVolatile());
+ Builder.CreateMemCpy(M->getRawDest(), M->getDestAlignment(),
+ MDep->getRawSource(), MDep->getSourceAlignment(),
+ M->getLength(), M->isVolatile());
// Remove the instruction we're replacing.
MD->removeInstruction(M);
@@ -1106,7 +1113,7 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
// If Dest is aligned, and SrcSize is constant, use the minimum alignment
// of the sum.
const unsigned DestAlign =
- std::max(MemSet->getAlignment(), MemCpy->getAlignment());
+ std::max(MemSet->getDestAlignment(), MemCpy->getDestAlignment());
if (DestAlign > 1)
if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize))
Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign);
@@ -1166,7 +1173,7 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
IRBuilder<> Builder(MemCpy);
Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
- CopySize, MemCpy->getAlignment());
+ CopySize, MemCpy->getDestAlignment());
return true;
}
@@ -1192,7 +1199,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) {
IRBuilder<> Builder(M);
Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
- M->getAlignment(), false);
+ M->getDestAlignment(), false);
MD->removeInstruction(M);
M->eraseFromParent();
++NumCpyToSet;
@@ -1221,8 +1228,11 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
// d) memcpy from a just-memset'd source can be turned into memset.
if (DepInfo.isClobber()) {
if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
+ // FIXME: Can we pass in either of dest/src alignment here instead
+ // of conservatively taking the minimum?
+ unsigned Align = MinAlign(M->getDestAlignment(), M->getSourceAlignment());
if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
- CopySize->getZExtValue(), M->getAlignment(),
+ CopySize->getZExtValue(), Align,
C)) {
MD->removeInstruction(M);
M->eraseFromParent();
@@ -1284,8 +1294,8 @@ bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
MemoryLocation::getForSource(M)))
return false;
- DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
- << "\n");
+ LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
+ << "\n");
// If not, then we know we can transform this.
Type *ArgTys[3] = { M->getRawDest()->getType(),
@@ -1337,7 +1347,7 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
// source of the memcpy to the alignment we need. If we fail, we bail out.
AssumptionCache &AC = LookupAssumptionCache();
DominatorTree &DT = LookupDomTree();
- if (MDep->getAlignment() < ByValAlign &&
+ if (MDep->getSourceAlignment() < ByValAlign &&
getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL,
CS.getInstruction(), &AC, &DT) < ByValAlign)
return false;
@@ -1367,9 +1377,9 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
"tmpcast", CS.getInstruction());
- DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
- << " " << *MDep << "\n"
- << " " << *CS.getInstruction() << "\n");
+ LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
+ << " " << *MDep << "\n"
+ << " " << *CS.getInstruction() << "\n");
// Otherwise we're good! Update the byval argument.
CS.setArgument(ArgNo, TmpCast);
@@ -1381,10 +1391,19 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
bool MemCpyOptPass::iterateOnFunction(Function &F) {
bool MadeChange = false;
+ DominatorTree &DT = LookupDomTree();
+
// Walk all instruction in the function.
for (BasicBlock &BB : F) {
+ // Skip unreachable blocks. For example processStore assumes that an
+ // instruction in a BB can't be dominated by a later instruction in the
+ // same BB (which is a scenario that can happen for an unreachable BB that
+ // has itself as a predecessor).
+ if (!DT.isReachableFromEntry(&BB))
+ continue;
+
for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
- // Avoid invalidating the iterator.
+ // Avoid invalidating the iterator.
Instruction *I = &*BI++;
bool RepeatInstruction = false;
diff --git a/contrib/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/contrib/llvm/lib/Transforms/Scalar/MergeICmps.cpp
index 9869a3fb96fa..ff0183a8ea2d 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MergeICmps.cpp
@@ -71,30 +71,30 @@ struct BCEAtom {
};
// If this value is a load from a constant offset w.r.t. a base address, and
-// there are no othe rusers of the load or address, returns the base address and
+// there are no other users of the load or address, returns the base address and
// the offset.
BCEAtom visitICmpLoadOperand(Value *const Val) {
BCEAtom Result;
if (auto *const LoadI = dyn_cast<LoadInst>(Val)) {
- DEBUG(dbgs() << "load\n");
+ LLVM_DEBUG(dbgs() << "load\n");
if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
- DEBUG(dbgs() << "used outside of block\n");
+ LLVM_DEBUG(dbgs() << "used outside of block\n");
return {};
}
if (LoadI->isVolatile()) {
- DEBUG(dbgs() << "volatile\n");
+ LLVM_DEBUG(dbgs() << "volatile\n");
return {};
}
Value *const Addr = LoadI->getOperand(0);
if (auto *const GEP = dyn_cast<GetElementPtrInst>(Addr)) {
- DEBUG(dbgs() << "GEP\n");
+ LLVM_DEBUG(dbgs() << "GEP\n");
if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
- DEBUG(dbgs() << "used outside of block\n");
+ LLVM_DEBUG(dbgs() << "used outside of block\n");
return {};
}
const auto &DL = GEP->getModule()->getDataLayout();
if (!isDereferenceablePointer(GEP, DL)) {
- DEBUG(dbgs() << "not dereferenceable\n");
+ LLVM_DEBUG(dbgs() << "not dereferenceable\n");
// We need to make sure that we can do comparison in any order, so we
// require memory to be unconditionnally dereferencable.
return {};
@@ -110,6 +110,10 @@ BCEAtom visitICmpLoadOperand(Value *const Val) {
}
// A basic block with a comparison between two BCE atoms.
+// The block might do extra work besides the atom comparison, in which case
+// doesOtherWork() returns true. Under some conditions, the block can be
+// split into the atom comparison part and the "other work" part
+// (see canSplit()).
// Note: the terminology is misleading: the comparison is symmetric, so there
// is no real {l/r}hs. What we want though is to have the same base on the
// left (resp. right), so that we can detect consecutive loads. To ensure this
@@ -127,7 +131,7 @@ class BCECmpBlock {
return Lhs_.Base() != nullptr && Rhs_.Base() != nullptr;
}
- // Assert the the block is consistent: If valid, it should also have
+ // Assert the block is consistent: If valid, it should also have
// non-null members besides Lhs_ and Rhs_.
void AssertConsistent() const {
if (IsValid()) {
@@ -144,37 +148,95 @@ class BCECmpBlock {
// Returns true if the block does other works besides comparison.
bool doesOtherWork() const;
+ // Returns true if the non-BCE-cmp instructions can be separated from BCE-cmp
+ // instructions in the block.
+ bool canSplit() const;
+
+ // Return true if this all the relevant instructions in the BCE-cmp-block can
+ // be sunk below this instruction. By doing this, we know we can separate the
+ // BCE-cmp-block instructions from the non-BCE-cmp-block instructions in the
+ // block.
+ bool canSinkBCECmpInst(const Instruction *, DenseSet<Instruction *> &) const;
+
+ // We can separate the BCE-cmp-block instructions and the non-BCE-cmp-block
+ // instructions. Split the old block and move all non-BCE-cmp-insts into the
+ // new parent block.
+ void split(BasicBlock *NewParent) const;
+
// The basic block where this comparison happens.
BasicBlock *BB = nullptr;
// The ICMP for this comparison.
ICmpInst *CmpI = nullptr;
// The terminating branch.
BranchInst *BranchI = nullptr;
+ // The block requires splitting.
+ bool RequireSplit = false;
- private:
+private:
BCEAtom Lhs_;
BCEAtom Rhs_;
int SizeBits_ = 0;
};
+bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst,
+ DenseSet<Instruction *> &BlockInsts) const {
+ // If this instruction has side effects and its in middle of the BCE cmp block
+ // instructions, then bail for now.
+ // TODO: use alias analysis to tell whether there is real interference.
+ if (Inst->mayHaveSideEffects())
+ return false;
+ // Make sure this instruction does not use any of the BCE cmp block
+ // instructions as operand.
+ for (auto BI : BlockInsts) {
+ if (is_contained(Inst->operands(), BI))
+ return false;
+ }
+ return true;
+}
+
+void BCECmpBlock::split(BasicBlock *NewParent) const {
+ DenseSet<Instruction *> BlockInsts(
+ {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
+ llvm::SmallVector<Instruction *, 4> OtherInsts;
+ for (Instruction &Inst : *BB) {
+ if (BlockInsts.count(&Inst))
+ continue;
+ assert(canSinkBCECmpInst(&Inst, BlockInsts) && "Split unsplittable block");
+ // This is a non-BCE-cmp-block instruction. And it can be separated
+ // from the BCE-cmp-block instruction.
+ OtherInsts.push_back(&Inst);
+ }
+
+ // Do the actual spliting.
+ for (Instruction *Inst : reverse(OtherInsts)) {
+ Inst->moveBefore(&*NewParent->begin());
+ }
+}
+
+bool BCECmpBlock::canSplit() const {
+ DenseSet<Instruction *> BlockInsts(
+ {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
+ for (Instruction &Inst : *BB) {
+ if (!BlockInsts.count(&Inst)) {
+ if (!canSinkBCECmpInst(&Inst, BlockInsts))
+ return false;
+ }
+ }
+ return true;
+}
+
bool BCECmpBlock::doesOtherWork() const {
AssertConsistent();
+ // All the instructions we care about in the BCE cmp block.
+ DenseSet<Instruction *> BlockInsts(
+ {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
// TODO(courbet): Can we allow some other things ? This is very conservative.
- // We might be able to get away with anything does does not have any side
+ // We might be able to get away with anything does not have any side
// effects outside of the basic block.
// Note: The GEPs and/or loads are not necessarily in the same block.
for (const Instruction &Inst : *BB) {
- if (const auto *const GEP = dyn_cast<GetElementPtrInst>(&Inst)) {
- if (!(Lhs_.GEP == GEP || Rhs_.GEP == GEP)) return true;
- } else if (const auto *const L = dyn_cast<LoadInst>(&Inst)) {
- if (!(Lhs_.LoadI == L || Rhs_.LoadI == L)) return true;
- } else if (const auto *const C = dyn_cast<ICmpInst>(&Inst)) {
- if (C != CmpI) return true;
- } else if (const auto *const Br = dyn_cast<BranchInst>(&Inst)) {
- if (Br != BranchI) return true;
- } else {
+ if (!BlockInsts.count(&Inst))
return true;
- }
}
return false;
}
@@ -183,10 +245,19 @@ bool BCECmpBlock::doesOtherWork() const {
// BCE atoms, returns the comparison.
BCECmpBlock visitICmp(const ICmpInst *const CmpI,
const ICmpInst::Predicate ExpectedPredicate) {
+ // The comparison can only be used once:
+ // - For intermediate blocks, as a branch condition.
+ // - For the final block, as an incoming value for the Phi.
+ // If there are any other uses of the comparison, we cannot merge it with
+ // other comparisons as we would create an orphan use of the value.
+ if (!CmpI->hasOneUse()) {
+ LLVM_DEBUG(dbgs() << "cmp has several uses\n");
+ return {};
+ }
if (CmpI->getPredicate() == ExpectedPredicate) {
- DEBUG(dbgs() << "cmp "
- << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
- << "\n");
+ LLVM_DEBUG(dbgs() << "cmp "
+ << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
+ << "\n");
auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0));
if (!Lhs.Base()) return {};
auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1));
@@ -204,7 +275,7 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
if (Block->empty()) return {};
auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator());
if (!BranchI) return {};
- DEBUG(dbgs() << "branch\n");
+ LLVM_DEBUG(dbgs() << "branch\n");
if (BranchI->isUnconditional()) {
// In this case, we expect an incoming value which is the result of the
// comparison. This is the last link in the chain of comparisons (note
@@ -212,7 +283,7 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
// can be reordered).
auto *const CmpI = dyn_cast<ICmpInst>(Val);
if (!CmpI) return {};
- DEBUG(dbgs() << "icmp\n");
+ LLVM_DEBUG(dbgs() << "icmp\n");
auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ);
Result.CmpI = CmpI;
Result.BranchI = BranchI;
@@ -221,12 +292,12 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
// In this case, we expect a constant incoming value (the comparison is
// chained).
const auto *const Const = dyn_cast<ConstantInt>(Val);
- DEBUG(dbgs() << "const\n");
+ LLVM_DEBUG(dbgs() << "const\n");
if (!Const->isZero()) return {};
- DEBUG(dbgs() << "false\n");
+ LLVM_DEBUG(dbgs() << "false\n");
auto *const CmpI = dyn_cast<ICmpInst>(BranchI->getCondition());
if (!CmpI) return {};
- DEBUG(dbgs() << "icmp\n");
+ LLVM_DEBUG(dbgs() << "icmp\n");
assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch");
BasicBlock *const FalseBlock = BranchI->getSuccessor(1);
auto Result = visitICmp(
@@ -238,6 +309,18 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
return {};
}
+static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons,
+ BCECmpBlock &Comparison) {
+ LLVM_DEBUG(dbgs() << "Block '" << Comparison.BB->getName()
+ << "': Found cmp of " << Comparison.SizeBits()
+ << " bits between " << Comparison.Lhs().Base() << " + "
+ << Comparison.Lhs().Offset << " and "
+ << Comparison.Rhs().Base() << " + "
+ << Comparison.Rhs().Offset << "\n");
+ LLVM_DEBUG(dbgs() << "\n");
+ Comparisons.push_back(Comparison);
+}
+
// A chain of comparisons.
class BCECmpChain {
public:
@@ -263,9 +346,9 @@ class BCECmpChain {
// Merges the given comparison blocks into one memcmp block and update
// branches. Comparisons are assumed to be continguous. If NextBBInChain is
// null, the merged block will link to the phi block.
- static void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
- BasicBlock *const NextBBInChain, PHINode &Phi,
- const TargetLibraryInfo *const TLI);
+ void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
+ BasicBlock *const NextBBInChain, PHINode &Phi,
+ const TargetLibraryInfo *const TLI);
PHINode &Phi_;
std::vector<BCECmpBlock> Comparisons_;
@@ -275,24 +358,47 @@ class BCECmpChain {
BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi)
: Phi_(Phi) {
+ assert(!Blocks.empty() && "a chain should have at least one block");
// Now look inside blocks to check for BCE comparisons.
std::vector<BCECmpBlock> Comparisons;
- for (BasicBlock *Block : Blocks) {
+ for (size_t BlockIdx = 0; BlockIdx < Blocks.size(); ++BlockIdx) {
+ BasicBlock *const Block = Blocks[BlockIdx];
+ assert(Block && "invalid block");
BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block),
Block, Phi.getParent());
Comparison.BB = Block;
if (!Comparison.IsValid()) {
- DEBUG(dbgs() << "skip: not a valid BCECmpBlock\n");
+ LLVM_DEBUG(dbgs() << "chain with invalid BCECmpBlock, no merge.\n");
return;
}
if (Comparison.doesOtherWork()) {
- DEBUG(dbgs() << "block does extra work besides compare\n");
- if (Comparisons.empty()) { // First block.
- // TODO(courbet): The first block can do other things, and we should
- // split them apart in a separate block before the comparison chain.
- // Right now we just discard it and make the chain shorter.
- DEBUG(dbgs()
- << "ignoring first block that does extra work besides compare\n");
+ LLVM_DEBUG(dbgs() << "block '" << Comparison.BB->getName()
+ << "' does extra work besides compare\n");
+ if (Comparisons.empty()) {
+ // This is the initial block in the chain, in case this block does other
+ // work, we can try to split the block and move the irrelevant
+ // instructions to the predecessor.
+ //
+ // If this is not the initial block in the chain, splitting it wont
+ // work.
+ //
+ // As once split, there will still be instructions before the BCE cmp
+ // instructions that do other work in program order, i.e. within the
+ // chain before sorting. Unless we can abort the chain at this point
+ // and start anew.
+ //
+ // NOTE: we only handle block with single predecessor for now.
+ if (Comparison.canSplit()) {
+ LLVM_DEBUG(dbgs()
+ << "Split initial block '" << Comparison.BB->getName()
+ << "' that does extra work besides compare\n");
+ Comparison.RequireSplit = true;
+ enqueueBlock(Comparisons, Comparison);
+ } else {
+ LLVM_DEBUG(dbgs()
+ << "ignoring initial block '" << Comparison.BB->getName()
+ << "' that does extra work besides compare\n");
+ }
continue;
}
// TODO(courbet): Right now we abort the whole chain. We could be
@@ -320,13 +426,13 @@ BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi)
// We could still merge bb1 and bb2 though.
return;
}
- DEBUG(dbgs() << "*Found cmp of " << Comparison.SizeBits()
- << " bits between " << Comparison.Lhs().Base() << " + "
- << Comparison.Lhs().Offset << " and "
- << Comparison.Rhs().Base() << " + " << Comparison.Rhs().Offset
- << "\n");
- DEBUG(dbgs() << "\n");
- Comparisons.push_back(Comparison);
+ enqueueBlock(Comparisons, Comparison);
+ }
+
+ // It is possible we have no suitable comparison to merge.
+ if (Comparisons.empty()) {
+ LLVM_DEBUG(dbgs() << "chain with no BCE basic blocks, no merge\n");
+ return;
}
EntryBlock_ = Comparisons[0].BB;
Comparisons_ = std::move(Comparisons);
@@ -336,10 +442,10 @@ BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi)
#endif // MERGEICMPS_DOT_ON
// Reorder blocks by LHS. We can do that without changing the
// semantics because we are only accessing dereferencable memory.
- std::sort(Comparisons_.begin(), Comparisons_.end(),
- [](const BCECmpBlock &a, const BCECmpBlock &b) {
- return a.Lhs() < b.Lhs();
- });
+ llvm::sort(Comparisons_.begin(), Comparisons_.end(),
+ [](const BCECmpBlock &a, const BCECmpBlock &b) {
+ return a.Lhs() < b.Lhs();
+ });
#ifdef MERGEICMPS_DOT_ON
errs() << "AFTER REORDERING:\n\n";
dump();
@@ -389,10 +495,24 @@ bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI) {
Phi_.removeIncomingValue(Comparison.BB, false);
}
+ // If entry block is part of the chain, we need to make the first block
+ // of the chain the new entry block of the function.
+ BasicBlock *Entry = &Comparisons_[0].BB->getParent()->getEntryBlock();
+ for (size_t I = 1; I < Comparisons_.size(); ++I) {
+ if (Entry == Comparisons_[I].BB) {
+ BasicBlock *NEntryBB = BasicBlock::Create(Entry->getContext(), "",
+ Entry->getParent(), Entry);
+ BranchInst::Create(Entry, NEntryBB);
+ break;
+ }
+ }
+
// Point the predecessors of the chain to the first comparison block (which is
- // the new entry point).
- if (EntryBlock_ != Comparisons_[0].BB)
+ // the new entry point) and update the entry block of the chain.
+ if (EntryBlock_ != Comparisons_[0].BB) {
EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB);
+ EntryBlock_ = Comparisons_[0].BB;
+ }
// Effectively merge blocks.
int NumMerged = 1;
@@ -424,7 +544,15 @@ void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
LLVMContext &Context = BB->getContext();
if (Comparisons.size() >= 2) {
- DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n");
+ // If there is one block that requires splitting, we do it now, i.e.
+ // just before we know we will collapse the chain. The instructions
+ // can be executed before any of the instructions in the chain.
+ auto C = std::find_if(Comparisons.begin(), Comparisons.end(),
+ [](const BCECmpBlock &B) { return B.RequireSplit; });
+ if (C != Comparisons.end())
+ C->split(EntryBlock_);
+
+ LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n");
const auto TotalSize =
std::accumulate(Comparisons.begin(), Comparisons.end(), 0,
[](int Size, const BCECmpBlock &C) {
@@ -445,7 +573,8 @@ void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
IRBuilder<> Builder(BB);
const auto &DL = Phi.getModule()->getDataLayout();
Value *const MemCmpCall = emitMemCmp(
- FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP, ConstantInt::get(DL.getIntPtrType(Context), TotalSize),
+ FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP,
+ ConstantInt::get(DL.getIntPtrType(Context), TotalSize),
Builder, DL, TLI);
Value *const MemCmpIsZero = Builder.CreateICmpEQ(
MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0));
@@ -468,17 +597,17 @@ void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
} else {
assert(Comparisons.size() == 1);
// There are no blocks to merge, but we still need to update the branches.
- DEBUG(dbgs() << "Only one comparison, updating branches\n");
+ LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n");
if (NextBBInChain) {
if (FirstComparison.BranchI->isConditional()) {
- DEBUG(dbgs() << "conditional -> conditional\n");
+ LLVM_DEBUG(dbgs() << "conditional -> conditional\n");
// Just update the "true" target, the "false" target should already be
// the phi block.
assert(FirstComparison.BranchI->getSuccessor(1) == Phi.getParent());
FirstComparison.BranchI->setSuccessor(0, NextBBInChain);
Phi.addIncoming(ConstantInt::getFalse(Context), BB);
} else {
- DEBUG(dbgs() << "unconditional -> conditional\n");
+ LLVM_DEBUG(dbgs() << "unconditional -> conditional\n");
// Replace the unconditional branch by a conditional one.
FirstComparison.BranchI->eraseFromParent();
IRBuilder<> Builder(BB);
@@ -488,14 +617,14 @@ void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
}
} else {
if (FirstComparison.BranchI->isConditional()) {
- DEBUG(dbgs() << "conditional -> unconditional\n");
+ LLVM_DEBUG(dbgs() << "conditional -> unconditional\n");
// Replace the conditional branch by an unconditional one.
FirstComparison.BranchI->eraseFromParent();
IRBuilder<> Builder(BB);
Builder.CreateBr(Phi.getParent());
Phi.addIncoming(FirstComparison.CmpI, BB);
} else {
- DEBUG(dbgs() << "unconditional -> unconditional\n");
+ LLVM_DEBUG(dbgs() << "unconditional -> unconditional\n");
Phi.addIncoming(FirstComparison.CmpI, BB);
}
}
@@ -507,27 +636,28 @@ std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
int NumBlocks) {
// Walk up from the last block to find other blocks.
std::vector<BasicBlock *> Blocks(NumBlocks);
+ assert(LastBlock && "invalid last block");
BasicBlock *CurBlock = LastBlock;
for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) {
if (CurBlock->hasAddressTaken()) {
// Somebody is jumping to the block through an address, all bets are
// off.
- DEBUG(dbgs() << "skip: block " << BlockIndex
- << " has its address taken\n");
+ LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
+ << " has its address taken\n");
return {};
}
Blocks[BlockIndex] = CurBlock;
auto *SinglePredecessor = CurBlock->getSinglePredecessor();
if (!SinglePredecessor) {
// The block has two or more predecessors.
- DEBUG(dbgs() << "skip: block " << BlockIndex
- << " has two or more predecessors\n");
+ LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
+ << " has two or more predecessors\n");
return {};
}
if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) {
// The block does not link back to the phi.
- DEBUG(dbgs() << "skip: block " << BlockIndex
- << " does not link back to the phi\n");
+ LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
+ << " does not link back to the phi\n");
return {};
}
CurBlock = SinglePredecessor;
@@ -537,9 +667,9 @@ std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
}
bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
- DEBUG(dbgs() << "processPhi()\n");
+ LLVM_DEBUG(dbgs() << "processPhi()\n");
if (Phi.getNumIncomingValues() <= 1) {
- DEBUG(dbgs() << "skip: only one incoming value in phi\n");
+ LLVM_DEBUG(dbgs() << "skip: only one incoming value in phi\n");
return false;
}
// We are looking for something that has the following structure:
@@ -552,7 +682,7 @@ bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
// - The last basic block (bb4 here) must branch unconditionally to bb_phi.
// It's the only block that contributes a non-constant value to the Phi.
// - All other blocks (b1, b2, b3) must have exactly two successors, one of
- // them being the the phi block.
+ // them being the phi block.
// - All intermediate blocks (bb2, bb3) must have only one predecessor.
// - Blocks cannot do other work besides the comparison, see doesOtherWork()
@@ -563,18 +693,31 @@ bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
if (isa<ConstantInt>(Phi.getIncomingValue(I))) continue;
if (LastBlock) {
// There are several non-constant values.
- DEBUG(dbgs() << "skip: several non-constant values\n");
+ LLVM_DEBUG(dbgs() << "skip: several non-constant values\n");
+ return false;
+ }
+ if (!isa<ICmpInst>(Phi.getIncomingValue(I)) ||
+ cast<ICmpInst>(Phi.getIncomingValue(I))->getParent() !=
+ Phi.getIncomingBlock(I)) {
+ // Non-constant incoming value is not from a cmp instruction or not
+ // produced by the last block. We could end up processing the value
+ // producing block more than once.
+ //
+ // This is an uncommon case, so we bail.
+ LLVM_DEBUG(
+ dbgs()
+ << "skip: non-constant value not from cmp or not from last block.\n");
return false;
}
LastBlock = Phi.getIncomingBlock(I);
}
if (!LastBlock) {
// There is no non-constant block.
- DEBUG(dbgs() << "skip: no non-constant block\n");
+ LLVM_DEBUG(dbgs() << "skip: no non-constant block\n");
return false;
}
if (LastBlock->getSingleSuccessor() != Phi.getParent()) {
- DEBUG(dbgs() << "skip: last block non-phi successor\n");
+ LLVM_DEBUG(dbgs() << "skip: last block non-phi successor\n");
return false;
}
@@ -584,7 +727,7 @@ bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
BCECmpChain CmpChain(Blocks, Phi);
if (CmpChain.size() < 2) {
- DEBUG(dbgs() << "skip: only one compare block\n");
+ LLVM_DEBUG(dbgs() << "skip: only one compare block\n");
return false;
}
@@ -619,12 +762,16 @@ class MergeICmps : public FunctionPass {
PreservedAnalyses MergeICmps::runImpl(Function &F, const TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI) {
- DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n");
// We only try merging comparisons if the target wants to expand memcmp later.
// The rationale is to avoid turning small chains into memcmp calls.
if (!TTI->enableMemCmpExpansion(true)) return PreservedAnalyses::all();
+ // If we don't have memcmp avaiable we can't emit calls to it.
+ if (!TLI->has(LibFunc_memcmp))
+ return PreservedAnalyses::all();
+
bool MadeChange = false;
for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index f2f615cb9b0f..3464b759280f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
//! \file
-//! \brief This pass performs merges of loads and stores on both sides of a
+//! This pass performs merges of loads and stores on both sides of a
// diamond (hammock). It hoists the loads and sinks the stores.
//
// The algorithm iteratively hoists two loads to the same address out of a
@@ -80,7 +80,6 @@
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Metadata.h"
#include "llvm/Support/Debug.h"
@@ -97,7 +96,6 @@ namespace {
// MergedLoadStoreMotion Pass
//===----------------------------------------------------------------------===//
class MergedLoadStoreMotion {
- MemoryDependenceResults *MD = nullptr;
AliasAnalysis *AA = nullptr;
// The mergeLoad/Store algorithms could have Size0 * Size1 complexity,
@@ -107,14 +105,9 @@ class MergedLoadStoreMotion {
const int MagicCompileTimeControl = 250;
public:
- bool run(Function &F, MemoryDependenceResults *MD, AliasAnalysis &AA);
+ bool run(Function &F, AliasAnalysis &AA);
private:
- ///
- /// \brief Remove instruction from parent and update memory dependence
- /// analysis.
- ///
- void removeInstruction(Instruction *Inst);
BasicBlock *getDiamondTail(BasicBlock *BB);
bool isDiamondHead(BasicBlock *BB);
// Routines for sinking stores
@@ -128,23 +121,7 @@ private:
} // end anonymous namespace
///
-/// \brief Remove instruction from parent and update memory dependence analysis.
-///
-void MergedLoadStoreMotion::removeInstruction(Instruction *Inst) {
- // Notify the memory dependence analysis.
- if (MD) {
- MD->removeInstruction(Inst);
- if (auto *LI = dyn_cast<LoadInst>(Inst))
- MD->invalidateCachedPointerInfo(LI->getPointerOperand());
- if (Inst->getType()->isPtrOrPtrVectorTy()) {
- MD->invalidateCachedPointerInfo(Inst);
- }
- }
- Inst->eraseFromParent();
-}
-
-///
-/// \brief Return tail block of a diamond.
+/// Return tail block of a diamond.
///
BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) {
assert(isDiamondHead(BB) && "Basic block is not head of a diamond");
@@ -152,7 +129,7 @@ BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) {
}
///
-/// \brief True when BB is the head of a diamond (hammock)
+/// True when BB is the head of a diamond (hammock)
///
bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
if (!BB)
@@ -179,7 +156,7 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
///
-/// \brief True when instruction is a sink barrier for a store
+/// True when instruction is a sink barrier for a store
/// located in Loc
///
/// Whenever an instruction could possibly read or modify the
@@ -197,13 +174,13 @@ bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start,
}
///
-/// \brief Check if \p BB contains a store to the same address as \p SI
+/// Check if \p BB contains a store to the same address as \p SI
///
/// \return The store in \p when it is safe to sink. Otherwise return Null.
///
StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
StoreInst *Store0) {
- DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
BasicBlock *BB0 = Store0->getParent();
for (Instruction &Inst : reverse(*BB1)) {
auto *Store1 = dyn_cast<StoreInst>(&Inst);
@@ -222,7 +199,7 @@ StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
}
///
-/// \brief Create a PHI node in BB for the operands of S0 and S1
+/// Create a PHI node in BB for the operands of S0 and S1
///
PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
StoreInst *S1) {
@@ -236,13 +213,11 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
&BB->front());
NewPN->addIncoming(Opd1, S0->getParent());
NewPN->addIncoming(Opd2, S1->getParent());
- if (MD && NewPN->getType()->isPtrOrPtrVectorTy())
- MD->invalidateCachedPointerInfo(NewPN);
return NewPN;
}
///
-/// \brief Merge two stores to same address and sink into \p BB
+/// Merge two stores to same address and sink into \p BB
///
/// Also sinks GEP instruction computing the store address
///
@@ -254,9 +229,9 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() &&
(A0->getParent() == S0->getParent()) && A1->hasOneUse() &&
(A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0)) {
- DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
- dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
- dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
+ dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
+ dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
// Hoist the instruction.
BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
// Intersect optional metadata.
@@ -275,19 +250,19 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
// New PHI operand? Use it.
if (PHINode *NewPN = getPHIOperand(BB, S0, S1))
SNew->setOperand(0, NewPN);
- removeInstruction(S0);
- removeInstruction(S1);
+ S0->eraseFromParent();
+ S1->eraseFromParent();
A0->replaceAllUsesWith(ANew);
- removeInstruction(A0);
+ A0->eraseFromParent();
A1->replaceAllUsesWith(ANew);
- removeInstruction(A1);
+ A1->eraseFromParent();
return true;
}
return false;
}
///
-/// \brief True when two stores are equivalent and can sink into the footer
+/// True when two stores are equivalent and can sink into the footer
///
/// Starting from a diamond tail block, iterate over the instructions in one
/// predecessor block and try to match a store in the second predecessor.
@@ -310,7 +285,8 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
return false; // No. More than 2 predecessors.
// #Instructions in Succ1 for Compile Time Control
- int Size1 = Pred1->size();
+ auto InstsNoDbg = Pred1->instructionsWithoutDebug();
+ int Size1 = std::distance(InstsNoDbg.begin(), InstsNoDbg.end());
int NStores = 0;
for (BasicBlock::reverse_iterator RBI = Pred0->rbegin(), RBE = Pred0->rend();
@@ -338,19 +314,17 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
break;
RBI = Pred0->rbegin();
RBE = Pred0->rend();
- DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
+ LLVM_DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
}
}
return MergedStores;
}
-bool MergedLoadStoreMotion::run(Function &F, MemoryDependenceResults *MD,
- AliasAnalysis &AA) {
- this->MD = MD;
+bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) {
this->AA = &AA;
bool Changed = false;
- DEBUG(dbgs() << "Instruction Merger\n");
+ LLVM_DEBUG(dbgs() << "Instruction Merger\n");
// Merge unconditional branches, allowing PRE to catch more
// optimization opportunities.
@@ -376,15 +350,13 @@ public:
}
///
- /// \brief Run the transformation for each function
+ /// Run the transformation for each function
///
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
MergedLoadStoreMotion Impl;
- auto *MDWP = getAnalysisIfAvailable<MemoryDependenceWrapperPass>();
- return Impl.run(F, MDWP ? &MDWP->getMemDep() : nullptr,
- getAnalysis<AAResultsWrapperPass>().getAAResults());
+ return Impl.run(F, getAnalysis<AAResultsWrapperPass>().getAAResults());
}
private:
@@ -392,7 +364,6 @@ private:
AU.setPreservesCFG();
AU.addRequired<AAResultsWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addPreserved<MemoryDependenceWrapperPass>();
}
};
@@ -400,7 +371,7 @@ char MergedLoadStoreMotionLegacyPass::ID = 0;
} // anonymous namespace
///
-/// \brief createMergedLoadStoreMotionPass - The public interface to this file.
+/// createMergedLoadStoreMotionPass - The public interface to this file.
///
FunctionPass *llvm::createMergedLoadStoreMotionPass() {
return new MergedLoadStoreMotionLegacyPass();
@@ -408,7 +379,6 @@ FunctionPass *llvm::createMergedLoadStoreMotionPass() {
INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion",
"MergedLoadStoreMotion", false, false)
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion",
"MergedLoadStoreMotion", false, false)
@@ -416,14 +386,12 @@ INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion",
PreservedAnalyses
MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) {
MergedLoadStoreMotion Impl;
- auto *MD = AM.getCachedResult<MemoryDependenceAnalysis>(F);
auto &AA = AM.getResult<AAManager>(F);
- if (!Impl.run(F, MD, AA))
+ if (!Impl.run(F, AA))
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
PA.preserve<GlobalsAA>();
- PA.preserve<MemoryDependenceAnalysis>();
return PA;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index b026c8d692c3..7106ea216ad6 100644
--- a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -83,6 +83,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
@@ -105,7 +106,6 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <cstdint>
@@ -240,10 +240,17 @@ bool NaryReassociatePass::doOneIteration(Function &F) {
Changed = true;
SE->forgetValue(&*I);
I->replaceAllUsesWith(NewI);
- // If SeenExprs constains I's WeakTrackingVH, that entry will be
- // replaced with
- // nullptr.
+ WeakVH NewIExist = NewI;
+ // If SeenExprs/NewIExist contains I's WeakTrackingVH/WeakVH, that
+ // entry will be replaced with nullptr if deleted.
RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI);
+ if (!NewIExist) {
+ // Rare occation where the new instruction (NewI) have been removed,
+ // probably due to parts of the input code was dead from the
+ // beginning, reset the iterator and start over from the beginning
+ I = BB->begin();
+ continue;
+ }
I = NewI->getIterator();
}
// Add the rewritten instruction to SeenExprs; the original instruction
@@ -429,6 +436,9 @@ NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
Instruction *NaryReassociatePass::tryReassociateBinaryOp(BinaryOperator *I) {
Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+ // There is no need to reassociate 0.
+ if (SE->getSCEV(I)->isZero())
+ return nullptr;
if (auto *NewI = tryReassociateBinaryOp(LHS, RHS, I))
return NewI;
if (auto *NewI = tryReassociateBinaryOp(RHS, LHS, I))
diff --git a/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 9ebf2d769356..2eb887c986be 100644
--- a/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -77,6 +77,7 @@
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
@@ -105,7 +106,6 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVNExpression.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/PredicateInfo.h"
#include "llvm/Transforms/Utils/VNCoercion.h"
#include <algorithm>
@@ -221,13 +221,13 @@ private:
Components.resize(Components.size() + 1);
auto &Component = Components.back();
Component.insert(I);
- DEBUG(dbgs() << "Component root is " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "Component root is " << *I << "\n");
InComponent.insert(I);
ValueToComponent[I] = ComponentID;
// Pop a component off the stack and label it.
while (!Stack.empty() && Root.lookup(Stack.back()) >= OurDFS) {
auto *Member = Stack.back();
- DEBUG(dbgs() << "Component member is " << *Member << "\n");
+ LLVM_DEBUG(dbgs() << "Component member is " << *Member << "\n");
Component.insert(Member);
InComponent.insert(Member);
ValueToComponent[Member] = ComponentID;
@@ -366,9 +366,8 @@ public:
// True if this class has no memory members.
bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); }
- // Return true if two congruence classes are equivalent to each other. This
- // means
- // that every field but the ID number and the dead field are equivalent.
+ // Return true if two congruence classes are equivalent to each other. This
+ // means that every field but the ID number and the dead field are equivalent.
bool isEquivalentTo(const CongruenceClass *Other) const {
if (!Other)
return false;
@@ -383,10 +382,12 @@ public:
if (!DefiningExpr || !Other->DefiningExpr ||
*DefiningExpr != *Other->DefiningExpr)
return false;
- // We need some ordered set
- std::set<Value *> AMembers(Members.begin(), Members.end());
- std::set<Value *> BMembers(Members.begin(), Members.end());
- return AMembers == BMembers;
+
+ if (Members.size() != Other->Members.size())
+ return false;
+
+ return all_of(Members,
+ [&](const Value *V) { return Other->Members.count(V); });
}
private:
@@ -860,7 +861,7 @@ private:
// Debug counter info. When verifying, we have to reset the value numbering
// debug counter to the same state it started in to get the same results.
- std::pair<int, int> StartingVNCounter;
+ int64_t StartingVNCounter;
};
} // end anonymous namespace
@@ -958,7 +959,8 @@ static bool isCopyOfAPHI(const Value *V) {
// order. The BlockInstRange numbers are generated in an RPO walk of the basic
// blocks.
void NewGVN::sortPHIOps(MutableArrayRef<ValPair> Ops) const {
- std::sort(Ops.begin(), Ops.end(), [&](const ValPair &P1, const ValPair &P2) {
+ llvm::sort(Ops.begin(), Ops.end(),
+ [&](const ValPair &P1, const ValPair &P2) {
return BlockInstRange.lookup(P1.second).first <
BlockInstRange.lookup(P2.second).first;
});
@@ -1067,8 +1069,8 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
return nullptr;
if (auto *C = dyn_cast<Constant>(V)) {
if (I)
- DEBUG(dbgs() << "Simplified " << *I << " to "
- << " constant " << *C << "\n");
+ LLVM_DEBUG(dbgs() << "Simplified " << *I << " to "
+ << " constant " << *C << "\n");
NumGVNOpsSimplified++;
assert(isa<BasicExpression>(E) &&
"We should always have had a basic expression here");
@@ -1076,8 +1078,8 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
return createConstantExpression(C);
} else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
if (I)
- DEBUG(dbgs() << "Simplified " << *I << " to "
- << " variable " << *V << "\n");
+ LLVM_DEBUG(dbgs() << "Simplified " << *I << " to "
+ << " variable " << *V << "\n");
deleteExpression(E);
return createVariableExpression(V);
}
@@ -1100,8 +1102,8 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
}
if (I)
- DEBUG(dbgs() << "Simplified " << *I << " to "
- << " expression " << *CC->getDefiningExpr() << "\n");
+ LLVM_DEBUG(dbgs() << "Simplified " << *I << " to "
+ << " expression " << *CC->getDefiningExpr() << "\n");
NumGVNOpsSimplified++;
deleteExpression(E);
return CC->getDefiningExpr();
@@ -1257,7 +1259,7 @@ bool NewGVN::someEquivalentDominates(const Instruction *Inst,
// This must be an instruction because we are only called from phi nodes
// in the case that the value it needs to check against is an instruction.
- // The most likely candiates for dominance are the leader and the next leader.
+ // The most likely candidates for dominance are the leader and the next leader.
// The leader or nextleader will dominate in all cases where there is an
// equivalent that is higher up in the dom tree.
// We can't *only* check them, however, because the
@@ -1421,8 +1423,8 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
if (Offset >= 0) {
if (auto *C = dyn_cast<Constant>(
lookupOperandLeader(DepSI->getValueOperand()))) {
- DEBUG(dbgs() << "Coercing load from store " << *DepSI << " to constant "
- << *C << "\n");
+ LLVM_DEBUG(dbgs() << "Coercing load from store " << *DepSI
+ << " to constant " << *C << "\n");
return createConstantExpression(
getConstantStoreValueForLoad(C, Offset, LoadType, DL));
}
@@ -1437,8 +1439,8 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI)))
if (auto *PossibleConstant =
getConstantLoadValueForLoad(C, Offset, LoadType, DL)) {
- DEBUG(dbgs() << "Coercing load from load " << *LI << " to constant "
- << *PossibleConstant << "\n");
+ LLVM_DEBUG(dbgs() << "Coercing load from load " << *LI
+ << " to constant " << *PossibleConstant << "\n");
return createConstantExpression(PossibleConstant);
}
}
@@ -1447,8 +1449,8 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
if (Offset >= 0) {
if (auto *PossibleConstant =
getConstantMemInstValueForLoad(DepMI, Offset, LoadType, DL)) {
- DEBUG(dbgs() << "Coercing load from meminst " << *DepMI
- << " to constant " << *PossibleConstant << "\n");
+ LLVM_DEBUG(dbgs() << "Coercing load from meminst " << *DepMI
+ << " to constant " << *PossibleConstant << "\n");
return createConstantExpression(PossibleConstant);
}
}
@@ -1529,7 +1531,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
if (!PI)
return nullptr;
- DEBUG(dbgs() << "Found predicate info from instruction !\n");
+ LLVM_DEBUG(dbgs() << "Found predicate info from instruction !\n");
auto *PWC = dyn_cast<PredicateWithCondition>(PI);
if (!PWC)
@@ -1569,7 +1571,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
return nullptr;
if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) {
- DEBUG(dbgs() << "Copy is not of any condition operands!\n");
+ LLVM_DEBUG(dbgs() << "Copy is not of any condition operands!\n");
return nullptr;
}
Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
@@ -1584,11 +1586,11 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
SwappedOps ? Cmp->getSwappedPredicate() : Cmp->getPredicate();
if (isa<PredicateAssume>(PI)) {
- // If the comparison is true when the operands are equal, then we know the
- // operands are equal, because assumes must always be true.
- if (CmpInst::isTrueWhenEqual(Predicate)) {
+ // If we assume the operands are equal, then they are equal.
+ if (Predicate == CmpInst::ICMP_EQ) {
addPredicateUsers(PI, I);
- addAdditionalUsers(Cmp->getOperand(0), I);
+ addAdditionalUsers(SwappedOps ? Cmp->getOperand(1) : Cmp->getOperand(0),
+ I);
return createVariableOrConstant(FirstOp);
}
}
@@ -1622,7 +1624,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
auto *CI = cast<CallInst>(I);
if (auto *II = dyn_cast<IntrinsicInst>(I)) {
- // Instrinsics with the returned attribute are copies of arguments.
+ // Intrinsics with the returned attribute are copies of arguments.
if (auto *ReturnedValue = II->getReturnedArgOperand()) {
if (II->getIntrinsicID() == Intrinsic::ssa_copy)
if (const auto *Result = performSymbolicPredicateInfoEvaluation(I))
@@ -1652,10 +1654,11 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
CongruenceClass *NewClass) {
assert(NewClass &&
"Every MemoryAccess should be getting mapped to a non-null class");
- DEBUG(dbgs() << "Setting " << *From);
- DEBUG(dbgs() << " equivalent to congruence class ");
- DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader ");
- DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");
+ LLVM_DEBUG(dbgs() << "Setting " << *From);
+ LLVM_DEBUG(dbgs() << " equivalent to congruence class ");
+ LLVM_DEBUG(dbgs() << NewClass->getID()
+ << " with current MemoryAccess leader ");
+ LLVM_DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");
auto LookupResult = MemoryAccessToClass.find(From);
bool Changed = false;
@@ -1673,11 +1676,11 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
OldClass->setMemoryLeader(nullptr);
} else {
OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
- DEBUG(dbgs() << "Memory class leader change for class "
- << OldClass->getID() << " to "
- << *OldClass->getMemoryLeader()
- << " due to removal of a memory member " << *From
- << "\n");
+ LLVM_DEBUG(dbgs() << "Memory class leader change for class "
+ << OldClass->getID() << " to "
+ << *OldClass->getMemoryLeader()
+ << " due to removal of a memory member " << *From
+ << "\n");
markMemoryLeaderChangeTouched(OldClass);
}
}
@@ -1705,7 +1708,7 @@ bool NewGVN::isCycleFree(const Instruction *I) const {
if (ICS == ICS_Unknown) {
SCCFinder.Start(I);
auto &SCC = SCCFinder.getComponentFor(I);
- // It's cycle free if it's size 1 or or the SCC is *only* phi nodes.
+ // It's cycle free if it's size 1 or the SCC is *only* phi nodes.
if (SCC.size() == 1)
InstCycleState.insert({I, ICS_CycleFree});
else {
@@ -1753,12 +1756,13 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
// If it has undef at this point, it means there are no-non-undef arguments,
// and thus, the value of the phi node must be undef.
if (HasUndef) {
- DEBUG(dbgs() << "PHI Node " << *I
- << " has no non-undef arguments, valuing it as undef\n");
+ LLVM_DEBUG(
+ dbgs() << "PHI Node " << *I
+ << " has no non-undef arguments, valuing it as undef\n");
return createConstantExpression(UndefValue::get(I->getType()));
}
- DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n");
+ LLVM_DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n");
deleteExpression(E);
return createDeadExpression();
}
@@ -1797,8 +1801,8 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
InstrToDFSNum(AllSameValue) > InstrToDFSNum(I))
return E;
NumGVNPhisAllSame++;
- DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
- << "\n");
+ LLVM_DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
+ << "\n");
deleteExpression(E);
return createVariableOrConstant(AllSameValue);
}
@@ -2091,7 +2095,7 @@ void NewGVN::markUsersTouched(Value *V) {
}
void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const {
- DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
+ LLVM_DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
MemoryToUsers[To].insert(U);
}
@@ -2207,13 +2211,13 @@ Value *NewGVN::getNextValueLeader(CongruenceClass *CC) const {
//
// - I must be moving to NewClass from OldClass
// - The StoreCount of OldClass and NewClass is expected to have been updated
-// for I already if it is is a store.
+// for I already if it is a store.
// - The OldClass memory leader has not been updated yet if I was the leader.
void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
MemoryAccess *InstMA,
CongruenceClass *OldClass,
CongruenceClass *NewClass) {
- // If the leader is I, and we had a represenative MemoryAccess, it should
+ // If the leader is I, and we had a representative MemoryAccess, it should
// be the MemoryAccess of OldClass.
assert((!InstMA || !OldClass->getMemoryLeader() ||
OldClass->getLeader() != I ||
@@ -2227,8 +2231,9 @@ void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
(isa<StoreInst>(I) && NewClass->getStoreCount() == 1));
NewClass->setMemoryLeader(InstMA);
// Mark it touched if we didn't just create a singleton
- DEBUG(dbgs() << "Memory class leader change for class " << NewClass->getID()
- << " due to new memory instruction becoming leader\n");
+ LLVM_DEBUG(dbgs() << "Memory class leader change for class "
+ << NewClass->getID()
+ << " due to new memory instruction becoming leader\n");
markMemoryLeaderChangeTouched(NewClass);
}
setMemoryClass(InstMA, NewClass);
@@ -2236,10 +2241,10 @@ void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
if (OldClass->getMemoryLeader() == InstMA) {
if (!OldClass->definesNoMemory()) {
OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
- DEBUG(dbgs() << "Memory class leader change for class "
- << OldClass->getID() << " to "
- << *OldClass->getMemoryLeader()
- << " due to removal of old leader " << *InstMA << "\n");
+ LLVM_DEBUG(dbgs() << "Memory class leader change for class "
+ << OldClass->getID() << " to "
+ << *OldClass->getMemoryLeader()
+ << " due to removal of old leader " << *InstMA << "\n");
markMemoryLeaderChangeTouched(OldClass);
} else
OldClass->setMemoryLeader(nullptr);
@@ -2276,9 +2281,10 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
NewClass->setStoredValue(SE->getStoredValue());
markValueLeaderChangeTouched(NewClass);
// Shift the new class leader to be the store
- DEBUG(dbgs() << "Changing leader of congruence class "
- << NewClass->getID() << " from " << *NewClass->getLeader()
- << " to " << *SI << " because store joined class\n");
+ LLVM_DEBUG(dbgs() << "Changing leader of congruence class "
+ << NewClass->getID() << " from "
+ << *NewClass->getLeader() << " to " << *SI
+ << " because store joined class\n");
// If we changed the leader, we have to mark it changed because we don't
// know what it will do to symbolic evaluation.
NewClass->setLeader(SI);
@@ -2298,8 +2304,8 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
// See if we destroyed the class or need to swap leaders.
if (OldClass->empty() && OldClass != TOPClass) {
if (OldClass->getDefiningExpr()) {
- DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()
- << " from table\n");
+ LLVM_DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()
+ << " from table\n");
// We erase it as an exact expression to make sure we don't just erase an
// equivalent one.
auto Iter = ExpressionToClass.find_as(
@@ -2316,8 +2322,8 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
// When the leader changes, the value numbering of
// everything may change due to symbolization changes, so we need to
// reprocess.
- DEBUG(dbgs() << "Value class leader change for class " << OldClass->getID()
- << "\n");
+ LLVM_DEBUG(dbgs() << "Value class leader change for class "
+ << OldClass->getID() << "\n");
++NumGVNLeaderChanges;
// Destroy the stored value if there are no more stores to represent it.
// Note that this is basically clean up for the expression removal that
@@ -2380,12 +2386,14 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
"VariableExpression should have been handled already");
EClass = NewClass;
- DEBUG(dbgs() << "Created new congruence class for " << *I
- << " using expression " << *E << " at " << NewClass->getID()
- << " and leader " << *(NewClass->getLeader()));
+ LLVM_DEBUG(dbgs() << "Created new congruence class for " << *I
+ << " using expression " << *E << " at "
+ << NewClass->getID() << " and leader "
+ << *(NewClass->getLeader()));
if (NewClass->getStoredValue())
- DEBUG(dbgs() << " and stored value " << *(NewClass->getStoredValue()));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " and stored value "
+ << *(NewClass->getStoredValue()));
+ LLVM_DEBUG(dbgs() << "\n");
} else {
EClass = lookupResult.first->second;
if (isa<ConstantExpression>(E))
@@ -2403,8 +2411,8 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
bool ClassChanged = IClass != EClass;
bool LeaderChanged = LeaderChanges.erase(I);
if (ClassChanged || LeaderChanged) {
- DEBUG(dbgs() << "New class " << EClass->getID() << " for expression " << *E
- << "\n");
+ LLVM_DEBUG(dbgs() << "New class " << EClass->getID() << " for expression "
+ << *E << "\n");
if (ClassChanged) {
moveValueToNewCongruenceClass(I, E, IClass, EClass);
markPhiOfOpsChanged(E);
@@ -2442,13 +2450,15 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
if (ReachableEdges.insert({From, To}).second) {
// If this block wasn't reachable before, all instructions are touched.
if (ReachableBlocks.insert(To).second) {
- DEBUG(dbgs() << "Block " << getBlockName(To) << " marked reachable\n");
+ LLVM_DEBUG(dbgs() << "Block " << getBlockName(To)
+ << " marked reachable\n");
const auto &InstRange = BlockInstRange.lookup(To);
TouchedInstructions.set(InstRange.first, InstRange.second);
} else {
- DEBUG(dbgs() << "Block " << getBlockName(To)
- << " was reachable, but new edge {" << getBlockName(From)
- << "," << getBlockName(To) << "} to it found\n");
+ LLVM_DEBUG(dbgs() << "Block " << getBlockName(To)
+ << " was reachable, but new edge {"
+ << getBlockName(From) << "," << getBlockName(To)
+ << "} to it found\n");
// We've made an edge reachable to an existing block, which may
// impact predicates. Otherwise, only mark the phi nodes as touched, as
@@ -2495,12 +2505,12 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
BasicBlock *FalseSucc = BR->getSuccessor(1);
if (CondEvaluated && (CI = dyn_cast<ConstantInt>(CondEvaluated))) {
if (CI->isOne()) {
- DEBUG(dbgs() << "Condition for Terminator " << *TI
- << " evaluated to true\n");
+ LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI
+ << " evaluated to true\n");
updateReachableEdge(B, TrueSucc);
} else if (CI->isZero()) {
- DEBUG(dbgs() << "Condition for Terminator " << *TI
- << " evaluated to false\n");
+ LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI
+ << " evaluated to false\n");
updateReachableEdge(B, FalseSucc);
}
} else {
@@ -2685,8 +2695,8 @@ Value *NewGVN::findLeaderForInst(Instruction *TransInst,
auto *FoundVal = findPHIOfOpsLeader(E, OrigInst, PredBB);
if (!FoundVal) {
ExpressionToPhiOfOps[E].insert(OrigInst);
- DEBUG(dbgs() << "Cannot find phi of ops operand for " << *TransInst
- << " in block " << getBlockName(PredBB) << "\n");
+ LLVM_DEBUG(dbgs() << "Cannot find phi of ops operand for " << *TransInst
+ << " in block " << getBlockName(PredBB) << "\n");
return nullptr;
}
if (auto *SI = dyn_cast<StoreInst>(FoundVal))
@@ -2723,116 +2733,143 @@ NewGVN::makePossiblePHIOfOps(Instruction *I,
MemAccess->getDefiningAccess()->getBlock() == I->getParent())
return nullptr;
- SmallPtrSet<const Value *, 10> VisitedOps;
// Convert op of phis to phi of ops
- for (auto *Op : I->operand_values()) {
+ SmallPtrSet<const Value *, 10> VisitedOps;
+ SmallVector<Value *, 4> Ops(I->operand_values());
+ BasicBlock *SamePHIBlock = nullptr;
+ PHINode *OpPHI = nullptr;
+ if (!DebugCounter::shouldExecute(PHIOfOpsCounter))
+ return nullptr;
+ for (auto *Op : Ops) {
if (!isa<PHINode>(Op)) {
auto *ValuePHI = RealToTemp.lookup(Op);
if (!ValuePHI)
continue;
- DEBUG(dbgs() << "Found possible dependent phi of ops\n");
+ LLVM_DEBUG(dbgs() << "Found possible dependent phi of ops\n");
Op = ValuePHI;
}
- auto *OpPHI = cast<PHINode>(Op);
+ OpPHI = cast<PHINode>(Op);
+ if (!SamePHIBlock) {
+ SamePHIBlock = getBlockForValue(OpPHI);
+ } else if (SamePHIBlock != getBlockForValue(OpPHI)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "PHIs for operands are not all in the same block, aborting\n");
+ return nullptr;
+ }
// No point in doing this for one-operand phis.
- if (OpPHI->getNumOperands() == 1)
+ if (OpPHI->getNumOperands() == 1) {
+ OpPHI = nullptr;
continue;
- if (!DebugCounter::shouldExecute(PHIOfOpsCounter))
- return nullptr;
- SmallVector<ValPair, 4> Ops;
- SmallPtrSet<Value *, 4> Deps;
- auto *PHIBlock = getBlockForValue(OpPHI);
- RevisitOnReachabilityChange[PHIBlock].reset(InstrToDFSNum(I));
- for (unsigned PredNum = 0; PredNum < OpPHI->getNumOperands(); ++PredNum) {
- auto *PredBB = OpPHI->getIncomingBlock(PredNum);
- Value *FoundVal = nullptr;
- // We could just skip unreachable edges entirely but it's tricky to do
- // with rewriting existing phi nodes.
- if (ReachableEdges.count({PredBB, PHIBlock})) {
- // Clone the instruction, create an expression from it that is
- // translated back into the predecessor, and see if we have a leader.
- Instruction *ValueOp = I->clone();
- if (MemAccess)
- TempToMemory.insert({ValueOp, MemAccess});
- bool SafeForPHIOfOps = true;
- VisitedOps.clear();
- for (auto &Op : ValueOp->operands()) {
- auto *OrigOp = &*Op;
- // When these operand changes, it could change whether there is a
- // leader for us or not, so we have to add additional users.
- if (isa<PHINode>(Op)) {
- Op = Op->DoPHITranslation(PHIBlock, PredBB);
- if (Op != OrigOp && Op != I)
- Deps.insert(Op);
- } else if (auto *ValuePHI = RealToTemp.lookup(Op)) {
- if (getBlockForValue(ValuePHI) == PHIBlock)
- Op = ValuePHI->getIncomingValueForBlock(PredBB);
- }
- // If we phi-translated the op, it must be safe.
- SafeForPHIOfOps =
- SafeForPHIOfOps &&
- (Op != OrigOp || OpIsSafeForPHIOfOps(Op, PHIBlock, VisitedOps));
+ }
+ }
+
+ if (!OpPHI)
+ return nullptr;
+
+ SmallVector<ValPair, 4> PHIOps;
+ SmallPtrSet<Value *, 4> Deps;
+ auto *PHIBlock = getBlockForValue(OpPHI);
+ RevisitOnReachabilityChange[PHIBlock].reset(InstrToDFSNum(I));
+ for (unsigned PredNum = 0; PredNum < OpPHI->getNumOperands(); ++PredNum) {
+ auto *PredBB = OpPHI->getIncomingBlock(PredNum);
+ Value *FoundVal = nullptr;
+ SmallPtrSet<Value *, 4> CurrentDeps;
+ // We could just skip unreachable edges entirely but it's tricky to do
+ // with rewriting existing phi nodes.
+ if (ReachableEdges.count({PredBB, PHIBlock})) {
+ // Clone the instruction, create an expression from it that is
+ // translated back into the predecessor, and see if we have a leader.
+ Instruction *ValueOp = I->clone();
+ if (MemAccess)
+ TempToMemory.insert({ValueOp, MemAccess});
+ bool SafeForPHIOfOps = true;
+ VisitedOps.clear();
+ for (auto &Op : ValueOp->operands()) {
+ auto *OrigOp = &*Op;
+ // When these operand changes, it could change whether there is a
+ // leader for us or not, so we have to add additional users.
+ if (isa<PHINode>(Op)) {
+ Op = Op->DoPHITranslation(PHIBlock, PredBB);
+ if (Op != OrigOp && Op != I)
+ CurrentDeps.insert(Op);
+ } else if (auto *ValuePHI = RealToTemp.lookup(Op)) {
+ if (getBlockForValue(ValuePHI) == PHIBlock)
+ Op = ValuePHI->getIncomingValueForBlock(PredBB);
}
- // FIXME: For those things that are not safe we could generate
- // expressions all the way down, and see if this comes out to a
- // constant. For anything where that is true, and unsafe, we should
- // have made a phi-of-ops (or value numbered it equivalent to something)
- // for the pieces already.
- FoundVal = !SafeForPHIOfOps ? nullptr
- : findLeaderForInst(ValueOp, Visited,
- MemAccess, I, PredBB);
- ValueOp->deleteValue();
- if (!FoundVal)
- return nullptr;
- } else {
- DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
- << getBlockName(PredBB)
- << " because the block is unreachable\n");
- FoundVal = UndefValue::get(I->getType());
- RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
+ // If we phi-translated the op, it must be safe.
+ SafeForPHIOfOps =
+ SafeForPHIOfOps &&
+ (Op != OrigOp || OpIsSafeForPHIOfOps(Op, PHIBlock, VisitedOps));
}
-
- Ops.push_back({FoundVal, PredBB});
- DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in "
- << getBlockName(PredBB) << "\n");
- }
- for (auto Dep : Deps)
- addAdditionalUsers(Dep, I);
- sortPHIOps(Ops);
- auto *E = performSymbolicPHIEvaluation(Ops, I, PHIBlock);
- if (isa<ConstantExpression>(E) || isa<VariableExpression>(E)) {
- DEBUG(dbgs()
- << "Not creating real PHI of ops because it simplified to existing "
- "value or constant\n");
- return E;
- }
- auto *ValuePHI = RealToTemp.lookup(I);
- bool NewPHI = false;
- if (!ValuePHI) {
- ValuePHI =
- PHINode::Create(I->getType(), OpPHI->getNumOperands(), "phiofops");
- addPhiOfOps(ValuePHI, PHIBlock, I);
- NewPHI = true;
- NumGVNPHIOfOpsCreated++;
- }
- if (NewPHI) {
- for (auto PHIOp : Ops)
- ValuePHI->addIncoming(PHIOp.first, PHIOp.second);
- } else {
- unsigned int i = 0;
- for (auto PHIOp : Ops) {
- ValuePHI->setIncomingValue(i, PHIOp.first);
- ValuePHI->setIncomingBlock(i, PHIOp.second);
- ++i;
+ // FIXME: For those things that are not safe we could generate
+ // expressions all the way down, and see if this comes out to a
+ // constant. For anything where that is true, and unsafe, we should
+ // have made a phi-of-ops (or value numbered it equivalent to something)
+ // for the pieces already.
+ FoundVal = !SafeForPHIOfOps ? nullptr
+ : findLeaderForInst(ValueOp, Visited,
+ MemAccess, I, PredBB);
+ ValueOp->deleteValue();
+ if (!FoundVal) {
+ // We failed to find a leader for the current ValueOp, but this might
+ // change in case of the translated operands change.
+ if (SafeForPHIOfOps)
+ for (auto Dep : CurrentDeps)
+ addAdditionalUsers(Dep, I);
+
+ return nullptr;
}
+ Deps.insert(CurrentDeps.begin(), CurrentDeps.end());
+ } else {
+ LLVM_DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
+ << getBlockName(PredBB)
+ << " because the block is unreachable\n");
+ FoundVal = UndefValue::get(I->getType());
+ RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
}
- RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
- DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I
- << "\n");
+ PHIOps.push_back({FoundVal, PredBB});
+ LLVM_DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in "
+ << getBlockName(PredBB) << "\n");
+ }
+ for (auto Dep : Deps)
+ addAdditionalUsers(Dep, I);
+ sortPHIOps(PHIOps);
+ auto *E = performSymbolicPHIEvaluation(PHIOps, I, PHIBlock);
+ if (isa<ConstantExpression>(E) || isa<VariableExpression>(E)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "Not creating real PHI of ops because it simplified to existing "
+ "value or constant\n");
return E;
}
- return nullptr;
+ auto *ValuePHI = RealToTemp.lookup(I);
+ bool NewPHI = false;
+ if (!ValuePHI) {
+ ValuePHI =
+ PHINode::Create(I->getType(), OpPHI->getNumOperands(), "phiofops");
+ addPhiOfOps(ValuePHI, PHIBlock, I);
+ NewPHI = true;
+ NumGVNPHIOfOpsCreated++;
+ }
+ if (NewPHI) {
+ for (auto PHIOp : PHIOps)
+ ValuePHI->addIncoming(PHIOp.first, PHIOp.second);
+ } else {
+ TempToBlock[ValuePHI] = PHIBlock;
+ unsigned int i = 0;
+ for (auto PHIOp : PHIOps) {
+ ValuePHI->setIncomingValue(i, PHIOp.first);
+ ValuePHI->setIncomingBlock(i, PHIOp.second);
+ ++i;
+ }
+ }
+ RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
+ LLVM_DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I
+ << "\n");
+
+ return E;
}
// The algorithm initially places the values of the routine in the TOP
@@ -2902,8 +2939,9 @@ void NewGVN::initializeCongruenceClasses(Function &F) {
void NewGVN::cleanupTables() {
for (unsigned i = 0, e = CongruenceClasses.size(); i != e; ++i) {
- DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID()
- << " has " << CongruenceClasses[i]->size() << " members\n");
+ LLVM_DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID()
+ << " has " << CongruenceClasses[i]->size()
+ << " members\n");
// Make sure we delete the congruence class (probably worth switching to
// a unique_ptr at some point.
delete CongruenceClasses[i];
@@ -2973,7 +3011,7 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
// we change its DFS number so that it doesn't get value numbered.
if (isInstructionTriviallyDead(&I, TLI)) {
InstrDFS[&I] = 0;
- DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
+ LLVM_DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
markInstructionForDeletion(&I);
continue;
}
@@ -3039,9 +3077,10 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
[&AllSameValue](const MemoryAccess *V) { return V == AllSameValue; });
if (AllEqual)
- DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue << "\n");
+ LLVM_DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue
+ << "\n");
else
- DEBUG(dbgs() << "Memory Phi value numbered to itself\n");
+ LLVM_DEBUG(dbgs() << "Memory Phi value numbered to itself\n");
// If it's equal to something, it's in that class. Otherwise, it has to be in
// a class where it is the leader (other things may be equivalent to it, but
// it needs to start off in its own class, which means it must have been the
@@ -3060,7 +3099,7 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
// Value number a single instruction, symbolically evaluating, performing
// congruence finding, and updating mappings.
void NewGVN::valueNumberInstruction(Instruction *I) {
- DEBUG(dbgs() << "Processing instruction " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "Processing instruction " << *I << "\n");
if (!I->isTerminator()) {
const Expression *Symbolized = nullptr;
SmallPtrSet<Value *, 2> Visited;
@@ -3246,7 +3285,7 @@ void NewGVN::verifyMemoryCongruency() const {
// and redoing the iteration to see if anything changed.
void NewGVN::verifyIterationSettled(Function &F) {
#ifndef NDEBUG
- DEBUG(dbgs() << "Beginning iteration verification\n");
+ LLVM_DEBUG(dbgs() << "Beginning iteration verification\n");
if (DebugCounter::isCounterSet(VNCounter))
DebugCounter::setCounterValue(VNCounter, StartingVNCounter);
@@ -3364,9 +3403,9 @@ void NewGVN::iterateTouchedInstructions() {
// If it's not reachable, erase any touched instructions and move on.
if (!BlockReachable) {
TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
- DEBUG(dbgs() << "Skipping instructions in block "
- << getBlockName(CurrBlock)
- << " because it is unreachable\n");
+ LLVM_DEBUG(dbgs() << "Skipping instructions in block "
+ << getBlockName(CurrBlock)
+ << " because it is unreachable\n");
continue;
}
updateProcessedCount(CurrBlock);
@@ -3376,7 +3415,7 @@ void NewGVN::iterateTouchedInstructions() {
TouchedInstructions.reset(InstrNum);
if (auto *MP = dyn_cast<MemoryPhi>(V)) {
- DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
+ LLVM_DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
valueNumberMemoryPhi(MP);
} else if (auto *I = dyn_cast<Instruction>(V)) {
valueNumberInstruction(I);
@@ -3422,10 +3461,10 @@ bool NewGVN::runGVN() {
for (auto &B : RPOT) {
auto *Node = DT->getNode(B);
if (Node->getChildren().size() > 1)
- std::sort(Node->begin(), Node->end(),
- [&](const DomTreeNode *A, const DomTreeNode *B) {
- return RPOOrdering[A] < RPOOrdering[B];
- });
+ llvm::sort(Node->begin(), Node->end(),
+ [&](const DomTreeNode *A, const DomTreeNode *B) {
+ return RPOOrdering[A] < RPOOrdering[B];
+ });
}
// Now a standard depth first ordering of the domtree is equivalent to RPO.
@@ -3446,8 +3485,8 @@ bool NewGVN::runGVN() {
// Initialize the touched instructions to include the entry block.
const auto &InstRange = BlockInstRange.lookup(&F.getEntryBlock());
TouchedInstructions.set(InstRange.first, InstRange.second);
- DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock())
- << " marked reachable\n");
+ LLVM_DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock())
+ << " marked reachable\n");
ReachableBlocks.insert(&F.getEntryBlock());
iterateTouchedInstructions();
@@ -3472,8 +3511,8 @@ bool NewGVN::runGVN() {
};
for (auto &BB : make_filter_range(F, UnreachableBlockPred)) {
- DEBUG(dbgs() << "We believe block " << getBlockName(&BB)
- << " is unreachable\n");
+ LLVM_DEBUG(dbgs() << "We believe block " << getBlockName(&BB)
+ << " is unreachable\n");
deleteInstructionsInBlock(&BB);
Changed = true;
}
@@ -3695,7 +3734,7 @@ static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
}
void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
- DEBUG(dbgs() << " BasicBlock Dead:" << *BB);
+ LLVM_DEBUG(dbgs() << " BasicBlock Dead:" << *BB);
++NumGVNBlocksDeleted;
// Delete the instructions backwards, as it has a reduced likelihood of having
@@ -3722,12 +3761,12 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
}
void NewGVN::markInstructionForDeletion(Instruction *I) {
- DEBUG(dbgs() << "Marking " << *I << " for deletion\n");
+ LLVM_DEBUG(dbgs() << "Marking " << *I << " for deletion\n");
InstructionsToErase.insert(I);
}
void NewGVN::replaceInstruction(Instruction *I, Value *V) {
- DEBUG(dbgs() << "Replacing " << *I << " with " << *V << "\n");
+ LLVM_DEBUG(dbgs() << "Replacing " << *I << " with " << *V << "\n");
patchAndReplaceAllUsesWith(I, V);
// We save the actual erasing to avoid invalidating memory
// dependencies until we are done with everything.
@@ -3853,9 +3892,10 @@ bool NewGVN::eliminateInstructions(Function &F) {
auto ReplaceUnreachablePHIArgs = [&](PHINode *PHI, BasicBlock *BB) {
for (auto &Operand : PHI->incoming_values())
if (!ReachableEdges.count({PHI->getIncomingBlock(Operand), BB})) {
- DEBUG(dbgs() << "Replacing incoming value of " << PHI << " for block "
- << getBlockName(PHI->getIncomingBlock(Operand))
- << " with undef due to it being unreachable\n");
+ LLVM_DEBUG(dbgs() << "Replacing incoming value of " << PHI
+ << " for block "
+ << getBlockName(PHI->getIncomingBlock(Operand))
+ << " with undef due to it being unreachable\n");
Operand.set(UndefValue::get(PHI->getType()));
}
};
@@ -3887,7 +3927,8 @@ bool NewGVN::eliminateInstructions(Function &F) {
// Map to store the use counts
DenseMap<const Value *, unsigned int> UseCounts;
for (auto *CC : reverse(CongruenceClasses)) {
- DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID() << "\n");
+ LLVM_DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID()
+ << "\n");
// Track the equivalent store info so we can decide whether to try
// dead store elimination.
SmallVector<ValueDFS, 8> PossibleDeadStores;
@@ -3925,8 +3966,8 @@ bool NewGVN::eliminateInstructions(Function &F) {
MembersLeft.insert(Member);
continue;
}
- DEBUG(dbgs() << "Found replacement " << *(Leader) << " for " << *Member
- << "\n");
+ LLVM_DEBUG(dbgs() << "Found replacement " << *(Leader) << " for "
+ << *Member << "\n");
auto *I = cast<Instruction>(Member);
assert(Leader != I && "About to accidentally remove our leader");
replaceInstruction(I, Leader);
@@ -3947,7 +3988,7 @@ bool NewGVN::eliminateInstructions(Function &F) {
convertClassToDFSOrdered(*CC, DFSOrderedSet, UseCounts, ProbablyDead);
// Sort the whole thing.
- std::sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
+ llvm::sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
for (auto &VD : DFSOrderedSet) {
int MemberDFSIn = VD.DFSIn;
int MemberDFSOut = VD.DFSOut;
@@ -3966,24 +4007,24 @@ bool NewGVN::eliminateInstructions(Function &F) {
// remove from temp instruction list.
AllTempInstructions.erase(PN);
auto *DefBlock = getBlockForValue(Def);
- DEBUG(dbgs() << "Inserting fully real phi of ops" << *Def
- << " into block "
- << getBlockName(getBlockForValue(Def)) << "\n");
+ LLVM_DEBUG(dbgs() << "Inserting fully real phi of ops" << *Def
+ << " into block "
+ << getBlockName(getBlockForValue(Def)) << "\n");
PN->insertBefore(&DefBlock->front());
Def = PN;
NumGVNPHIOfOpsEliminations++;
}
if (EliminationStack.empty()) {
- DEBUG(dbgs() << "Elimination Stack is empty\n");
+ LLVM_DEBUG(dbgs() << "Elimination Stack is empty\n");
} else {
- DEBUG(dbgs() << "Elimination Stack Top DFS numbers are ("
- << EliminationStack.dfs_back().first << ","
- << EliminationStack.dfs_back().second << ")\n");
+ LLVM_DEBUG(dbgs() << "Elimination Stack Top DFS numbers are ("
+ << EliminationStack.dfs_back().first << ","
+ << EliminationStack.dfs_back().second << ")\n");
}
- DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << ","
- << MemberDFSOut << ")\n");
+ LLVM_DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << ","
+ << MemberDFSOut << ")\n");
// First, we see if we are out of scope or empty. If so,
// and there equivalences, we try to replace the top of
// stack with equivalences (if it's on the stack, it must
@@ -4058,14 +4099,16 @@ bool NewGVN::eliminateInstructions(Function &F) {
Value *DominatingLeader = EliminationStack.back();
auto *II = dyn_cast<IntrinsicInst>(DominatingLeader);
- if (II && II->getIntrinsicID() == Intrinsic::ssa_copy)
+ bool isSSACopy = II && II->getIntrinsicID() == Intrinsic::ssa_copy;
+ if (isSSACopy)
DominatingLeader = II->getOperand(0);
// Don't replace our existing users with ourselves.
if (U->get() == DominatingLeader)
continue;
- DEBUG(dbgs() << "Found replacement " << *DominatingLeader << " for "
- << *U->get() << " in " << *(U->getUser()) << "\n");
+ LLVM_DEBUG(dbgs()
+ << "Found replacement " << *DominatingLeader << " for "
+ << *U->get() << " in " << *(U->getUser()) << "\n");
// If we replaced something in an instruction, handle the patching of
// metadata. Skip this if we are replacing predicateinfo with its
@@ -4081,7 +4124,9 @@ bool NewGVN::eliminateInstructions(Function &F) {
// It's about to be alive again.
if (LeaderUseCount == 0 && isa<Instruction>(DominatingLeader))
ProbablyDead.erase(cast<Instruction>(DominatingLeader));
- if (LeaderUseCount == 0 && II)
+ // Copy instructions, however, are still dead because we use their
+ // operand as the leader.
+ if (LeaderUseCount == 0 && isSSACopy)
ProbablyDead.insert(II);
++LeaderUseCount;
AnythingReplaced = true;
@@ -4106,7 +4151,7 @@ bool NewGVN::eliminateInstructions(Function &F) {
// If we have possible dead stores to look at, try to eliminate them.
if (CC->getStoreCount() > 0) {
convertClassToLoadsAndStores(*CC, PossibleDeadStores);
- std::sort(PossibleDeadStores.begin(), PossibleDeadStores.end());
+ llvm::sort(PossibleDeadStores.begin(), PossibleDeadStores.end());
ValueDFSStack EliminationStack;
for (auto &VD : PossibleDeadStores) {
int MemberDFSIn = VD.DFSIn;
@@ -4129,8 +4174,8 @@ bool NewGVN::eliminateInstructions(Function &F) {
(void)Leader;
assert(DT->dominates(Leader->getParent(), Member->getParent()));
// Member is dominater by Leader, and thus dead
- DEBUG(dbgs() << "Marking dead store " << *Member
- << " that is dominated by " << *Leader << "\n");
+ LLVM_DEBUG(dbgs() << "Marking dead store " << *Member
+ << " that is dominated by " << *Leader << "\n");
markInstructionForDeletion(Member);
CC->erase(Member);
++NumGVNDeadStores;
diff --git a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
index 2d0cb6fbf211..8f30bccf48f1 100644
--- a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -55,6 +55,7 @@
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -65,7 +66,6 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
#define DEBUG_TYPE "safepoint-placement"
@@ -323,7 +323,7 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
// avoiding the runtime cost of the actual safepoint.
if (!AllBackedges) {
if (mustBeFiniteCountedLoop(L, SE, Pred)) {
- DEBUG(dbgs() << "skipping safepoint placement in finite loop\n");
+ LLVM_DEBUG(dbgs() << "skipping safepoint placement in finite loop\n");
FiniteExecution++;
continue;
}
@@ -332,7 +332,9 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
// Note: This is only semantically legal since we won't do any further
// IPO or inlining before the actual call insertion.. If we hadn't, we
// might latter loose this call safepoint.
- DEBUG(dbgs() << "skipping safepoint placement due to unconditional call\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "skipping safepoint placement due to unconditional call\n");
CallInLoop++;
continue;
}
@@ -348,7 +350,7 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
// variables) and branches to the true header
TerminatorInst *Term = Pred->getTerminator();
- DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term);
+ LLVM_DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term);
PollLocations.push_back(Term);
}
@@ -522,7 +524,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
};
// We need the order of list to be stable so that naming ends up stable
// when we split edges. This makes test cases much easier to write.
- std::sort(PollLocations.begin(), PollLocations.end(), OrderByBBName);
+ llvm::sort(PollLocations.begin(), PollLocations.end(), OrderByBBName);
// We can sometimes end up with duplicate poll locations. This happens if
// a single loop is visited more than once. The fact this happens seems
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
index 88dcaf0f8a36..c81ac70d99e6 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -31,6 +31,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/BasicBlock.h"
@@ -42,6 +43,7 @@
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
@@ -55,7 +57,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
#include <utility>
@@ -168,8 +169,8 @@ void ReassociatePass::BuildRankMap(Function &F,
// Assign distinct ranks to function arguments.
for (auto &Arg : F.args()) {
ValueRankMap[&Arg] = ++Rank;
- DEBUG(dbgs() << "Calculated Rank[" << Arg.getName() << "] = " << Rank
- << "\n");
+ LLVM_DEBUG(dbgs() << "Calculated Rank[" << Arg.getName() << "] = " << Rank
+ << "\n");
}
// Traverse basic blocks in ReversePostOrder
@@ -200,17 +201,17 @@ unsigned ReassociatePass::getRank(Value *V) {
// for PHI nodes, we cannot have infinite recursion here, because there
// cannot be loops in the value graph that do not go through PHI nodes.
unsigned Rank = 0, MaxRank = RankMap[I->getParent()];
- for (unsigned i = 0, e = I->getNumOperands();
- i != e && Rank != MaxRank; ++i)
+ for (unsigned i = 0, e = I->getNumOperands(); i != e && Rank != MaxRank; ++i)
Rank = std::max(Rank, getRank(I->getOperand(i)));
// If this is a not or neg instruction, do not count it for rank. This
// assures us that X and ~X will have the same rank.
- if (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
- !BinaryOperator::isFNeg(I))
+ if (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
+ !BinaryOperator::isFNeg(I))
++Rank;
- DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank << "\n");
+ LLVM_DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank
+ << "\n");
return ValueRankMap[I] = Rank;
}
@@ -445,7 +446,7 @@ using RepeatedValue = std::pair<Value*, APInt>;
/// type and thus make the expression bigger.
static bool LinearizeExprTree(BinaryOperator *I,
SmallVectorImpl<RepeatedValue> &Ops) {
- DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
+ LLVM_DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
unsigned Opcode = I->getOpcode();
assert(I->isAssociative() && I->isCommutative() &&
@@ -494,14 +495,14 @@ static bool LinearizeExprTree(BinaryOperator *I,
for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { // Visit operands.
Value *Op = I->getOperand(OpIdx);
APInt Weight = P.second; // Number of paths to this operand.
- DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
+ LLVM_DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
assert(!Op->use_empty() && "No uses, so how did we get to it?!");
// If this is a binary operation of the right kind with only one use then
// add its operands to the expression.
if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
assert(Visited.insert(Op).second && "Not first visit!");
- DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
+ LLVM_DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
Worklist.push_back(std::make_pair(BO, Weight));
continue;
}
@@ -514,7 +515,8 @@ static bool LinearizeExprTree(BinaryOperator *I,
if (!Op->hasOneUse()) {
// This value has uses not accounted for by the expression, so it is
// not safe to modify. Mark it as being a leaf.
- DEBUG(dbgs() << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
+ LLVM_DEBUG(dbgs()
+ << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
LeafOrder.push_back(Op);
Leaves[Op] = Weight;
continue;
@@ -540,7 +542,7 @@ static bool LinearizeExprTree(BinaryOperator *I,
// to the expression, then no longer consider it to be a leaf and add
// its operands to the expression.
if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
- DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
+ LLVM_DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
Worklist.push_back(std::make_pair(BO, It->second));
Leaves.erase(It);
continue;
@@ -573,9 +575,10 @@ static bool LinearizeExprTree(BinaryOperator *I,
if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op))
if ((Opcode == Instruction::Mul && BinaryOperator::isNeg(BO)) ||
(Opcode == Instruction::FMul && BinaryOperator::isFNeg(BO))) {
- DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
+ LLVM_DEBUG(dbgs()
+ << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
BO = LowerNegateToMultiply(BO);
- DEBUG(dbgs() << *BO << '\n');
+ LLVM_DEBUG(dbgs() << *BO << '\n');
Worklist.push_back(std::make_pair(BO, Weight));
Changed = true;
continue;
@@ -583,7 +586,7 @@ static bool LinearizeExprTree(BinaryOperator *I,
// Failed to morph into an expression of the right type. This really is
// a leaf.
- DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
+ LLVM_DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
assert(!isReassociableOp(Op, Opcode) && "Value was morphed?");
LeafOrder.push_back(Op);
Leaves[Op] = Weight;
@@ -675,9 +678,9 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
if (NewLHS == OldRHS && NewRHS == OldLHS) {
// The order of the operands was reversed. Swap them.
- DEBUG(dbgs() << "RA: " << *Op << '\n');
+ LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
Op->swapOperands();
- DEBUG(dbgs() << "TO: " << *Op << '\n');
+ LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
MadeChange = true;
++NumChanged;
break;
@@ -685,7 +688,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
// The new operation differs non-trivially from the original. Overwrite
// the old operands with the new ones.
- DEBUG(dbgs() << "RA: " << *Op << '\n');
+ LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
if (NewLHS != OldLHS) {
BinaryOperator *BO = isReassociableOp(OldLHS, Opcode);
if (BO && !NotRewritable.count(BO))
@@ -698,7 +701,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
NodesToRewrite.push_back(BO);
Op->setOperand(1, NewRHS);
}
- DEBUG(dbgs() << "TO: " << *Op << '\n');
+ LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
ExpressionChanged = Op;
MadeChange = true;
@@ -711,7 +714,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
// while the right-hand side will be the current element of Ops.
Value *NewRHS = Ops[i].Op;
if (NewRHS != Op->getOperand(1)) {
- DEBUG(dbgs() << "RA: " << *Op << '\n');
+ LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
if (NewRHS == Op->getOperand(0)) {
// The new right-hand side was already present as the left operand. If
// we are lucky then swapping the operands will sort out both of them.
@@ -724,7 +727,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
Op->setOperand(1, NewRHS);
ExpressionChanged = Op;
}
- DEBUG(dbgs() << "TO: " << *Op << '\n');
+ LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
MadeChange = true;
++NumChanged;
}
@@ -756,9 +759,9 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
NewOp = NodesToRewrite.pop_back_val();
}
- DEBUG(dbgs() << "RA: " << *Op << '\n');
+ LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
Op->setOperand(0, NewOp);
- DEBUG(dbgs() << "TO: " << *Op << '\n');
+ LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
ExpressionChanged = Op;
MadeChange = true;
++NumChanged;
@@ -781,6 +784,18 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
if (ExpressionChanged == I)
break;
+
+ // Discard any debug info related to the expressions that has changed (we
+ // can leave debug infor related to the root, since the result of the
+ // expression tree should be the same even after reassociation).
+ SmallVector<DbgInfoIntrinsic *, 1> DbgUsers;
+ findDbgUsers(DbgUsers, ExpressionChanged);
+ for (auto *DII : DbgUsers) {
+ Value *Undef = UndefValue::get(ExpressionChanged->getType());
+ DII->setOperand(0, MetadataAsValue::get(DII->getContext(),
+ ValueAsMetadata::get(Undef)));
+ }
+
ExpressionChanged->moveBefore(I);
ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin());
} while (true);
@@ -798,7 +813,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
/// pushing the negates through adds. These will be revisited to see if
/// additional opportunities have been exposed.
static Value *NegateValue(Value *V, Instruction *BI,
- SetVector<AssertingVH<Instruction>> &ToRedo) {
+ ReassociatePass::OrderedSet &ToRedo) {
if (auto *C = dyn_cast<Constant>(V))
return C->getType()->isFPOrFPVectorTy() ? ConstantExpr::getFNeg(C) :
ConstantExpr::getNeg(C);
@@ -912,8 +927,8 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) {
/// If we have (X-Y), and if either X is an add, or if this is only used by an
/// add, transform this into (X+(0-Y)) to promote better reassociation.
-static BinaryOperator *
-BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) {
+static BinaryOperator *BreakUpSubtract(Instruction *Sub,
+ ReassociatePass::OrderedSet &ToRedo) {
// Convert a subtract into an add and a neg instruction. This allows sub
// instructions to be commuted with other add instructions.
//
@@ -929,7 +944,7 @@ BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) {
Sub->replaceAllUsesWith(New);
New->setDebugLoc(Sub->getDebugLoc());
- DEBUG(dbgs() << "Negated: " << *New << '\n');
+ LLVM_DEBUG(dbgs() << "Negated: " << *New << '\n');
return New;
}
@@ -1415,7 +1430,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
++NumFound;
} while (i != Ops.size() && Ops[i].Op == TheOp);
- DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
+ LLVM_DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp
+ << '\n');
++NumFactor;
// Insert a new multiply.
@@ -1553,7 +1569,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
// If any factor occurred more than one time, we can pull it out.
if (MaxOcc > 1) {
- DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
+ LLVM_DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal
+ << '\n');
++NumFactor;
// Create a new instruction that uses the MaxOccVal twice. If we don't do
@@ -1622,7 +1639,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
return nullptr;
}
-/// \brief Build up a vector of value/power pairs factoring a product.
+/// Build up a vector of value/power pairs factoring a product.
///
/// Given a series of multiplication operands, build a vector of factors and
/// the powers each is raised to when forming the final product. Sort them in
@@ -1687,7 +1704,7 @@ static bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
return true;
}
-/// \brief Build a tree of multiplies, computing the product of Ops.
+/// Build a tree of multiplies, computing the product of Ops.
static Value *buildMultiplyTree(IRBuilder<> &Builder,
SmallVectorImpl<Value*> &Ops) {
if (Ops.size() == 1)
@@ -1704,7 +1721,7 @@ static Value *buildMultiplyTree(IRBuilder<> &Builder,
return LHS;
}
-/// \brief Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*...
+/// Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*...
///
/// Given a vector of values raised to various powers, where no two values are
/// equal and the powers are sorted in decreasing order, compute the minimal
@@ -1859,8 +1876,8 @@ Value *ReassociatePass::OptimizeExpression(BinaryOperator *I,
// Remove dead instructions and if any operands are trivially dead add them to
// Insts so they will be removed as well.
-void ReassociatePass::RecursivelyEraseDeadInsts(
- Instruction *I, SetVector<AssertingVH<Instruction>> &Insts) {
+void ReassociatePass::RecursivelyEraseDeadInsts(Instruction *I,
+ OrderedSet &Insts) {
assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end());
ValueRankMap.erase(I);
@@ -1876,7 +1893,7 @@ void ReassociatePass::RecursivelyEraseDeadInsts(
/// Zap the given instruction, adding interesting operands to the work list.
void ReassociatePass::EraseInst(Instruction *I) {
assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
- DEBUG(dbgs() << "Erasing dead inst: "; I->dump());
+ LLVM_DEBUG(dbgs() << "Erasing dead inst: "; I->dump());
SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
// Erase the dead instruction.
@@ -1893,7 +1910,14 @@ void ReassociatePass::EraseInst(Instruction *I) {
while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode &&
Visited.insert(Op).second)
Op = Op->user_back();
- RedoInsts.insert(Op);
+
+ // The instruction we're going to push may be coming from a
+ // dead block, and Reassociate skips the processing of unreachable
+ // blocks because it's a waste of time and also because it can
+ // lead to infinite loop due to LLVM's non-standard definition
+ // of dominance.
+ if (ValueRankMap.find(Op) != ValueRankMap.end())
+ RedoInsts.insert(Op);
}
MadeChange = true;
@@ -2120,7 +2144,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
ValueEntry(getRank(E.first), E.first));
}
- DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
// Now that we have linearized the tree to a list and have gathered all of
// the operands and their ranks, sort the operands by their rank. Use a
@@ -2138,7 +2162,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
return;
// This expression tree simplified to something that isn't a tree,
// eliminate it.
- DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
+ LLVM_DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
I->replaceAllUsesWith(V);
if (Instruction *VI = dyn_cast<Instruction>(V))
if (I->getDebugLoc())
@@ -2169,7 +2193,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
}
}
- DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
if (Ops.size() == 1) {
if (Ops[0].Op == I)
@@ -2321,7 +2345,7 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
// Make a copy of all the instructions to be redone so we can remove dead
// instructions.
- SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts);
+ OrderedSet ToRedo(RedoInsts);
// Iterate over all instructions to be reevaluated and remove trivially dead
// instructions. If any operand of the trivially dead instruction becomes
// dead mark it for deletion as well. Continue this process until all
@@ -2337,7 +2361,8 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
// Now that we have removed dead instructions, we can reoptimize the
// remaining instructions.
while (!RedoInsts.empty()) {
- Instruction *I = RedoInsts.pop_back_val();
+ Instruction *I = RedoInsts.front();
+ RedoInsts.erase(RedoInsts.begin());
if (isInstructionTriviallyDead(I))
EraseInst(I);
else
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
index 96295683314c..018feb035a4f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -17,6 +17,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/ADT/Statistic.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Function.h"
@@ -25,7 +26,7 @@
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils.h"
#include <list>
using namespace llvm;
diff --git a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index c44edbed8ed9..391e43f79121 100644
--- a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -28,6 +28,7 @@
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
@@ -64,7 +65,6 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/PromoteMemToReg.h"
#include <algorithm>
#include <cassert>
@@ -476,6 +476,12 @@ findBaseDefiningValueOfVector(Value *I) {
if (auto *BC = dyn_cast<BitCastInst>(I))
return findBaseDefiningValue(BC->getOperand(0));
+ // We assume that functions in the source language only return base
+ // pointers. This should probably be generalized via attributes to support
+ // both source language and internal functions.
+ if (isa<CallInst>(I) || isa<InvokeInst>(I))
+ return BaseDefiningValueResult(I, true);
+
// A PHI or Select is a base defining value. The outer findBasePointer
// algorithm is responsible for constructing a base value for this BDV.
assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
@@ -610,8 +616,8 @@ static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) {
Value *&Cached = Cache[I];
if (!Cached) {
Cached = findBaseDefiningValue(I).BDV;
- DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> "
- << Cached->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> "
+ << Cached->getName() << "\n");
}
assert(Cache[I] != nullptr);
return Cached;
@@ -842,9 +848,9 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
}
#ifndef NDEBUG
- DEBUG(dbgs() << "States after initialization:\n");
+ LLVM_DEBUG(dbgs() << "States after initialization:\n");
for (auto Pair : States) {
- DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
+ LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
}
#endif
@@ -917,9 +923,9 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
}
#ifndef NDEBUG
- DEBUG(dbgs() << "States after meet iteration:\n");
+ LLVM_DEBUG(dbgs() << "States after meet iteration:\n");
for (auto Pair : States) {
- DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
+ LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
}
#endif
@@ -960,7 +966,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
auto MakeBaseInstPlaceholder = [](Instruction *I) -> Instruction* {
if (isa<PHINode>(I)) {
BasicBlock *BB = I->getParent();
- int NumPreds = std::distance(pred_begin(BB), pred_end(BB));
+ int NumPreds = pred_size(BB);
assert(NumPreds > 0 && "how did we reach here");
std::string Name = suffixed_name_or(I, ".base", "base_phi");
return PHINode::Create(I->getType(), NumPreds, Name, I);
@@ -1118,10 +1124,11 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
assert(BDV && Base);
assert(!isKnownBaseResult(BDV) && "why did it get added?");
- DEBUG(dbgs() << "Updating base value cache"
- << " for: " << BDV->getName() << " from: "
- << (Cache.count(BDV) ? Cache[BDV]->getName().str() : "none")
- << " to: " << Base->getName() << "\n");
+ LLVM_DEBUG(
+ dbgs() << "Updating base value cache"
+ << " for: " << BDV->getName() << " from: "
+ << (Cache.count(BDV) ? Cache[BDV]->getName().str() : "none")
+ << " to: " << Base->getName() << "\n");
if (Cache.count(BDV)) {
assert(isKnownBaseResult(Base) &&
@@ -1369,7 +1376,7 @@ public:
assert(OldI != NewI && "Disallowed at construction?!");
assert((!IsDeoptimize || !New) &&
- "Deoptimize instrinsics are not replaced!");
+ "Deoptimize intrinsics are not replaced!");
Old = nullptr;
New = nullptr;
@@ -1379,7 +1386,7 @@ public:
if (IsDeoptimize) {
// Note: we've inserted instructions, so the call to llvm.deoptimize may
- // not necessarilly be followed by the matching return.
+ // not necessarily be followed by the matching return.
auto *RI = cast<ReturnInst>(OldI->getParent()->getTerminator());
new UnreachableInst(RI->getContext(), RI);
RI->eraseFromParent();
@@ -1805,7 +1812,7 @@ static void relocationViaAlloca(
SmallVector<Instruction *, 20> Uses;
// PERF: trade a linear scan for repeated reallocation
- Uses.reserve(std::distance(Def->user_begin(), Def->user_end()));
+ Uses.reserve(Def->getNumUses());
for (User *U : Def->users()) {
if (!isa<ConstantExpr>(U)) {
// If the def has a ConstantExpr use, then the def is either a
@@ -1817,7 +1824,7 @@ static void relocationViaAlloca(
}
}
- std::sort(Uses.begin(), Uses.end());
+ llvm::sort(Uses.begin(), Uses.end());
auto Last = std::unique(Uses.begin(), Uses.end());
Uses.erase(Last, Uses.end());
@@ -1977,7 +1984,7 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
Cost += 2;
} else {
- llvm_unreachable("unsupported instruciton type during rematerialization");
+ llvm_unreachable("unsupported instruction type during rematerialization");
}
}
@@ -2024,7 +2031,7 @@ static void rematerializeLiveValues(CallSite CS,
SmallVector<Value *, 32> LiveValuesToBeDeleted;
for (Value *LiveValue: Info.LiveSet) {
- // For each live pointer find it's defining chain
+ // For each live pointer find its defining chain
SmallVector<Instruction *, 3> ChainToBase;
assert(Info.PointerToBase.count(LiveValue));
Value *RootOfChain =
@@ -2461,22 +2468,8 @@ static void stripNonValidDataFromBody(Function &F) {
continue;
}
- if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) {
- assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!");
- bool IsImmutableTBAA =
- MD->getNumOperands() == 4 &&
- mdconst::extract<ConstantInt>(MD->getOperand(3))->getValue() == 1;
-
- if (!IsImmutableTBAA)
- continue; // no work to do, MD_tbaa is already marked mutable
-
- MDNode *Base = cast<MDNode>(MD->getOperand(0));
- MDNode *Access = cast<MDNode>(MD->getOperand(1));
- uint64_t Offset =
- mdconst::extract<ConstantInt>(MD->getOperand(2))->getZExtValue();
-
- MDNode *MutableTBAA =
- Builder.createTBAAStructTagNode(Base, Access, Offset);
+ if (MDNode *Tag = I.getMetadata(LLVMContext::MD_tbaa)) {
+ MDNode *MutableTBAA = Builder.createMutableTBAAAccessTag(Tag);
I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA);
}
@@ -2537,30 +2530,31 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
return false;
};
+
+ // Delete any unreachable statepoints so that we don't have unrewritten
+ // statepoints surviving this pass. This makes testing easier and the
+ // resulting IR less confusing to human readers.
+ DeferredDominance DD(DT);
+ bool MadeChange = removeUnreachableBlocks(F, nullptr, &DD);
+ DD.flush();
+
// Gather all the statepoints which need rewritten. Be careful to only
// consider those in reachable code since we need to ask dominance queries
// when rewriting. We'll delete the unreachable ones in a moment.
SmallVector<CallSite, 64> ParsePointNeeded;
- bool HasUnreachableStatepoint = false;
for (Instruction &I : instructions(F)) {
// TODO: only the ones with the flag set!
if (NeedsRewrite(I)) {
- if (DT.isReachableFromEntry(I.getParent()))
- ParsePointNeeded.push_back(CallSite(&I));
- else
- HasUnreachableStatepoint = true;
+ // NOTE removeUnreachableBlocks() is stronger than
+ // DominatorTree::isReachableFromEntry(). In other words
+ // removeUnreachableBlocks can remove some blocks for which
+ // isReachableFromEntry() returns true.
+ assert(DT.isReachableFromEntry(I.getParent()) &&
+ "no unreachable blocks expected");
+ ParsePointNeeded.push_back(CallSite(&I));
}
}
- bool MadeChange = false;
-
- // Delete any unreachable statepoints so that we don't have unrewritten
- // statepoints surviving this pass. This makes testing easier and the
- // resulting IR less confusing to human readers. Rather than be fancy, we
- // just reuse a utility function which removes the unreachable blocks.
- if (HasUnreachableStatepoint)
- MadeChange |= removeUnreachableBlocks(F);
-
// Return early if no work to do.
if (ParsePointNeeded.empty())
return MadeChange;
diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
index 3e12649ddedc..5e3ddeda2d49 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -17,7 +17,6 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/IPO/SCCP.h"
#include "llvm/Transforms/Scalar/SCCP.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
@@ -30,6 +29,7 @@
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueLattice.h"
#include "llvm/Analysis/ValueLatticeUtils.h"
#include "llvm/IR/BasicBlock.h"
@@ -54,9 +54,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <utility>
#include <vector>
@@ -71,8 +69,6 @@ STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");
STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP");
STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");
-STATISTIC(IPNumRangeInfoUsed, "Number of times constant range info was used by"
- "IPSCCP");
namespace {
@@ -261,7 +257,7 @@ public:
bool MarkBlockExecutable(BasicBlock *BB) {
if (!BBExecutable.insert(BB).second)
return false;
- DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n');
BBWorkList.push_back(BB); // Add the block to the work list!
return true;
}
@@ -329,6 +325,10 @@ public:
return BBExecutable.count(BB);
}
+ // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+ // block to the 'To' basic block is currently feasible.
+ bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
+
std::vector<LatticeVal> getStructLatticeValueFor(Value *V) const {
std::vector<LatticeVal> StructValues;
auto *STy = dyn_cast<StructType>(V->getType());
@@ -341,20 +341,13 @@ public:
return StructValues;
}
- ValueLatticeElement getLatticeValueFor(Value *V) {
+ const LatticeVal &getLatticeValueFor(Value *V) const {
assert(!V->getType()->isStructTy() &&
"Should use getStructLatticeValueFor");
- std::pair<DenseMap<Value*, ValueLatticeElement>::iterator, bool>
- PI = ParamState.insert(std::make_pair(V, ValueLatticeElement()));
- ValueLatticeElement &LV = PI.first->second;
- if (PI.second) {
- DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V);
- assert(I != ValueState.end() &&
- "V not found in ValueState nor Paramstate map!");
- LV = I->second.toValueLattice();
- }
-
- return LV;
+ DenseMap<Value *, LatticeVal>::const_iterator I = ValueState.find(V);
+ assert(I != ValueState.end() &&
+ "V not found in ValueState nor Paramstate map!");
+ return I->second;
}
/// getTrackedRetVals - Get the inferred return value map.
@@ -415,55 +408,57 @@ private:
// markConstant - Make a value be marked as "constant". If the value
// is not already a constant, add it to the instruction work list so that
// the users of the instruction are updated later.
- void markConstant(LatticeVal &IV, Value *V, Constant *C) {
- if (!IV.markConstant(C)) return;
- DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n');
+ bool markConstant(LatticeVal &IV, Value *V, Constant *C) {
+ if (!IV.markConstant(C)) return false;
+ LLVM_DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n');
pushToWorkList(IV, V);
+ return true;
}
- void markConstant(Value *V, Constant *C) {
+ bool markConstant(Value *V, Constant *C) {
assert(!V->getType()->isStructTy() && "structs should use mergeInValue");
- markConstant(ValueState[V], V, C);
+ return markConstant(ValueState[V], V, C);
}
void markForcedConstant(Value *V, Constant *C) {
assert(!V->getType()->isStructTy() && "structs should use mergeInValue");
LatticeVal &IV = ValueState[V];
IV.markForcedConstant(C);
- DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n');
+ LLVM_DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n');
pushToWorkList(IV, V);
}
// markOverdefined - Make a value be marked as "overdefined". If the
// value is not already overdefined, add it to the overdefined instruction
// work list so that the users of the instruction are updated later.
- void markOverdefined(LatticeVal &IV, Value *V) {
- if (!IV.markOverdefined()) return;
-
- DEBUG(dbgs() << "markOverdefined: ";
- if (auto *F = dyn_cast<Function>(V))
- dbgs() << "Function '" << F->getName() << "'\n";
- else
- dbgs() << *V << '\n');
+ bool markOverdefined(LatticeVal &IV, Value *V) {
+ if (!IV.markOverdefined()) return false;
+
+ LLVM_DEBUG(dbgs() << "markOverdefined: ";
+ if (auto *F = dyn_cast<Function>(V)) dbgs()
+ << "Function '" << F->getName() << "'\n";
+ else dbgs() << *V << '\n');
// Only instructions go on the work list
pushToWorkList(IV, V);
+ return true;
}
- void mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) {
+ bool mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) {
if (IV.isOverdefined() || MergeWithV.isUnknown())
- return; // Noop.
+ return false; // Noop.
if (MergeWithV.isOverdefined())
return markOverdefined(IV, V);
if (IV.isUnknown())
return markConstant(IV, V, MergeWithV.getConstant());
if (IV.getConstant() != MergeWithV.getConstant())
return markOverdefined(IV, V);
+ return false;
}
- void mergeInValue(Value *V, LatticeVal MergeWithV) {
+ bool mergeInValue(Value *V, LatticeVal MergeWithV) {
assert(!V->getType()->isStructTy() &&
"non-structs should use markConstant");
- mergeInValue(ValueState[V], V, MergeWithV);
+ return mergeInValue(ValueState[V], V, MergeWithV);
}
/// getValueState - Return the LatticeVal object that corresponds to the
@@ -534,30 +529,27 @@ private:
/// markEdgeExecutable - Mark a basic block as executable, adding it to the BB
/// work list if it is not already executable.
- void markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
+ bool markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second)
- return; // This edge is already known to be executable!
+ return false; // This edge is already known to be executable!
if (!MarkBlockExecutable(Dest)) {
// If the destination is already executable, we just made an *edge*
// feasible that wasn't before. Revisit the PHI nodes in the block
// because they have potentially new operands.
- DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
- << " -> " << Dest->getName() << '\n');
+ LLVM_DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
+ << " -> " << Dest->getName() << '\n');
for (PHINode &PN : Dest->phis())
visitPHINode(PN);
}
+ return true;
}
// getFeasibleSuccessors - Return a vector of booleans to indicate which
// successors are reachable from a given terminator instruction.
void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs);
- // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
- // block to the 'To' basic block is currently feasible.
- bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
-
// OperandChangedState - This method is invoked on all of the users of an
// instruction that was just changed state somehow. Based on this
// information, we need to update the specified user of this instruction.
@@ -614,7 +606,7 @@ private:
void visitInstruction(Instruction &I) {
// All the instructions we don't do any special handling for just
// go to overdefined.
- DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
+ LLVM_DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
markOverdefined(&I);
}
};
@@ -701,68 +693,17 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
return;
}
- DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n');
+ LLVM_DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n');
llvm_unreachable("SCCP: Don't know how to handle this terminator!");
}
// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
// block to the 'To' basic block is currently feasible.
bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
- assert(BBExecutable.count(To) && "Dest should always be alive!");
-
- // Make sure the source basic block is executable!!
- if (!BBExecutable.count(From)) return false;
-
- // Check to make sure this edge itself is actually feasible now.
- TerminatorInst *TI = From->getTerminator();
- if (auto *BI = dyn_cast<BranchInst>(TI)) {
- if (BI->isUnconditional())
- return true;
-
- LatticeVal BCValue = getValueState(BI->getCondition());
-
- // Overdefined condition variables mean the branch could go either way,
- // undef conditions mean that neither edge is feasible yet.
- ConstantInt *CI = BCValue.getConstantInt();
- if (!CI)
- return !BCValue.isUnknown();
-
- // Constant condition variables mean the branch can only go a single way.
- return BI->getSuccessor(CI->isZero()) == To;
- }
-
- // Unwinding instructions successors are always executable.
- if (TI->isExceptional())
- return true;
-
- if (auto *SI = dyn_cast<SwitchInst>(TI)) {
- if (SI->getNumCases() < 1)
- return true;
-
- LatticeVal SCValue = getValueState(SI->getCondition());
- ConstantInt *CI = SCValue.getConstantInt();
-
- if (!CI)
- return !SCValue.isUnknown();
-
- return SI->findCaseValue(CI)->getCaseSuccessor() == To;
- }
-
- // In case of indirect branch and its address is a blockaddress, we mark
- // the target as executable.
- if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
- LatticeVal IBRValue = getValueState(IBR->getAddress());
- BlockAddress *Addr = IBRValue.getBlockAddress();
-
- if (!Addr)
- return !IBRValue.isUnknown();
-
- // At this point, the indirectbr is branching on a blockaddress.
- return Addr->getBasicBlock() == To;
- }
-
- DEBUG(dbgs() << "Unknown terminator instruction: " << *TI << '\n');
- llvm_unreachable("SCCP: Don't know how to handle this terminator!");
+ // Check if we've called markEdgeExecutable on the edge yet. (We could
+ // be more aggressive and try to consider edges which haven't been marked
+ // yet, but there isn't any need.)
+ return KnownFeasibleEdges.count(Edge(From, To));
}
// visit Implementations - Something changed in this instruction, either an
@@ -786,7 +727,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
// If this PN returns a struct, just mark the result overdefined.
// TODO: We could do a lot better than this if code actually uses this.
if (PN.getType()->isStructTy())
- return markOverdefined(&PN);
+ return (void)markOverdefined(&PN);
if (getValueState(&PN).isOverdefined())
return; // Quick exit
@@ -794,7 +735,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
// Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
// and slow us down a lot. Just mark them overdefined.
if (PN.getNumIncomingValues() > 64)
- return markOverdefined(&PN);
+ return (void)markOverdefined(&PN);
// Look at all of the executable operands of the PHI node. If any of them
// are overdefined, the PHI becomes overdefined as well. If they are all
@@ -810,7 +751,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
continue;
if (IV.isOverdefined()) // PHI node becomes overdefined!
- return markOverdefined(&PN);
+ return (void)markOverdefined(&PN);
if (!OperandVal) { // Grab the first value.
OperandVal = IV.getConstant();
@@ -824,7 +765,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
// Check to see if there are two different constants merging, if so, the PHI
// node is overdefined.
if (IV.getConstant() != OperandVal)
- return markOverdefined(&PN);
+ return (void)markOverdefined(&PN);
}
// If we exited the loop, this means that the PHI node only has constant
@@ -892,11 +833,11 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
// If this returns a struct, mark all elements over defined, we don't track
// structs in structs.
if (EVI.getType()->isStructTy())
- return markOverdefined(&EVI);
+ return (void)markOverdefined(&EVI);
// If this is extracting from more than one level of struct, we don't know.
if (EVI.getNumIndices() != 1)
- return markOverdefined(&EVI);
+ return (void)markOverdefined(&EVI);
Value *AggVal = EVI.getAggregateOperand();
if (AggVal->getType()->isStructTy()) {
@@ -905,19 +846,19 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
mergeInValue(getValueState(&EVI), &EVI, EltVal);
} else {
// Otherwise, must be extracting from an array.
- return markOverdefined(&EVI);
+ return (void)markOverdefined(&EVI);
}
}
void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
auto *STy = dyn_cast<StructType>(IVI.getType());
if (!STy)
- return markOverdefined(&IVI);
+ return (void)markOverdefined(&IVI);
// If this has more than one index, we can't handle it, drive all results to
// undef.
if (IVI.getNumIndices() != 1)
- return markOverdefined(&IVI);
+ return (void)markOverdefined(&IVI);
Value *Aggr = IVI.getAggregateOperand();
unsigned Idx = *IVI.idx_begin();
@@ -946,7 +887,7 @@ void SCCPSolver::visitSelectInst(SelectInst &I) {
// If this select returns a struct, just mark the result overdefined.
// TODO: We could do a lot better than this if code actually uses this.
if (I.getType()->isStructTy())
- return markOverdefined(&I);
+ return (void)markOverdefined(&I);
LatticeVal CondValue = getValueState(I.getCondition());
if (CondValue.isUnknown())
@@ -967,12 +908,12 @@ void SCCPSolver::visitSelectInst(SelectInst &I) {
// select ?, C, C -> C.
if (TVal.isConstant() && FVal.isConstant() &&
TVal.getConstant() == FVal.getConstant())
- return markConstant(&I, FVal.getConstant());
+ return (void)markConstant(&I, FVal.getConstant());
if (TVal.isUnknown()) // select ?, undef, X -> X.
- return mergeInValue(&I, FVal);
+ return (void)mergeInValue(&I, FVal);
if (FVal.isUnknown()) // select ?, X, undef -> X.
- return mergeInValue(&I, TVal);
+ return (void)mergeInValue(&I, TVal);
markOverdefined(&I);
}
@@ -990,7 +931,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
// X op Y -> undef.
if (isa<UndefValue>(C))
return;
- return markConstant(IV, &I, C);
+ return (void)markConstant(IV, &I, C);
}
// If something is undef, wait for it to resolve.
@@ -1003,7 +944,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
// overdefined, and we can replace it with zero.
if (I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv)
if (V1State.isConstant() && V1State.getConstant()->isNullValue())
- return markConstant(IV, &I, V1State.getConstant());
+ return (void)markConstant(IV, &I, V1State.getConstant());
// If this is:
// -> AND/MUL with 0
@@ -1026,12 +967,12 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
// X and 0 = 0
// X * 0 = 0
if (NonOverdefVal->getConstant()->isNullValue())
- return markConstant(IV, &I, NonOverdefVal->getConstant());
+ return (void)markConstant(IV, &I, NonOverdefVal->getConstant());
} else {
// X or -1 = -1
if (ConstantInt *CI = NonOverdefVal->getConstantInt())
if (CI->isMinusOne())
- return markConstant(IV, &I, NonOverdefVal->getConstant());
+ return (void)markConstant(IV, &I, NonOverdefVal->getConstant());
}
}
}
@@ -1041,22 +982,36 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
// Handle ICmpInst instruction.
void SCCPSolver::visitCmpInst(CmpInst &I) {
- LatticeVal V1State = getValueState(I.getOperand(0));
- LatticeVal V2State = getValueState(I.getOperand(1));
-
LatticeVal &IV = ValueState[&I];
if (IV.isOverdefined()) return;
- if (V1State.isConstant() && V2State.isConstant()) {
- Constant *C = ConstantExpr::getCompare(
- I.getPredicate(), V1State.getConstant(), V2State.getConstant());
+ Value *Op1 = I.getOperand(0);
+ Value *Op2 = I.getOperand(1);
+
+ // For parameters, use ParamState which includes constant range info if
+ // available.
+ auto V1Param = ParamState.find(Op1);
+ ValueLatticeElement V1State = (V1Param != ParamState.end())
+ ? V1Param->second
+ : getValueState(Op1).toValueLattice();
+
+ auto V2Param = ParamState.find(Op2);
+ ValueLatticeElement V2State = V2Param != ParamState.end()
+ ? V2Param->second
+ : getValueState(Op2).toValueLattice();
+
+ Constant *C = V1State.getCompare(I.getPredicate(), I.getType(), V2State);
+ if (C) {
if (isa<UndefValue>(C))
return;
- return markConstant(IV, &I, C);
+ LatticeVal CV;
+ CV.markConstant(C);
+ mergeInValue(&I, CV);
+ return;
}
// If operands are still unknown, wait for it to resolve.
- if (!V1State.isOverdefined() && !V2State.isOverdefined())
+ if (!V1State.isOverdefined() && !V2State.isOverdefined() && !IV.isConstant())
return;
markOverdefined(&I);
@@ -1076,7 +1031,7 @@ void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
return; // Operands are not resolved yet.
if (State.isOverdefined())
- return markOverdefined(&I);
+ return (void)markOverdefined(&I);
assert(State.isConstant() && "Unknown state!");
Operands.push_back(State.getConstant());
@@ -1114,7 +1069,7 @@ void SCCPSolver::visitStoreInst(StoreInst &SI) {
void SCCPSolver::visitLoadInst(LoadInst &I) {
// If this load is of a struct, just mark the result overdefined.
if (I.getType()->isStructTy())
- return markOverdefined(&I);
+ return (void)markOverdefined(&I);
LatticeVal PtrVal = getValueState(I.getOperand(0));
if (PtrVal.isUnknown()) return; // The pointer is not resolved yet!
@@ -1123,13 +1078,17 @@ void SCCPSolver::visitLoadInst(LoadInst &I) {
if (IV.isOverdefined()) return;
if (!PtrVal.isConstant() || I.isVolatile())
- return markOverdefined(IV, &I);
+ return (void)markOverdefined(IV, &I);
Constant *Ptr = PtrVal.getConstant();
// load null is undefined.
- if (isa<ConstantPointerNull>(Ptr) && I.getPointerAddressSpace() == 0)
- return;
+ if (isa<ConstantPointerNull>(Ptr)) {
+ if (NullPointerIsDefined(I.getFunction(), I.getPointerAddressSpace()))
+ return (void)markOverdefined(IV, &I);
+ else
+ return;
+ }
// Transform load (constant global) into the value loaded.
if (auto *GV = dyn_cast<GlobalVariable>(Ptr)) {
@@ -1148,7 +1107,7 @@ void SCCPSolver::visitLoadInst(LoadInst &I) {
if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) {
if (isa<UndefValue>(C))
return;
- return markConstant(IV, &I, C);
+ return (void)markConstant(IV, &I, C);
}
// Otherwise we cannot say for certain what value this load will produce.
@@ -1180,7 +1139,7 @@ CallOverdefined:
if (State.isUnknown())
return; // Operands are not resolved yet.
if (State.isOverdefined())
- return markOverdefined(I);
+ return (void)markOverdefined(I);
assert(State.isConstant() && "Unknown state!");
Operands.push_back(State.getConstant());
}
@@ -1194,12 +1153,12 @@ CallOverdefined:
// call -> undef.
if (isa<UndefValue>(C))
return;
- return markConstant(I, C);
+ return (void)markConstant(I, C);
}
}
// Otherwise, we don't know anything about this call, mark it overdefined.
- return markOverdefined(I);
+ return (void)markOverdefined(I);
}
// If this is a local function that doesn't have its address taken, mark its
@@ -1227,8 +1186,16 @@ CallOverdefined:
} else {
// Most other parts of the Solver still only use the simpler value
// lattice, so we propagate changes for parameters to both lattices.
- getParamState(&*AI).mergeIn(getValueState(*CAI).toValueLattice(), DL);
- mergeInValue(&*AI, getValueState(*CAI));
+ LatticeVal ConcreteArgument = getValueState(*CAI);
+ bool ParamChanged =
+ getParamState(&*AI).mergeIn(ConcreteArgument.toValueLattice(), DL);
+ bool ValueChanged = mergeInValue(&*AI, ConcreteArgument);
+ // Add argument to work list, if the state of a parameter changes but
+ // ValueState does not change (because it is already overdefined there),
+ // We have to take changes in ParamState into account, as it is used
+ // when evaluating Cmp instructions.
+ if (!ValueChanged && ParamChanged)
+ pushToWorkList(ValueState[&*AI], &*AI);
}
}
}
@@ -1262,7 +1229,7 @@ void SCCPSolver::Solve() {
while (!OverdefinedInstWorkList.empty()) {
Value *I = OverdefinedInstWorkList.pop_back_val();
- DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n');
+ LLVM_DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n');
// "I" got into the work list because it either made the transition from
// bottom to constant, or to overdefined.
@@ -1280,7 +1247,7 @@ void SCCPSolver::Solve() {
while (!InstWorkList.empty()) {
Value *I = InstWorkList.pop_back_val();
- DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n');
+ LLVM_DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n');
// "I" got into the work list because it made the transition from undef to
// constant.
@@ -1300,7 +1267,7 @@ void SCCPSolver::Solve() {
BasicBlock *BB = BBWorkList.back();
BBWorkList.pop_back();
- DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n');
+ LLVM_DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n');
// Notify all instructions in this basic block that they are newly
// executable.
@@ -1521,7 +1488,11 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
break;
case Instruction::ICmp:
// X == undef -> undef. Other comparisons get more complicated.
- if (cast<ICmpInst>(&I)->isEquality())
+ Op0LV = getValueState(I.getOperand(0));
+ Op1LV = getValueState(I.getOperand(1));
+
+ if ((Op0LV.isUnknown() || Op1LV.isUnknown()) &&
+ cast<ICmpInst>(&I)->isEquality())
break;
markOverdefined(&I);
return true;
@@ -1566,11 +1537,14 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
}
// Otherwise, it is a branch on a symbolic value which is currently
- // considered to be undef. Handle this by forcing the input value to the
- // branch to false.
- markForcedConstant(BI->getCondition(),
- ConstantInt::getFalse(TI->getContext()));
- return true;
+ // considered to be undef. Make sure some edge is executable, so a
+ // branch on "undef" always flows somewhere.
+ // FIXME: Distinguish between dead code and an LLVM "undef" value.
+ BasicBlock *DefaultSuccessor = TI->getSuccessor(1);
+ if (markEdgeExecutable(&BB, DefaultSuccessor))
+ return true;
+
+ continue;
}
if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
@@ -1591,11 +1565,15 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
}
// Otherwise, it is a branch on a symbolic value which is currently
- // considered to be undef. Handle this by forcing the input value to the
- // branch to the first successor.
- markForcedConstant(IBR->getAddress(),
- BlockAddress::get(IBR->getSuccessor(0)));
- return true;
+ // considered to be undef. Make sure some edge is executable, so a
+ // branch on "undef" always flows somewhere.
+ // FIXME: IndirectBr on "undef" doesn't actually need to go anywhere:
+ // we can assume the branch has undefined behavior instead.
+ BasicBlock *DefaultSuccessor = IBR->getSuccessor(0);
+ if (markEdgeExecutable(&BB, DefaultSuccessor))
+ return true;
+
+ continue;
}
if (auto *SI = dyn_cast<SwitchInst>(TI)) {
@@ -1610,56 +1588,19 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
return true;
}
- markForcedConstant(SI->getCondition(), SI->case_begin()->getCaseValue());
- return true;
- }
- }
-
- return false;
-}
-
-static bool tryToReplaceWithConstantRange(SCCPSolver &Solver, Value *V) {
- bool Changed = false;
-
- // Currently we only use range information for integer values.
- if (!V->getType()->isIntegerTy())
- return false;
-
- const ValueLatticeElement &IV = Solver.getLatticeValueFor(V);
- if (!IV.isConstantRange())
- return false;
+ // Otherwise, it is a branch on a symbolic value which is currently
+ // considered to be undef. Make sure some edge is executable, so a
+ // branch on "undef" always flows somewhere.
+ // FIXME: Distinguish between dead code and an LLVM "undef" value.
+ BasicBlock *DefaultSuccessor = SI->case_begin()->getCaseSuccessor();
+ if (markEdgeExecutable(&BB, DefaultSuccessor))
+ return true;
- for (auto UI = V->uses().begin(), E = V->uses().end(); UI != E;) {
- const Use &U = *UI++;
- auto *Icmp = dyn_cast<ICmpInst>(U.getUser());
- if (!Icmp || !Solver.isBlockExecutable(Icmp->getParent()))
continue;
-
- auto getIcmpLatticeValue = [&](Value *Op) {
- if (auto *C = dyn_cast<Constant>(Op))
- return ValueLatticeElement::get(C);
- return Solver.getLatticeValueFor(Op);
- };
-
- ValueLatticeElement A = getIcmpLatticeValue(Icmp->getOperand(0));
- ValueLatticeElement B = getIcmpLatticeValue(Icmp->getOperand(1));
-
- Constant *C = nullptr;
- if (A.satisfiesPredicate(Icmp->getPredicate(), B))
- C = ConstantInt::getTrue(Icmp->getType());
- else if (A.satisfiesPredicate(Icmp->getInversePredicate(), B))
- C = ConstantInt::getFalse(Icmp->getType());
-
- if (C) {
- Icmp->replaceAllUsesWith(C);
- DEBUG(dbgs() << "Replacing " << *Icmp << " with " << *C
- << ", because of range information " << A << " " << B
- << "\n");
- Icmp->eraseFromParent();
- Changed = true;
}
}
- return Changed;
+
+ return false;
}
static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
@@ -1679,26 +1620,18 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
}
Const = ConstantStruct::get(ST, ConstVals);
} else {
- const ValueLatticeElement &IV = Solver.getLatticeValueFor(V);
+ const LatticeVal &IV = Solver.getLatticeValueFor(V);
if (IV.isOverdefined())
return false;
- if (IV.isConstantRange()) {
- if (IV.getConstantRange().isSingleElement())
- Const =
- ConstantInt::get(V->getType(), IV.asConstantInteger().getValue());
- else
- return false;
- } else
- Const =
- IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType());
+ Const = IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType());
}
assert(Const && "Constant is nullptr here!");
// Replacing `musttail` instructions with constant breaks `musttail` invariant
// unless the call itself can be removed
CallInst *CI = dyn_cast<CallInst>(V);
- if (CI && CI->isMustTailCall() && !isInstructionTriviallyDead(CI)) {
+ if (CI && CI->isMustTailCall() && !CI->isSafeToRemove()) {
CallSite CS(CI);
Function *F = CS.getCalledFunction();
@@ -1706,12 +1639,12 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
if (F)
Solver.AddMustTailCallee(F);
- DEBUG(dbgs() << " Can\'t treat the result of musttail call : " << *CI
- << " as a constant\n");
+ LLVM_DEBUG(dbgs() << " Can\'t treat the result of musttail call : " << *CI
+ << " as a constant\n");
return false;
}
- DEBUG(dbgs() << " Constant: " << *Const << " = " << *V << '\n');
+ LLVM_DEBUG(dbgs() << " Constant: " << *Const << " = " << *V << '\n');
// Replaces all of the uses of a variable with uses of the constant.
V->replaceAllUsesWith(Const);
@@ -1722,7 +1655,7 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
// and return true if the function was modified.
static bool runSCCP(Function &F, const DataLayout &DL,
const TargetLibraryInfo *TLI) {
- DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
+ LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
SCCPSolver Solver(DL, TLI);
// Mark the first block of the function as being executable.
@@ -1736,7 +1669,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
bool ResolvedUndefs = true;
while (ResolvedUndefs) {
Solver.Solve();
- DEBUG(dbgs() << "RESOLVING UNDEFs\n");
+ LLVM_DEBUG(dbgs() << "RESOLVING UNDEFs\n");
ResolvedUndefs = Solver.ResolvedUndefsIn(F);
}
@@ -1748,7 +1681,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
for (BasicBlock &BB : F) {
if (!Solver.isBlockExecutable(&BB)) {
- DEBUG(dbgs() << " BasicBlock Dead:" << BB);
+ LLVM_DEBUG(dbgs() << " BasicBlock Dead:" << BB);
++NumDeadBlocks;
NumInstRemoved += removeAllNonTerminatorAndEHPadInstructions(&BB);
@@ -1785,6 +1718,7 @@ PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) {
auto PA = PreservedAnalyses();
PA.preserve<GlobalsAA>();
+ PA.preserveSet<CFGAnalyses>();
return PA;
}
@@ -1807,6 +1741,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.setPreservesCFG();
}
// runOnFunction - Run the Sparse Conditional Constant Propagation
@@ -1844,15 +1779,15 @@ static void findReturnsToZap(Function &F,
// There is a non-removable musttail call site of this function. Zapping
// returns is not allowed.
if (Solver.isMustTailCallee(&F)) {
- DEBUG(dbgs() << "Can't zap returns of the function : " << F.getName()
- << " due to present musttail call of it\n");
+ LLVM_DEBUG(dbgs() << "Can't zap returns of the function : " << F.getName()
+ << " due to present musttail call of it\n");
return;
}
for (BasicBlock &BB : F) {
if (CallInst *CI = BB.getTerminatingMustTailCall()) {
- DEBUG(dbgs() << "Can't zap return of the block due to present "
- << "musttail call : " << *CI << "\n");
+ LLVM_DEBUG(dbgs() << "Can't zap return of the block due to present "
+ << "musttail call : " << *CI << "\n");
(void)CI;
return;
}
@@ -1863,8 +1798,8 @@ static void findReturnsToZap(Function &F,
}
}
-static bool runIPSCCP(Module &M, const DataLayout &DL,
- const TargetLibraryInfo *TLI) {
+bool llvm::runIPSCCP(Module &M, const DataLayout &DL,
+ const TargetLibraryInfo *TLI) {
SCCPSolver Solver(DL, TLI);
// Loop over all functions, marking arguments to those with their addresses
@@ -1904,13 +1839,17 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
// Solve for constants.
bool ResolvedUndefs = true;
+ Solver.Solve();
while (ResolvedUndefs) {
- Solver.Solve();
-
- DEBUG(dbgs() << "RESOLVING UNDEFS\n");
+ LLVM_DEBUG(dbgs() << "RESOLVING UNDEFS\n");
ResolvedUndefs = false;
for (Function &F : M)
- ResolvedUndefs |= Solver.ResolvedUndefsIn(F);
+ if (Solver.ResolvedUndefsIn(F)) {
+ // We run Solve() after we resolved an undef in a function, because
+ // we might deduce a fact that eliminates an undef in another function.
+ Solver.Solve();
+ ResolvedUndefs = true;
+ }
}
bool MadeChanges = false;
@@ -1930,18 +1869,12 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
++IPNumArgsElimed;
continue;
}
-
- if (!AI->use_empty() && tryToReplaceWithConstantRange(Solver, &*AI))
- ++IPNumRangeInfoUsed;
}
for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
if (!Solver.isBlockExecutable(&*BB)) {
- DEBUG(dbgs() << " BasicBlock Dead:" << *BB);
-
+ LLVM_DEBUG(dbgs() << " BasicBlock Dead:" << *BB);
++NumDeadBlocks;
- NumInstRemoved +=
- changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false);
MadeChanges = true;
@@ -1955,7 +1888,7 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
if (Inst->getType()->isVoidTy())
continue;
if (tryToReplaceWithConstant(Solver, Inst)) {
- if (!isa<CallInst>(Inst) && !isa<TerminatorInst>(Inst))
+ if (Inst->isSafeToRemove())
Inst->eraseFromParent();
// Hey, we just changed something!
MadeChanges = true;
@@ -1964,6 +1897,17 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
}
}
+ // Change dead blocks to unreachable. We do it after replacing constants in
+ // all executable blocks, because changeToUnreachable may remove PHI nodes
+ // in executable blocks we found values for. The function's entry block is
+ // not part of BlocksToErase, so we have to handle it separately.
+ for (BasicBlock *BB : BlocksToErase)
+ NumInstRemoved +=
+ changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false);
+ if (!Solver.isBlockExecutable(&F.front()))
+ NumInstRemoved += changeToUnreachable(F.front().getFirstNonPHI(),
+ /*UseLLVMTrap=*/false);
+
// Now that all instructions in the function are constant folded, erase dead
// blocks, because we can now use ConstantFoldTerminator to get rid of
// in-edges.
@@ -1983,31 +1927,33 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
bool Folded = ConstantFoldTerminator(I->getParent());
if (!Folded) {
- // The constant folder may not have been able to fold the terminator
- // if this is a branch or switch on undef. Fold it manually as a
- // branch to the first successor.
-#ifndef NDEBUG
- if (auto *BI = dyn_cast<BranchInst>(I)) {
- assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) &&
- "Branch should be foldable!");
- } else if (auto *SI = dyn_cast<SwitchInst>(I)) {
- assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold");
+ // If the branch can't be folded, we must have forced an edge
+ // for an indeterminate value. Force the terminator to fold
+ // to that edge.
+ Constant *C;
+ BasicBlock *Dest;
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+ Dest = SI->case_begin()->getCaseSuccessor();
+ C = SI->case_begin()->getCaseValue();
+ } else if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+ Dest = BI->getSuccessor(1);
+ C = ConstantInt::getFalse(BI->getContext());
+ } else if (IndirectBrInst *IBR = dyn_cast<IndirectBrInst>(I)) {
+ Dest = IBR->getSuccessor(0);
+ C = BlockAddress::get(IBR->getSuccessor(0));
} else {
- llvm_unreachable("Didn't fold away reference to block!");
+ llvm_unreachable("Unexpected terminator instruction");
}
-#endif
-
- // Make this an uncond branch to the first successor.
- TerminatorInst *TI = I->getParent()->getTerminator();
- BranchInst::Create(TI->getSuccessor(0), TI);
+ assert(Solver.isEdgeFeasible(I->getParent(), Dest) &&
+ "Didn't find feasible edge?");
+ (void)Dest;
- // Remove entries in successor phi nodes to remove edges.
- for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
- TI->getSuccessor(i)->removePredecessor(TI->getParent());
-
- // Remove the old terminator.
- TI->eraseFromParent();
+ I->setOperand(0, C);
+ Folded = ConstantFoldTerminator(I->getParent());
}
+ assert(Folded &&
+ "Expect TermInst on constantint or blockaddress to be folded");
+ (void) Folded;
}
// Finally, delete the basic block.
@@ -2058,7 +2004,8 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
GlobalVariable *GV = I->first;
assert(!I->second.isOverdefined() &&
"Overdefined values should have been taken out of the map!");
- DEBUG(dbgs() << "Found that GV '" << GV->getName() << "' is constant!\n");
+ LLVM_DEBUG(dbgs() << "Found that GV '" << GV->getName()
+ << "' is constant!\n");
while (!GV->use_empty()) {
StoreInst *SI = cast<StoreInst>(GV->user_back());
SI->eraseFromParent();
@@ -2069,55 +2016,3 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
return MadeChanges;
}
-
-PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) {
- const DataLayout &DL = M.getDataLayout();
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
- if (!runIPSCCP(M, DL, &TLI))
- return PreservedAnalyses::all();
- return PreservedAnalyses::none();
-}
-
-namespace {
-
-//===--------------------------------------------------------------------===//
-//
-/// IPSCCP Class - This class implements interprocedural Sparse Conditional
-/// Constant Propagation.
-///
-class IPSCCPLegacyPass : public ModulePass {
-public:
- static char ID;
-
- IPSCCPLegacyPass() : ModulePass(ID) {
- initializeIPSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
- const DataLayout &DL = M.getDataLayout();
- const TargetLibraryInfo *TLI =
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
- return runIPSCCP(M, DL, TLI);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
-};
-
-} // end anonymous namespace
-
-char IPSCCPLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(IPSCCPLegacyPass, "ipsccp",
- "Interprocedural Sparse Conditional Constant Propagation",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(IPSCCPLegacyPass, "ipsccp",
- "Interprocedural Sparse Conditional Constant Propagation",
- false, false)
-
-// createIPSCCPPass - This is the public interface to this file.
-ModulePass *llvm::createIPSCCPPass() { return new IPSCCPLegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
index bfe3754f0769..6c3f012c6280 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -42,6 +42,8 @@
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/PtrUseVisitor.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/ConstantFolder.h"
@@ -79,7 +81,6 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/PromoteMemToReg.h"
#include <algorithm>
#include <cassert>
@@ -124,14 +125,9 @@ static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
cl::Hidden);
-/// Hidden option to allow more aggressive splitting.
-static cl::opt<bool>
-SROASplitNonWholeAllocaSlices("sroa-split-nonwhole-alloca-slices",
- cl::init(false), cl::Hidden);
-
namespace {
-/// \brief A custom IRBuilder inserter which prefixes all names, but only in
+/// A custom IRBuilder inserter which prefixes all names, but only in
/// Assert builds.
class IRBuilderPrefixedInserter : public IRBuilderDefaultInserter {
std::string Prefix;
@@ -151,23 +147,23 @@ protected:
}
};
-/// \brief Provide a type for IRBuilder that drops names in release builds.
+/// Provide a type for IRBuilder that drops names in release builds.
using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
-/// \brief A used slice of an alloca.
+/// A used slice of an alloca.
///
/// This structure represents a slice of an alloca used by some instruction. It
/// stores both the begin and end offsets of this use, a pointer to the use
/// itself, and a flag indicating whether we can classify the use as splittable
/// or not when forming partitions of the alloca.
class Slice {
- /// \brief The beginning offset of the range.
+ /// The beginning offset of the range.
uint64_t BeginOffset = 0;
- /// \brief The ending offset, not included in the range.
+ /// The ending offset, not included in the range.
uint64_t EndOffset = 0;
- /// \brief Storage for both the use of this slice and whether it can be
+ /// Storage for both the use of this slice and whether it can be
/// split.
PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
@@ -189,7 +185,7 @@ public:
bool isDead() const { return getUse() == nullptr; }
void kill() { UseAndIsSplittable.setPointer(nullptr); }
- /// \brief Support for ordering ranges.
+ /// Support for ordering ranges.
///
/// This provides an ordering over ranges such that start offsets are
/// always increasing, and within equal start offsets, the end offsets are
@@ -207,7 +203,7 @@ public:
return false;
}
- /// \brief Support comparison with a single offset to allow binary searches.
+ /// Support comparison with a single offset to allow binary searches.
friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
uint64_t RHSOffset) {
return LHS.beginOffset() < RHSOffset;
@@ -233,7 +229,7 @@ template <> struct isPodLike<Slice> { static const bool value = true; };
} // end namespace llvm
-/// \brief Representation of the alloca slices.
+/// Representation of the alloca slices.
///
/// This class represents the slices of an alloca which are formed by its
/// various uses. If a pointer escapes, we can't fully build a representation
@@ -242,16 +238,16 @@ template <> struct isPodLike<Slice> { static const bool value = true; };
/// starting at a particular offset before splittable slices.
class llvm::sroa::AllocaSlices {
public:
- /// \brief Construct the slices of a particular alloca.
+ /// Construct the slices of a particular alloca.
AllocaSlices(const DataLayout &DL, AllocaInst &AI);
- /// \brief Test whether a pointer to the allocation escapes our analysis.
+ /// Test whether a pointer to the allocation escapes our analysis.
///
/// If this is true, the slices are never fully built and should be
/// ignored.
bool isEscaped() const { return PointerEscapingInstr; }
- /// \brief Support for iterating over the slices.
+ /// Support for iterating over the slices.
/// @{
using iterator = SmallVectorImpl<Slice>::iterator;
using range = iterator_range<iterator>;
@@ -266,10 +262,10 @@ public:
const_iterator end() const { return Slices.end(); }
/// @}
- /// \brief Erase a range of slices.
+ /// Erase a range of slices.
void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
- /// \brief Insert new slices for this alloca.
+ /// Insert new slices for this alloca.
///
/// This moves the slices into the alloca's slices collection, and re-sorts
/// everything so that the usual ordering properties of the alloca's slices
@@ -278,7 +274,7 @@ public:
int OldSize = Slices.size();
Slices.append(NewSlices.begin(), NewSlices.end());
auto SliceI = Slices.begin() + OldSize;
- std::sort(SliceI, Slices.end());
+ llvm::sort(SliceI, Slices.end());
std::inplace_merge(Slices.begin(), SliceI, Slices.end());
}
@@ -287,10 +283,10 @@ public:
class partition_iterator;
iterator_range<partition_iterator> partitions();
- /// \brief Access the dead users for this alloca.
+ /// Access the dead users for this alloca.
ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
- /// \brief Access the dead operands referring to this alloca.
+ /// Access the dead operands referring to this alloca.
///
/// These are operands which have cannot actually be used to refer to the
/// alloca as they are outside its range and the user doesn't correct for
@@ -316,11 +312,11 @@ private:
friend class AllocaSlices::SliceBuilder;
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- /// \brief Handle to alloca instruction to simplify method interfaces.
+ /// Handle to alloca instruction to simplify method interfaces.
AllocaInst &AI;
#endif
- /// \brief The instruction responsible for this alloca not having a known set
+ /// The instruction responsible for this alloca not having a known set
/// of slices.
///
/// When an instruction (potentially) escapes the pointer to the alloca, we
@@ -328,7 +324,7 @@ private:
/// alloca. This will be null if the alloca slices are analyzed successfully.
Instruction *PointerEscapingInstr;
- /// \brief The slices of the alloca.
+ /// The slices of the alloca.
///
/// We store a vector of the slices formed by uses of the alloca here. This
/// vector is sorted by increasing begin offset, and then the unsplittable
@@ -336,7 +332,7 @@ private:
/// details.
SmallVector<Slice, 8> Slices;
- /// \brief Instructions which will become dead if we rewrite the alloca.
+ /// Instructions which will become dead if we rewrite the alloca.
///
/// Note that these are not separated by slice. This is because we expect an
/// alloca to be completely rewritten or not rewritten at all. If rewritten,
@@ -344,7 +340,7 @@ private:
/// they come from outside of the allocated space.
SmallVector<Instruction *, 8> DeadUsers;
- /// \brief Operands which will become dead if we rewrite the alloca.
+ /// Operands which will become dead if we rewrite the alloca.
///
/// These are operands that in their particular use can be replaced with
/// undef when we rewrite the alloca. These show up in out-of-bounds inputs
@@ -355,7 +351,7 @@ private:
SmallVector<Use *, 8> DeadOperands;
};
-/// \brief A partition of the slices.
+/// A partition of the slices.
///
/// An ephemeral representation for a range of slices which can be viewed as
/// a partition of the alloca. This range represents a span of the alloca's
@@ -371,32 +367,32 @@ private:
using iterator = AllocaSlices::iterator;
- /// \brief The beginning and ending offsets of the alloca for this
+ /// The beginning and ending offsets of the alloca for this
/// partition.
uint64_t BeginOffset, EndOffset;
- /// \brief The start and end iterators of this partition.
+ /// The start and end iterators of this partition.
iterator SI, SJ;
- /// \brief A collection of split slice tails overlapping the partition.
+ /// A collection of split slice tails overlapping the partition.
SmallVector<Slice *, 4> SplitTails;
- /// \brief Raw constructor builds an empty partition starting and ending at
+ /// Raw constructor builds an empty partition starting and ending at
/// the given iterator.
Partition(iterator SI) : SI(SI), SJ(SI) {}
public:
- /// \brief The start offset of this partition.
+ /// The start offset of this partition.
///
/// All of the contained slices start at or after this offset.
uint64_t beginOffset() const { return BeginOffset; }
- /// \brief The end offset of this partition.
+ /// The end offset of this partition.
///
/// All of the contained slices end at or before this offset.
uint64_t endOffset() const { return EndOffset; }
- /// \brief The size of the partition.
+ /// The size of the partition.
///
/// Note that this can never be zero.
uint64_t size() const {
@@ -404,7 +400,7 @@ public:
return EndOffset - BeginOffset;
}
- /// \brief Test whether this partition contains no slices, and merely spans
+ /// Test whether this partition contains no slices, and merely spans
/// a region occupied by split slices.
bool empty() const { return SI == SJ; }
@@ -421,7 +417,7 @@ public:
iterator end() const { return SJ; }
/// @}
- /// \brief Get the sequence of split slice tails.
+ /// Get the sequence of split slice tails.
///
/// These tails are of slices which start before this partition but are
/// split and overlap into the partition. We accumulate these while forming
@@ -429,7 +425,7 @@ public:
ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
};
-/// \brief An iterator over partitions of the alloca's slices.
+/// An iterator over partitions of the alloca's slices.
///
/// This iterator implements the core algorithm for partitioning the alloca's
/// slices. It is a forward iterator as we don't support backtracking for
@@ -443,18 +439,18 @@ class AllocaSlices::partition_iterator
Partition> {
friend class AllocaSlices;
- /// \brief Most of the state for walking the partitions is held in a class
+ /// Most of the state for walking the partitions is held in a class
/// with a nice interface for examining them.
Partition P;
- /// \brief We need to keep the end of the slices to know when to stop.
+ /// We need to keep the end of the slices to know when to stop.
AllocaSlices::iterator SE;
- /// \brief We also need to keep track of the maximum split end offset seen.
+ /// We also need to keep track of the maximum split end offset seen.
/// FIXME: Do we really?
uint64_t MaxSplitSliceEndOffset = 0;
- /// \brief Sets the partition to be empty at given iterator, and sets the
+ /// Sets the partition to be empty at given iterator, and sets the
/// end iterator.
partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
: P(SI), SE(SE) {
@@ -464,7 +460,7 @@ class AllocaSlices::partition_iterator
advance();
}
- /// \brief Advance the iterator to the next partition.
+ /// Advance the iterator to the next partition.
///
/// Requires that the iterator not be at the end of the slices.
void advance() {
@@ -619,7 +615,7 @@ public:
Partition &operator*() { return P; }
};
-/// \brief A forward range over the partitions of the alloca's slices.
+/// A forward range over the partitions of the alloca's slices.
///
/// This accesses an iterator range over the partitions of the alloca's
/// slices. It computes these partitions on the fly based on the overlapping
@@ -643,7 +639,7 @@ static Value *foldSelectInst(SelectInst &SI) {
return nullptr;
}
-/// \brief A helper that folds a PHI node or a select.
+/// A helper that folds a PHI node or a select.
static Value *foldPHINodeOrSelectInst(Instruction &I) {
if (PHINode *PN = dyn_cast<PHINode>(&I)) {
// If PN merges together the same value, return that value.
@@ -652,7 +648,7 @@ static Value *foldPHINodeOrSelectInst(Instruction &I) {
return foldSelectInst(cast<SelectInst>(I));
}
-/// \brief Builder for the alloca slices.
+/// Builder for the alloca slices.
///
/// This class builds a set of alloca slices by recursively visiting the uses
/// of an alloca and making a slice for each load and store at each offset.
@@ -668,7 +664,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
- /// \brief Set to de-duplicate dead instructions found in the use walk.
+ /// Set to de-duplicate dead instructions found in the use walk.
SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
public:
@@ -687,11 +683,12 @@ private:
// Completely skip uses which have a zero size or start either before or
// past the end of the allocation.
if (Size == 0 || Offset.uge(AllocSize)) {
- DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
- << " which has zero size or starts outside of the "
- << AllocSize << " byte alloca:\n"
- << " alloca: " << AS.AI << "\n"
- << " use: " << I << "\n");
+ LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
+ << Offset
+ << " which has zero size or starts outside of the "
+ << AllocSize << " byte alloca:\n"
+ << " alloca: " << AS.AI << "\n"
+ << " use: " << I << "\n");
return markAsDead(I);
}
@@ -706,10 +703,11 @@ private:
// them, and so have to record at least the information here.
assert(AllocSize >= BeginOffset); // Established above.
if (Size > AllocSize - BeginOffset) {
- DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
- << " to remain within the " << AllocSize << " byte alloca:\n"
- << " alloca: " << AS.AI << "\n"
- << " use: " << I << "\n");
+ LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
+ << Offset << " to remain within the " << AllocSize
+ << " byte alloca:\n"
+ << " alloca: " << AS.AI << "\n"
+ << " use: " << I << "\n");
EndOffset = AllocSize;
}
@@ -802,18 +800,18 @@ private:
uint64_t Size = DL.getTypeStoreSize(ValOp->getType());
// If this memory access can be shown to *statically* extend outside the
- // bounds of of the allocation, it's behavior is undefined, so simply
+ // bounds of the allocation, it's behavior is undefined, so simply
// ignore it. Note that this is more strict than the generic clamping
// behavior of insertUse. We also try to handle cases which might run the
// risk of overflow.
// FIXME: We should instead consider the pointer to have escaped if this
// function is being instrumented for addressing bugs or race conditions.
if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
- DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
- << " which extends past the end of the " << AllocSize
- << " byte alloca:\n"
- << " alloca: " << AS.AI << "\n"
- << " use: " << SI << "\n");
+ LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
+ << Offset << " which extends past the end of the "
+ << AllocSize << " byte alloca:\n"
+ << " alloca: " << AS.AI << "\n"
+ << " use: " << SI << "\n");
return markAsDead(SI);
}
@@ -1027,7 +1025,7 @@ private:
void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
- /// \brief Disable SROA entirely if there are unhandled users of the alloca.
+ /// Disable SROA entirely if there are unhandled users of the alloca.
void visitInstruction(Instruction &I) { PI.setAborted(&I); }
};
@@ -1062,7 +1060,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
// Sort the uses. This arranges for the offsets to be in ascending order,
// and the sizes to be in descending order.
- std::sort(Slices.begin(), Slices.end());
+ llvm::sort(Slices.begin(), Slices.end());
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1240,7 +1238,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
}
static void speculatePHINodeLoads(PHINode &PN) {
- DEBUG(dbgs() << " original: " << PN << "\n");
+ LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
IRBuilderTy PHIBuilder(&PN);
@@ -1263,10 +1261,21 @@ static void speculatePHINodeLoads(PHINode &PN) {
}
// Inject loads into all of the pred blocks.
+ DenseMap<BasicBlock*, Value*> InjectedLoads;
for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
BasicBlock *Pred = PN.getIncomingBlock(Idx);
- TerminatorInst *TI = Pred->getTerminator();
Value *InVal = PN.getIncomingValue(Idx);
+
+ // A PHI node is allowed to have multiple (duplicated) entries for the same
+ // basic block, as long as the value is the same. So if we already injected
+ // a load in the predecessor, then we should reuse the same load for all
+ // duplicated entries.
+ if (Value* V = InjectedLoads.lookup(Pred)) {
+ NewPN->addIncoming(V, Pred);
+ continue;
+ }
+
+ TerminatorInst *TI = Pred->getTerminator();
IRBuilderTy PredBuilder(TI);
LoadInst *Load = PredBuilder.CreateLoad(
@@ -1276,9 +1285,10 @@ static void speculatePHINodeLoads(PHINode &PN) {
if (AATags)
Load->setAAMetadata(AATags);
NewPN->addIncoming(Load, Pred);
+ InjectedLoads[Pred] = Load;
}
- DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
+ LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
PN.eraseFromParent();
}
@@ -1318,7 +1328,7 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) {
}
static void speculateSelectInstLoads(SelectInst &SI) {
- DEBUG(dbgs() << " original: " << SI << "\n");
+ LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
IRBuilderTy IRB(&SI);
Value *TV = SI.getTrueValue();
@@ -1349,14 +1359,14 @@ static void speculateSelectInstLoads(SelectInst &SI) {
Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
LI->getName() + ".sroa.speculated");
- DEBUG(dbgs() << " speculated to: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
LI->replaceAllUsesWith(V);
LI->eraseFromParent();
}
SI.eraseFromParent();
}
-/// \brief Build a GEP out of a base pointer and indices.
+/// Build a GEP out of a base pointer and indices.
///
/// This will return the BasePtr if that is valid, or build a new GEP
/// instruction using the IRBuilder if GEP-ing is needed.
@@ -1374,7 +1384,7 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
NamePrefix + "sroa_idx");
}
-/// \brief Get a natural GEP off of the BasePtr walking through Ty toward
+/// Get a natural GEP off of the BasePtr walking through Ty toward
/// TargetTy without changing the offset of the pointer.
///
/// This routine assumes we've already established a properly offset GEP with
@@ -1423,7 +1433,7 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
return buildGEP(IRB, BasePtr, Indices, NamePrefix);
}
-/// \brief Recursively compute indices for a natural GEP.
+/// Recursively compute indices for a natural GEP.
///
/// This is the recursive step for getNaturalGEPWithOffset that walks down the
/// element types adding appropriate indices for the GEP.
@@ -1491,7 +1501,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
Indices, NamePrefix);
}
-/// \brief Get a natural GEP from a base pointer to a particular offset and
+/// Get a natural GEP from a base pointer to a particular offset and
/// resulting in a particular type.
///
/// The goal is to produce a "natural" looking GEP that works with the existing
@@ -1526,7 +1536,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
Indices, NamePrefix);
}
-/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
+/// Compute an adjusted pointer from Ptr by Offset bytes where the
/// resulting pointer has PointerTy.
///
/// This tries very hard to compute a "natural" GEP which arrives at the offset
@@ -1635,7 +1645,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
return Ptr;
}
-/// \brief Compute the adjusted alignment for a load or store from an offset.
+/// Compute the adjusted alignment for a load or store from an offset.
static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
const DataLayout &DL) {
unsigned Alignment;
@@ -1656,7 +1666,7 @@ static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
return MinAlign(Alignment, Offset);
}
-/// \brief Test whether we can convert a value from the old to the new type.
+/// Test whether we can convert a value from the old to the new type.
///
/// This predicate should be used to guard calls to convertValue in order to
/// ensure that we only try to convert viable values. The strategy is that we
@@ -1707,7 +1717,7 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
return true;
}
-/// \brief Generic routine to convert an SSA value to a value of a different
+/// Generic routine to convert an SSA value to a value of a different
/// type.
///
/// This will try various different casting techniques, such as bitcasts,
@@ -1759,7 +1769,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
return IRB.CreateBitCast(V, NewTy);
}
-/// \brief Test whether the given slice use can be promoted to a vector.
+/// Test whether the given slice use can be promoted to a vector.
///
/// This function is called to test each entry in a partition which is slated
/// for a single slice.
@@ -1830,7 +1840,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
return true;
}
-/// \brief Test whether the given alloca partitioning and range of slices can be
+/// Test whether the given alloca partitioning and range of slices can be
/// promoted to a vector.
///
/// This is a quick test to check whether we can rewrite a particular alloca
@@ -1896,7 +1906,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
"All non-integer types eliminated!");
return RHSTy->getNumElements() < LHSTy->getNumElements();
};
- std::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes);
+ llvm::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes);
CandidateTys.erase(
std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
CandidateTys.end());
@@ -1943,7 +1953,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
return nullptr;
}
-/// \brief Test whether a slice of an alloca is valid for integer widening.
+/// Test whether a slice of an alloca is valid for integer widening.
///
/// This implements the necessary checking for the \c isIntegerWideningViable
/// test below on a single slice of the alloca.
@@ -1970,6 +1980,10 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
// We can't handle loads that extend past the allocated memory.
if (DL.getTypeStoreSize(LI->getType()) > Size)
return false;
+ // So far, AllocaSliceRewriter does not support widening split slice tails
+ // in rewriteIntegerLoad.
+ if (S.beginOffset() < AllocBeginOffset)
+ return false;
// Note that we don't count vector loads or stores as whole-alloca
// operations which enable integer widening because we would prefer to use
// vector widening instead.
@@ -1991,6 +2005,10 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
// We can't handle stores that extend past the allocated memory.
if (DL.getTypeStoreSize(ValueTy) > Size)
return false;
+ // So far, AllocaSliceRewriter does not support widening split slice tails
+ // in rewriteIntegerStore.
+ if (S.beginOffset() < AllocBeginOffset)
+ return false;
// Note that we don't count vector loads or stores as whole-alloca
// operations which enable integer widening because we would prefer to use
// vector widening instead.
@@ -2021,7 +2039,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
return true;
}
-/// \brief Test whether the given alloca partition's integer operations can be
+/// Test whether the given alloca partition's integer operations can be
/// widened to promotable ones.
///
/// This is a quick test to check whether we can rewrite the integer loads and
@@ -2072,7 +2090,7 @@ static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
IntegerType *Ty, uint64_t Offset,
const Twine &Name) {
- DEBUG(dbgs() << " start: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
IntegerType *IntTy = cast<IntegerType>(V->getType());
assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
"Element extends past full value");
@@ -2081,13 +2099,13 @@ static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
if (ShAmt) {
V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
- DEBUG(dbgs() << " shifted: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
}
assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
"Cannot extract to a larger integer!");
if (Ty != IntTy) {
V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
- DEBUG(dbgs() << " trunced: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
}
return V;
}
@@ -2098,10 +2116,10 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
IntegerType *Ty = cast<IntegerType>(V->getType());
assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
"Cannot insert a larger integer!");
- DEBUG(dbgs() << " start: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
if (Ty != IntTy) {
V = IRB.CreateZExt(V, IntTy, Name + ".ext");
- DEBUG(dbgs() << " extended: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
}
assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
"Element store outside of alloca store");
@@ -2110,15 +2128,15 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
if (ShAmt) {
V = IRB.CreateShl(V, ShAmt, Name + ".shift");
- DEBUG(dbgs() << " shifted: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
}
if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
- DEBUG(dbgs() << " masked: " << *Old << "\n");
+ LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
V = IRB.CreateOr(Old, V, Name + ".insert");
- DEBUG(dbgs() << " inserted: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
}
return V;
}
@@ -2135,7 +2153,7 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
if (NumElements == 1) {
V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
Name + ".extract");
- DEBUG(dbgs() << " extract: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
return V;
}
@@ -2145,7 +2163,7 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
Mask.push_back(IRB.getInt32(i));
V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
ConstantVector::get(Mask), Name + ".extract");
- DEBUG(dbgs() << " shuffle: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
return V;
}
@@ -2159,7 +2177,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
// Single element to insert.
V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
Name + ".insert");
- DEBUG(dbgs() << " insert: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
return V;
}
@@ -2184,7 +2202,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
ConstantVector::get(Mask), Name + ".expand");
- DEBUG(dbgs() << " shuffle: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
Mask.clear();
for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
@@ -2192,11 +2210,11 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend");
- DEBUG(dbgs() << " blend: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
return V;
}
-/// \brief Visitor to rewrite instructions using p particular slice of an alloca
+/// Visitor to rewrite instructions using p particular slice of an alloca
/// to use a new alloca.
///
/// Also implements the rewriting to vector-based accesses when the partition
@@ -2295,9 +2313,9 @@ public:
IsSplittable = I->isSplittable();
IsSplit =
BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
- DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
- DEBUG(AS.printSlice(dbgs(), I, ""));
- DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
+ LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
+ LLVM_DEBUG(dbgs() << "\n");
// Compute the intersecting offset range.
assert(BeginOffset < NewAllocaEndOffset);
@@ -2327,7 +2345,7 @@ private:
// Every instruction which can end up as a user must have a rewrite rule.
bool visitInstruction(Instruction &I) {
- DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
+ LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
llvm_unreachable("No rewrite rule for this instruction!");
}
@@ -2369,7 +2387,7 @@ private:
);
}
- /// \brief Compute suitable alignment to access this slice of the *new*
+ /// Compute suitable alignment to access this slice of the *new*
/// alloca.
///
/// You can optionally pass a type to this routine and if that type's ABI
@@ -2431,10 +2449,13 @@ private:
}
bool visitLoadInst(LoadInst &LI) {
- DEBUG(dbgs() << " original: " << LI << "\n");
+ LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
Value *OldOp = LI.getOperand(0);
assert(OldOp == OldPtr);
+ AAMDNodes AATags;
+ LI.getAAMetadata(AATags);
+
unsigned AS = LI.getPointerAddressSpace();
Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
@@ -2453,6 +2474,8 @@ private:
TargetTy->isIntegerTy()))) {
LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
LI.isVolatile(), LI.getName());
+ if (AATags)
+ NewLI->setAAMetadata(AATags);
if (LI.isVolatile())
NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
@@ -2488,6 +2511,8 @@ private:
LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
getSliceAlign(TargetTy),
LI.isVolatile(), LI.getName());
+ if (AATags)
+ NewLI->setAAMetadata(AATags);
if (LI.isVolatile())
NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
@@ -2524,11 +2549,12 @@ private:
Pass.DeadInsts.insert(&LI);
deleteIfTriviallyDead(OldOp);
- DEBUG(dbgs() << " to: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
return !LI.isVolatile() && !IsPtrAdjusted;
}
- bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp) {
+ bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
+ AAMDNodes AATags) {
if (V->getType() != VecTy) {
unsigned BeginIndex = getIndex(NewBeginOffset);
unsigned EndIndex = getIndex(NewEndOffset);
@@ -2546,14 +2572,15 @@ private:
V = insertVector(IRB, Old, V, BeginIndex, "vec");
}
StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+ if (AATags)
+ Store->setAAMetadata(AATags);
Pass.DeadInsts.insert(&SI);
- (void)Store;
- DEBUG(dbgs() << " to: " << *Store << "\n");
+ LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
return true;
}
- bool rewriteIntegerStore(Value *V, StoreInst &SI) {
+ bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
assert(IntTy && "We cannot extract an integer from the alloca");
assert(!SI.isVolatile());
if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
@@ -2567,16 +2594,21 @@ private:
V = convertValue(DL, IRB, V, NewAllocaTy);
StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
Store->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access);
+ if (AATags)
+ Store->setAAMetadata(AATags);
Pass.DeadInsts.insert(&SI);
- DEBUG(dbgs() << " to: " << *Store << "\n");
+ LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
return true;
}
bool visitStoreInst(StoreInst &SI) {
- DEBUG(dbgs() << " original: " << SI << "\n");
+ LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
Value *OldOp = SI.getOperand(1);
assert(OldOp == OldPtr);
+ AAMDNodes AATags;
+ SI.getAAMetadata(AATags);
+
Value *V = SI.getValueOperand();
// Strip all inbounds GEPs and pointer casts to try to dig out any root
@@ -2598,9 +2630,9 @@ private:
}
if (VecTy)
- return rewriteVectorizedStoreInst(V, SI, OldOp);
+ return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
if (IntTy && V->getType()->isIntegerTy())
- return rewriteIntegerStore(V, SI);
+ return rewriteIntegerStore(V, SI, AATags);
const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize;
StoreInst *NewSI;
@@ -2631,16 +2663,18 @@ private:
SI.isVolatile());
}
NewSI->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access);
+ if (AATags)
+ NewSI->setAAMetadata(AATags);
if (SI.isVolatile())
NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
Pass.DeadInsts.insert(&SI);
deleteIfTriviallyDead(OldOp);
- DEBUG(dbgs() << " to: " << *NewSI << "\n");
+ LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
}
- /// \brief Compute an integer value from splatting an i8 across the given
+ /// Compute an integer value from splatting an i8 across the given
/// number of bytes.
///
/// Note that this routine assumes an i8 is a byte. If that isn't true, don't
@@ -2667,25 +2701,27 @@ private:
return V;
}
- /// \brief Compute a vector splat for a given element value.
+ /// Compute a vector splat for a given element value.
Value *getVectorSplat(Value *V, unsigned NumElements) {
V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
- DEBUG(dbgs() << " splat: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
return V;
}
bool visitMemSetInst(MemSetInst &II) {
- DEBUG(dbgs() << " original: " << II << "\n");
+ LLVM_DEBUG(dbgs() << " original: " << II << "\n");
assert(II.getRawDest() == OldPtr);
+ AAMDNodes AATags;
+ II.getAAMetadata(AATags);
+
// If the memset has a variable size, it cannot be split, just adjust the
// pointer to the new alloca.
if (!isa<Constant>(II.getLength())) {
assert(!IsSplit);
assert(NewBeginOffset == BeginOffset);
II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
- Type *CstTy = II.getAlignmentCst()->getType();
- II.setAlignment(ConstantInt::get(CstTy, getSliceAlign()));
+ II.setDestAlignment(getSliceAlign());
deleteIfTriviallyDead(OldPtr);
return false;
@@ -2710,8 +2746,9 @@ private:
CallInst *New = IRB.CreateMemSet(
getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
getSliceAlign(), II.isVolatile());
- (void)New;
- DEBUG(dbgs() << " to: " << *New << "\n");
+ if (AATags)
+ New->setAAMetadata(AATags);
+ LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
return false;
}
@@ -2773,10 +2810,11 @@ private:
V = convertValue(DL, IRB, V, AllocaTy);
}
- Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
- II.isVolatile());
- (void)New;
- DEBUG(dbgs() << " to: " << *New << "\n");
+ StoreInst *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
+ II.isVolatile());
+ if (AATags)
+ New->setAAMetadata(AATags);
+ LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
return !II.isVolatile();
}
@@ -2784,7 +2822,10 @@ private:
// Rewriting of memory transfer instructions can be a bit tricky. We break
// them into two categories: split intrinsics and unsplit intrinsics.
- DEBUG(dbgs() << " original: " << II << "\n");
+ LLVM_DEBUG(dbgs() << " original: " << II << "\n");
+
+ AAMDNodes AATags;
+ II.getAAMetadata(AATags);
bool IsDest = &II.getRawDestUse() == OldUse;
assert((IsDest && II.getRawDest() == OldPtr) ||
@@ -2801,18 +2842,16 @@ private:
// update both source and dest of a single call.
if (!IsSplittable) {
Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
- if (IsDest)
+ if (IsDest) {
II.setDest(AdjustedPtr);
- else
+ II.setDestAlignment(SliceAlign);
+ }
+ else {
II.setSource(AdjustedPtr);
-
- if (II.getAlignment() > SliceAlign) {
- Type *CstTy = II.getAlignmentCst()->getType();
- II.setAlignment(
- ConstantInt::get(CstTy, MinAlign(II.getAlignment(), SliceAlign)));
+ II.setSourceAlignment(SliceAlign);
}
- DEBUG(dbgs() << " to: " << II << "\n");
+ LLVM_DEBUG(dbgs() << " to: " << II << "\n");
deleteIfTriviallyDead(OldPtr);
return false;
}
@@ -2862,8 +2901,10 @@ private:
// Compute the relative offset for the other pointer within the transfer.
unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS);
APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset);
- unsigned OtherAlign = MinAlign(II.getAlignment() ? II.getAlignment() : 1,
- OtherOffset.zextOrTrunc(64).getZExtValue());
+ unsigned OtherAlign =
+ IsDest ? II.getSourceAlignment() : II.getDestAlignment();
+ OtherAlign = MinAlign(OtherAlign ? OtherAlign : 1,
+ OtherOffset.zextOrTrunc(64).getZExtValue());
if (EmitMemCpy) {
// Compute the other pointer, folding as much as possible to produce
@@ -2875,11 +2916,25 @@ private:
Type *SizeTy = II.getLength()->getType();
Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
- CallInst *New = IRB.CreateMemCpy(
- IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, Size,
- MinAlign(SliceAlign, OtherAlign), II.isVolatile());
- (void)New;
- DEBUG(dbgs() << " to: " << *New << "\n");
+ Value *DestPtr, *SrcPtr;
+ unsigned DestAlign, SrcAlign;
+ // Note: IsDest is true iff we're copying into the new alloca slice
+ if (IsDest) {
+ DestPtr = OurPtr;
+ DestAlign = SliceAlign;
+ SrcPtr = OtherPtr;
+ SrcAlign = OtherAlign;
+ } else {
+ DestPtr = OtherPtr;
+ DestAlign = OtherAlign;
+ SrcPtr = OurPtr;
+ SrcAlign = SliceAlign;
+ }
+ CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
+ Size, II.isVolatile());
+ if (AATags)
+ New->setAAMetadata(AATags);
+ LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
return false;
}
@@ -2927,8 +2982,11 @@ private:
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
} else {
- Src =
- IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload");
+ LoadInst *Load = IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(),
+ "copyload");
+ if (AATags)
+ Load->setAAMetadata(AATags);
+ Src = Load;
}
if (VecTy && !IsWholeAlloca && IsDest) {
@@ -2946,15 +3004,16 @@ private:
StoreInst *Store = cast<StoreInst>(
IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
- (void)Store;
- DEBUG(dbgs() << " to: " << *Store << "\n");
+ if (AATags)
+ Store->setAAMetadata(AATags);
+ LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
return !II.isVolatile();
}
bool visitIntrinsicInst(IntrinsicInst &II) {
assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
II.getIntrinsicID() == Intrinsic::lifetime_end);
- DEBUG(dbgs() << " original: " << II << "\n");
+ LLVM_DEBUG(dbgs() << " original: " << II << "\n");
assert(II.getArgOperand(1) == OldPtr);
// Record this instruction for deletion.
@@ -2982,13 +3041,13 @@ private:
New = IRB.CreateLifetimeEnd(Ptr, Size);
(void)New;
- DEBUG(dbgs() << " to: " << *New << "\n");
+ LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
return true;
}
bool visitPHINode(PHINode &PN) {
- DEBUG(dbgs() << " original: " << PN << "\n");
+ LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
@@ -3007,7 +3066,7 @@ private:
// Replace the operands which were using the old pointer.
std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
- DEBUG(dbgs() << " to: " << PN << "\n");
+ LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
deleteIfTriviallyDead(OldPtr);
// PHIs can't be promoted on their own, but often can be speculated. We
@@ -3018,7 +3077,7 @@ private:
}
bool visitSelectInst(SelectInst &SI) {
- DEBUG(dbgs() << " original: " << SI << "\n");
+ LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
"Pointer isn't an operand!");
assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
@@ -3031,7 +3090,7 @@ private:
if (SI.getOperand(2) == OldPtr)
SI.setOperand(2, NewPtr);
- DEBUG(dbgs() << " to: " << SI << "\n");
+ LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
deleteIfTriviallyDead(OldPtr);
// Selects can't be promoted on their own, but often can be speculated. We
@@ -3044,7 +3103,7 @@ private:
namespace {
-/// \brief Visitor to rewrite aggregate loads and stores as scalar.
+/// Visitor to rewrite aggregate loads and stores as scalar.
///
/// This pass aggressively rewrites all aggregate loads and stores on
/// a particular pointer (or any pointer derived from it which we can identify)
@@ -3067,7 +3126,7 @@ public:
/// Rewrite loads and stores through a pointer and all pointers derived from
/// it.
bool rewrite(Instruction &I) {
- DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
+ LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
enqueueUsers(I);
bool Changed = false;
while (!Queue.empty()) {
@@ -3089,7 +3148,7 @@ private:
// Conservative default is to not rewrite anything.
bool visitInstruction(Instruction &I) { return false; }
- /// \brief Generic recursive split emission class.
+ /// Generic recursive split emission class.
template <typename Derived> class OpSplitter {
protected:
/// The builder used to form new instructions.
@@ -3113,7 +3172,7 @@ private:
: IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
public:
- /// \brief Generic recursive split emission routine.
+ /// Generic recursive split emission routine.
///
/// This method recursively splits an aggregate op (load or store) into
/// scalar or vector ops. It splits recursively until it hits a single value
@@ -3165,8 +3224,10 @@ private:
};
struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
- LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr)
- : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
+ AAMDNodes AATags;
+
+ LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, AAMDNodes AATags)
+ : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr), AATags(AATags) {}
/// Emit a leaf load of a single value. This is called at the leaves of the
/// recursive emission to actually load values.
@@ -3175,9 +3236,11 @@ private:
// Load the single value and insert it using the indices.
Value *GEP =
IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
- Value *Load = IRB.CreateLoad(GEP, Name + ".load");
+ LoadInst *Load = IRB.CreateLoad(GEP, Name + ".load");
+ if (AATags)
+ Load->setAAMetadata(AATags);
Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
- DEBUG(dbgs() << " to: " << *Load << "\n");
+ LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
}
};
@@ -3187,8 +3250,10 @@ private:
return false;
// We have an aggregate being loaded, split it apart.
- DEBUG(dbgs() << " original: " << LI << "\n");
- LoadOpSplitter Splitter(&LI, *U);
+ LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
+ AAMDNodes AATags;
+ LI.getAAMetadata(AATags);
+ LoadOpSplitter Splitter(&LI, *U, AATags);
Value *V = UndefValue::get(LI.getType());
Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
LI.replaceAllUsesWith(V);
@@ -3197,8 +3262,9 @@ private:
}
struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
- StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr)
- : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
+ StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, AAMDNodes AATags)
+ : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr), AATags(AATags) {}
+ AAMDNodes AATags;
/// Emit a leaf store of a single value. This is called at the leaves of the
/// recursive emission to actually produce stores.
@@ -3212,9 +3278,10 @@ private:
IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
Value *InBoundsGEP =
IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
- Value *Store = IRB.CreateStore(ExtractValue, InBoundsGEP);
- (void)Store;
- DEBUG(dbgs() << " to: " << *Store << "\n");
+ StoreInst *Store = IRB.CreateStore(ExtractValue, InBoundsGEP);
+ if (AATags)
+ Store->setAAMetadata(AATags);
+ LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
}
};
@@ -3226,8 +3293,10 @@ private:
return false;
// We have an aggregate being stored, split it apart.
- DEBUG(dbgs() << " original: " << SI << "\n");
- StoreOpSplitter Splitter(&SI, *U);
+ LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
+ AAMDNodes AATags;
+ SI.getAAMetadata(AATags);
+ StoreOpSplitter Splitter(&SI, *U, AATags);
Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
SI.eraseFromParent();
return true;
@@ -3256,7 +3325,7 @@ private:
} // end anonymous namespace
-/// \brief Strip aggregate type wrapping.
+/// Strip aggregate type wrapping.
///
/// This removes no-op aggregate types wrapping an underlying type. It will
/// strip as many layers of types as it can without changing either the type
@@ -3286,7 +3355,7 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
return stripAggregateTypeWrapping(DL, InnerTy);
}
-/// \brief Try to find a partition of the aggregate type passed in for a given
+/// Try to find a partition of the aggregate type passed in for a given
/// offset and size.
///
/// This recurses through the aggregate type and tries to compute a subtype
@@ -3392,7 +3461,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
return SubTy;
}
-/// \brief Pre-split loads and stores to simplify rewriting.
+/// Pre-split loads and stores to simplify rewriting.
///
/// We want to break up the splittable load+store pairs as much as
/// possible. This is important to do as a preprocessing step, as once we
@@ -3423,7 +3492,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
///
/// \returns true if any changes are made.
bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
- DEBUG(dbgs() << "Pre-splitting loads and stores\n");
+ LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
// Track the loads and stores which are candidates for pre-splitting here, in
// the order they first appear during the partition scan. These give stable
@@ -3455,7 +3524,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
// maybe it would make it more principled?
SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
- DEBUG(dbgs() << " Searching for candidate loads and stores\n");
+ LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
for (auto &P : AS.partitions()) {
for (Slice &S : P) {
Instruction *I = cast<Instruction>(S.getUse()->getUser());
@@ -3510,7 +3579,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
}
// Record the initial split.
- DEBUG(dbgs() << " Candidate: " << *I << "\n");
+ LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
auto &Offsets = SplitOffsetsMap[I];
assert(Offsets.Splits.empty() &&
"Should not have splits the first time we see an instruction!");
@@ -3570,10 +3639,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
if (LoadOffsets.Splits == StoreOffsets.Splits)
return false;
- DEBUG(dbgs()
- << " Mismatched splits for load and store:\n"
- << " " << *LI << "\n"
- << " " << *SI << "\n");
+ LLVM_DEBUG(
+ dbgs()
+ << " Mismatched splits for load and store:\n"
+ << " " << *LI << "\n"
+ << " " << *SI << "\n");
// We've found a store and load that we need to split
// with mismatched relative splits. Just give up on them
@@ -3646,7 +3716,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
IRB.SetInsertPoint(LI);
- DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
+ LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
int Idx = 0, Size = Offsets.Splits.size();
@@ -3656,7 +3726,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
auto *PartPtrTy = PartTy->getPointerTo(AS);
LoadInst *PLoad = IRB.CreateAlignedLoad(
getAdjustedPtr(IRB, DL, BasePtr,
- APInt(DL.getPointerSizeInBits(AS), PartOffset),
+ APInt(DL.getIndexSizeInBits(AS), PartOffset),
PartPtrTy, BasePtr->getName() + "."),
getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
LI->getName());
@@ -3671,9 +3741,9 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
&PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
/*IsSplittable*/ false));
- DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
- << ", " << NewSlices.back().endOffset() << "): " << *PLoad
- << "\n");
+ LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
+ << ", " << NewSlices.back().endOffset()
+ << "): " << *PLoad << "\n");
// See if we've handled all the splits.
if (Idx >= Size)
@@ -3693,14 +3763,15 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
StoreInst *SI = cast<StoreInst>(LU);
if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
DeferredStores = true;
- DEBUG(dbgs() << " Deferred splitting of store: " << *SI << "\n");
+ LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
+ << "\n");
continue;
}
Value *StoreBasePtr = SI->getPointerOperand();
IRB.SetInsertPoint(SI);
- DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
+ LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
LoadInst *PLoad = SplitLoads[Idx];
@@ -3712,11 +3783,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
StoreInst *PStore = IRB.CreateAlignedStore(
PLoad,
getAdjustedPtr(IRB, DL, StoreBasePtr,
- APInt(DL.getPointerSizeInBits(AS), PartOffset),
+ APInt(DL.getIndexSizeInBits(AS), PartOffset),
PartPtrTy, StoreBasePtr->getName() + "."),
getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
PStore->copyMetadata(*LI, LLVMContext::MD_mem_parallel_loop_access);
- DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
+ LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
}
// We want to immediately iterate on any allocas impacted by splitting
@@ -3765,7 +3836,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
Value *LoadBasePtr = LI->getPointerOperand();
Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
- DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
+ LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
// Check whether we have an already split load.
auto SplitLoadsMapI = SplitLoadsMap.find(LI);
@@ -3775,7 +3846,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
"Too few split loads for the number of splits in the store!");
} else {
- DEBUG(dbgs() << " of load: " << *LI << "\n");
+ LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
}
uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
@@ -3794,7 +3865,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
auto AS = LI->getPointerAddressSpace();
PLoad = IRB.CreateAlignedLoad(
getAdjustedPtr(IRB, DL, LoadBasePtr,
- APInt(DL.getPointerSizeInBits(AS), PartOffset),
+ APInt(DL.getIndexSizeInBits(AS), PartOffset),
LoadPartPtrTy, LoadBasePtr->getName() + "."),
getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
LI->getName());
@@ -3806,7 +3877,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
StoreInst *PStore = IRB.CreateAlignedStore(
PLoad,
getAdjustedPtr(IRB, DL, StoreBasePtr,
- APInt(DL.getPointerSizeInBits(AS), PartOffset),
+ APInt(DL.getIndexSizeInBits(AS), PartOffset),
StorePartPtrTy, StoreBasePtr->getName() + "."),
getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
@@ -3815,11 +3886,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
&PStore->getOperandUse(PStore->getPointerOperandIndex()),
/*IsSplittable*/ false));
- DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
- << ", " << NewSlices.back().endOffset() << "): " << *PStore
- << "\n");
+ LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
+ << ", " << NewSlices.back().endOffset()
+ << "): " << *PStore << "\n");
if (!SplitLoads) {
- DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
+ LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
}
// See if we've finished all the splits.
@@ -3874,10 +3945,10 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
// sequence.
AS.insert(NewSlices);
- DEBUG(dbgs() << " Pre-split slices:\n");
+ LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
#ifndef NDEBUG
for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
- DEBUG(AS.print(dbgs(), I, " "));
+ LLVM_DEBUG(AS.print(dbgs(), I, " "));
#endif
// Finally, don't try to promote any allocas that new require re-splitting.
@@ -3891,7 +3962,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
return true;
}
-/// \brief Rewrite an alloca partition's users.
+/// Rewrite an alloca partition's users.
///
/// This routine drives both of the rewriting goals of the SROA pass. It tries
/// to rewrite uses of an alloca partition to be conducive for SSA value
@@ -3934,10 +4005,10 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// exact same type as the original, and with the same access offsets. In that
// case, re-use the existing alloca, but still run through the rewriter to
// perform phi and select speculation.
+ // P.beginOffset() can be non-zero even with the same type in a case with
+ // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
AllocaInst *NewAI;
- if (SliceTy == AI.getAllocatedType()) {
- assert(P.beginOffset() == 0 &&
- "Non-zero begin offset but same alloca type");
+ if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
NewAI = &AI;
// FIXME: We should be able to bail at this point with "nothing changed".
// FIXME: We might want to defer PHI speculation until after here.
@@ -3958,12 +4029,14 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
NewAI = new AllocaInst(
SliceTy, AI.getType()->getAddressSpace(), nullptr, Alignment,
AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
+ // Copy the old AI debug location over to the new one.
+ NewAI->setDebugLoc(AI.getDebugLoc());
++NumNewAllocas;
}
- DEBUG(dbgs() << "Rewriting alloca partition "
- << "[" << P.beginOffset() << "," << P.endOffset()
- << ") to: " << *NewAI << "\n");
+ LLVM_DEBUG(dbgs() << "Rewriting alloca partition "
+ << "[" << P.beginOffset() << "," << P.endOffset()
+ << ") to: " << *NewAI << "\n");
// Track the high watermark on the worklist as it is only relevant for
// promoted allocas. We will reset it to this point if the alloca is not in
@@ -4040,7 +4113,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
return NewAI;
}
-/// \brief Walks the slices of an alloca and form partitions based on them,
+/// Walks the slices of an alloca and form partitions based on them,
/// rewriting each of their uses.
bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
if (AS.begin() == AS.end())
@@ -4063,7 +4136,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
uint64_t AllocaSize = DL.getTypeAllocSize(AI.getAllocatedType());
const uint64_t MaxBitVectorSize = 1024;
- if (SROASplitNonWholeAllocaSlices && AllocaSize <= MaxBitVectorSize) {
+ if (AllocaSize <= MaxBitVectorSize) {
// If a byte boundary is included in any load or store, a slice starting or
// ending at the boundary is not splittable.
SmallBitVector SplittableOffset(AllocaSize + 1, true);
@@ -4106,7 +4179,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
}
if (!IsSorted)
- std::sort(AS.begin(), AS.end());
+ llvm::sort(AS.begin(), AS.end());
/// Describes the allocas introduced by rewritePartition in order to migrate
/// the debug info.
@@ -4201,7 +4274,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
return Changed;
}
-/// \brief Clobber a use with undef, deleting the used value if it becomes dead.
+/// Clobber a use with undef, deleting the used value if it becomes dead.
void SROA::clobberUse(Use &U) {
Value *OldV = U;
// Replace the use with an undef value.
@@ -4216,13 +4289,13 @@ void SROA::clobberUse(Use &U) {
}
}
-/// \brief Analyze an alloca for SROA.
+/// Analyze an alloca for SROA.
///
/// This analyzes the alloca to ensure we can reason about it, builds
/// the slices of the alloca, and then hands it off to be split and
/// rewritten as needed.
bool SROA::runOnAlloca(AllocaInst &AI) {
- DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
+ LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
++NumAllocasAnalyzed;
// Special case dead allocas, as they're trivial.
@@ -4246,7 +4319,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
// Build the slices using a recursive instruction-visiting builder.
AllocaSlices AS(DL, AI);
- DEBUG(AS.print(dbgs()));
+ LLVM_DEBUG(AS.print(dbgs()));
if (AS.isEscaped())
return Changed;
@@ -4274,18 +4347,18 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
Changed |= splitAlloca(AI, AS);
- DEBUG(dbgs() << " Speculating PHIs\n");
+ LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
while (!SpeculatablePHIs.empty())
speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
- DEBUG(dbgs() << " Speculating Selects\n");
+ LLVM_DEBUG(dbgs() << " Speculating Selects\n");
while (!SpeculatableSelects.empty())
speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
return Changed;
}
-/// \brief Delete the dead instructions accumulated in this run.
+/// Delete the dead instructions accumulated in this run.
///
/// Recursively deletes the dead instructions we've accumulated. This is done
/// at the very end to maximize locality of the recursive delete and to
@@ -4299,7 +4372,7 @@ bool SROA::deleteDeadInstructions(
bool Changed = false;
while (!DeadInsts.empty()) {
Instruction *I = DeadInsts.pop_back_val();
- DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
+ LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
// If the instruction is an alloca, find the possible dbg.declare connected
// to it, and remove it too. We must do this before calling RAUW or we will
@@ -4327,7 +4400,7 @@ bool SROA::deleteDeadInstructions(
return Changed;
}
-/// \brief Promote the allocas, using the best available technique.
+/// Promote the allocas, using the best available technique.
///
/// This attempts to promote whatever allocas have been identified as viable in
/// the PromotableAllocas list. If that list is empty, there is nothing to do.
@@ -4338,7 +4411,7 @@ bool SROA::promoteAllocas(Function &F) {
NumPromoted += PromotableAllocas.size();
- DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
+ LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
PromoteMemToReg(PromotableAllocas, *DT, AC);
PromotableAllocas.clear();
return true;
@@ -4346,7 +4419,7 @@ bool SROA::promoteAllocas(Function &F) {
PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
AssumptionCache &RunAC) {
- DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
C = &F.getContext();
DT = &RunDT;
AC = &RunAC;
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
index 3b99ddff2e06..526487d3477e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -45,6 +45,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeScalarizerPass(Registry);
initializeDSELegacyPassPass(Registry);
initializeGuardWideningLegacyPassPass(Registry);
+ initializeLoopGuardWideningLegacyPassPass(Registry);
initializeGVNLegacyPassPass(Registry);
initializeNewGVNLegacyPassPass(Registry);
initializeEarlyCSELegacyPassPass(Registry);
@@ -52,9 +53,10 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeGVNHoistLegacyPassPass(Registry);
initializeGVNSinkLegacyPassPass(Registry);
initializeFlattenCFGPassPass(Registry);
- initializeInductiveRangeCheckEliminationPass(Registry);
+ initializeIRCELegacyPassPass(Registry);
initializeIndVarSimplifyLegacyPassPass(Registry);
initializeInferAddressSpacesPass(Registry);
+ initializeInstSimplifyLegacyPassPass(Registry);
initializeJumpThreadingPass(Registry);
initializeLegacyLICMPassPass(Registry);
initializeLegacyLoopSinkPassPass(Registry);
@@ -68,6 +70,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeLoopStrengthReducePass(Registry);
initializeLoopRerollPass(Registry);
initializeLoopUnrollPass(Registry);
+ initializeLoopUnrollAndJamPass(Registry);
initializeLoopUnswitchPass(Registry);
initializeLoopVersioningLICMPass(Registry);
initializeLoopIdiomRecognizeLegacyPassPass(Registry);
@@ -83,7 +86,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeRegToMemPass(Registry);
initializeRewriteStatepointsForGCLegacyPassPass(Registry);
initializeSCCPLegacyPassPass(Registry);
- initializeIPSCCPLegacyPassPass(Registry);
initializeSROALegacyPassPass(Registry);
initializeCFGSimplifyPassPass(Registry);
initializeStructurizeCFGPass(Registry);
@@ -104,6 +106,10 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializePostInlineEntryExitInstrumenterPass(Registry);
}
+void LLVMAddLoopSimplifyCFGPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopSimplifyCFGPass());
+}
+
void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
initializeScalarOpts(*unwrap(R));
}
@@ -148,10 +154,6 @@ void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createIndVarSimplifyPass());
}
-void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createInstructionCombiningPass());
-}
-
void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createJumpThreadingPass());
}
@@ -180,14 +182,14 @@ void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createLoopRerollPass());
}
-void LLVMAddLoopSimplifyCFGPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLoopSimplifyCFGPass());
-}
-
void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createLoopUnrollPass());
}
+void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopUnrollAndJamPass());
+}
+
void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createLoopUnswitchPass());
}
@@ -200,14 +202,6 @@ void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createPartiallyInlineLibCallsPass());
}
-void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLowerSwitchPass());
-}
-
-void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createPromoteMemoryToRegisterPass());
-}
-
void LLVMAddReassociatePass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createReassociatePass());
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 4a96e0ddca16..967f4a42a8fb 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -165,8 +165,8 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
@@ -190,7 +190,6 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <cstdint>
#include <string>
@@ -213,7 +212,7 @@ static cl::opt<bool>
namespace {
-/// \brief A helper class for separating a constant offset from a GEP index.
+/// A helper class for separating a constant offset from a GEP index.
///
/// In real programs, a GEP index may be more complicated than a simple addition
/// of something and a constant integer which can be trivially splitted. For
@@ -340,16 +339,15 @@ private:
const DominatorTree *DT;
};
-/// \brief A pass that tries to split every GEP in the function into a variadic
+/// A pass that tries to split every GEP in the function into a variadic
/// base and a constant offset. It is a FunctionPass because searching for the
/// constant offset may inspect other basic blocks.
class SeparateConstOffsetFromGEP : public FunctionPass {
public:
static char ID;
- SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr,
- bool LowerGEP = false)
- : FunctionPass(ID), TM(TM), LowerGEP(LowerGEP) {
+ SeparateConstOffsetFromGEP(bool LowerGEP = false)
+ : FunctionPass(ID), LowerGEP(LowerGEP) {
initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
}
@@ -450,7 +448,6 @@ private:
const DataLayout *DL = nullptr;
DominatorTree *DT = nullptr;
ScalarEvolution *SE;
- const TargetMachine *TM;
LoopInfo *LI;
TargetLibraryInfo *TLI;
@@ -480,10 +477,8 @@ INITIALIZE_PASS_END(
"Split GEPs to a variadic base and a constant offset for better CSE", false,
false)
-FunctionPass *
-llvm::createSeparateConstOffsetFromGEPPass(const TargetMachine *TM,
- bool LowerGEP) {
- return new SeparateConstOffsetFromGEP(TM, LowerGEP);
+FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) {
+ return new SeparateConstOffsetFromGEP(LowerGEP);
}
bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
@@ -502,6 +497,8 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
// Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
// don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
+ // FIXME: this does not appear to be covered by any tests
+ // (with x86/aarch64 backends at least)
if (BO->getOpcode() == Instruction::Or &&
!haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT))
return false;
@@ -590,6 +587,10 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
// Trace into subexpressions for more hoisting opportunities.
if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
+ } else if (isa<TruncInst>(V)) {
+ ConstantOffset =
+ find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative)
+ .trunc(BitWidth);
} else if (isa<SExtInst>(V)) {
ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
ZeroExtended, NonNegative).sext(BitWidth);
@@ -654,8 +655,9 @@ ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
}
if (CastInst *Cast = dyn_cast<CastInst>(U)) {
- assert((isa<SExtInst>(Cast) || isa<ZExtInst>(Cast)) &&
- "We only traced into two types of CastInst: sext and zext");
+ assert(
+ (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) &&
+ "Only following instructions can be traced: sext, zext & trunc");
ExtInsts.push_back(Cast);
UserChain[ChainIndex] = nullptr;
return distributeExtsAndCloneChain(ChainIndex - 1);
@@ -706,7 +708,7 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
BinaryOperator::BinaryOps NewOp = BO->getOpcode();
if (BO->getOpcode() == Instruction::Or) {
// Rebuild "or" as "add", because "or" may be invalid for the new
- // epxression.
+ // expression.
//
// For instance, given
// a | (b + 5) where a and b + 5 have no common bits,
@@ -943,6 +945,10 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
if (!NeedsExtraction)
return Changed;
+
+ TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*GEP->getFunction());
+
// If LowerGEP is disabled, before really splitting the GEP, check whether the
// backend supports the addressing mode we are about to produce. If no, this
// splitting probably won't be beneficial.
@@ -951,9 +957,6 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
// of variable indices. Therefore, we don't check for addressing modes in that
// case.
if (!LowerGEP) {
- TargetTransformInfo &TTI =
- getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
- *GEP->getParent()->getParent());
unsigned AddrSpace = GEP->getPointerAddressSpace();
if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
/*BaseGV=*/nullptr, AccumulativeByteOffset,
@@ -1016,7 +1019,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
if (LowerGEP) {
// As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
// arithmetic operations if the target uses alias analysis in codegen.
- if (TM && TM->getSubtargetImpl(*GEP->getParent()->getParent())->useAA())
+ if (TTI.useAA())
lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
else
lowerToArithmetics(GEP, AccumulativeByteOffset);
@@ -1065,7 +1068,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
DL->getTypeAllocSize(GEP->getResultElementType()));
Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
- // Very likely. As long as %gep is natually aligned, the byte offset we
+ // Very likely. As long as %gep is naturally aligned, the byte offset we
// extracted should be a multiple of sizeof(*%gep).
int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
@@ -1295,7 +1298,7 @@ void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
// We changed p+o+c to p+c+o, p+c may not be inbound anymore.
const DataLayout &DAL = First->getModule()->getDataLayout();
- APInt Offset(DAL.getPointerSizeInBits(
+ APInt Offset(DAL.getIndexSizeInBits(
cast<PointerType>(First->getType())->getAddressSpace()),
0);
Value *NewBase =
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index aba732bc413f..34510cb40732 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -1,4 +1,4 @@
-//===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===//
+///===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===//
//
// The LLVM Compiler Infrastructure
//
@@ -17,10 +17,14 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
@@ -66,180 +70,65 @@ static cl::opt<int>
UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
cl::desc("The cost threshold for unswitching a loop."));
-static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,
- Constant &Replacement) {
- assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
-
- // Replace uses of LIC in the loop with the given constant.
- for (auto UI = LIC.use_begin(), UE = LIC.use_end(); UI != UE;) {
- // Grab the use and walk past it so we can clobber it in the use list.
- Use *U = &*UI++;
- Instruction *UserI = dyn_cast<Instruction>(U->getUser());
- if (!UserI || !L.contains(UserI))
- continue;
-
- // Replace this use within the loop body.
- *U = &Replacement;
- }
-}
-
-/// Update the IDom for a basic block whose predecessor set has changed.
-///
-/// This routine is designed to work when the domtree update is relatively
-/// localized by leveraging a known common dominator, often a loop header.
+/// Collect all of the loop invariant input values transitively used by the
+/// homogeneous instruction graph from a given root.
///
-/// FIXME: Should consider hand-rolling a slightly more efficient non-DFS
-/// approach here as we can do that easily by persisting the candidate IDom's
-/// dominating set between each predecessor.
-///
-/// FIXME: Longer term, many uses of this can be replaced by an incremental
-/// domtree update strategy that starts from a known dominating block and
-/// rebuilds that subtree.
-static bool updateIDomWithKnownCommonDominator(BasicBlock *BB,
- BasicBlock *KnownDominatingBB,
- DominatorTree &DT) {
- assert(pred_begin(BB) != pred_end(BB) &&
- "This routine does not handle unreachable blocks!");
-
- BasicBlock *OrigIDom = DT[BB]->getIDom()->getBlock();
-
- BasicBlock *IDom = *pred_begin(BB);
- assert(DT.dominates(KnownDominatingBB, IDom) &&
- "Bad known dominating block!");
-
- // Walk all of the other predecessors finding the nearest common dominator
- // until all predecessors are covered or we reach the loop header. The loop
- // header necessarily dominates all loop exit blocks in loop simplified form
- // so we can early-exit the moment we hit that block.
- for (auto PI = std::next(pred_begin(BB)), PE = pred_end(BB);
- PI != PE && IDom != KnownDominatingBB; ++PI) {
- assert(DT.dominates(KnownDominatingBB, *PI) &&
- "Bad known dominating block!");
- IDom = DT.findNearestCommonDominator(IDom, *PI);
- }
+/// This essentially walks from a root recursively through loop variant operands
+/// which have the exact same opcode and finds all inputs which are loop
+/// invariant. For some operations these can be re-associated and unswitched out
+/// of the loop entirely.
+static TinyPtrVector<Value *>
+collectHomogenousInstGraphLoopInvariants(Loop &L, Instruction &Root,
+ LoopInfo &LI) {
+ assert(!L.isLoopInvariant(&Root) &&
+ "Only need to walk the graph if root itself is not invariant.");
+ TinyPtrVector<Value *> Invariants;
+
+ // Build a worklist and recurse through operators collecting invariants.
+ SmallVector<Instruction *, 4> Worklist;
+ SmallPtrSet<Instruction *, 8> Visited;
+ Worklist.push_back(&Root);
+ Visited.insert(&Root);
+ do {
+ Instruction &I = *Worklist.pop_back_val();
+ for (Value *OpV : I.operand_values()) {
+ // Skip constants as unswitching isn't interesting for them.
+ if (isa<Constant>(OpV))
+ continue;
- if (IDom == OrigIDom)
- return false;
+ // Add it to our result if loop invariant.
+ if (L.isLoopInvariant(OpV)) {
+ Invariants.push_back(OpV);
+ continue;
+ }
- DT.changeImmediateDominator(BB, IDom);
- return true;
-}
+ // If not an instruction with the same opcode, nothing we can do.
+ Instruction *OpI = dyn_cast<Instruction>(OpV);
+ if (!OpI || OpI->getOpcode() != Root.getOpcode())
+ continue;
-// Note that we don't currently use the IDFCalculator here for two reasons:
-// 1) It computes dominator tree levels for the entire function on each run
-// of 'compute'. While this isn't terrible, given that we expect to update
-// relatively small subtrees of the domtree, it isn't necessarily the right
-// tradeoff.
-// 2) The interface doesn't fit this usage well. It doesn't operate in
-// append-only, and builds several sets that we don't need.
-//
-// FIXME: Neither of these issues are a big deal and could be addressed with
-// some amount of refactoring of IDFCalculator. That would allow us to share
-// the core logic here (which is solving the same core problem).
-static void appendDomFrontier(DomTreeNode *Node,
- SmallSetVector<BasicBlock *, 4> &Worklist,
- SmallVectorImpl<DomTreeNode *> &DomNodes,
- SmallPtrSetImpl<BasicBlock *> &DomSet) {
- assert(DomNodes.empty() && "Must start with no dominator nodes.");
- assert(DomSet.empty() && "Must start with an empty dominator set.");
-
- // First flatten this subtree into sequence of nodes by doing a pre-order
- // walk.
- DomNodes.push_back(Node);
- // We intentionally re-evaluate the size as each node can add new children.
- // Because this is a tree walk, this cannot add any duplicates.
- for (int i = 0; i < (int)DomNodes.size(); ++i)
- DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end());
-
- // Now create a set of the basic blocks so we can quickly test for
- // dominated successors. We could in theory use the DFS numbers of the
- // dominator tree for this, but we want this to remain predictably fast
- // even while we mutate the dominator tree in ways that would invalidate
- // the DFS numbering.
- for (DomTreeNode *InnerN : DomNodes)
- DomSet.insert(InnerN->getBlock());
-
- // Now re-walk the nodes, appending every successor of every node that isn't
- // in the set. Note that we don't append the node itself, even though if it
- // is a successor it does not strictly dominate itself and thus it would be
- // part of the dominance frontier. The reason we don't append it is that
- // the node passed in came *from* the worklist and so it has already been
- // processed.
- for (DomTreeNode *InnerN : DomNodes)
- for (BasicBlock *SuccBB : successors(InnerN->getBlock()))
- if (!DomSet.count(SuccBB))
- Worklist.insert(SuccBB);
-
- DomNodes.clear();
- DomSet.clear();
-}
+ // Visit this operand.
+ if (Visited.insert(OpI).second)
+ Worklist.push_back(OpI);
+ }
+ } while (!Worklist.empty());
-/// Update the dominator tree after unswitching a particular former exit block.
-///
-/// This handles the full update of the dominator tree after hoisting a block
-/// that previously was an exit block (or split off of an exit block) up to be
-/// reached from the new immediate dominator of the preheader.
-///
-/// The common case is simple -- we just move the unswitched block to have an
-/// immediate dominator of the old preheader. But in complex cases, there may
-/// be other blocks reachable from the unswitched block that are immediately
-/// dominated by some node between the unswitched one and the old preheader.
-/// All of these also need to be hoisted in the dominator tree. We also want to
-/// minimize queries to the dominator tree because each step of this
-/// invalidates any DFS numbers that would make queries fast.
-static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,
- DominatorTree &DT) {
- DomTreeNode *OldPHNode = DT[OldPH];
- DomTreeNode *UnswitchedNode = DT[UnswitchedBB];
- // If the dominator tree has already been updated for this unswitched node,
- // we're done. This makes it easier to use this routine if there are multiple
- // paths to the same unswitched destination.
- if (UnswitchedNode->getIDom() == OldPHNode)
- return;
+ return Invariants;
+}
- // First collect the domtree nodes that we are hoisting over. These are the
- // set of nodes which may have children that need to be hoisted as well.
- SmallPtrSet<DomTreeNode *, 4> DomChain;
- for (auto *IDom = UnswitchedNode->getIDom(); IDom != OldPHNode;
- IDom = IDom->getIDom())
- DomChain.insert(IDom);
-
- // The unswitched block ends up immediately dominated by the old preheader --
- // regardless of whether it is the loop exit block or split off of the loop
- // exit block.
- DT.changeImmediateDominator(UnswitchedNode, OldPHNode);
-
- // For everything that moves up the dominator tree, we need to examine the
- // dominator frontier to see if it additionally should move up the dominator
- // tree. This lambda appends the dominator frontier for a node on the
- // worklist.
- SmallSetVector<BasicBlock *, 4> Worklist;
-
- // Scratch data structures reused by domfrontier finding.
- SmallVector<DomTreeNode *, 4> DomNodes;
- SmallPtrSet<BasicBlock *, 4> DomSet;
-
- // Append the initial dom frontier nodes.
- appendDomFrontier(UnswitchedNode, Worklist, DomNodes, DomSet);
-
- // Walk the worklist. We grow the list in the loop and so must recompute size.
- for (int i = 0; i < (int)Worklist.size(); ++i) {
- auto *BB = Worklist[i];
-
- DomTreeNode *Node = DT[BB];
- assert(!DomChain.count(Node) &&
- "Cannot be dominated by a block you can reach!");
-
- // If this block had an immediate dominator somewhere in the chain
- // we hoisted over, then its position in the domtree needs to move as it is
- // reachable from a node hoisted over this chain.
- if (!DomChain.count(Node->getIDom()))
- continue;
+static void replaceLoopInvariantUses(Loop &L, Value *Invariant,
+ Constant &Replacement) {
+ assert(!isa<Constant>(Invariant) && "Why are we unswitching on a constant?");
- DT.changeImmediateDominator(Node, OldPHNode);
+ // Replace uses of LIC in the loop with the given constant.
+ for (auto UI = Invariant->use_begin(), UE = Invariant->use_end(); UI != UE;) {
+ // Grab the use and walk past it so we can clobber it in the use list.
+ Use *U = &*UI++;
+ Instruction *UserI = dyn_cast<Instruction>(U->getUser());
- // Now add this node's dominator frontier to the worklist as well.
- appendDomFrontier(Node, Worklist, DomNodes, DomSet);
+ // Replace this use within the loop body.
+ if (UserI && L.contains(UserI))
+ U->set(&Replacement);
}
}
@@ -261,6 +150,26 @@ static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
llvm_unreachable("Basic blocks should never be empty!");
}
+/// Insert code to test a set of loop invariant values, and conditionally branch
+/// on them.
+static void buildPartialUnswitchConditionalBranch(BasicBlock &BB,
+ ArrayRef<Value *> Invariants,
+ bool Direction,
+ BasicBlock &UnswitchedSucc,
+ BasicBlock &NormalSucc) {
+ IRBuilder<> IRB(&BB);
+ Value *Cond = Invariants.front();
+ for (Value *Invariant :
+ make_range(std::next(Invariants.begin()), Invariants.end()))
+ if (Direction)
+ Cond = IRB.CreateOr(Cond, Invariant);
+ else
+ Cond = IRB.CreateAnd(Cond, Invariant);
+
+ IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
+ Direction ? &NormalSucc : &UnswitchedSucc);
+}
+
/// Rewrite the PHI nodes in an unswitched loop exit basic block.
///
/// Requires that the loop exit and unswitched basic block are the same, and
@@ -293,7 +202,8 @@ static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
BasicBlock &UnswitchedBB,
BasicBlock &OldExitingBB,
- BasicBlock &OldPH) {
+ BasicBlock &OldPH,
+ bool FullUnswitch) {
assert(&ExitBB != &UnswitchedBB &&
"Must have different loop exit and unswitched blocks!");
Instruction *InsertPt = &*UnswitchedBB.begin();
@@ -314,7 +224,11 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
if (PN.getIncomingBlock(i) != &OldExitingBB)
continue;
- Value *Incoming = PN.removeIncomingValue(i);
+ Value *Incoming = PN.getIncomingValue(i);
+ if (FullUnswitch)
+ // No more edge from the old exiting block to the exit block.
+ PN.removeIncomingValue(i);
+
NewPN->addIncoming(Incoming, &OldPH);
}
@@ -325,6 +239,76 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
}
}
+/// Hoist the current loop up to the innermost loop containing a remaining exit.
+///
+/// Because we've removed an exit from the loop, we may have changed the set of
+/// loops reachable and need to move the current loop up the loop nest or even
+/// to an entirely separate nest.
+static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader,
+ DominatorTree &DT, LoopInfo &LI) {
+ // If the loop is already at the top level, we can't hoist it anywhere.
+ Loop *OldParentL = L.getParentLoop();
+ if (!OldParentL)
+ return;
+
+ SmallVector<BasicBlock *, 4> Exits;
+ L.getExitBlocks(Exits);
+ Loop *NewParentL = nullptr;
+ for (auto *ExitBB : Exits)
+ if (Loop *ExitL = LI.getLoopFor(ExitBB))
+ if (!NewParentL || NewParentL->contains(ExitL))
+ NewParentL = ExitL;
+
+ if (NewParentL == OldParentL)
+ return;
+
+ // The new parent loop (if different) should always contain the old one.
+ if (NewParentL)
+ assert(NewParentL->contains(OldParentL) &&
+ "Can only hoist this loop up the nest!");
+
+ // The preheader will need to move with the body of this loop. However,
+ // because it isn't in this loop we also need to update the primary loop map.
+ assert(OldParentL == LI.getLoopFor(&Preheader) &&
+ "Parent loop of this loop should contain this loop's preheader!");
+ LI.changeLoopFor(&Preheader, NewParentL);
+
+ // Remove this loop from its old parent.
+ OldParentL->removeChildLoop(&L);
+
+ // Add the loop either to the new parent or as a top-level loop.
+ if (NewParentL)
+ NewParentL->addChildLoop(&L);
+ else
+ LI.addTopLevelLoop(&L);
+
+ // Remove this loops blocks from the old parent and every other loop up the
+ // nest until reaching the new parent. Also update all of these
+ // no-longer-containing loops to reflect the nesting change.
+ for (Loop *OldContainingL = OldParentL; OldContainingL != NewParentL;
+ OldContainingL = OldContainingL->getParentLoop()) {
+ llvm::erase_if(OldContainingL->getBlocksVector(),
+ [&](const BasicBlock *BB) {
+ return BB == &Preheader || L.contains(BB);
+ });
+
+ OldContainingL->getBlocksSet().erase(&Preheader);
+ for (BasicBlock *BB : L.blocks())
+ OldContainingL->getBlocksSet().erase(BB);
+
+ // Because we just hoisted a loop out of this one, we have essentially
+ // created new exit paths from it. That means we need to form LCSSA PHI
+ // nodes for values used in the no-longer-nested loop.
+ formLCSSA(*OldContainingL, DT, &LI, nullptr);
+
+ // We shouldn't need to form dedicated exits because the exit introduced
+ // here is the (just split by unswitching) preheader. As such, it is
+ // necessarily dedicated.
+ assert(OldContainingL->hasDedicatedExits() &&
+ "Unexpected predecessor of hoisted loop preheader!");
+ }
+}
+
/// Unswitch a trivial branch if the condition is loop invariant.
///
/// This routine should only be called when loop code leading to the branch has
@@ -339,48 +323,83 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
/// (splitting the exit block as necessary). It simplifies the branch within
/// the loop to an unconditional branch but doesn't remove it entirely. Further
/// cleanup can be done with some simplify-cfg like pass.
+///
+/// If `SE` is not null, it will be updated based on the potential loop SCEVs
+/// invalidated by this.
static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
- LoopInfo &LI) {
+ LoopInfo &LI, ScalarEvolution *SE) {
assert(BI.isConditional() && "Can only unswitch a conditional branch!");
- DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n");
+ LLVM_DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n");
- Value *LoopCond = BI.getCondition();
+ // The loop invariant values that we want to unswitch.
+ TinyPtrVector<Value *> Invariants;
- // Need a trivial loop condition to unswitch.
- if (!L.isLoopInvariant(LoopCond))
- return false;
+ // When true, we're fully unswitching the branch rather than just unswitching
+ // some input conditions to the branch.
+ bool FullUnswitch = false;
+
+ if (L.isLoopInvariant(BI.getCondition())) {
+ Invariants.push_back(BI.getCondition());
+ FullUnswitch = true;
+ } else {
+ if (auto *CondInst = dyn_cast<Instruction>(BI.getCondition()))
+ Invariants = collectHomogenousInstGraphLoopInvariants(L, *CondInst, LI);
+ if (Invariants.empty())
+ // Couldn't find invariant inputs!
+ return false;
+ }
- // FIXME: We should compute this once at the start and update it!
- SmallVector<BasicBlock *, 16> ExitBlocks;
- L.getExitBlocks(ExitBlocks);
- SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(),
- ExitBlocks.end());
-
- // Check to see if a successor of the branch is guaranteed to
- // exit through a unique exit block without having any
- // side-effects. If so, determine the value of Cond that causes
- // it to do this.
- ConstantInt *CondVal = ConstantInt::getTrue(BI.getContext());
- ConstantInt *Replacement = ConstantInt::getFalse(BI.getContext());
+ // Check that one of the branch's successors exits, and which one.
+ bool ExitDirection = true;
int LoopExitSuccIdx = 0;
auto *LoopExitBB = BI.getSuccessor(0);
- if (!ExitBlockSet.count(LoopExitBB)) {
- std::swap(CondVal, Replacement);
+ if (L.contains(LoopExitBB)) {
+ ExitDirection = false;
LoopExitSuccIdx = 1;
LoopExitBB = BI.getSuccessor(1);
- if (!ExitBlockSet.count(LoopExitBB))
+ if (L.contains(LoopExitBB))
return false;
}
auto *ContinueBB = BI.getSuccessor(1 - LoopExitSuccIdx);
- assert(L.contains(ContinueBB) &&
- "Cannot have both successors exit and still be in the loop!");
-
auto *ParentBB = BI.getParent();
if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB))
return false;
- DEBUG(dbgs() << " unswitching trivial branch when: " << CondVal
- << " == " << LoopCond << "\n");
+ // When unswitching only part of the branch's condition, we need the exit
+ // block to be reached directly from the partially unswitched input. This can
+ // be done when the exit block is along the true edge and the branch condition
+ // is a graph of `or` operations, or the exit block is along the false edge
+ // and the condition is a graph of `and` operations.
+ if (!FullUnswitch) {
+ if (ExitDirection) {
+ if (cast<Instruction>(BI.getCondition())->getOpcode() != Instruction::Or)
+ return false;
+ } else {
+ if (cast<Instruction>(BI.getCondition())->getOpcode() != Instruction::And)
+ return false;
+ }
+ }
+
+ LLVM_DEBUG({
+ dbgs() << " unswitching trivial invariant conditions for: " << BI
+ << "\n";
+ for (Value *Invariant : Invariants) {
+ dbgs() << " " << *Invariant << " == true";
+ if (Invariant != Invariants.back())
+ dbgs() << " ||";
+ dbgs() << "\n";
+ }
+ });
+
+ // If we have scalar evolutions, we need to invalidate them including this
+ // loop and the loop containing the exit block.
+ if (SE) {
+ if (Loop *ExitL = LI.getLoopFor(LoopExitBB))
+ SE->forgetLoop(ExitL);
+ else
+ // Forget the entire nest as this exits the entire nest.
+ SE->forgetTopmostLoop(&L);
+ }
// Split the preheader, so that we know that there is a safe place to insert
// the conditional branch. We will change the preheader to have a conditional
@@ -393,45 +412,73 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
// unswitching. We need to split this if there are other loop predecessors.
// Because the loop is in simplified form, *any* other predecessor is enough.
BasicBlock *UnswitchedBB;
- if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) {
- (void)PredBB;
- assert(PredBB == BI.getParent() &&
+ if (FullUnswitch && LoopExitBB->getUniquePredecessor()) {
+ assert(LoopExitBB->getUniquePredecessor() == BI.getParent() &&
"A branch's parent isn't a predecessor!");
UnswitchedBB = LoopExitBB;
} else {
UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI);
}
- // Now splice the branch to gate reaching the new preheader and re-point its
- // successors.
- OldPH->getInstList().splice(std::prev(OldPH->end()),
- BI.getParent()->getInstList(), BI);
+ // Actually move the invariant uses into the unswitched position. If possible,
+ // we do this by moving the instructions, but when doing partial unswitching
+ // we do it by building a new merge of the values in the unswitched position.
OldPH->getTerminator()->eraseFromParent();
- BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
- BI.setSuccessor(1 - LoopExitSuccIdx, NewPH);
-
- // Create a new unconditional branch that will continue the loop as a new
- // terminator.
- BranchInst::Create(ContinueBB, ParentBB);
+ if (FullUnswitch) {
+ // If fully unswitching, we can use the existing branch instruction.
+ // Splice it into the old PH to gate reaching the new preheader and re-point
+ // its successors.
+ OldPH->getInstList().splice(OldPH->end(), BI.getParent()->getInstList(),
+ BI);
+ BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
+ BI.setSuccessor(1 - LoopExitSuccIdx, NewPH);
+
+ // Create a new unconditional branch that will continue the loop as a new
+ // terminator.
+ BranchInst::Create(ContinueBB, ParentBB);
+ } else {
+ // Only unswitching a subset of inputs to the condition, so we will need to
+ // build a new branch that merges the invariant inputs.
+ if (ExitDirection)
+ assert(cast<Instruction>(BI.getCondition())->getOpcode() ==
+ Instruction::Or &&
+ "Must have an `or` of `i1`s for the condition!");
+ else
+ assert(cast<Instruction>(BI.getCondition())->getOpcode() ==
+ Instruction::And &&
+ "Must have an `and` of `i1`s for the condition!");
+ buildPartialUnswitchConditionalBranch(*OldPH, Invariants, ExitDirection,
+ *UnswitchedBB, *NewPH);
+ }
// Rewrite the relevant PHI nodes.
if (UnswitchedBB == LoopExitBB)
rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
else
rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
- *ParentBB, *OldPH);
+ *ParentBB, *OldPH, FullUnswitch);
// Now we need to update the dominator tree.
- updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
- // But if we split something off of the loop exit block then we also removed
- // one of the predecessors for the loop exit block and may need to update its
- // idom.
- if (UnswitchedBB != LoopExitBB)
- updateIDomWithKnownCommonDominator(LoopExitBB, L.getHeader(), DT);
+ DT.insertEdge(OldPH, UnswitchedBB);
+ if (FullUnswitch)
+ DT.deleteEdge(ParentBB, UnswitchedBB);
+
+ // The constant we can replace all of our invariants with inside the loop
+ // body. If any of the invariants have a value other than this the loop won't
+ // be entered.
+ ConstantInt *Replacement = ExitDirection
+ ? ConstantInt::getFalse(BI.getContext())
+ : ConstantInt::getTrue(BI.getContext());
// Since this is an i1 condition we can also trivially replace uses of it
// within the loop with a constant.
- replaceLoopUsesWithConstant(L, *LoopCond, *Replacement);
+ for (Value *Invariant : Invariants)
+ replaceLoopInvariantUses(L, Invariant, *Replacement);
+
+ // If this was full unswitching, we may have changed the nesting relationship
+ // for this loop so hoist it to its correct parent if needed.
+ if (FullUnswitch)
+ hoistLoopToNewParent(L, *NewPH, DT, LI);
++NumTrivial;
++NumBranches;
@@ -461,9 +508,12 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
/// switch will not be revisited. If after unswitching there is only a single
/// in-loop successor, the switch is further simplified to an unconditional
/// branch. Still more cleanup can be done with some simplify-cfg like pass.
+///
+/// If `SE` is not null, it will be updated based on the potential loop SCEVs
+/// invalidated by this.
static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
- LoopInfo &LI) {
- DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n");
+ LoopInfo &LI, ScalarEvolution *SE) {
+ LLVM_DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n");
Value *LoopCond = SI.getCondition();
// If this isn't switching on an invariant condition, we can't unswitch it.
@@ -472,41 +522,62 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
auto *ParentBB = SI.getParent();
- // FIXME: We should compute this once at the start and update it!
- SmallVector<BasicBlock *, 16> ExitBlocks;
- L.getExitBlocks(ExitBlocks);
- SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(),
- ExitBlocks.end());
-
SmallVector<int, 4> ExitCaseIndices;
for (auto Case : SI.cases()) {
auto *SuccBB = Case.getCaseSuccessor();
- if (ExitBlockSet.count(SuccBB) &&
+ if (!L.contains(SuccBB) &&
areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB))
ExitCaseIndices.push_back(Case.getCaseIndex());
}
BasicBlock *DefaultExitBB = nullptr;
- if (ExitBlockSet.count(SI.getDefaultDest()) &&
+ if (!L.contains(SI.getDefaultDest()) &&
areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) &&
!isa<UnreachableInst>(SI.getDefaultDest()->getTerminator()))
DefaultExitBB = SI.getDefaultDest();
else if (ExitCaseIndices.empty())
return false;
- DEBUG(dbgs() << " unswitching trivial cases...\n");
+ LLVM_DEBUG(dbgs() << " unswitching trivial cases...\n");
+
+ // We may need to invalidate SCEVs for the outermost loop reached by any of
+ // the exits.
+ Loop *OuterL = &L;
+ if (DefaultExitBB) {
+ // Clear out the default destination temporarily to allow accurate
+ // predecessor lists to be examined below.
+ SI.setDefaultDest(nullptr);
+ // Check the loop containing this exit.
+ Loop *ExitL = LI.getLoopFor(DefaultExitBB);
+ if (!ExitL || ExitL->contains(OuterL))
+ OuterL = ExitL;
+ }
+
+ // Store the exit cases into a separate data structure and remove them from
+ // the switch.
SmallVector<std::pair<ConstantInt *, BasicBlock *>, 4> ExitCases;
ExitCases.reserve(ExitCaseIndices.size());
// We walk the case indices backwards so that we remove the last case first
// and don't disrupt the earlier indices.
for (unsigned Index : reverse(ExitCaseIndices)) {
auto CaseI = SI.case_begin() + Index;
+ // Compute the outer loop from this exit.
+ Loop *ExitL = LI.getLoopFor(CaseI->getCaseSuccessor());
+ if (!ExitL || ExitL->contains(OuterL))
+ OuterL = ExitL;
// Save the value of this case.
ExitCases.push_back({CaseI->getCaseValue(), CaseI->getCaseSuccessor()});
// Delete the unswitched cases.
SI.removeCase(CaseI);
}
+ if (SE) {
+ if (OuterL)
+ SE->forgetLoop(OuterL);
+ else
+ SE->forgetTopmostLoop(&L);
+ }
+
// Check if after this all of the remaining cases point at the same
// successor.
BasicBlock *CommonSuccBB = nullptr;
@@ -517,23 +588,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
SI.case_begin()->getCaseSuccessor();
}))
CommonSuccBB = SI.case_begin()->getCaseSuccessor();
-
- if (DefaultExitBB) {
- // We can't remove the default edge so replace it with an edge to either
- // the single common remaining successor (if we have one) or an unreachable
- // block.
- if (CommonSuccBB) {
- SI.setDefaultDest(CommonSuccBB);
- } else {
- BasicBlock *UnreachableBB = BasicBlock::Create(
- ParentBB->getContext(),
- Twine(ParentBB->getName()) + ".unreachable_default",
- ParentBB->getParent());
- new UnreachableInst(ParentBB->getContext(), UnreachableBB);
- SI.setDefaultDest(UnreachableBB);
- DT.addNewBlock(UnreachableBB, ParentBB);
- }
- } else {
+ if (!DefaultExitBB) {
// If we're not unswitching the default, we need it to match any cases to
// have a common successor or if we have no cases it is the common
// successor.
@@ -570,9 +625,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
} else {
auto *SplitBB =
SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
- rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
- *ParentBB, *OldPH);
- updateIDomWithKnownCommonDominator(DefaultExitBB, L.getHeader(), DT);
+ rewritePHINodesForExitAndUnswitchedBlocks(
+ *DefaultExitBB, *SplitBB, *ParentBB, *OldPH, /*FullUnswitch*/ true);
DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
}
}
@@ -597,9 +651,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
if (!SplitExitBB) {
// If this is the first time we see this, do the split and remember it.
SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
- rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
- *ParentBB, *OldPH);
- updateIDomWithKnownCommonDominator(ExitBB, L.getHeader(), DT);
+ rewritePHINodesForExitAndUnswitchedBlocks(
+ *ExitBB, *SplitExitBB, *ParentBB, *OldPH, /*FullUnswitch*/ true);
}
// Update the case pair to point to the split block.
CasePair.second = SplitExitBB;
@@ -612,14 +665,12 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
BasicBlock *UnswitchedBB = CasePair.second;
NewSI->addCase(CaseVal, UnswitchedBB);
- updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
}
// If the default was unswitched, re-point it and add explicit cases for
// entering the loop.
if (DefaultExitBB) {
NewSI->setDefaultDest(DefaultExitBB);
- updateDTAfterUnswitch(DefaultExitBB, OldPH, DT);
// We removed all the exit cases, so we just copy the cases to the
// unswitched switch.
@@ -633,11 +684,57 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
// pointing at unreachable and other complexity.
if (CommonSuccBB) {
BasicBlock *BB = SI.getParent();
+ // We may have had multiple edges to this common successor block, so remove
+ // them as predecessors. We skip the first one, either the default or the
+ // actual first case.
+ bool SkippedFirst = DefaultExitBB == nullptr;
+ for (auto Case : SI.cases()) {
+ assert(Case.getCaseSuccessor() == CommonSuccBB &&
+ "Non-common successor!");
+ (void)Case;
+ if (!SkippedFirst) {
+ SkippedFirst = true;
+ continue;
+ }
+ CommonSuccBB->removePredecessor(BB,
+ /*DontDeleteUselessPHIs*/ true);
+ }
+ // Now nuke the switch and replace it with a direct branch.
SI.eraseFromParent();
BranchInst::Create(CommonSuccBB, BB);
+ } else if (DefaultExitBB) {
+ assert(SI.getNumCases() > 0 &&
+ "If we had no cases we'd have a common successor!");
+ // Move the last case to the default successor. This is valid as if the
+ // default got unswitched it cannot be reached. This has the advantage of
+ // being simple and keeping the number of edges from this switch to
+ // successors the same, and avoiding any PHI update complexity.
+ auto LastCaseI = std::prev(SI.case_end());
+ SI.setDefaultDest(LastCaseI->getCaseSuccessor());
+ SI.removeCase(LastCaseI);
}
- DT.verifyDomTree();
+ // Walk the unswitched exit blocks and the unswitched split blocks and update
+ // the dominator tree based on the CFG edits. While we are walking unordered
+ // containers here, the API for applyUpdates takes an unordered list of
+ // updates and requires them to not contain duplicates.
+ SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+ for (auto *UnswitchedExitBB : UnswitchedExitBBs) {
+ DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedExitBB});
+ DTUpdates.push_back({DT.Insert, OldPH, UnswitchedExitBB});
+ }
+ for (auto SplitUnswitchedPair : SplitExitBBMap) {
+ auto *UnswitchedBB = SplitUnswitchedPair.second;
+ DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedBB});
+ DTUpdates.push_back({DT.Insert, OldPH, UnswitchedBB});
+ }
+ DT.applyUpdates(DTUpdates);
+ assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+
+ // We may have changed the nesting relationship for this loop so hoist it to
+ // its correct parent if needed.
+ hoistLoopToNewParent(L, *NewPH, DT, LI);
+
++NumTrivial;
++NumSwitches;
return true;
@@ -652,8 +749,11 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
///
/// The return value indicates whether anything was unswitched (and therefore
/// changed).
+///
+/// If `SE` is not null, it will be updated based on the potential loop SCEVs
+/// invalidated by this.
static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
- LoopInfo &LI) {
+ LoopInfo &LI, ScalarEvolution *SE) {
bool Changed = false;
// If loop header has only one reachable successor we should keep looking for
@@ -687,8 +787,8 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
if (isa<Constant>(SI->getCondition()))
return Changed;
- if (!unswitchTrivialSwitch(L, *SI, DT, LI))
- // Coludn't unswitch this one so we're done.
+ if (!unswitchTrivialSwitch(L, *SI, DT, LI, SE))
+ // Couldn't unswitch this one so we're done.
return Changed;
// Mark that we managed to unswitch something.
@@ -719,17 +819,19 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
// Found a trivial condition candidate: non-foldable conditional branch. If
// we fail to unswitch this, we can't do anything else that is trivial.
- if (!unswitchTrivialBranch(L, *BI, DT, LI))
+ if (!unswitchTrivialBranch(L, *BI, DT, LI, SE))
return Changed;
// Mark that we managed to unswitch something.
Changed = true;
- // We unswitched the branch. This should always leave us with an
- // unconditional branch that we can follow now.
+ // If we only unswitched some of the conditions feeding the branch, we won't
+ // have collapsed it to a single successor.
BI = cast<BranchInst>(CurrentBB->getTerminator());
- assert(!BI->isConditional() &&
- "Cannot form a conditional branch by unswitching1");
+ if (BI->isConditional())
+ return Changed;
+
+ // Follow the newly unconditional branch into its successor.
CurrentBB = BI->getSuccessor(0);
// When continuing, if we exit the loop or reach a previous visited block,
@@ -748,8 +850,12 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
///
/// This routine handles cloning all of the necessary loop blocks and exit
/// blocks including rewriting their instructions and the relevant PHI nodes.
-/// It skips loop and exit blocks that are not necessary based on the provided
-/// set. It also correctly creates the unconditional branch in the cloned
+/// Any loop blocks or exit blocks which are dominated by a different successor
+/// than the one for this clone of the loop blocks can be trivially skipped. We
+/// use the `DominatingSucc` map to determine whether a block satisfies that
+/// property with a simple map lookup.
+///
+/// It also correctly creates the unconditional branch in the cloned
/// unswitched parent block to only point at the unswitched successor.
///
/// This does not handle most of the necessary updates to `LoopInfo`. Only exit
@@ -763,9 +869,10 @@ static BasicBlock *buildClonedLoopBlocks(
Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB,
ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB,
BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB,
- const SmallPtrSetImpl<BasicBlock *> &SkippedLoopAndExitBlocks,
- ValueToValueMapTy &VMap, AssumptionCache &AC, DominatorTree &DT,
- LoopInfo &LI) {
+ const SmallDenseMap<BasicBlock *, BasicBlock *, 16> &DominatingSucc,
+ ValueToValueMapTy &VMap,
+ SmallVectorImpl<DominatorTree::UpdateType> &DTUpdates, AssumptionCache &AC,
+ DominatorTree &DT, LoopInfo &LI) {
SmallVector<BasicBlock *, 4> NewBlocks;
NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size());
@@ -780,26 +887,29 @@ static BasicBlock *buildClonedLoopBlocks(
NewBlocks.push_back(NewBB);
VMap[OldBB] = NewBB;
- // Add the block to the domtree. We'll move it to the correct position
- // below.
- DT.addNewBlock(NewBB, SplitBB);
-
return NewBB;
};
+ // We skip cloning blocks when they have a dominating succ that is not the
+ // succ we are cloning for.
+ auto SkipBlock = [&](BasicBlock *BB) {
+ auto It = DominatingSucc.find(BB);
+ return It != DominatingSucc.end() && It->second != UnswitchedSuccBB;
+ };
+
// First, clone the preheader.
auto *ClonedPH = CloneBlock(LoopPH);
// Then clone all the loop blocks, skipping the ones that aren't necessary.
for (auto *LoopBB : L.blocks())
- if (!SkippedLoopAndExitBlocks.count(LoopBB))
+ if (!SkipBlock(LoopBB))
CloneBlock(LoopBB);
// Split all the loop exit edges so that when we clone the exit blocks, if
// any of the exit blocks are *also* a preheader for some other loop, we
// don't create multiple predecessors entering the loop header.
for (auto *ExitBB : ExitBlocks) {
- if (SkippedLoopAndExitBlocks.count(ExitBB))
+ if (SkipBlock(ExitBB))
continue;
// When we are going to clone an exit, we don't need to clone all the
@@ -822,17 +932,6 @@ static BasicBlock *buildClonedLoopBlocks(
assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB &&
"Cloned exit block has the wrong successor!");
- // Move the merge block's idom to be the split point as one exit is
- // dominated by one header, and the other by another, so we know the split
- // point dominates both. While the dominator tree isn't fully accurate, we
- // want sub-trees within the original loop to be correctly reflect
- // dominance within that original loop (at least) and that requires moving
- // the merge block out of that subtree.
- // FIXME: This is very brittle as we essentially have a partial contract on
- // the dominator tree. We really need to instead update it and keep it
- // valid or stop relying on it.
- DT.changeImmediateDominator(MergeBB, SplitBB);
-
// Remap any cloned instructions and create a merge phi node for them.
for (auto ZippedInsts : llvm::zip_first(
llvm::make_range(ExitBB->begin(), std::prev(ExitBB->end())),
@@ -872,28 +971,63 @@ static BasicBlock *buildClonedLoopBlocks(
AC.registerAssumption(II);
}
- // Remove the cloned parent as a predecessor of the cloned continue successor
- // if we did in fact clone it.
- auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB));
- if (auto *ClonedContinueSuccBB =
- cast_or_null<BasicBlock>(VMap.lookup(ContinueSuccBB)))
- ClonedContinueSuccBB->removePredecessor(ClonedParentBB,
- /*DontDeleteUselessPHIs*/ true);
- // Replace the cloned branch with an unconditional branch to the cloneed
- // unswitched successor.
- auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB));
- ClonedParentBB->getTerminator()->eraseFromParent();
- BranchInst::Create(ClonedSuccBB, ClonedParentBB);
-
// Update any PHI nodes in the cloned successors of the skipped blocks to not
// have spurious incoming values.
for (auto *LoopBB : L.blocks())
- if (SkippedLoopAndExitBlocks.count(LoopBB))
+ if (SkipBlock(LoopBB))
for (auto *SuccBB : successors(LoopBB))
if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB)))
for (PHINode &PN : ClonedSuccBB->phis())
PN.removeIncomingValue(LoopBB, /*DeletePHIIfEmpty*/ false);
+ // Remove the cloned parent as a predecessor of any successor we ended up
+ // cloning other than the unswitched one.
+ auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB));
+ for (auto *SuccBB : successors(ParentBB)) {
+ if (SuccBB == UnswitchedSuccBB)
+ continue;
+
+ auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB));
+ if (!ClonedSuccBB)
+ continue;
+
+ ClonedSuccBB->removePredecessor(ClonedParentBB,
+ /*DontDeleteUselessPHIs*/ true);
+ }
+
+ // Replace the cloned branch with an unconditional branch to the cloned
+ // unswitched successor.
+ auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB));
+ ClonedParentBB->getTerminator()->eraseFromParent();
+ BranchInst::Create(ClonedSuccBB, ClonedParentBB);
+
+ // If there are duplicate entries in the PHI nodes because of multiple edges
+ // to the unswitched successor, we need to nuke all but one as we replaced it
+ // with a direct branch.
+ for (PHINode &PN : ClonedSuccBB->phis()) {
+ bool Found = false;
+ // Loop over the incoming operands backwards so we can easily delete as we
+ // go without invalidating the index.
+ for (int i = PN.getNumOperands() - 1; i >= 0; --i) {
+ if (PN.getIncomingBlock(i) != ClonedParentBB)
+ continue;
+ if (!Found) {
+ Found = true;
+ continue;
+ }
+ PN.removeIncomingValue(i, /*DeletePHIIfEmpty*/ false);
+ }
+ }
+
+ // Record the domtree updates for the new blocks.
+ SmallPtrSet<BasicBlock *, 4> SuccSet;
+ for (auto *ClonedBB : NewBlocks) {
+ for (auto *SuccBB : successors(ClonedBB))
+ if (SuccSet.insert(SuccBB).second)
+ DTUpdates.push_back({DominatorTree::Insert, ClonedBB, SuccBB});
+ SuccSet.clear();
+ }
+
return ClonedPH;
}
@@ -911,11 +1045,8 @@ static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL,
for (auto *BB : OrigL.blocks()) {
auto *ClonedBB = cast<BasicBlock>(VMap.lookup(BB));
ClonedL.addBlockEntry(ClonedBB);
- if (LI.getLoopFor(BB) == &OrigL) {
- assert(!LI.getLoopFor(ClonedBB) &&
- "Should not have an existing loop for this block!");
+ if (LI.getLoopFor(BB) == &OrigL)
LI.changeLoopFor(ClonedBB, &ClonedL);
- }
}
};
@@ -965,9 +1096,9 @@ static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL,
/// original loop, multiple cloned sibling loops may be created. All of them
/// are returned so that the newly introduced loop nest roots can be
/// identified.
-static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
- const ValueToValueMapTy &VMap, LoopInfo &LI,
- SmallVectorImpl<Loop *> &NonChildClonedLoops) {
+static void buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
+ const ValueToValueMapTy &VMap, LoopInfo &LI,
+ SmallVectorImpl<Loop *> &NonChildClonedLoops) {
Loop *ClonedL = nullptr;
auto *OrigPH = OrigL.getLoopPreheader();
@@ -1060,6 +1191,7 @@ static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
} else {
LI.addTopLevelLoop(ClonedL);
}
+ NonChildClonedLoops.push_back(ClonedL);
ClonedL->reserveBlocks(BlocksInClonedLoop.size());
// We don't want to just add the cloned loop blocks based on how we
@@ -1128,11 +1260,11 @@ static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
// matter as we're just trying to build up the map from inside-out; we use
// the map in a more stably ordered way below.
auto OrderedClonedExitsInLoops = ClonedExitsInLoops;
- std::sort(OrderedClonedExitsInLoops.begin(), OrderedClonedExitsInLoops.end(),
- [&](BasicBlock *LHS, BasicBlock *RHS) {
- return ExitLoopMap.lookup(LHS)->getLoopDepth() <
- ExitLoopMap.lookup(RHS)->getLoopDepth();
- });
+ llvm::sort(OrderedClonedExitsInLoops.begin(), OrderedClonedExitsInLoops.end(),
+ [&](BasicBlock *LHS, BasicBlock *RHS) {
+ return ExitLoopMap.lookup(LHS)->getLoopDepth() <
+ ExitLoopMap.lookup(RHS)->getLoopDepth();
+ });
// Populate the existing ExitLoopMap with everything reachable from each
// exit, starting from the inner most exit.
@@ -1212,60 +1344,69 @@ static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
NonChildClonedLoops.push_back(cloneLoopNest(
*ChildL, ExitLoopMap.lookup(ClonedChildHeader), VMap, LI));
}
+}
+
+static void
+deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
+ ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
+ DominatorTree &DT) {
+ // Find all the dead clones, and remove them from their successors.
+ SmallVector<BasicBlock *, 16> DeadBlocks;
+ for (BasicBlock *BB : llvm::concat<BasicBlock *const>(L.blocks(), ExitBlocks))
+ for (auto &VMap : VMaps)
+ if (BasicBlock *ClonedBB = cast_or_null<BasicBlock>(VMap->lookup(BB)))
+ if (!DT.isReachableFromEntry(ClonedBB)) {
+ for (BasicBlock *SuccBB : successors(ClonedBB))
+ SuccBB->removePredecessor(ClonedBB);
+ DeadBlocks.push_back(ClonedBB);
+ }
- // Return the main cloned loop if any.
- return ClonedL;
+ // Drop any remaining references to break cycles.
+ for (BasicBlock *BB : DeadBlocks)
+ BB->dropAllReferences();
+ // Erase them from the IR.
+ for (BasicBlock *BB : DeadBlocks)
+ BB->eraseFromParent();
}
-static void deleteDeadBlocksFromLoop(Loop &L, BasicBlock *DeadSubtreeRoot,
- SmallVectorImpl<BasicBlock *> &ExitBlocks,
- DominatorTree &DT, LoopInfo &LI) {
- // Walk the dominator tree to build up the set of blocks we will delete here.
- // The order is designed to allow us to always delete bottom-up and avoid any
- // dangling uses.
- SmallSetVector<BasicBlock *, 16> DeadBlocks;
- DeadBlocks.insert(DeadSubtreeRoot);
- for (int i = 0; i < (int)DeadBlocks.size(); ++i)
- for (DomTreeNode *ChildN : *DT[DeadBlocks[i]]) {
- // FIXME: This assert should pass and that means we don't change nearly
- // as much below! Consider rewriting all of this to avoid deleting
- // blocks. They are always cloned before being deleted, and so instead
- // could just be moved.
- // FIXME: This in turn means that we might actually be more able to
- // update the domtree.
- assert((L.contains(ChildN->getBlock()) ||
- llvm::find(ExitBlocks, ChildN->getBlock()) != ExitBlocks.end()) &&
- "Should never reach beyond the loop and exits when deleting!");
- DeadBlocks.insert(ChildN->getBlock());
+static void
+deleteDeadBlocksFromLoop(Loop &L,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks,
+ DominatorTree &DT, LoopInfo &LI) {
+ // Find all the dead blocks, and remove them from their successors.
+ SmallVector<BasicBlock *, 16> DeadBlocks;
+ for (BasicBlock *BB : llvm::concat<BasicBlock *const>(L.blocks(), ExitBlocks))
+ if (!DT.isReachableFromEntry(BB)) {
+ for (BasicBlock *SuccBB : successors(BB))
+ SuccBB->removePredecessor(BB);
+ DeadBlocks.push_back(BB);
}
+ SmallPtrSet<BasicBlock *, 16> DeadBlockSet(DeadBlocks.begin(),
+ DeadBlocks.end());
+
// Filter out the dead blocks from the exit blocks list so that it can be
// used in the caller.
llvm::erase_if(ExitBlocks,
- [&](BasicBlock *BB) { return DeadBlocks.count(BB); });
-
- // Remove these blocks from their successors.
- for (auto *BB : DeadBlocks)
- for (BasicBlock *SuccBB : successors(BB))
- SuccBB->removePredecessor(BB, /*DontDeleteUselessPHIs*/ true);
+ [&](BasicBlock *BB) { return DeadBlockSet.count(BB); });
// Walk from this loop up through its parents removing all of the dead blocks.
for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) {
for (auto *BB : DeadBlocks)
ParentL->getBlocksSet().erase(BB);
llvm::erase_if(ParentL->getBlocksVector(),
- [&](BasicBlock *BB) { return DeadBlocks.count(BB); });
+ [&](BasicBlock *BB) { return DeadBlockSet.count(BB); });
}
// Now delete the dead child loops. This raw delete will clear them
// recursively.
llvm::erase_if(L.getSubLoopsVector(), [&](Loop *ChildL) {
- if (!DeadBlocks.count(ChildL->getHeader()))
+ if (!DeadBlockSet.count(ChildL->getHeader()))
return false;
assert(llvm::all_of(ChildL->blocks(),
[&](BasicBlock *ChildBB) {
- return DeadBlocks.count(ChildBB);
+ return DeadBlockSet.count(ChildBB);
}) &&
"If the child loop header is dead all blocks in the child loop must "
"be dead as well!");
@@ -1273,19 +1414,20 @@ static void deleteDeadBlocksFromLoop(Loop &L, BasicBlock *DeadSubtreeRoot,
return true;
});
- // Remove the mappings for the dead blocks.
- for (auto *BB : DeadBlocks)
+ // Remove the loop mappings for the dead blocks and drop all the references
+ // from these blocks to others to handle cyclic references as we start
+ // deleting the blocks themselves.
+ for (auto *BB : DeadBlocks) {
+ // Check that the dominator tree has already been updated.
+ assert(!DT.getNode(BB) && "Should already have cleared domtree!");
LI.changeLoopFor(BB, nullptr);
-
- // Drop all the references from these blocks to others to handle cyclic
- // references as we start deleting the blocks themselves.
- for (auto *BB : DeadBlocks)
BB->dropAllReferences();
+ }
- for (auto *BB : llvm::reverse(DeadBlocks)) {
- DT.eraseNode(BB);
+ // Actually delete the blocks now that they've been fully unhooked from the
+ // IR.
+ for (auto *BB : DeadBlocks)
BB->eraseFromParent();
- }
}
/// Recompute the set of blocks in a loop after unswitching.
@@ -1333,14 +1475,15 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
if (LoopBlockSet.empty())
return LoopBlockSet;
- // Add the loop header to the set.
- LoopBlockSet.insert(Header);
-
// We found backedges, recurse through them to identify the loop blocks.
while (!Worklist.empty()) {
BasicBlock *BB = Worklist.pop_back_val();
assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!");
+ // No need to walk past the header.
+ if (BB == Header)
+ continue;
+
// Because we know the inner loop structure remains valid we can use the
// loop structure to jump immediately across the entire nested loop.
// Further, because it is in loop simplified form, we can directly jump
@@ -1361,9 +1504,10 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
continue;
// Insert all of the blocks (other than those already present) into
- // the loop set. The only block we expect to already be in the set is
- // the one we used to find this loop as we immediately handle the
- // others the first time we encounter the loop.
+ // the loop set. We expect at least the block that led us to find the
+ // inner loop to be in the block set, but we may also have other loop
+ // blocks if they were already enqueued as predecessors of some other
+ // outer loop block.
for (auto *InnerBB : InnerL->blocks()) {
if (InnerBB == BB) {
assert(LoopBlockSet.count(InnerBB) &&
@@ -1371,9 +1515,7 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
continue;
}
- bool Inserted = LoopBlockSet.insert(InnerBB).second;
- (void)Inserted;
- assert(Inserted && "Should only insert an inner loop once!");
+ LoopBlockSet.insert(InnerBB);
}
// Add the preheader to the worklist so we will continue past the
@@ -1389,6 +1531,8 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
Worklist.push_back(Pred);
}
+ assert(LoopBlockSet.count(Header) && "Cannot fail to add the header!");
+
// We've found all the blocks participating in the loop, return our completed
// set.
return LoopBlockSet;
@@ -1636,32 +1780,58 @@ void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
} while (!DomWorklist.empty());
}
-/// Take an invariant branch that has been determined to be safe and worthwhile
-/// to unswitch despite being non-trivial to do so and perform the unswitch.
-///
-/// This directly updates the CFG to hoist the predicate out of the loop, and
-/// clone the necessary parts of the loop to maintain behavior.
-///
-/// It also updates both dominator tree and loopinfo based on the unswitching.
-///
-/// Once unswitching has been performed it runs the provided callback to report
-/// the new loops and no-longer valid loops to the caller.
-static bool unswitchInvariantBranch(
- Loop &L, BranchInst &BI, DominatorTree &DT, LoopInfo &LI,
- AssumptionCache &AC,
- function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) {
- assert(BI.isConditional() && "Can only unswitch a conditional branch!");
- assert(L.isLoopInvariant(BI.getCondition()) &&
- "Can only unswitch an invariant branch condition!");
+static bool unswitchNontrivialInvariants(
+ Loop &L, TerminatorInst &TI, ArrayRef<Value *> Invariants,
+ DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
+ function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
+ ScalarEvolution *SE) {
+ auto *ParentBB = TI.getParent();
+ BranchInst *BI = dyn_cast<BranchInst>(&TI);
+ SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(&TI);
+
+ // We can only unswitch switches, conditional branches with an invariant
+ // condition, or combining invariant conditions with an instruction.
+ assert((SI || BI->isConditional()) &&
+ "Can only unswitch switches and conditional branch!");
+ bool FullUnswitch = SI || BI->getCondition() == Invariants[0];
+ if (FullUnswitch)
+ assert(Invariants.size() == 1 &&
+ "Cannot have other invariants with full unswitching!");
+ else
+ assert(isa<Instruction>(BI->getCondition()) &&
+ "Partial unswitching requires an instruction as the condition!");
+
+ // Constant and BBs tracking the cloned and continuing successor. When we are
+ // unswitching the entire condition, this can just be trivially chosen to
+ // unswitch towards `true`. However, when we are unswitching a set of
+ // invariants combined with `and` or `or`, the combining operation determines
+ // the best direction to unswitch: we want to unswitch the direction that will
+ // collapse the branch.
+ bool Direction = true;
+ int ClonedSucc = 0;
+ if (!FullUnswitch) {
+ if (cast<Instruction>(BI->getCondition())->getOpcode() != Instruction::Or) {
+ assert(cast<Instruction>(BI->getCondition())->getOpcode() ==
+ Instruction::And &&
+ "Only `or` and `and` instructions can combine invariants being "
+ "unswitched.");
+ Direction = false;
+ ClonedSucc = 1;
+ }
+ }
- // Constant and BBs tracking the cloned and continuing successor.
- const int ClonedSucc = 0;
- auto *ParentBB = BI.getParent();
- auto *UnswitchedSuccBB = BI.getSuccessor(ClonedSucc);
- auto *ContinueSuccBB = BI.getSuccessor(1 - ClonedSucc);
+ BasicBlock *RetainedSuccBB =
+ BI ? BI->getSuccessor(1 - ClonedSucc) : SI->getDefaultDest();
+ SmallSetVector<BasicBlock *, 4> UnswitchedSuccBBs;
+ if (BI)
+ UnswitchedSuccBBs.insert(BI->getSuccessor(ClonedSucc));
+ else
+ for (auto Case : SI->cases())
+ if (Case.getCaseSuccessor() != RetainedSuccBB)
+ UnswitchedSuccBBs.insert(Case.getCaseSuccessor());
- assert(UnswitchedSuccBB != ContinueSuccBB &&
- "Should not unswitch a branch that always goes to the same place!");
+ assert(!UnswitchedSuccBBs.count(RetainedSuccBB) &&
+ "Should not unswitch the same successor we are retaining!");
// The branch should be in this exact loop. Any inner loop's invariant branch
// should be handled by unswitching that inner loop. The caller of this
@@ -1680,9 +1850,6 @@ static bool unswitchInvariantBranch(
if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI()))
return false;
- SmallPtrSet<BasicBlock *, 4> ExitBlockSet(ExitBlocks.begin(),
- ExitBlocks.end());
-
// Compute the parent loop now before we start hacking on things.
Loop *ParentL = L.getParentLoop();
@@ -1701,27 +1868,31 @@ static bool unswitchInvariantBranch(
OuterExitL = NewOuterExitL;
}
- // If the edge we *aren't* cloning in the unswitch (the continuing edge)
- // dominates its target, we can skip cloning the dominated region of the loop
- // and its exits. We compute this as a set of nodes to be skipped.
- SmallPtrSet<BasicBlock *, 4> SkippedLoopAndExitBlocks;
- if (ContinueSuccBB->getUniquePredecessor() ||
- llvm::all_of(predecessors(ContinueSuccBB), [&](BasicBlock *PredBB) {
- return PredBB == ParentBB || DT.dominates(ContinueSuccBB, PredBB);
- })) {
- visitDomSubTree(DT, ContinueSuccBB, [&](BasicBlock *BB) {
- SkippedLoopAndExitBlocks.insert(BB);
- return true;
- });
+ // At this point, we're definitely going to unswitch something so invalidate
+ // any cached information in ScalarEvolution for the outer most loop
+ // containing an exit block and all nested loops.
+ if (SE) {
+ if (OuterExitL)
+ SE->forgetLoop(OuterExitL);
+ else
+ SE->forgetTopmostLoop(&L);
}
- // Similarly, if the edge we *are* cloning in the unswitch (the unswitched
- // edge) dominates its target, we will end up with dead nodes in the original
- // loop and its exits that will need to be deleted. Here, we just retain that
- // the property holds and will compute the deleted set later.
- bool DeleteUnswitchedSucc =
- UnswitchedSuccBB->getUniquePredecessor() ||
- llvm::all_of(predecessors(UnswitchedSuccBB), [&](BasicBlock *PredBB) {
- return PredBB == ParentBB || DT.dominates(UnswitchedSuccBB, PredBB);
+
+ // If the edge from this terminator to a successor dominates that successor,
+ // store a map from each block in its dominator subtree to it. This lets us
+ // tell when cloning for a particular successor if a block is dominated by
+ // some *other* successor with a single data structure. We use this to
+ // significantly reduce cloning.
+ SmallDenseMap<BasicBlock *, BasicBlock *, 16> DominatingSucc;
+ for (auto *SuccBB : llvm::concat<BasicBlock *const>(
+ makeArrayRef(RetainedSuccBB), UnswitchedSuccBBs))
+ if (SuccBB->getUniquePredecessor() ||
+ llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) {
+ return PredBB == ParentBB || DT.dominates(SuccBB, PredBB);
+ }))
+ visitDomSubTree(DT, SuccBB, [&](BasicBlock *BB) {
+ DominatingSucc[BB] = SuccBB;
+ return true;
});
// Split the preheader, so that we know that there is a safe place to insert
@@ -1732,52 +1903,162 @@ static bool unswitchInvariantBranch(
BasicBlock *SplitBB = L.getLoopPreheader();
BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI);
- // Keep a mapping for the cloned values.
- ValueToValueMapTy VMap;
+ // Keep track of the dominator tree updates needed.
+ SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+
+ // Clone the loop for each unswitched successor.
+ SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
+ VMaps.reserve(UnswitchedSuccBBs.size());
+ SmallDenseMap<BasicBlock *, BasicBlock *, 4> ClonedPHs;
+ for (auto *SuccBB : UnswitchedSuccBBs) {
+ VMaps.emplace_back(new ValueToValueMapTy());
+ ClonedPHs[SuccBB] = buildClonedLoopBlocks(
+ L, LoopPH, SplitBB, ExitBlocks, ParentBB, SuccBB, RetainedSuccBB,
+ DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI);
+ }
- // Build the cloned blocks from the loop.
- auto *ClonedPH = buildClonedLoopBlocks(
- L, LoopPH, SplitBB, ExitBlocks, ParentBB, UnswitchedSuccBB,
- ContinueSuccBB, SkippedLoopAndExitBlocks, VMap, AC, DT, LI);
+ // The stitching of the branched code back together depends on whether we're
+ // doing full unswitching or not with the exception that we always want to
+ // nuke the initial terminator placed in the split block.
+ SplitBB->getTerminator()->eraseFromParent();
+ if (FullUnswitch) {
+ // First we need to unhook the successor relationship as we'll be replacing
+ // the terminator with a direct branch. This is much simpler for branches
+ // than switches so we handle those first.
+ if (BI) {
+ // Remove the parent as a predecessor of the unswitched successor.
+ assert(UnswitchedSuccBBs.size() == 1 &&
+ "Only one possible unswitched block for a branch!");
+ BasicBlock *UnswitchedSuccBB = *UnswitchedSuccBBs.begin();
+ UnswitchedSuccBB->removePredecessor(ParentBB,
+ /*DontDeleteUselessPHIs*/ true);
+ DTUpdates.push_back({DominatorTree::Delete, ParentBB, UnswitchedSuccBB});
+ } else {
+ // Note that we actually want to remove the parent block as a predecessor
+ // of *every* case successor. The case successor is either unswitched,
+ // completely eliminating an edge from the parent to that successor, or it
+ // is a duplicate edge to the retained successor as the retained successor
+ // is always the default successor and as we'll replace this with a direct
+ // branch we no longer need the duplicate entries in the PHI nodes.
+ assert(SI->getDefaultDest() == RetainedSuccBB &&
+ "Not retaining default successor!");
+ for (auto &Case : SI->cases())
+ Case.getCaseSuccessor()->removePredecessor(
+ ParentBB,
+ /*DontDeleteUselessPHIs*/ true);
+
+ // We need to use the set to populate domtree updates as even when there
+ // are multiple cases pointing at the same successor we only want to
+ // remove and insert one edge in the domtree.
+ for (BasicBlock *SuccBB : UnswitchedSuccBBs)
+ DTUpdates.push_back({DominatorTree::Delete, ParentBB, SuccBB});
+ }
- // Build the cloned loop structure itself. This may be substantially
- // different from the original structure due to the simplified CFG. This also
- // handles inserting all the cloned blocks into the correct loops.
- SmallVector<Loop *, 4> NonChildClonedLoops;
- Loop *ClonedL =
- buildClonedLoops(L, ExitBlocks, VMap, LI, NonChildClonedLoops);
+ // Now that we've unhooked the successor relationship, splice the terminator
+ // from the original loop to the split.
+ SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), TI);
- // Remove the parent as a predecessor of the unswitched successor.
- UnswitchedSuccBB->removePredecessor(ParentBB, /*DontDeleteUselessPHIs*/ true);
+ // Now wire up the terminator to the preheaders.
+ if (BI) {
+ BasicBlock *ClonedPH = ClonedPHs.begin()->second;
+ BI->setSuccessor(ClonedSucc, ClonedPH);
+ BI->setSuccessor(1 - ClonedSucc, LoopPH);
+ DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
+ } else {
+ assert(SI && "Must either be a branch or switch!");
+
+ // Walk the cases and directly update their successors.
+ SI->setDefaultDest(LoopPH);
+ for (auto &Case : SI->cases())
+ if (Case.getCaseSuccessor() == RetainedSuccBB)
+ Case.setSuccessor(LoopPH);
+ else
+ Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second);
+
+ // We need to use the set to populate domtree updates as even when there
+ // are multiple cases pointing at the same successor we only want to
+ // remove and insert one edge in the domtree.
+ for (BasicBlock *SuccBB : UnswitchedSuccBBs)
+ DTUpdates.push_back(
+ {DominatorTree::Insert, SplitBB, ClonedPHs.find(SuccBB)->second});
+ }
- // Now splice the branch from the original loop and use it to select between
- // the two loops.
- SplitBB->getTerminator()->eraseFromParent();
- SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), BI);
- BI.setSuccessor(ClonedSucc, ClonedPH);
- BI.setSuccessor(1 - ClonedSucc, LoopPH);
+ // Create a new unconditional branch to the continuing block (as opposed to
+ // the one cloned).
+ BranchInst::Create(RetainedSuccBB, ParentBB);
+ } else {
+ assert(BI && "Only branches have partial unswitching.");
+ assert(UnswitchedSuccBBs.size() == 1 &&
+ "Only one possible unswitched block for a branch!");
+ BasicBlock *ClonedPH = ClonedPHs.begin()->second;
+ // When doing a partial unswitch, we have to do a bit more work to build up
+ // the branch in the split block.
+ buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction,
+ *ClonedPH, *LoopPH);
+ DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
+ }
- // Create a new unconditional branch to the continuing block (as opposed to
- // the one cloned).
- BranchInst::Create(ContinueSuccBB, ParentBB);
+ // Apply the updates accumulated above to get an up-to-date dominator tree.
+ DT.applyUpdates(DTUpdates);
- // Delete anything that was made dead in the original loop due to
- // unswitching.
- if (DeleteUnswitchedSucc)
- deleteDeadBlocksFromLoop(L, UnswitchedSuccBB, ExitBlocks, DT, LI);
+ // Now that we have an accurate dominator tree, first delete the dead cloned
+ // blocks so that we can accurately build any cloned loops. It is important to
+ // not delete the blocks from the original loop yet because we still want to
+ // reference the original loop to understand the cloned loop's structure.
+ deleteDeadClonedBlocks(L, ExitBlocks, VMaps, DT);
+
+ // Build the cloned loop structure itself. This may be substantially
+ // different from the original structure due to the simplified CFG. This also
+ // handles inserting all the cloned blocks into the correct loops.
+ SmallVector<Loop *, 4> NonChildClonedLoops;
+ for (std::unique_ptr<ValueToValueMapTy> &VMap : VMaps)
+ buildClonedLoops(L, ExitBlocks, *VMap, LI, NonChildClonedLoops);
+ // Now that our cloned loops have been built, we can update the original loop.
+ // First we delete the dead blocks from it and then we rebuild the loop
+ // structure taking these deletions into account.
+ deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI);
SmallVector<Loop *, 4> HoistedLoops;
bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops);
- // This will have completely invalidated the dominator tree. We can't easily
- // bound how much is invalid because in some cases we will refine the
- // predecessor set of exit blocks of the loop which can move large unrelated
- // regions of code into a new subtree.
- //
- // FIXME: Eventually, we should use an incremental update utility that
- // leverages the existing information in the dominator tree (and potentially
- // the nature of the change) to more efficiently update things.
- DT.recalculate(*SplitBB->getParent());
+ // This transformation has a high risk of corrupting the dominator tree, and
+ // the below steps to rebuild loop structures will result in hard to debug
+ // errors in that case so verify that the dominator tree is sane first.
+ // FIXME: Remove this when the bugs stop showing up and rely on existing
+ // verification steps.
+ assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+
+ if (BI) {
+ // If we unswitched a branch which collapses the condition to a known
+ // constant we want to replace all the uses of the invariants within both
+ // the original and cloned blocks. We do this here so that we can use the
+ // now updated dominator tree to identify which side the users are on.
+ assert(UnswitchedSuccBBs.size() == 1 &&
+ "Only one possible unswitched block for a branch!");
+ BasicBlock *ClonedPH = ClonedPHs.begin()->second;
+ ConstantInt *UnswitchedReplacement =
+ Direction ? ConstantInt::getTrue(BI->getContext())
+ : ConstantInt::getFalse(BI->getContext());
+ ConstantInt *ContinueReplacement =
+ Direction ? ConstantInt::getFalse(BI->getContext())
+ : ConstantInt::getTrue(BI->getContext());
+ for (Value *Invariant : Invariants)
+ for (auto UI = Invariant->use_begin(), UE = Invariant->use_end();
+ UI != UE;) {
+ // Grab the use and walk past it so we can clobber it in the use list.
+ Use *U = &*UI++;
+ Instruction *UserI = dyn_cast<Instruction>(U->getUser());
+ if (!UserI)
+ continue;
+
+ // Replace it with the 'continue' side if in the main loop body, and the
+ // unswitched if in the cloned blocks.
+ if (DT.dominates(LoopPH, UserI->getParent()))
+ U->set(ContinueReplacement);
+ else if (DT.dominates(ClonedPH, UserI->getParent()))
+ U->set(UnswitchedReplacement);
+ }
+ }
// We can change which blocks are exit blocks of all the cloned sibling
// loops, the current loop, and any parent loops which shared exit blocks
@@ -1791,57 +2072,50 @@ static bool unswitchInvariantBranch(
// also need to cover any intervening loops. We add all of these loops to
// a list and sort them by loop depth to achieve this without updating
// unnecessary loops.
- auto UpdateLCSSA = [&](Loop &UpdateL) {
+ auto UpdateLoop = [&](Loop &UpdateL) {
#ifndef NDEBUG
- for (Loop *ChildL : UpdateL)
+ UpdateL.verifyLoop();
+ for (Loop *ChildL : UpdateL) {
+ ChildL->verifyLoop();
assert(ChildL->isRecursivelyLCSSAForm(DT, LI) &&
"Perturbed a child loop's LCSSA form!");
+ }
#endif
+ // First build LCSSA for this loop so that we can preserve it when
+ // forming dedicated exits. We don't want to perturb some other loop's
+ // LCSSA while doing that CFG edit.
formLCSSA(UpdateL, DT, &LI, nullptr);
+
+ // For loops reached by this loop's original exit blocks we may
+ // introduced new, non-dedicated exits. At least try to re-form dedicated
+ // exits for these loops. This may fail if they couldn't have dedicated
+ // exits to start with.
+ formDedicatedExitBlocks(&UpdateL, &DT, &LI, /*PreserveLCSSA*/ true);
};
// For non-child cloned loops and hoisted loops, we just need to update LCSSA
// and we can do it in any order as they don't nest relative to each other.
- for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
- UpdateLCSSA(*UpdatedL);
+ //
+ // Also check if any of the loops we have updated have become top-level loops
+ // as that will necessitate widening the outer loop scope.
+ for (Loop *UpdatedL :
+ llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) {
+ UpdateLoop(*UpdatedL);
+ if (!UpdatedL->getParentLoop())
+ OuterExitL = nullptr;
+ }
+ if (IsStillLoop) {
+ UpdateLoop(L);
+ if (!L.getParentLoop())
+ OuterExitL = nullptr;
+ }
// If the original loop had exit blocks, walk up through the outer most loop
// of those exit blocks to update LCSSA and form updated dedicated exits.
- if (OuterExitL != &L) {
- SmallVector<Loop *, 4> OuterLoops;
- // We start with the cloned loop and the current loop if they are loops and
- // move toward OuterExitL. Also, if either the cloned loop or the current
- // loop have become top level loops we need to walk all the way out.
- if (ClonedL) {
- OuterLoops.push_back(ClonedL);
- if (!ClonedL->getParentLoop())
- OuterExitL = nullptr;
- }
- if (IsStillLoop) {
- OuterLoops.push_back(&L);
- if (!L.getParentLoop())
- OuterExitL = nullptr;
- }
- // Grab all of the enclosing loops now.
+ if (OuterExitL != &L)
for (Loop *OuterL = ParentL; OuterL != OuterExitL;
OuterL = OuterL->getParentLoop())
- OuterLoops.push_back(OuterL);
-
- // Finally, update our list of outer loops. This is nicely ordered to work
- // inside-out.
- for (Loop *OuterL : OuterLoops) {
- // First build LCSSA for this loop so that we can preserve it when
- // forming dedicated exits. We don't want to perturb some other loop's
- // LCSSA while doing that CFG edit.
- UpdateLCSSA(*OuterL);
-
- // For loops reached by this loop's original exit blocks we may
- // introduced new, non-dedicated exits. At least try to re-form dedicated
- // exits for these loops. This may fail if they couldn't have dedicated
- // exits to start with.
- formDedicatedExitBlocks(OuterL, &DT, &LI, /*PreserveLCSSA*/ true);
- }
- }
+ UpdateLoop(*OuterL);
#ifndef NDEBUG
// Verify the entire loop structure to catch any incorrect updates before we
@@ -1856,7 +2130,7 @@ static bool unswitchInvariantBranch(
for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
if (UpdatedL->getParentLoop() == ParentL)
SibLoops.push_back(UpdatedL);
- NonTrivialUnswitchCB(IsStillLoop, SibLoops);
+ UnswitchCB(IsStillLoop, SibLoops);
++NumBranches;
return true;
@@ -1895,50 +2169,69 @@ computeDomSubtreeCost(DomTreeNode &N,
return Cost;
}
-/// Unswitch control flow predicated on loop invariant conditions.
-///
-/// This first hoists all branches or switches which are trivial (IE, do not
-/// require duplicating any part of the loop) out of the loop body. It then
-/// looks at other loop invariant control flows and tries to unswitch those as
-/// well by cloning the loop if the result is small enough.
static bool
-unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
- TargetTransformInfo &TTI, bool NonTrivial,
- function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) {
- assert(L.isRecursivelyLCSSAForm(DT, LI) &&
- "Loops must be in LCSSA form before unswitching.");
- bool Changed = false;
+unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
+ AssumptionCache &AC, TargetTransformInfo &TTI,
+ function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
+ ScalarEvolution *SE) {
+ // Collect all invariant conditions within this loop (as opposed to an inner
+ // loop which would be handled when visiting that inner loop).
+ SmallVector<std::pair<TerminatorInst *, TinyPtrVector<Value *>>, 4>
+ UnswitchCandidates;
+ for (auto *BB : L.blocks()) {
+ if (LI.getLoopFor(BB) != &L)
+ continue;
- // Must be in loop simplified form: we need a preheader and dedicated exits.
- if (!L.isLoopSimplifyForm())
- return false;
+ if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+ // We can only consider fully loop-invariant switch conditions as we need
+ // to completely eliminate the switch after unswitching.
+ if (!isa<Constant>(SI->getCondition()) &&
+ L.isLoopInvariant(SI->getCondition()))
+ UnswitchCandidates.push_back({SI, {SI->getCondition()}});
+ continue;
+ }
- // Try trivial unswitch first before loop over other basic blocks in the loop.
- Changed |= unswitchAllTrivialConditions(L, DT, LI);
+ auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BI || !BI->isConditional() || isa<Constant>(BI->getCondition()) ||
+ BI->getSuccessor(0) == BI->getSuccessor(1))
+ continue;
- // If we're not doing non-trivial unswitching, we're done. We both accept
- // a parameter but also check a local flag that can be used for testing
- // a debugging.
- if (!NonTrivial && !EnableNonTrivialUnswitch)
- return Changed;
-
- // Collect all remaining invariant branch conditions within this loop (as
- // opposed to an inner loop which would be handled when visiting that inner
- // loop).
- SmallVector<TerminatorInst *, 4> UnswitchCandidates;
- for (auto *BB : L.blocks())
- if (LI.getLoopFor(BB) == &L)
- if (auto *BI = dyn_cast<BranchInst>(BB->getTerminator()))
- if (BI->isConditional() && L.isLoopInvariant(BI->getCondition()) &&
- BI->getSuccessor(0) != BI->getSuccessor(1))
- UnswitchCandidates.push_back(BI);
+ if (L.isLoopInvariant(BI->getCondition())) {
+ UnswitchCandidates.push_back({BI, {BI->getCondition()}});
+ continue;
+ }
+
+ Instruction &CondI = *cast<Instruction>(BI->getCondition());
+ if (CondI.getOpcode() != Instruction::And &&
+ CondI.getOpcode() != Instruction::Or)
+ continue;
+
+ TinyPtrVector<Value *> Invariants =
+ collectHomogenousInstGraphLoopInvariants(L, CondI, LI);
+ if (Invariants.empty())
+ continue;
+
+ UnswitchCandidates.push_back({BI, std::move(Invariants)});
+ }
// If we didn't find any candidates, we're done.
if (UnswitchCandidates.empty())
- return Changed;
+ return false;
- DEBUG(dbgs() << "Considering " << UnswitchCandidates.size()
- << " non-trivial loop invariant conditions for unswitching.\n");
+ // Check if there are irreducible CFG cycles in this loop. If so, we cannot
+ // easily unswitch non-trivial edges out of the loop. Doing so might turn the
+ // irreducible control flow into reducible control flow and introduce new
+ // loops "out of thin air". If we ever discover important use cases for doing
+ // this, we can add support to loop unswitch, but it is a lot of complexity
+ // for what seems little or no real world benefit.
+ LoopBlocksRPO RPOT(&L);
+ RPOT.perform(&LI);
+ if (containsIrreducibleCFG<const BasicBlock *>(RPOT, LI))
+ return false;
+
+ LLVM_DEBUG(
+ dbgs() << "Considering " << UnswitchCandidates.size()
+ << " non-trivial loop invariant conditions for unswitching.\n");
// Given that unswitching these terminators will require duplicating parts of
// the loop, so we need to be able to model that cost. Compute the ephemeral
@@ -1962,10 +2255,10 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
continue;
if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
- return Changed;
+ return false;
if (auto CS = CallSite(&I))
if (CS.isConvergent() || CS.cannotDuplicate())
- return Changed;
+ return false;
Cost += TTI.getUserCost(&I);
}
@@ -1974,7 +2267,7 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
assert(LoopCost >= 0 && "Must not have negative loop costs!");
BBCostMap[BB] = Cost;
}
- DEBUG(dbgs() << " Total loop cost: " << LoopCost << "\n");
+ LLVM_DEBUG(dbgs() << " Total loop cost: " << LoopCost << "\n");
// Now we find the best candidate by searching for the one with the following
// properties in order:
@@ -1993,8 +2286,8 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
SmallDenseMap<DomTreeNode *, int, 4> DTCostMap;
// Given a terminator which might be unswitched, computes the non-duplicated
// cost for that terminator.
- auto ComputeUnswitchedCost = [&](TerminatorInst *TI) {
- BasicBlock &BB = *TI->getParent();
+ auto ComputeUnswitchedCost = [&](TerminatorInst &TI, bool FullUnswitch) {
+ BasicBlock &BB = *TI.getParent();
SmallPtrSet<BasicBlock *, 4> Visited;
int Cost = LoopCost;
@@ -2003,6 +2296,26 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
if (!Visited.insert(SuccBB).second)
continue;
+ // If this is a partial unswitch candidate, then it must be a conditional
+ // branch with a condition of either `or` or `and`. In that case, one of
+ // the successors is necessarily duplicated, so don't even try to remove
+ // its cost.
+ if (!FullUnswitch) {
+ auto &BI = cast<BranchInst>(TI);
+ if (cast<Instruction>(BI.getCondition())->getOpcode() ==
+ Instruction::And) {
+ if (SuccBB == BI.getSuccessor(1))
+ continue;
+ } else {
+ assert(cast<Instruction>(BI.getCondition())->getOpcode() ==
+ Instruction::Or &&
+ "Only `and` and `or` conditions can result in a partial "
+ "unswitch!");
+ if (SuccBB == BI.getSuccessor(0))
+ continue;
+ }
+ }
+
// This successor's domtree will not need to be duplicated after
// unswitching if the edge to the successor dominates it (and thus the
// entire tree). This essentially means there is no other path into this
@@ -2026,27 +2339,95 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
};
TerminatorInst *BestUnswitchTI = nullptr;
int BestUnswitchCost;
- for (TerminatorInst *CandidateTI : UnswitchCandidates) {
- int CandidateCost = ComputeUnswitchedCost(CandidateTI);
- DEBUG(dbgs() << " Computed cost of " << CandidateCost
- << " for unswitch candidate: " << *CandidateTI << "\n");
+ ArrayRef<Value *> BestUnswitchInvariants;
+ for (auto &TerminatorAndInvariants : UnswitchCandidates) {
+ TerminatorInst &TI = *TerminatorAndInvariants.first;
+ ArrayRef<Value *> Invariants = TerminatorAndInvariants.second;
+ BranchInst *BI = dyn_cast<BranchInst>(&TI);
+ int CandidateCost = ComputeUnswitchedCost(
+ TI, /*FullUnswitch*/ !BI || (Invariants.size() == 1 &&
+ Invariants[0] == BI->getCondition()));
+ LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost
+ << " for unswitch candidate: " << TI << "\n");
if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) {
- BestUnswitchTI = CandidateTI;
+ BestUnswitchTI = &TI;
BestUnswitchCost = CandidateCost;
+ BestUnswitchInvariants = Invariants;
}
}
- if (BestUnswitchCost < UnswitchThreshold) {
- DEBUG(dbgs() << " Trying to unswitch non-trivial (cost = "
- << BestUnswitchCost << ") branch: " << *BestUnswitchTI
- << "\n");
- Changed |= unswitchInvariantBranch(L, cast<BranchInst>(*BestUnswitchTI), DT,
- LI, AC, NonTrivialUnswitchCB);
- } else {
- DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " << BestUnswitchCost
- << "\n");
+ if (BestUnswitchCost >= UnswitchThreshold) {
+ LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: "
+ << BestUnswitchCost << "\n");
+ return false;
}
+ LLVM_DEBUG(dbgs() << " Trying to unswitch non-trivial (cost = "
+ << BestUnswitchCost << ") terminator: " << *BestUnswitchTI
+ << "\n");
+ return unswitchNontrivialInvariants(
+ L, *BestUnswitchTI, BestUnswitchInvariants, DT, LI, AC, UnswitchCB, SE);
+}
+
+/// Unswitch control flow predicated on loop invariant conditions.
+///
+/// This first hoists all branches or switches which are trivial (IE, do not
+/// require duplicating any part of the loop) out of the loop body. It then
+/// looks at other loop invariant control flows and tries to unswitch those as
+/// well by cloning the loop if the result is small enough.
+///
+/// The `DT`, `LI`, `AC`, `TTI` parameters are required analyses that are also
+/// updated based on the unswitch.
+///
+/// If either `NonTrivial` is true or the flag `EnableNonTrivialUnswitch` is
+/// true, we will attempt to do non-trivial unswitching as well as trivial
+/// unswitching.
+///
+/// The `UnswitchCB` callback provided will be run after unswitching is
+/// complete, with the first parameter set to `true` if the provided loop
+/// remains a loop, and a list of new sibling loops created.
+///
+/// If `SE` is non-null, we will update that analysis based on the unswitching
+/// done.
+static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
+ AssumptionCache &AC, TargetTransformInfo &TTI,
+ bool NonTrivial,
+ function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
+ ScalarEvolution *SE) {
+ assert(L.isRecursivelyLCSSAForm(DT, LI) &&
+ "Loops must be in LCSSA form before unswitching.");
+ bool Changed = false;
+
+ // Must be in loop simplified form: we need a preheader and dedicated exits.
+ if (!L.isLoopSimplifyForm())
+ return false;
+
+ // Try trivial unswitch first before loop over other basic blocks in the loop.
+ if (unswitchAllTrivialConditions(L, DT, LI, SE)) {
+ // If we unswitched successfully we will want to clean up the loop before
+ // processing it further so just mark it as unswitched and return.
+ UnswitchCB(/*CurrentLoopValid*/ true, {});
+ return true;
+ }
+
+ // If we're not doing non-trivial unswitching, we're done. We both accept
+ // a parameter but also check a local flag that can be used for testing
+ // a debugging.
+ if (!NonTrivial && !EnableNonTrivialUnswitch)
+ return false;
+
+ // For non-trivial unswitching, because it often creates new loops, we rely on
+ // the pass manager to iterate on the loops rather than trying to immediately
+ // reach a fixed point. There is no substantial advantage to iterating
+ // internally, and if any of the new loops are simplified enough to contain
+ // trivial unswitching we want to prefer those.
+
+ // Try to unswitch the best invariant condition. We prefer this full unswitch to
+ // a partial unswitch when possible below the threshold.
+ if (unswitchBestCondition(L, DT, LI, AC, TTI, UnswitchCB, SE))
+ return true;
+
+ // No other opportunities to unswitch.
return Changed;
}
@@ -2056,16 +2437,18 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
Function &F = *L.getHeader()->getParent();
(void)F;
- DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n");
+ LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L
+ << "\n");
// Save the current loop name in a variable so that we can report it even
// after it has been deleted.
std::string LoopName = L.getName();
- auto NonTrivialUnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid,
- ArrayRef<Loop *> NewLoops) {
+ auto UnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid,
+ ArrayRef<Loop *> NewLoops) {
// If we did a non-trivial unswitch, we have added new (cloned) loops.
- U.addSiblingLoops(NewLoops);
+ if (!NewLoops.empty())
+ U.addSiblingLoops(NewLoops);
// If the current loop remains valid, we should revisit it to catch any
// other unswitch opportunities. Otherwise, we need to mark it as deleted.
@@ -2075,15 +2458,13 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
U.markLoopAsDeleted(L, LoopName);
};
- if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial,
- NonTrivialUnswitchCB))
+ if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial, UnswitchCB,
+ &AR.SE))
return PreservedAnalyses::all();
-#ifndef NDEBUG
// Historically this pass has had issues with the dominator tree so verify it
// in asserts builds.
- AR.DT.verifyDomTree();
-#endif
+ assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast));
return getLoopPassPreservedAnalyses();
}
@@ -2118,15 +2499,19 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
Function &F = *L->getHeader()->getParent();
- DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L << "\n");
+ LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L
+ << "\n");
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto NonTrivialUnswitchCB = [&L, &LPM](bool CurrentLoopValid,
- ArrayRef<Loop *> NewLoops) {
+ auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+ auto *SE = SEWP ? &SEWP->getSE() : nullptr;
+
+ auto UnswitchCB = [&L, &LPM](bool CurrentLoopValid,
+ ArrayRef<Loop *> NewLoops) {
// If we did a non-trivial unswitch, we have added new (cloned) loops.
for (auto *NewL : NewLoops)
LPM.addLoop(*NewL);
@@ -2140,18 +2525,16 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
LPM.markLoopAsDeleted(*L);
};
- bool Changed =
- unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, NonTrivialUnswitchCB);
+ bool Changed = unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, UnswitchCB, SE);
// If anything was unswitched, also clear any cached information about this
// loop.
LPM.deleteSimpleAnalysisLoop(L);
-#ifndef NDEBUG
// Historically this pass has had issues with the dominator tree so verify it
// in asserts builds.
- DT.verifyDomTree();
-#endif
+ assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+
return Changed;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 1522170dc3b9..b7b1db76b492 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -28,6 +28,7 @@
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
@@ -39,7 +40,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/SimplifyCFG.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <utility>
using namespace llvm;
diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
index cfb8a062299f..ca6b93e0b4a9 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -114,7 +114,7 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
// We cannot sink a load across a critical edge - there may be stores in
// other code paths.
- if (isa<LoadInst>(Inst))
+ if (Inst->mayReadFromMemory())
return false;
// We don't want to sink across a critical edge if we don't dominate the
@@ -187,11 +187,9 @@ static bool SinkInstruction(Instruction *Inst,
if (!SuccToSinkTo)
return false;
- DEBUG(dbgs() << "Sink" << *Inst << " (";
- Inst->getParent()->printAsOperand(dbgs(), false);
- dbgs() << " -> ";
- SuccToSinkTo->printAsOperand(dbgs(), false);
- dbgs() << ")\n");
+ LLVM_DEBUG(dbgs() << "Sink" << *Inst << " (";
+ Inst->getParent()->printAsOperand(dbgs(), false); dbgs() << " -> ";
+ SuccToSinkTo->printAsOperand(dbgs(), false); dbgs() << ")\n");
// Move the instruction.
Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt());
@@ -244,7 +242,7 @@ static bool iterativelySinkInstructions(Function &F, DominatorTree &DT,
do {
MadeChange = false;
- DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
+ LLVM_DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
// Process all basic blocks.
for (BasicBlock &I : F)
MadeChange |= ProcessBlock(I, DT, LI, AA);
diff --git a/contrib/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/contrib/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
index 23156d5a4d83..6743e19a7c92 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
@@ -64,7 +64,7 @@ isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
// block. We should consider using actual post-dominance here in the
// future.
if (UI->getParent() != PhiBB) {
- DEBUG(dbgs() << " Unsafe: use in a different BB: " << *UI << "\n");
+ LLVM_DEBUG(dbgs() << " Unsafe: use in a different BB: " << *UI << "\n");
return false;
}
@@ -75,7 +75,7 @@ isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
// probably change this to do at least a limited scan of the intervening
// instructions and allow handling stores in easily proven safe cases.
if (mayBeMemoryDependent(*UI)) {
- DEBUG(dbgs() << " Unsafe: can't speculate use: " << *UI << "\n");
+ LLVM_DEBUG(dbgs() << " Unsafe: can't speculate use: " << *UI << "\n");
return false;
}
@@ -126,8 +126,8 @@ isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
// If when we directly test whether this is safe it fails, bail.
if (UnsafeSet.count(OpI) || ParentBB != PhiBB ||
mayBeMemoryDependent(*OpI)) {
- DEBUG(dbgs() << " Unsafe: can't speculate transitive use: " << *OpI
- << "\n");
+ LLVM_DEBUG(dbgs() << " Unsafe: can't speculate transitive use: "
+ << *OpI << "\n");
// Record the stack of instructions which reach this node as unsafe
// so we prune subsequent searches.
UnsafeSet.insert(OpI);
@@ -229,7 +229,7 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
NonFreeMat |= MatCost != TTI.TCC_Free;
}
if (!NonFreeMat) {
- DEBUG(dbgs() << " Free: " << PN << "\n");
+ LLVM_DEBUG(dbgs() << " Free: " << PN << "\n");
// No profit in free materialization.
return false;
}
@@ -237,7 +237,7 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
// Now check that the uses of this PHI can actually be speculated,
// otherwise we'll still have to materialize the PHI value.
if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) {
- DEBUG(dbgs() << " Unsafe PHI: " << PN << "\n");
+ LLVM_DEBUG(dbgs() << " Unsafe PHI: " << PN << "\n");
return false;
}
@@ -266,7 +266,7 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
// Assume we will commute the constant to the RHS to be canonical.
Idx = 1;
- // Get the intrinsic ID if this user is an instrinsic.
+ // Get the intrinsic ID if this user is an intrinsic.
Intrinsic::ID IID = Intrinsic::not_intrinsic;
if (auto *UserII = dyn_cast<IntrinsicInst>(UserI))
IID = UserII->getIntrinsicID();
@@ -288,9 +288,13 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
// just bail. We're only interested in cases where folding the incoming
// constants is at least break-even on all paths.
if (FoldedCost > MatCost) {
- DEBUG(dbgs() << " Not profitable to fold imm: " << *IncomingC << "\n"
- " Materializing cost: " << MatCost << "\n"
- " Accumulated folded cost: " << FoldedCost << "\n");
+ LLVM_DEBUG(dbgs() << " Not profitable to fold imm: " << *IncomingC
+ << "\n"
+ " Materializing cost: "
+ << MatCost
+ << "\n"
+ " Accumulated folded cost: "
+ << FoldedCost << "\n");
return false;
}
}
@@ -310,8 +314,8 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
"less that its materialized cost, "
"the sum must be as well.");
- DEBUG(dbgs() << " Cost savings " << (TotalMatCost - TotalFoldedCost)
- << ": " << PN << "\n");
+ LLVM_DEBUG(dbgs() << " Cost savings " << (TotalMatCost - TotalFoldedCost)
+ << ": " << PN << "\n");
CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost;
return true;
}
@@ -489,9 +493,13 @@ findProfitablePHIs(ArrayRef<PHINode *> PNs,
// and zero out the cost of everything it depends on.
int CostSavings = CostSavingsMap.find(PN)->second;
if (SpecCost > CostSavings) {
- DEBUG(dbgs() << " Not profitable, speculation cost: " << *PN << "\n"
- " Cost savings: " << CostSavings << "\n"
- " Speculation cost: " << SpecCost << "\n");
+ LLVM_DEBUG(dbgs() << " Not profitable, speculation cost: " << *PN
+ << "\n"
+ " Cost savings: "
+ << CostSavings
+ << "\n"
+ " Speculation cost: "
+ << SpecCost << "\n");
continue;
}
@@ -545,7 +553,7 @@ static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
SmallSetVector<BasicBlock *, 16> &PredSet,
DominatorTree &DT) {
- DEBUG(dbgs() << " Speculating around " << SpecPNs.size() << " PHIs!\n");
+ LLVM_DEBUG(dbgs() << " Speculating around " << SpecPNs.size() << " PHIs!\n");
NumPHIsSpeculated += SpecPNs.size();
// Split any critical edges so that we have a block to hoist into.
@@ -558,8 +566,8 @@ static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges());
if (NewPredBB) {
++NumEdgesSplit;
- DEBUG(dbgs() << " Split critical edge from: " << PredBB->getName()
- << "\n");
+ LLVM_DEBUG(dbgs() << " Split critical edge from: " << PredBB->getName()
+ << "\n");
SpecPreds.push_back(NewPredBB);
} else {
assert(PredBB->getSingleSuccessor() == ParentBB &&
@@ -593,14 +601,15 @@ static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
int NumSpecInsts = SpecList.size() * SpecPreds.size();
int NumRedundantInsts = NumSpecInsts - SpecList.size();
- DEBUG(dbgs() << " Inserting " << NumSpecInsts << " speculated instructions, "
- << NumRedundantInsts << " redundancies\n");
+ LLVM_DEBUG(dbgs() << " Inserting " << NumSpecInsts
+ << " speculated instructions, " << NumRedundantInsts
+ << " redundancies\n");
NumSpeculatedInstructions += NumSpecInsts;
NumNewRedundantInstructions += NumRedundantInsts;
// Each predecessor is numbered by its index in `SpecPreds`, so for each
// instruction we speculate, the speculated instruction is stored in that
- // index of the vector asosciated with the original instruction. We also
+ // index of the vector associated with the original instruction. We also
// store the incoming values for each predecessor from any PHIs used.
SmallDenseMap<Instruction *, SmallVector<Value *, 2>, 16> SpeculatedValueMap;
@@ -716,7 +725,7 @@ static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
/// true when at least some speculation occurs.
static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
DominatorTree &DT, TargetTransformInfo &TTI) {
- DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n");
+ LLVM_DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n");
// Savings in cost from speculating around a PHI node.
SmallDenseMap<PHINode *, int, 16> CostSavingsMap;
@@ -745,7 +754,7 @@ static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
PNs.end());
// If no PHIs were profitable, skip.
if (PNs.empty()) {
- DEBUG(dbgs() << " No safe and profitable PHIs found!\n");
+ LLVM_DEBUG(dbgs() << " No safe and profitable PHIs found!\n");
return false;
}
@@ -763,13 +772,13 @@ static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
// differently.
if (isa<IndirectBrInst>(PredBB->getTerminator()) ||
isa<InvokeInst>(PredBB->getTerminator())) {
- DEBUG(dbgs() << " Invalid: predecessor terminator: " << PredBB->getName()
- << "\n");
+ LLVM_DEBUG(dbgs() << " Invalid: predecessor terminator: "
+ << PredBB->getName() << "\n");
return false;
}
}
if (PredSet.size() < 2) {
- DEBUG(dbgs() << " Unimportant: phi with only one predecessor\n");
+ LLVM_DEBUG(dbgs() << " Unimportant: phi with only one predecessor\n");
return false;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index a7c308b59877..f5e1dd6ed850 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -62,7 +62,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/SpeculativeExecution.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Instructions.h"
@@ -137,6 +137,7 @@ INITIALIZE_PASS_END(SpeculativeExecutionLegacyPass, "speculative-execution",
void SpeculativeExecutionLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.setPreservesCFG();
}
bool SpeculativeExecutionLegacyPass::runOnFunction(Function &F) {
@@ -151,8 +152,8 @@ namespace llvm {
bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) {
if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence()) {
- DEBUG(dbgs() << "Not running SpeculativeExecution because "
- "TTI->hasBranchDivergence() is false.\n");
+ LLVM_DEBUG(dbgs() << "Not running SpeculativeExecution because "
+ "TTI->hasBranchDivergence() is false.\n");
return false;
}
@@ -251,7 +252,7 @@ static unsigned ComputeSpeculationCost(const Instruction *I,
bool SpeculativeExecutionPass::considerHoistingFromTo(
BasicBlock &FromBlock, BasicBlock &ToBlock) {
- SmallSet<const Instruction *, 8> NotHoisted;
+ SmallPtrSet<const Instruction *, 8> NotHoisted;
const auto AllPrecedingUsesFromBlockHoisted = [&NotHoisted](User *U) {
for (Value* V : U->operand_values()) {
if (Instruction *I = dyn_cast<Instruction>(V)) {
@@ -314,6 +315,7 @@ PreservedAnalyses SpeculativeExecutionPass::run(Function &F,
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<GlobalsAA>();
+ PA.preserveSet<CFGAnalyses>();
return PA;
}
} // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index ce40af1223f6..2061db13639a 100644
--- a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -61,6 +61,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
@@ -80,7 +81,6 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <cstdint>
#include <limits>
diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index b8fb80b6cc26..d650264176aa 100644
--- a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -40,6 +40,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
#include <algorithm>
#include <cassert>
@@ -55,6 +56,12 @@ static const char *const FlowBlockName = "Flow";
namespace {
+static cl::opt<bool> ForceSkipUniformRegions(
+ "structurizecfg-skip-uniform-regions",
+ cl::Hidden,
+ cl::desc("Force whether the StructurizeCFG pass skips uniform regions"),
+ cl::init(false));
+
// Definition of the complex types used in this pass.
using BBValuePair = std::pair<BasicBlock *, Value *>;
@@ -120,7 +127,7 @@ public:
bool resultIsRememberedBlock() { return ResultIsRemembered; }
};
-/// @brief Transforms the control flow graph on one single entry/exit region
+/// Transforms the control flow graph on one single entry/exit region
/// at a time.
///
/// After the transform all "If"/"Then"/"Else" style control flow looks like
@@ -176,6 +183,7 @@ class StructurizeCFG : public RegionPass {
Function *Func;
Region *ParentRegion;
+ DivergenceAnalysis *DA;
DominatorTree *DT;
LoopInfo *LI;
@@ -196,6 +204,9 @@ class StructurizeCFG : public RegionPass {
void orderNodes();
+ Loop *getAdjustedLoop(RegionNode *RN);
+ unsigned getAdjustedLoopDepth(RegionNode *RN);
+
void analyzeLoops(RegionNode *N);
Value *invert(Value *Condition);
@@ -242,8 +253,11 @@ class StructurizeCFG : public RegionPass {
public:
static char ID;
- explicit StructurizeCFG(bool SkipUniformRegions = false)
- : RegionPass(ID), SkipUniformRegions(SkipUniformRegions) {
+ explicit StructurizeCFG(bool SkipUniformRegions_ = false)
+ : RegionPass(ID),
+ SkipUniformRegions(SkipUniformRegions_) {
+ if (ForceSkipUniformRegions.getNumOccurrences())
+ SkipUniformRegions = ForceSkipUniformRegions.getValue();
initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
}
@@ -278,7 +292,7 @@ INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG",
false, false)
-/// \brief Initialize the types and constants used in the pass
+/// Initialize the types and constants used in the pass
bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
LLVMContext &Context = R->getEntry()->getContext();
@@ -290,7 +304,27 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
return false;
}
-/// \brief Build up the general order of nodes
+/// Use the exit block to determine the loop if RN is a SubRegion.
+Loop *StructurizeCFG::getAdjustedLoop(RegionNode *RN) {
+ if (RN->isSubRegion()) {
+ Region *SubRegion = RN->getNodeAs<Region>();
+ return LI->getLoopFor(SubRegion->getExit());
+ }
+
+ return LI->getLoopFor(RN->getEntry());
+}
+
+/// Use the exit block to determine the loop depth if RN is a SubRegion.
+unsigned StructurizeCFG::getAdjustedLoopDepth(RegionNode *RN) {
+ if (RN->isSubRegion()) {
+ Region *SubR = RN->getNodeAs<Region>();
+ return LI->getLoopDepth(SubR->getExit());
+ }
+
+ return LI->getLoopDepth(RN->getEntry());
+}
+
+/// Build up the general order of nodes
void StructurizeCFG::orderNodes() {
ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
SmallDenseMap<Loop*, unsigned, 8> LoopBlocks;
@@ -299,16 +333,15 @@ void StructurizeCFG::orderNodes() {
// to what we want. The only problem with it is that sometimes backedges
// for outer loops will be visited before backedges for inner loops.
for (RegionNode *RN : RPOT) {
- BasicBlock *BB = RN->getEntry();
- Loop *Loop = LI->getLoopFor(BB);
+ Loop *Loop = getAdjustedLoop(RN);
++LoopBlocks[Loop];
}
unsigned CurrentLoopDepth = 0;
Loop *CurrentLoop = nullptr;
for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
- BasicBlock *BB = (*I)->getEntry();
- unsigned LoopDepth = LI->getLoopDepth(BB);
+ RegionNode *RN = cast<RegionNode>(*I);
+ unsigned LoopDepth = getAdjustedLoopDepth(RN);
if (is_contained(Order, *I))
continue;
@@ -320,15 +353,14 @@ void StructurizeCFG::orderNodes() {
auto LoopI = I;
while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
LoopI++;
- BasicBlock *LoopBB = (*LoopI)->getEntry();
- if (LI->getLoopFor(LoopBB) == CurrentLoop) {
+ if (getAdjustedLoop(cast<RegionNode>(*LoopI)) == CurrentLoop) {
--BlockCount;
Order.push_back(*LoopI);
}
}
}
- CurrentLoop = LI->getLoopFor(BB);
+ CurrentLoop = getAdjustedLoop(RN);
if (CurrentLoop)
LoopBlocks[CurrentLoop]--;
@@ -343,7 +375,7 @@ void StructurizeCFG::orderNodes() {
std::reverse(Order.begin(), Order.end());
}
-/// \brief Determine the end of the loops
+/// Determine the end of the loops
void StructurizeCFG::analyzeLoops(RegionNode *N) {
if (N->isSubRegion()) {
// Test for exit as back edge
@@ -362,15 +394,16 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
}
}
-/// \brief Invert the given condition
+/// Invert the given condition
Value *StructurizeCFG::invert(Value *Condition) {
// First: Check if it's a constant
if (Constant *C = dyn_cast<Constant>(Condition))
return ConstantExpr::getNot(C);
// Second: If the condition is already inverted, return the original value
- if (match(Condition, m_Not(m_Value(Condition))))
- return Condition;
+ Value *NotCondition;
+ if (match(Condition, m_Not(m_Value(NotCondition))))
+ return NotCondition;
if (Instruction *Inst = dyn_cast<Instruction>(Condition)) {
// Third: Check all the users for an invert
@@ -394,7 +427,7 @@ Value *StructurizeCFG::invert(Value *Condition) {
llvm_unreachable("Unhandled condition to invert");
}
-/// \brief Build the condition for one edge
+/// Build the condition for one edge
Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
bool Invert) {
Value *Cond = Invert ? BoolFalse : BoolTrue;
@@ -407,7 +440,7 @@ Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
return Cond;
}
-/// \brief Analyze the predecessors of each block and build up predicates
+/// Analyze the predecessors of each block and build up predicates
void StructurizeCFG::gatherPredicates(RegionNode *N) {
RegionInfo *RI = ParentRegion->getRegionInfo();
BasicBlock *BB = N->getEntry();
@@ -465,7 +498,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
}
}
-/// \brief Collect various loop and predicate infos
+/// Collect various loop and predicate infos
void StructurizeCFG::collectInfos() {
// Reset predicate
Predicates.clear();
@@ -478,10 +511,10 @@ void StructurizeCFG::collectInfos() {
Visited.clear();
for (RegionNode *RN : reverse(Order)) {
- DEBUG(dbgs() << "Visiting: "
- << (RN->isSubRegion() ? "SubRegion with entry: " : "")
- << RN->getEntry()->getName() << " Loop Depth: "
- << LI->getLoopDepth(RN->getEntry()) << "\n");
+ LLVM_DEBUG(dbgs() << "Visiting: "
+ << (RN->isSubRegion() ? "SubRegion with entry: " : "")
+ << RN->getEntry()->getName() << " Loop Depth: "
+ << LI->getLoopDepth(RN->getEntry()) << "\n");
// Analyze all the conditions leading to a node
gatherPredicates(RN);
@@ -494,7 +527,7 @@ void StructurizeCFG::collectInfos() {
}
}
-/// \brief Insert the missing branch conditions
+/// Insert the missing branch conditions
void StructurizeCFG::insertConditions(bool Loops) {
BranchVector &Conds = Loops ? LoopConds : Conditions;
Value *Default = Loops ? BoolTrue : BoolFalse;
@@ -540,7 +573,7 @@ void StructurizeCFG::insertConditions(bool Loops) {
}
}
-/// \brief Remove all PHI values coming from "From" into "To" and remember
+/// Remove all PHI values coming from "From" into "To" and remember
/// them in DeletedPhis
void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
PhiMap &Map = DeletedPhis[To];
@@ -552,7 +585,7 @@ void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
}
}
-/// \brief Add a dummy PHI value as soon as we knew the new predecessor
+/// Add a dummy PHI value as soon as we knew the new predecessor
void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
for (PHINode &Phi : To->phis()) {
Value *Undef = UndefValue::get(Phi.getType());
@@ -561,7 +594,7 @@ void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
AddedPhis[To].push_back(From);
}
-/// \brief Add the real PHI value as soon as everything is set up
+/// Add the real PHI value as soon as everything is set up
void StructurizeCFG::setPhiValues() {
SSAUpdater Updater;
for (const auto &AddedPhi : AddedPhis) {
@@ -601,7 +634,7 @@ void StructurizeCFG::setPhiValues() {
assert(DeletedPhis.empty());
}
-/// \brief Remove phi values from all successors and then remove the terminator.
+/// Remove phi values from all successors and then remove the terminator.
void StructurizeCFG::killTerminator(BasicBlock *BB) {
TerminatorInst *Term = BB->getTerminator();
if (!Term)
@@ -611,10 +644,12 @@ void StructurizeCFG::killTerminator(BasicBlock *BB) {
SI != SE; ++SI)
delPhiValues(BB, *SI);
+ if (DA)
+ DA->removeValue(Term);
Term->eraseFromParent();
}
-/// \brief Let node exit(s) point to NewExit
+/// Let node exit(s) point to NewExit
void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
bool IncludeDominator) {
if (Node->isSubRegion()) {
@@ -660,7 +695,7 @@ void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
}
}
-/// \brief Create a new flow node and update dominator tree and region info
+/// Create a new flow node and update dominator tree and region info
BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
LLVMContext &Context = Func->getContext();
BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
@@ -672,7 +707,7 @@ BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
return Flow;
}
-/// \brief Create a new or reuse the previous node as flow node
+/// Create a new or reuse the previous node as flow node
BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
BasicBlock *Entry = PrevNode->getEntry();
@@ -691,7 +726,7 @@ BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
return Flow;
}
-/// \brief Returns the region exit if possible, otherwise just a new flow node
+/// Returns the region exit if possible, otherwise just a new flow node
BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow,
bool ExitUseAllowed) {
if (!Order.empty() || !ExitUseAllowed)
@@ -703,13 +738,13 @@ BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow,
return Exit;
}
-/// \brief Set the previous node
+/// Set the previous node
void StructurizeCFG::setPrevNode(BasicBlock *BB) {
PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB)
: nullptr;
}
-/// \brief Does BB dominate all the predicates of Node?
+/// Does BB dominate all the predicates of Node?
bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
BBPredicates &Preds = Predicates[Node->getEntry()];
return llvm::all_of(Preds, [&](std::pair<BasicBlock *, Value *> Pred) {
@@ -717,7 +752,7 @@ bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
});
}
-/// \brief Can we predict that this node will always be called?
+/// Can we predict that this node will always be called?
bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
BBPredicates &Preds = Predicates[Node->getEntry()];
bool Dominated = false;
@@ -845,7 +880,7 @@ void StructurizeCFG::createFlow() {
}
/// Handle a rare case where the disintegrated nodes instructions
-/// no longer dominate all their uses. Not sure if this is really nessasary
+/// no longer dominate all their uses. Not sure if this is really necessary
void StructurizeCFG::rebuildSSA() {
SSAUpdater Updater;
for (BasicBlock *BB : ParentRegion->blocks())
@@ -878,30 +913,60 @@ void StructurizeCFG::rebuildSSA() {
}
}
-static bool hasOnlyUniformBranches(const Region *R,
+static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID,
const DivergenceAnalysis &DA) {
- for (const BasicBlock *BB : R->blocks()) {
- const BranchInst *Br = dyn_cast<BranchInst>(BB->getTerminator());
- if (!Br || !Br->isConditional())
- continue;
+ for (auto E : R->elements()) {
+ if (!E->isSubRegion()) {
+ auto Br = dyn_cast<BranchInst>(E->getEntry()->getTerminator());
+ if (!Br || !Br->isConditional())
+ continue;
- if (!DA.isUniform(Br->getCondition()))
- return false;
- DEBUG(dbgs() << "BB: " << BB->getName() << " has uniform terminator\n");
+ if (!DA.isUniform(Br))
+ return false;
+ LLVM_DEBUG(dbgs() << "BB: " << Br->getParent()->getName()
+ << " has uniform terminator\n");
+ } else {
+ // Explicitly refuse to treat regions as uniform if they have non-uniform
+ // subregions. We cannot rely on DivergenceAnalysis for branches in
+ // subregions because those branches may have been removed and re-created,
+ // so we look for our metadata instead.
+ //
+ // Warning: It would be nice to treat regions as uniform based only on
+ // their direct child basic blocks' terminators, regardless of whether
+ // subregions are uniform or not. However, this requires a very careful
+ // look at SIAnnotateControlFlow to make sure nothing breaks there.
+ for (auto BB : E->getNodeAs<Region>()->blocks()) {
+ auto Br = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!Br || !Br->isConditional())
+ continue;
+
+ if (!Br->getMetadata(UniformMDKindID))
+ return false;
+ }
+ }
}
return true;
}
-/// \brief Run the transformation for each region found
+/// Run the transformation for each region found
bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
if (R->isTopLevelRegion())
return false;
+ DA = nullptr;
+
if (SkipUniformRegions) {
// TODO: We could probably be smarter here with how we handle sub-regions.
- auto &DA = getAnalysis<DivergenceAnalysis>();
- if (hasOnlyUniformBranches(R, DA)) {
- DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R << '\n');
+ // We currently rely on the fact that metadata is set by earlier invocations
+ // of the pass on sub-regions, and that this metadata doesn't get lost --
+ // but we shouldn't rely on metadata for correctness!
+ unsigned UniformMDKindID =
+ R->getEntry()->getContext().getMDKindID("structurizecfg.uniform");
+ DA = &getAnalysis<DivergenceAnalysis>();
+
+ if (hasOnlyUniformBranches(R, UniformMDKindID, *DA)) {
+ LLVM_DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R
+ << '\n');
// Mark all direct child block terminators as having been treated as
// uniform. To account for a possible future in which non-uniform
@@ -913,7 +978,7 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
continue;
if (Instruction *Term = E->getEntry()->getTerminator())
- Term->setMetadata("structurizecfg.uniform", MD);
+ Term->setMetadata(UniformMDKindID, MD);
}
return false;
diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 2a1106b41de2..f8cd6c17a5a6 100644
--- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -87,7 +87,7 @@ STATISTIC(NumEliminated, "Number of tail calls removed");
STATISTIC(NumRetDuped, "Number of return duplicated");
STATISTIC(NumAccumAdded, "Number of accumulators introduced");
-/// \brief Scan the specified function for alloca instructions.
+/// Scan the specified function for alloca instructions.
/// If it contains any dynamic allocas, returns false.
static bool canTRE(Function &F) {
// Because of PR962, we don't TRE dynamic allocas.
@@ -302,7 +302,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls,
if (Visited[CI->getParent()] != ESCAPED) {
// If the escape point was part way through the block, calls after the
// escape point wouldn't have been put into DeferredTails.
- DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n");
+ LLVM_DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n");
CI->setTailCall();
Modified = true;
} else {
@@ -699,8 +699,8 @@ static bool foldReturnAndProcessPred(
BranchInst *BI = UncondBranchPreds.pop_back_val();
BasicBlock *Pred = BI->getParent();
if (CallInst *CI = findTRECandidate(BI, CannotTailCallElimCallsMarkedTail, TTI)){
- DEBUG(dbgs() << "FOLDING: " << *BB
- << "INTO UNCOND BRANCH PRED: " << *Pred);
+ LLVM_DEBUG(dbgs() << "FOLDING: " << *BB
+ << "INTO UNCOND BRANCH PRED: " << *Pred);
ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred);
// Cleanup: if all predecessors of BB have been eliminated by