aboutsummaryrefslogtreecommitdiff
path: root/lib/Transforms/Scalar
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2019-10-23 17:51:42 +0000
committerDimitry Andric <dim@FreeBSD.org>2019-10-23 17:51:42 +0000
commit1d5ae1026e831016fc29fd927877c86af904481f (patch)
tree2cdfd12620fcfa5d9e4a0389f85368e8e36f63f9 /lib/Transforms/Scalar
parente6d1592492a3a379186bfb02bd0f4eda0669c0d5 (diff)
downloadsrc-1d5ae1026e831016fc29fd927877c86af904481f.tar.gz
src-1d5ae1026e831016fc29fd927877c86af904481f.zip
Notes
Diffstat (limited to 'lib/Transforms/Scalar')
-rw-r--r--lib/Transforms/Scalar/AlignmentFromAssumptions.cpp8
-rw-r--r--lib/Transforms/Scalar/CallSiteSplitting.cpp2
-rw-r--r--lib/Transforms/Scalar/ConstantHoisting.cpp24
-rw-r--r--lib/Transforms/Scalar/ConstantProp.cpp2
-rw-r--r--lib/Transforms/Scalar/CorrelatedValuePropagation.cpp180
-rw-r--r--lib/Transforms/Scalar/DCE.cpp31
-rw-r--r--lib/Transforms/Scalar/DeadStoreElimination.cpp7
-rw-r--r--lib/Transforms/Scalar/DivRemPairs.cpp219
-rw-r--r--lib/Transforms/Scalar/EarlyCSE.cpp22
-rw-r--r--lib/Transforms/Scalar/FlattenCFGPass.cpp24
-rw-r--r--lib/Transforms/Scalar/Float2Int.cpp47
-rw-r--r--lib/Transforms/Scalar/GVN.cpp201
-rw-r--r--lib/Transforms/Scalar/GVNHoist.cpp17
-rw-r--r--lib/Transforms/Scalar/GuardWidening.cpp2
-rw-r--r--lib/Transforms/Scalar/IndVarSimplify.cpp389
-rw-r--r--lib/Transforms/Scalar/InferAddressSpaces.cpp38
-rw-r--r--lib/Transforms/Scalar/InstSimplifyPass.cpp48
-rw-r--r--lib/Transforms/Scalar/JumpThreading.cpp18
-rw-r--r--lib/Transforms/Scalar/LICM.cpp55
-rw-r--r--lib/Transforms/Scalar/LoopDataPrefetch.cpp4
-rw-r--r--lib/Transforms/Scalar/LoopDeletion.cpp2
-rw-r--r--lib/Transforms/Scalar/LoopFuse.cpp640
-rw-r--r--lib/Transforms/Scalar/LoopIdiomRecognize.cpp890
-rw-r--r--lib/Transforms/Scalar/LoopInstSimplify.cpp5
-rw-r--r--lib/Transforms/Scalar/LoopInterchange.cpp62
-rw-r--r--lib/Transforms/Scalar/LoopLoadElimination.cpp3
-rw-r--r--lib/Transforms/Scalar/LoopPredication.cpp2
-rw-r--r--lib/Transforms/Scalar/LoopRerollPass.cpp3
-rw-r--r--lib/Transforms/Scalar/LoopRotation.cpp10
-rw-r--r--lib/Transforms/Scalar/LoopSimplifyCFG.cpp4
-rw-r--r--lib/Transforms/Scalar/LoopSink.cpp9
-rw-r--r--lib/Transforms/Scalar/LoopStrengthReduce.cpp20
-rw-r--r--lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp8
-rw-r--r--lib/Transforms/Scalar/LoopUnrollPass.cpp128
-rw-r--r--lib/Transforms/Scalar/LoopUnswitch.cpp87
-rw-r--r--lib/Transforms/Scalar/LoopVersioningLICM.cpp31
-rw-r--r--lib/Transforms/Scalar/LowerConstantIntrinsics.cpp170
-rw-r--r--lib/Transforms/Scalar/LowerExpectIntrinsic.cpp33
-rw-r--r--lib/Transforms/Scalar/MemCpyOptimizer.cpp110
-rw-r--r--lib/Transforms/Scalar/MergeICmps.cpp2
-rw-r--r--lib/Transforms/Scalar/MergedLoadStoreMotion.cpp167
-rw-r--r--lib/Transforms/Scalar/NaryReassociate.cpp2
-rw-r--r--lib/Transforms/Scalar/NewGVN.cpp25
-rw-r--r--lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp2
-rw-r--r--lib/Transforms/Scalar/PlaceSafepoints.cpp6
-rw-r--r--lib/Transforms/Scalar/Reassociate.cpp190
-rw-r--r--lib/Transforms/Scalar/RewriteStatepointsForGC.cpp6
-rw-r--r--lib/Transforms/Scalar/SCCP.cpp75
-rw-r--r--lib/Transforms/Scalar/SROA.cpp40
-rw-r--r--lib/Transforms/Scalar/Scalar.cpp9
-rw-r--r--lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp2
-rw-r--r--lib/Transforms/Scalar/SimpleLoopUnswitch.cpp25
-rw-r--r--lib/Transforms/Scalar/SpeculateAroundPHIs.cpp6
-rw-r--r--lib/Transforms/Scalar/StructurizeCFG.cpp2
-rw-r--r--lib/Transforms/Scalar/TailRecursionElimination.cpp2
55 files changed, 3197 insertions, 919 deletions
diff --git a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index de9a62e88c27..0e9f03a06061 100644
--- a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -93,9 +93,7 @@ static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,
const SCEV *AlignSCEV,
ScalarEvolution *SE) {
// DiffUnits = Diff % int64_t(Alignment)
- const SCEV *DiffAlignDiv = SE->getUDivExpr(DiffSCEV, AlignSCEV);
- const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV);
- const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV);
+ const SCEV *DiffUnitsSCEV = SE->getURemExpr(DiffSCEV, AlignSCEV);
LLVM_DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is "
<< *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
@@ -323,7 +321,7 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
LI->getPointerOperand(), SE);
if (NewAlignment > LI->getAlignment()) {
- LI->setAlignment(NewAlignment);
+ LI->setAlignment(MaybeAlign(NewAlignment));
++NumLoadAlignChanged;
}
} else if (StoreInst *SI = dyn_cast<StoreInst>(J)) {
@@ -331,7 +329,7 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
SI->getPointerOperand(), SE);
if (NewAlignment > SI->getAlignment()) {
- SI->setAlignment(NewAlignment);
+ SI->setAlignment(MaybeAlign(NewAlignment));
++NumStoreAlignChanged;
}
} else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) {
diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp
index 3519b000a33f..c3fba923104f 100644
--- a/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -562,7 +562,7 @@ struct CallSiteSplittingLegacyPass : public FunctionPass {
if (skipFunction(F))
return false;
- auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
return doCallSiteSplitting(F, TLI, TTI, DT);
diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp
index 98243a23f1ef..9f340afbf7c2 100644
--- a/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -204,7 +204,7 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
/// set found in \p BBs.
static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
BasicBlock *Entry,
- SmallPtrSet<BasicBlock *, 8> &BBs) {
+ SetVector<BasicBlock *> &BBs) {
assert(!BBs.count(Entry) && "Assume Entry is not in BBs");
// Nodes on the current path to the root.
SmallPtrSet<BasicBlock *, 8> Path;
@@ -257,7 +257,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
// Visit Orders in bottom-up order.
using InsertPtsCostPair =
- std::pair<SmallPtrSet<BasicBlock *, 16>, BlockFrequency>;
+ std::pair<SetVector<BasicBlock *>, BlockFrequency>;
// InsertPtsMap is a map from a BB to the best insertion points for the
// subtree of BB (subtree not including the BB itself).
@@ -266,7 +266,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) {
BasicBlock *Node = *RIt;
bool NodeInBBs = BBs.count(Node);
- SmallPtrSet<BasicBlock *, 16> &InsertPts = InsertPtsMap[Node].first;
+ auto &InsertPts = InsertPtsMap[Node].first;
BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second;
// Return the optimal insert points in BBs.
@@ -283,7 +283,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
BasicBlock *Parent = DT.getNode(Node)->getIDom()->getBlock();
// Initially, ParentInsertPts is empty and ParentPtsFreq is 0. Every child
// will update its parent's ParentInsertPts and ParentPtsFreq.
- SmallPtrSet<BasicBlock *, 16> &ParentInsertPts = InsertPtsMap[Parent].first;
+ auto &ParentInsertPts = InsertPtsMap[Parent].first;
BlockFrequency &ParentPtsFreq = InsertPtsMap[Parent].second;
// Choose to insert in Node or in subtree of Node.
// Don't hoist to EHPad because we may not find a proper place to insert
@@ -305,12 +305,12 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
}
/// Find an insertion point that dominates all uses.
-SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
+SetVector<Instruction *> ConstantHoistingPass::findConstantInsertionPoint(
const ConstantInfo &ConstInfo) const {
assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
// Collect all basic blocks.
- SmallPtrSet<BasicBlock *, 8> BBs;
- SmallPtrSet<Instruction *, 8> InsertPts;
+ SetVector<BasicBlock *> BBs;
+ SetVector<Instruction *> InsertPts;
for (auto const &RCI : ConstInfo.RebasedConstants)
for (auto const &U : RCI.Uses)
BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent());
@@ -333,15 +333,13 @@ SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
while (BBs.size() >= 2) {
BasicBlock *BB, *BB1, *BB2;
- BB1 = *BBs.begin();
- BB2 = *std::next(BBs.begin());
+ BB1 = BBs.pop_back_val();
+ BB2 = BBs.pop_back_val();
BB = DT->findNearestCommonDominator(BB1, BB2);
if (BB == Entry) {
InsertPts.insert(&Entry->front());
return InsertPts;
}
- BBs.erase(BB1);
- BBs.erase(BB2);
BBs.insert(BB);
}
assert((BBs.size() == 1) && "Expected only one element.");
@@ -403,7 +401,7 @@ void ConstantHoistingPass::collectConstantCandidates(
return;
// Get offset from the base GV.
- PointerType *GVPtrTy = dyn_cast<PointerType>(BaseGV->getType());
+ PointerType *GVPtrTy = cast<PointerType>(BaseGV->getType());
IntegerType *PtrIntTy = DL->getIntPtrType(*Ctx, GVPtrTy->getAddressSpace());
APInt Offset(DL->getTypeSizeInBits(PtrIntTy), /*val*/0, /*isSigned*/true);
auto *GEPO = cast<GEPOperator>(ConstExpr);
@@ -830,7 +828,7 @@ bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) {
SmallVectorImpl<consthoist::ConstantInfo> &ConstInfoVec =
BaseGV ? ConstGEPInfoMap[BaseGV] : ConstIntInfoVec;
for (auto const &ConstInfo : ConstInfoVec) {
- SmallPtrSet<Instruction *, 8> IPSet = findConstantInsertionPoint(ConstInfo);
+ SetVector<Instruction *> IPSet = findConstantInsertionPoint(ConstInfo);
// We can have an empty set if the function contains unreachable blocks.
if (IPSet.empty())
continue;
diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
index 770321c740a0..e9e6afe3fdd4 100644
--- a/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/lib/Transforms/Scalar/ConstantProp.cpp
@@ -82,7 +82,7 @@ bool ConstantPropagation::runOnFunction(Function &F) {
bool Changed = false;
const DataLayout &DL = F.getParent()->getDataLayout();
TargetLibraryInfo *TLI =
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
while (!WorkList.empty()) {
SmallVector<Instruction*, 16> NewWorkListVec;
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 89497177524f..2ef85268df48 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -62,6 +62,23 @@ STATISTIC(NumSDivs, "Number of sdiv converted to udiv");
STATISTIC(NumUDivs, "Number of udivs whose width was decreased");
STATISTIC(NumAShrs, "Number of ashr converted to lshr");
STATISTIC(NumSRems, "Number of srem converted to urem");
+STATISTIC(NumSExt, "Number of sext converted to zext");
+STATISTIC(NumAnd, "Number of ands removed");
+STATISTIC(NumNW, "Number of no-wrap deductions");
+STATISTIC(NumNSW, "Number of no-signed-wrap deductions");
+STATISTIC(NumNUW, "Number of no-unsigned-wrap deductions");
+STATISTIC(NumAddNW, "Number of no-wrap deductions for add");
+STATISTIC(NumAddNSW, "Number of no-signed-wrap deductions for add");
+STATISTIC(NumAddNUW, "Number of no-unsigned-wrap deductions for add");
+STATISTIC(NumSubNW, "Number of no-wrap deductions for sub");
+STATISTIC(NumSubNSW, "Number of no-signed-wrap deductions for sub");
+STATISTIC(NumSubNUW, "Number of no-unsigned-wrap deductions for sub");
+STATISTIC(NumMulNW, "Number of no-wrap deductions for mul");
+STATISTIC(NumMulNSW, "Number of no-signed-wrap deductions for mul");
+STATISTIC(NumMulNUW, "Number of no-unsigned-wrap deductions for mul");
+STATISTIC(NumShlNW, "Number of no-wrap deductions for shl");
+STATISTIC(NumShlNSW, "Number of no-signed-wrap deductions for shl");
+STATISTIC(NumShlNUW, "Number of no-unsigned-wrap deductions for shl");
STATISTIC(NumOverflows, "Number of overflow checks removed");
STATISTIC(NumSaturating,
"Number of saturating arithmetics converted to normal arithmetics");
@@ -85,6 +102,7 @@ namespace {
AU.addRequired<LazyValueInfoWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<LazyValueInfoWrapperPass>();
}
};
@@ -416,37 +434,96 @@ static bool willNotOverflow(BinaryOpIntrinsic *BO, LazyValueInfo *LVI) {
return NWRegion.contains(LRange);
}
-static void processOverflowIntrinsic(WithOverflowInst *WO) {
- IRBuilder<> B(WO);
- Value *NewOp = B.CreateBinOp(
- WO->getBinaryOp(), WO->getLHS(), WO->getRHS(), WO->getName());
- // Constant-folding could have happened.
- if (auto *Inst = dyn_cast<Instruction>(NewOp)) {
- if (WO->isSigned())
+static void setDeducedOverflowingFlags(Value *V, Instruction::BinaryOps Opcode,
+ bool NewNSW, bool NewNUW) {
+ Statistic *OpcNW, *OpcNSW, *OpcNUW;
+ switch (Opcode) {
+ case Instruction::Add:
+ OpcNW = &NumAddNW;
+ OpcNSW = &NumAddNSW;
+ OpcNUW = &NumAddNUW;
+ break;
+ case Instruction::Sub:
+ OpcNW = &NumSubNW;
+ OpcNSW = &NumSubNSW;
+ OpcNUW = &NumSubNUW;
+ break;
+ case Instruction::Mul:
+ OpcNW = &NumMulNW;
+ OpcNSW = &NumMulNSW;
+ OpcNUW = &NumMulNUW;
+ break;
+ case Instruction::Shl:
+ OpcNW = &NumShlNW;
+ OpcNSW = &NumShlNSW;
+ OpcNUW = &NumShlNUW;
+ break;
+ default:
+ llvm_unreachable("Will not be called with other binops");
+ }
+
+ auto *Inst = dyn_cast<Instruction>(V);
+ if (NewNSW) {
+ ++NumNW;
+ ++*OpcNW;
+ ++NumNSW;
+ ++*OpcNSW;
+ if (Inst)
Inst->setHasNoSignedWrap();
- else
+ }
+ if (NewNUW) {
+ ++NumNW;
+ ++*OpcNW;
+ ++NumNUW;
+ ++*OpcNUW;
+ if (Inst)
Inst->setHasNoUnsignedWrap();
}
+}
- Value *NewI = B.CreateInsertValue(UndefValue::get(WO->getType()), NewOp, 0);
- NewI = B.CreateInsertValue(NewI, ConstantInt::getFalse(WO->getContext()), 1);
+static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI);
+
+// Rewrite this with.overflow intrinsic as non-overflowing.
+static void processOverflowIntrinsic(WithOverflowInst *WO, LazyValueInfo *LVI) {
+ IRBuilder<> B(WO);
+ Instruction::BinaryOps Opcode = WO->getBinaryOp();
+ bool NSW = WO->isSigned();
+ bool NUW = !WO->isSigned();
+
+ Value *NewOp =
+ B.CreateBinOp(Opcode, WO->getLHS(), WO->getRHS(), WO->getName());
+ setDeducedOverflowingFlags(NewOp, Opcode, NSW, NUW);
+
+ StructType *ST = cast<StructType>(WO->getType());
+ Constant *Struct = ConstantStruct::get(ST,
+ { UndefValue::get(ST->getElementType(0)),
+ ConstantInt::getFalse(ST->getElementType(1)) });
+ Value *NewI = B.CreateInsertValue(Struct, NewOp, 0);
WO->replaceAllUsesWith(NewI);
WO->eraseFromParent();
++NumOverflows;
+
+ // See if we can infer the other no-wrap too.
+ if (auto *BO = dyn_cast<BinaryOperator>(NewOp))
+ processBinOp(BO, LVI);
}
-static void processSaturatingInst(SaturatingInst *SI) {
+static void processSaturatingInst(SaturatingInst *SI, LazyValueInfo *LVI) {
+ Instruction::BinaryOps Opcode = SI->getBinaryOp();
+ bool NSW = SI->isSigned();
+ bool NUW = !SI->isSigned();
BinaryOperator *BinOp = BinaryOperator::Create(
- SI->getBinaryOp(), SI->getLHS(), SI->getRHS(), SI->getName(), SI);
+ Opcode, SI->getLHS(), SI->getRHS(), SI->getName(), SI);
BinOp->setDebugLoc(SI->getDebugLoc());
- if (SI->isSigned())
- BinOp->setHasNoSignedWrap();
- else
- BinOp->setHasNoUnsignedWrap();
+ setDeducedOverflowingFlags(BinOp, Opcode, NSW, NUW);
SI->replaceAllUsesWith(BinOp);
SI->eraseFromParent();
++NumSaturating;
+
+ // See if we can infer the other no-wrap too.
+ if (auto *BO = dyn_cast<BinaryOperator>(BinOp))
+ processBinOp(BO, LVI);
}
/// Infer nonnull attributes for the arguments at the specified callsite.
@@ -456,14 +533,14 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
if (auto *WO = dyn_cast<WithOverflowInst>(CS.getInstruction())) {
if (WO->getLHS()->getType()->isIntegerTy() && willNotOverflow(WO, LVI)) {
- processOverflowIntrinsic(WO);
+ processOverflowIntrinsic(WO, LVI);
return true;
}
}
if (auto *SI = dyn_cast<SaturatingInst>(CS.getInstruction())) {
if (SI->getType()->isIntegerTy() && willNotOverflow(SI, LVI)) {
- processSaturatingInst(SI);
+ processSaturatingInst(SI, LVI);
return true;
}
}
@@ -632,6 +709,27 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
return true;
}
+static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) {
+ if (SDI->getType()->isVectorTy())
+ return false;
+
+ Value *Base = SDI->getOperand(0);
+
+ Constant *Zero = ConstantInt::get(Base->getType(), 0);
+ if (LVI->getPredicateAt(ICmpInst::ICMP_SGE, Base, Zero, SDI) !=
+ LazyValueInfo::True)
+ return false;
+
+ ++NumSExt;
+ auto *ZExt =
+ CastInst::CreateZExtOrBitCast(Base, SDI->getType(), SDI->getName(), SDI);
+ ZExt->setDebugLoc(SDI->getDebugLoc());
+ SDI->replaceAllUsesWith(ZExt);
+ SDI->eraseFromParent();
+
+ return true;
+}
+
static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) {
using OBO = OverflowingBinaryOperator;
@@ -648,6 +746,7 @@ static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) {
BasicBlock *BB = BinOp->getParent();
+ Instruction::BinaryOps Opcode = BinOp->getOpcode();
Value *LHS = BinOp->getOperand(0);
Value *RHS = BinOp->getOperand(1);
@@ -655,24 +754,48 @@ static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) {
ConstantRange RRange = LVI->getConstantRange(RHS, BB, BinOp);
bool Changed = false;
+ bool NewNUW = false, NewNSW = false;
if (!NUW) {
ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion(
- BinOp->getOpcode(), RRange, OBO::NoUnsignedWrap);
- bool NewNUW = NUWRange.contains(LRange);
- BinOp->setHasNoUnsignedWrap(NewNUW);
+ Opcode, RRange, OBO::NoUnsignedWrap);
+ NewNUW = NUWRange.contains(LRange);
Changed |= NewNUW;
}
if (!NSW) {
ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion(
- BinOp->getOpcode(), RRange, OBO::NoSignedWrap);
- bool NewNSW = NSWRange.contains(LRange);
- BinOp->setHasNoSignedWrap(NewNSW);
+ Opcode, RRange, OBO::NoSignedWrap);
+ NewNSW = NSWRange.contains(LRange);
Changed |= NewNSW;
}
+ setDeducedOverflowingFlags(BinOp, Opcode, NewNSW, NewNUW);
+
return Changed;
}
+static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) {
+ if (BinOp->getType()->isVectorTy())
+ return false;
+
+ // Pattern match (and lhs, C) where C includes a superset of bits which might
+ // be set in lhs. This is a common truncation idiom created by instcombine.
+ BasicBlock *BB = BinOp->getParent();
+ Value *LHS = BinOp->getOperand(0);
+ ConstantInt *RHS = dyn_cast<ConstantInt>(BinOp->getOperand(1));
+ if (!RHS || !RHS->getValue().isMask())
+ return false;
+
+ ConstantRange LRange = LVI->getConstantRange(LHS, BB, BinOp);
+ if (!LRange.getUnsignedMax().ule(RHS->getValue()))
+ return false;
+
+ BinOp->replaceAllUsesWith(LHS);
+ BinOp->eraseFromParent();
+ NumAnd++;
+ return true;
+}
+
+
static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
if (Constant *C = LVI->getConstant(V, At->getParent(), At))
return C;
@@ -740,10 +863,18 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT,
case Instruction::AShr:
BBChanged |= processAShr(cast<BinaryOperator>(II), LVI);
break;
+ case Instruction::SExt:
+ BBChanged |= processSExt(cast<SExtInst>(II), LVI);
+ break;
case Instruction::Add:
case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::Shl:
BBChanged |= processBinOp(cast<BinaryOperator>(II), LVI);
break;
+ case Instruction::And:
+ BBChanged |= processAnd(cast<BinaryOperator>(II), LVI);
+ break;
}
}
@@ -796,5 +927,6 @@ CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
PreservedAnalyses PA;
PA.preserve<GlobalsAA>();
PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LazyValueAnalysis>();
return PA;
}
diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
index 479e0ed74074..a79d775aa7f3 100644
--- a/lib/Transforms/Scalar/DCE.cpp
+++ b/lib/Transforms/Scalar/DCE.cpp
@@ -38,17 +38,19 @@ namespace {
//===--------------------------------------------------------------------===//
// DeadInstElimination pass implementation
//
- struct DeadInstElimination : public BasicBlockPass {
- static char ID; // Pass identification, replacement for typeid
- DeadInstElimination() : BasicBlockPass(ID) {
- initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
- }
- bool runOnBasicBlock(BasicBlock &BB) override {
- if (skipBasicBlock(BB))
- return false;
- auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
- TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
- bool Changed = false;
+struct DeadInstElimination : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ DeadInstElimination() : FunctionPass(ID) {
+ initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+
+ bool Changed = false;
+ for (auto &BB : F) {
for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
Instruction *Inst = &*DI++;
if (isInstructionTriviallyDead(Inst, TLI)) {
@@ -60,13 +62,14 @@ namespace {
++DIEEliminated;
}
}
- return Changed;
}
+ return Changed;
+ }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
}
- };
+};
}
char DeadInstElimination::ID = 0;
@@ -154,7 +157,7 @@ struct DCELegacyPass : public FunctionPass {
return false;
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
- TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+ TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
return eliminateDeadCode(F, TLI);
}
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index a81645745b48..685de82810ed 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1254,8 +1254,9 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
auto *SI = new StoreInst(
ConstantInt::get(Earlier->getValueOperand()->getType(), Merged),
- Earlier->getPointerOperand(), false, Earlier->getAlignment(),
- Earlier->getOrdering(), Earlier->getSyncScopeID(), DepWrite);
+ Earlier->getPointerOperand(), false,
+ MaybeAlign(Earlier->getAlignment()), Earlier->getOrdering(),
+ Earlier->getSyncScopeID(), DepWrite);
unsigned MDToKeep[] = {LLVMContext::MD_dbg, LLVMContext::MD_tbaa,
LLVMContext::MD_alias_scope,
@@ -1361,7 +1362,7 @@ public:
MemoryDependenceResults *MD =
&getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
const TargetLibraryInfo *TLI =
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
return eliminateDeadStores(F, AA, MD, DT, TLI);
}
diff --git a/lib/Transforms/Scalar/DivRemPairs.cpp b/lib/Transforms/Scalar/DivRemPairs.cpp
index 876681b4f9de..934853507478 100644
--- a/lib/Transforms/Scalar/DivRemPairs.cpp
+++ b/lib/Transforms/Scalar/DivRemPairs.cpp
@@ -1,4 +1,4 @@
-//===- DivRemPairs.cpp - Hoist/decompose division and remainder -*- C++ -*-===//
+//===- DivRemPairs.cpp - Hoist/[dr]ecompose division and remainder --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
//
-// This pass hoists and/or decomposes integer division and remainder
+// This pass hoists and/or decomposes/recomposes integer division and remainder
// instructions to enable CFG improvements and better codegen.
//
//===----------------------------------------------------------------------===//
@@ -19,37 +19,105 @@
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
#include "llvm/Support/DebugCounter.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+
using namespace llvm;
+using namespace llvm::PatternMatch;
#define DEBUG_TYPE "div-rem-pairs"
STATISTIC(NumPairs, "Number of div/rem pairs");
+STATISTIC(NumRecomposed, "Number of instructions recomposed");
STATISTIC(NumHoisted, "Number of instructions hoisted");
STATISTIC(NumDecomposed, "Number of instructions decomposed");
DEBUG_COUNTER(DRPCounter, "div-rem-pairs-transform",
"Controls transformations in div-rem-pairs pass");
-/// Find matching pairs of integer div/rem ops (they have the same numerator,
-/// denominator, and signedness). If they exist in different basic blocks, bring
-/// them together by hoisting or replace the common division operation that is
-/// implicit in the remainder:
-/// X % Y <--> X - ((X / Y) * Y).
-///
-/// We can largely ignore the normal safety and cost constraints on speculation
-/// of these ops when we find a matching pair. This is because we are already
-/// guaranteed that any exceptions and most cost are already incurred by the
-/// first member of the pair.
-///
-/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or
-/// SimplifyCFG, but it's split off on its own because it's different enough
-/// that it doesn't quite match the stated objectives of those passes.
-static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
- const DominatorTree &DT) {
- bool Changed = false;
+namespace {
+struct ExpandedMatch {
+ DivRemMapKey Key;
+ Instruction *Value;
+};
+} // namespace
+
+/// See if we can match: (which is the form we expand into)
+/// X - ((X ?/ Y) * Y)
+/// which is equivalent to:
+/// X ?% Y
+static llvm::Optional<ExpandedMatch> matchExpandedRem(Instruction &I) {
+ Value *Dividend, *XroundedDownToMultipleOfY;
+ if (!match(&I, m_Sub(m_Value(Dividend), m_Value(XroundedDownToMultipleOfY))))
+ return llvm::None;
+
+ Value *Divisor;
+ Instruction *Div;
+ // Look for ((X / Y) * Y)
+ if (!match(
+ XroundedDownToMultipleOfY,
+ m_c_Mul(m_CombineAnd(m_IDiv(m_Specific(Dividend), m_Value(Divisor)),
+ m_Instruction(Div)),
+ m_Deferred(Divisor))))
+ return llvm::None;
+
+ ExpandedMatch M;
+ M.Key.SignedOp = Div->getOpcode() == Instruction::SDiv;
+ M.Key.Dividend = Dividend;
+ M.Key.Divisor = Divisor;
+ M.Value = &I;
+ return M;
+}
+
+/// A thin wrapper to store two values that we matched as div-rem pair.
+/// We want this extra indirection to avoid dealing with RAUW'ing the map keys.
+struct DivRemPairWorklistEntry {
+ /// The actual udiv/sdiv instruction. Source of truth.
+ AssertingVH<Instruction> DivInst;
+
+ /// The instruction that we have matched as a remainder instruction.
+ /// Should only be used as Value, don't introspect it.
+ AssertingVH<Instruction> RemInst;
+
+ DivRemPairWorklistEntry(Instruction *DivInst_, Instruction *RemInst_)
+ : DivInst(DivInst_), RemInst(RemInst_) {
+ assert((DivInst->getOpcode() == Instruction::UDiv ||
+ DivInst->getOpcode() == Instruction::SDiv) &&
+ "Not a division.");
+ assert(DivInst->getType() == RemInst->getType() && "Types should match.");
+ // We can't check anything else about remainder instruction,
+ // it's not strictly required to be a urem/srem.
+ }
+ /// The type for this pair, identical for both the div and rem.
+ Type *getType() const { return DivInst->getType(); }
+
+ /// Is this pair signed or unsigned?
+ bool isSigned() const { return DivInst->getOpcode() == Instruction::SDiv; }
+
+ /// In this pair, what are the divident and divisor?
+ Value *getDividend() const { return DivInst->getOperand(0); }
+ Value *getDivisor() const { return DivInst->getOperand(1); }
+
+ bool isRemExpanded() const {
+ switch (RemInst->getOpcode()) {
+ case Instruction::SRem:
+ case Instruction::URem:
+ return false; // single 'rem' instruction - unexpanded form.
+ default:
+ return true; // anything else means we have remainder in expanded form.
+ }
+ }
+};
+using DivRemWorklistTy = SmallVector<DivRemPairWorklistEntry, 4>;
+
+/// Find matching pairs of integer div/rem ops (they have the same numerator,
+/// denominator, and signedness). Place those pairs into a worklist for further
+/// processing. This indirection is needed because we have to use TrackingVH<>
+/// because we will be doing RAUW, and if one of the rem instructions we change
+/// happens to be an input to another div/rem in the maps, we'd have problems.
+static DivRemWorklistTy getWorklist(Function &F) {
// Insert all divide and remainder instructions into maps keyed by their
// operands and opcode (signed or unsigned).
DenseMap<DivRemMapKey, Instruction *> DivMap;
@@ -66,9 +134,14 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
RemMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
else if (I.getOpcode() == Instruction::URem)
RemMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
+ else if (auto Match = matchExpandedRem(I))
+ RemMap[Match->Key] = Match->Value;
}
}
+ // We'll accumulate the matching pairs of div-rem instructions here.
+ DivRemWorklistTy Worklist;
+
// We can iterate over either map because we are only looking for matched
// pairs. Choose remainders for efficiency because they are usually even more
// rare than division.
@@ -78,12 +151,77 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
if (!DivInst)
continue;
- // We have a matching pair of div/rem instructions. If one dominates the
- // other, hoist and/or replace one.
+ // We have a matching pair of div/rem instructions.
NumPairs++;
Instruction *RemInst = RemPair.second;
- bool IsSigned = DivInst->getOpcode() == Instruction::SDiv;
- bool HasDivRemOp = TTI.hasDivRemOp(DivInst->getType(), IsSigned);
+
+ // Place it in the worklist.
+ Worklist.emplace_back(DivInst, RemInst);
+ }
+
+ return Worklist;
+}
+
+/// Find matching pairs of integer div/rem ops (they have the same numerator,
+/// denominator, and signedness). If they exist in different basic blocks, bring
+/// them together by hoisting or replace the common division operation that is
+/// implicit in the remainder:
+/// X % Y <--> X - ((X / Y) * Y).
+///
+/// We can largely ignore the normal safety and cost constraints on speculation
+/// of these ops when we find a matching pair. This is because we are already
+/// guaranteed that any exceptions and most cost are already incurred by the
+/// first member of the pair.
+///
+/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or
+/// SimplifyCFG, but it's split off on its own because it's different enough
+/// that it doesn't quite match the stated objectives of those passes.
+static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
+ const DominatorTree &DT) {
+ bool Changed = false;
+
+ // Get the matching pairs of div-rem instructions. We want this extra
+ // indirection to avoid dealing with having to RAUW the keys of the maps.
+ DivRemWorklistTy Worklist = getWorklist(F);
+
+ // Process each entry in the worklist.
+ for (DivRemPairWorklistEntry &E : Worklist) {
+ if (!DebugCounter::shouldExecute(DRPCounter))
+ continue;
+
+ bool HasDivRemOp = TTI.hasDivRemOp(E.getType(), E.isSigned());
+
+ auto &DivInst = E.DivInst;
+ auto &RemInst = E.RemInst;
+
+ const bool RemOriginallyWasInExpandedForm = E.isRemExpanded();
+ (void)RemOriginallyWasInExpandedForm; // suppress unused variable warning
+
+ if (HasDivRemOp && E.isRemExpanded()) {
+ // The target supports div+rem but the rem is expanded.
+ // We should recompose it first.
+ Value *X = E.getDividend();
+ Value *Y = E.getDivisor();
+ Instruction *RealRem = E.isSigned() ? BinaryOperator::CreateSRem(X, Y)
+ : BinaryOperator::CreateURem(X, Y);
+ // Note that we place it right next to the original expanded instruction,
+ // and letting further handling to move it if needed.
+ RealRem->setName(RemInst->getName() + ".recomposed");
+ RealRem->insertAfter(RemInst);
+ Instruction *OrigRemInst = RemInst;
+ // Update AssertingVH<> with new instruction so it doesn't assert.
+ RemInst = RealRem;
+ // And replace the original instruction with the new one.
+ OrigRemInst->replaceAllUsesWith(RealRem);
+ OrigRemInst->eraseFromParent();
+ NumRecomposed++;
+ // Note that we have left ((X / Y) * Y) around.
+ // If it had other uses we could rewrite it as X - X % Y
+ }
+
+ assert((!E.isRemExpanded() || !HasDivRemOp) &&
+ "*If* the target supports div-rem, then by now the RemInst *is* "
+ "Instruction::[US]Rem.");
// If the target supports div+rem and the instructions are in the same block
// already, there's nothing to do. The backend should handle this. If the
@@ -92,10 +230,16 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
continue;
bool DivDominates = DT.dominates(DivInst, RemInst);
- if (!DivDominates && !DT.dominates(RemInst, DivInst))
+ if (!DivDominates && !DT.dominates(RemInst, DivInst)) {
+ // We have matching div-rem pair, but they are in two different blocks,
+ // neither of which dominates one another.
+ // FIXME: We could hoist both ops to the common predecessor block?
continue;
+ }
- if (!DebugCounter::shouldExecute(DRPCounter))
+ // The target does not have a single div/rem operation,
+ // and the rem is already in expanded form. Nothing to do.
+ if (!HasDivRemOp && E.isRemExpanded())
continue;
if (HasDivRemOp) {
@@ -107,11 +251,17 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
DivInst->moveAfter(RemInst);
NumHoisted++;
} else {
- // The target does not have a single div/rem operation. Decompose the
- // remainder calculation as:
+ // The target does not have a single div/rem operation,
+ // and the rem is *not* in a already-expanded form.
+ // Decompose the remainder calculation as:
// X % Y --> X - ((X / Y) * Y).
- Value *X = RemInst->getOperand(0);
- Value *Y = RemInst->getOperand(1);
+
+ assert(!RemOriginallyWasInExpandedForm &&
+ "We should not be expanding if the rem was in expanded form to "
+ "begin with.");
+
+ Value *X = E.getDividend();
+ Value *Y = E.getDivisor();
Instruction *Mul = BinaryOperator::CreateMul(DivInst, Y);
Instruction *Sub = BinaryOperator::CreateSub(X, Mul);
@@ -152,8 +302,13 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
// Now kill the explicit remainder. We have replaced it with:
// (sub X, (mul (div X, Y), Y)
- RemInst->replaceAllUsesWith(Sub);
- RemInst->eraseFromParent();
+ Sub->setName(RemInst->getName() + ".decomposed");
+ Instruction *OrigRemInst = RemInst;
+ // Update AssertingVH<> with new instruction so it doesn't assert.
+ RemInst = Sub;
+ // And replace the original instruction with the new one.
+ OrigRemInst->replaceAllUsesWith(Sub);
+ OrigRemInst->eraseFromParent();
NumDecomposed++;
}
Changed = true;
@@ -188,7 +343,7 @@ struct DivRemPairsLegacyPass : public FunctionPass {
return optimizeDivRem(F, TTI, DT);
}
};
-}
+} // namespace
char DivRemPairsLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(DivRemPairsLegacyPass, "div-rem-pairs",
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index f1f075257020..ce540683dae2 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -108,11 +108,12 @@ struct SimpleValue {
// This can only handle non-void readnone functions.
if (CallInst *CI = dyn_cast<CallInst>(Inst))
return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy();
- return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) ||
- isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) ||
- isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
- isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
- isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst);
+ return isa<CastInst>(Inst) || isa<UnaryOperator>(Inst) ||
+ isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) ||
+ isa<CmpInst>(Inst) || isa<SelectInst>(Inst) ||
+ isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
+ isa<ShuffleVectorInst>(Inst) || isa<ExtractValueInst>(Inst) ||
+ isa<InsertValueInst>(Inst);
}
};
@@ -240,7 +241,7 @@ static unsigned getHashValueImpl(SimpleValue Val) {
assert((isa<CallInst>(Inst) || isa<GetElementPtrInst>(Inst) ||
isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
- isa<ShuffleVectorInst>(Inst)) &&
+ isa<ShuffleVectorInst>(Inst) || isa<UnaryOperator>(Inst)) &&
"Invalid/unknown instruction");
// Mix in the opcode.
@@ -526,7 +527,7 @@ public:
const TargetTransformInfo &TTI, DominatorTree &DT,
AssumptionCache &AC, MemorySSA *MSSA)
: TLI(TLI), TTI(TTI), DT(DT), AC(AC), SQ(DL, &TLI, &DT, &AC), MSSA(MSSA),
- MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {}
+ MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {}
bool run();
@@ -651,7 +652,7 @@ private:
bool isInvariantLoad() const {
if (auto *LI = dyn_cast<LoadInst>(Inst))
- return LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr;
+ return LI->hasMetadata(LLVMContext::MD_invariant_load);
return false;
}
@@ -790,7 +791,7 @@ bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) {
// A location loaded from with an invariant_load is assumed to *never* change
// within the visible scope of the compilation.
if (auto *LI = dyn_cast<LoadInst>(I))
- if (LI->getMetadata(LLVMContext::MD_invariant_load))
+ if (LI->hasMetadata(LLVMContext::MD_invariant_load))
return true;
auto MemLocOpt = MemoryLocation::getOrNone(I);
@@ -1359,7 +1360,7 @@ public:
if (skipFunction(F))
return false;
- auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -1381,6 +1382,7 @@ public:
AU.addPreserved<MemorySSAWrapperPass>();
}
AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
AU.setPreservesCFG();
}
};
diff --git a/lib/Transforms/Scalar/FlattenCFGPass.cpp b/lib/Transforms/Scalar/FlattenCFGPass.cpp
index 31670b1464e4..e6abf1ceb026 100644
--- a/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -11,10 +11,12 @@
//===----------------------------------------------------------------------===//
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/CFG.h"
+#include "llvm/IR/ValueHandle.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
using namespace llvm;
#define DEBUG_TYPE "flattencfg"
@@ -52,15 +54,23 @@ FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); }
static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
bool Changed = false;
bool LocalChange = true;
+
+ // Use block handles instead of iterating over function blocks directly
+ // to avoid using iterators invalidated by erasing blocks.
+ std::vector<WeakVH> Blocks;
+ Blocks.reserve(F.size());
+ for (auto &BB : F)
+ Blocks.push_back(&BB);
+
while (LocalChange) {
LocalChange = false;
- // Loop over all of the basic blocks and remove them if they are unneeded...
- //
- for (Function::iterator BBIt = F.begin(); BBIt != F.end();) {
- if (FlattenCFG(&*BBIt++, AA)) {
- LocalChange = true;
- }
+ // Loop over all of the basic blocks and try to flatten them.
+ for (WeakVH &BlockHandle : Blocks) {
+ // Skip blocks erased by FlattenCFG.
+ if (auto *BB = cast_or_null<BasicBlock>(BlockHandle))
+ if (FlattenCFG(BB, AA))
+ LocalChange = true;
}
Changed |= LocalChange;
}
diff --git a/lib/Transforms/Scalar/Float2Int.cpp b/lib/Transforms/Scalar/Float2Int.cpp
index 4f83e869b303..4d2eac0451df 100644
--- a/lib/Transforms/Scalar/Float2Int.cpp
+++ b/lib/Transforms/Scalar/Float2Int.cpp
@@ -60,11 +60,13 @@ namespace {
if (skipFunction(F))
return false;
- return Impl.runImpl(F);
+ const DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ return Impl.runImpl(F, DT);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
@@ -116,21 +118,29 @@ static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) {
// Find the roots - instructions that convert from the FP domain to
// integer domain.
-void Float2IntPass::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
- for (auto &I : instructions(F)) {
- if (isa<VectorType>(I.getType()))
+void Float2IntPass::findRoots(Function &F, const DominatorTree &DT,
+ SmallPtrSet<Instruction*,8> &Roots) {
+ for (BasicBlock &BB : F) {
+ // Unreachable code can take on strange forms that we are not prepared to
+ // handle. For example, an instruction may have itself as an operand.
+ if (!DT.isReachableFromEntry(&BB))
continue;
- switch (I.getOpcode()) {
- default: break;
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- Roots.insert(&I);
- break;
- case Instruction::FCmp:
- if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) !=
- CmpInst::BAD_ICMP_PREDICATE)
+
+ for (Instruction &I : BB) {
+ if (isa<VectorType>(I.getType()))
+ continue;
+ switch (I.getOpcode()) {
+ default: break;
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
Roots.insert(&I);
- break;
+ break;
+ case Instruction::FCmp:
+ if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) !=
+ CmpInst::BAD_ICMP_PREDICATE)
+ Roots.insert(&I);
+ break;
+ }
}
}
}
@@ -503,7 +513,7 @@ void Float2IntPass::cleanup() {
I.first->eraseFromParent();
}
-bool Float2IntPass::runImpl(Function &F) {
+bool Float2IntPass::runImpl(Function &F, const DominatorTree &DT) {
LLVM_DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n");
// Clear out all state.
ECs = EquivalenceClasses<Instruction*>();
@@ -513,7 +523,7 @@ bool Float2IntPass::runImpl(Function &F) {
Ctx = &F.getParent()->getContext();
- findRoots(F, Roots);
+ findRoots(F, DT, Roots);
walkBackwards(Roots);
walkForwards();
@@ -527,8 +537,9 @@ bool Float2IntPass::runImpl(Function &F) {
namespace llvm {
FunctionPass *createFloat2IntPass() { return new Float2IntLegacyPass(); }
-PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &) {
- if (!runImpl(F))
+PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &AM) {
+ const DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ if (!runImpl(F, DT))
return PreservedAnalyses::all();
PreservedAnalyses PA;
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 1a02e9d33f49..743353eaea22 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -70,6 +70,7 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -626,6 +627,8 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
PA.preserve<DominatorTreeAnalysis>();
PA.preserve<GlobalsAA>();
PA.preserve<TargetLibraryAnalysis>();
+ if (LI)
+ PA.preserve<LoopAnalysis>();
return PA;
}
@@ -1161,15 +1164,30 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
// Do PHI translation to get its value in the predecessor if necessary. The
// returned pointer (if non-null) is guaranteed to dominate UnavailablePred.
+ // We do the translation for each edge we skipped by going from LI's block
+ // to LoadBB, otherwise we might miss pieces needing translation.
// If all preds have a single successor, then we know it is safe to insert
// the load on the pred (?!?), so we can insert code to materialize the
// pointer if it is not available.
- PHITransAddr Address(LI->getPointerOperand(), DL, AC);
- Value *LoadPtr = nullptr;
- LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
- *DT, NewInsts);
+ Value *LoadPtr = LI->getPointerOperand();
+ BasicBlock *Cur = LI->getParent();
+ while (Cur != LoadBB) {
+ PHITransAddr Address(LoadPtr, DL, AC);
+ LoadPtr = Address.PHITranslateWithInsertion(
+ Cur, Cur->getSinglePredecessor(), *DT, NewInsts);
+ if (!LoadPtr) {
+ CanDoPRE = false;
+ break;
+ }
+ Cur = Cur->getSinglePredecessor();
+ }
+ if (LoadPtr) {
+ PHITransAddr Address(LoadPtr, DL, AC);
+ LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, *DT,
+ NewInsts);
+ }
// If we couldn't find or insert a computation of this phi translated value,
// we fail PRE.
if (!LoadPtr) {
@@ -1184,8 +1202,12 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
if (!CanDoPRE) {
while (!NewInsts.empty()) {
- Instruction *I = NewInsts.pop_back_val();
- markInstructionForDeletion(I);
+ // Erase instructions generated by the failed PHI translation before
+ // trying to number them. PHI translation might insert instructions
+ // in basic blocks other than the current one, and we delete them
+ // directly, as markInstructionForDeletion only allows removing from the
+ // current basic block.
+ NewInsts.pop_back_val()->eraseFromParent();
}
// HINT: Don't revert the edge-splitting as following transformation may
// also need to split these critical edges.
@@ -1219,10 +1241,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
BasicBlock *UnavailablePred = PredLoad.first;
Value *LoadPtr = PredLoad.second;
- auto *NewLoad =
- new LoadInst(LI->getType(), LoadPtr, LI->getName() + ".pre",
- LI->isVolatile(), LI->getAlignment(), LI->getOrdering(),
- LI->getSyncScopeID(), UnavailablePred->getTerminator());
+ auto *NewLoad = new LoadInst(
+ LI->getType(), LoadPtr, LI->getName() + ".pre", LI->isVolatile(),
+ MaybeAlign(LI->getAlignment()), LI->getOrdering(), LI->getSyncScopeID(),
+ UnavailablePred->getTerminator());
NewLoad->setDebugLoc(LI->getDebugLoc());
// Transfer the old load's AA tags to the new load.
@@ -1365,6 +1387,14 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks);
}
+static bool hasUsersIn(Value *V, BasicBlock *BB) {
+ for (User *U : V->users())
+ if (isa<Instruction>(U) &&
+ cast<Instruction>(U)->getParent() == BB)
+ return true;
+ return false;
+}
+
bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
assert(IntrinsicI->getIntrinsicID() == Intrinsic::assume &&
"This function can only be called with llvm.assume intrinsic");
@@ -1403,12 +1433,23 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
// We can replace assume value with true, which covers cases like this:
// call void @llvm.assume(i1 %cmp)
// br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true
- ReplaceWithConstMap[V] = True;
-
- // If one of *cmp *eq operand is const, adding it to map will cover this:
+ ReplaceOperandsWithMap[V] = True;
+
+ // If we find an equality fact, canonicalize all dominated uses in this block
+ // to one of the two values. We heuristically choice the "oldest" of the
+ // two where age is determined by value number. (Note that propagateEquality
+ // above handles the cross block case.)
+ //
+ // Key case to cover are:
+ // 1)
// %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen
// call void @llvm.assume(i1 %cmp)
// ret float %0 ; will change it to ret float 3.000000e+00
+ // 2)
+ // %load = load float, float* %addr
+ // %cmp = fcmp oeq float %load, %0
+ // call void @llvm.assume(i1 %cmp)
+ // ret float %load ; will change it to ret float %0
if (auto *CmpI = dyn_cast<CmpInst>(V)) {
if (CmpI->getPredicate() == CmpInst::Predicate::ICMP_EQ ||
CmpI->getPredicate() == CmpInst::Predicate::FCMP_OEQ ||
@@ -1416,13 +1457,50 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
CmpI->getFastMathFlags().noNaNs())) {
Value *CmpLHS = CmpI->getOperand(0);
Value *CmpRHS = CmpI->getOperand(1);
- if (isa<Constant>(CmpLHS))
+ // Heuristically pick the better replacement -- the choice of heuristic
+ // isn't terribly important here, but the fact we canonicalize on some
+ // replacement is for exposing other simplifications.
+ // TODO: pull this out as a helper function and reuse w/existing
+ // (slightly different) logic.
+ if (isa<Constant>(CmpLHS) && !isa<Constant>(CmpRHS))
std::swap(CmpLHS, CmpRHS);
- auto *RHSConst = dyn_cast<Constant>(CmpRHS);
+ if (!isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))
+ std::swap(CmpLHS, CmpRHS);
+ if ((isa<Argument>(CmpLHS) && isa<Argument>(CmpRHS)) ||
+ (isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))) {
+ // Move the 'oldest' value to the right-hand side, using the value
+ // number as a proxy for age.
+ uint32_t LVN = VN.lookupOrAdd(CmpLHS);
+ uint32_t RVN = VN.lookupOrAdd(CmpRHS);
+ if (LVN < RVN)
+ std::swap(CmpLHS, CmpRHS);
+ }
- // If only one operand is constant.
- if (RHSConst != nullptr && !isa<Constant>(CmpLHS))
- ReplaceWithConstMap[CmpLHS] = RHSConst;
+ // Handle degenerate case where we either haven't pruned a dead path or a
+ // removed a trivial assume yet.
+ if (isa<Constant>(CmpLHS) && isa<Constant>(CmpRHS))
+ return Changed;
+
+ // +0.0 and -0.0 compare equal, but do not imply equivalence. Unless we
+ // can prove equivalence, bail.
+ if (CmpRHS->getType()->isFloatTy() &&
+ (!isa<ConstantFP>(CmpRHS) || cast<ConstantFP>(CmpRHS)->isZero()))
+ return Changed;
+
+ LLVM_DEBUG(dbgs() << "Replacing dominated uses of "
+ << *CmpLHS << " with "
+ << *CmpRHS << " in block "
+ << IntrinsicI->getParent()->getName() << "\n");
+
+
+ // Setup the replacement map - this handles uses within the same block
+ if (hasUsersIn(CmpLHS, IntrinsicI->getParent()))
+ ReplaceOperandsWithMap[CmpLHS] = CmpRHS;
+
+ // NOTE: The non-block local cases are handled by the call to
+ // propagateEquality above; this block is just about handling the block
+ // local cases. TODO: There's a bunch of logic in propagateEqualiy which
+ // isn't duplicated for the block local case, can we share it somehow?
}
}
return Changed;
@@ -1522,6 +1600,41 @@ uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred,
return NewNum;
}
+// Return true if the value number \p Num and NewNum have equal value.
+// Return false if the result is unknown.
+bool GVN::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum,
+ const BasicBlock *Pred,
+ const BasicBlock *PhiBlock, GVN &Gvn) {
+ CallInst *Call = nullptr;
+ LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
+ while (Vals) {
+ Call = dyn_cast<CallInst>(Vals->Val);
+ if (Call && Call->getParent() == PhiBlock)
+ break;
+ Vals = Vals->Next;
+ }
+
+ if (AA->doesNotAccessMemory(Call))
+ return true;
+
+ if (!MD || !AA->onlyReadsMemory(Call))
+ return false;
+
+ MemDepResult local_dep = MD->getDependency(Call);
+ if (!local_dep.isNonLocal())
+ return false;
+
+ const MemoryDependenceResults::NonLocalDepInfo &deps =
+ MD->getNonLocalCallDependency(Call);
+
+ // Check to see if the Call has no function local clobber.
+ for (unsigned i = 0; i < deps.size(); i++) {
+ if (deps[i].getResult().isNonFuncLocal())
+ return true;
+ }
+ return false;
+}
+
/// Translate value number \p Num using phis, so that it has the values of
/// the phis in BB.
uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
@@ -1568,8 +1681,11 @@ uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
}
}
- if (uint32_t NewNum = expressionNumbering[Exp])
+ if (uint32_t NewNum = expressionNumbering[Exp]) {
+ if (Exp.opcode == Instruction::Call && NewNum != Num)
+ return areCallValsEqual(Num, NewNum, Pred, PhiBlock, Gvn) ? NewNum : Num;
return NewNum;
+ }
return Num;
}
@@ -1637,16 +1753,12 @@ void GVN::assignBlockRPONumber(Function &F) {
InvalidBlockRPONumbers = false;
}
-// Tries to replace instruction with const, using information from
-// ReplaceWithConstMap.
-bool GVN::replaceOperandsWithConsts(Instruction *Instr) const {
+bool GVN::replaceOperandsForInBlockEquality(Instruction *Instr) const {
bool Changed = false;
for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) {
- Value *Operand = Instr->getOperand(OpNum);
- auto it = ReplaceWithConstMap.find(Operand);
- if (it != ReplaceWithConstMap.end()) {
- assert(!isa<Constant>(Operand) &&
- "Replacing constants with constants is invalid");
+ Value *Operand = Instr->getOperand(OpNum);
+ auto it = ReplaceOperandsWithMap.find(Operand);
+ if (it != ReplaceOperandsWithMap.end()) {
LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with "
<< *it->second << " in instruction " << *Instr << '\n');
Instr->setOperand(OpNum, it->second);
@@ -1976,6 +2088,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
MD = RunMD;
ImplicitControlFlowTracking ImplicitCFT(DT);
ICF = &ImplicitCFT;
+ this->LI = LI;
VN.setMemDep(MD);
ORE = RunORE;
InvalidBlockRPONumbers = true;
@@ -2037,13 +2150,13 @@ bool GVN::processBlock(BasicBlock *BB) {
return false;
// Clearing map before every BB because it can be used only for single BB.
- ReplaceWithConstMap.clear();
+ ReplaceOperandsWithMap.clear();
bool ChangedFunction = false;
for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
BI != BE;) {
- if (!ReplaceWithConstMap.empty())
- ChangedFunction |= replaceOperandsWithConsts(&*BI);
+ if (!ReplaceOperandsWithMap.empty())
+ ChangedFunction |= replaceOperandsForInBlockEquality(&*BI);
ChangedFunction |= processInstruction(&*BI);
if (InstrsToErase.empty()) {
@@ -2335,7 +2448,7 @@ bool GVN::performPRE(Function &F) {
/// the block inserted to the critical edge.
BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
BasicBlock *BB =
- SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT));
+ SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT, LI));
if (MD)
MD->invalidateCachedPredecessors();
InvalidBlockRPONumbers = true;
@@ -2350,7 +2463,7 @@ bool GVN::splitCriticalEdges() {
do {
std::pair<Instruction *, unsigned> Edge = toSplit.pop_back_val();
SplitCriticalEdge(Edge.first, Edge.second,
- CriticalEdgeSplittingOptions(DT));
+ CriticalEdgeSplittingOptions(DT, LI));
} while (!toSplit.empty());
if (MD) MD->invalidateCachedPredecessors();
InvalidBlockRPONumbers = true;
@@ -2456,18 +2569,26 @@ void GVN::addDeadBlock(BasicBlock *BB) {
if (DeadBlocks.count(B))
continue;
+ // First, split the critical edges. This might also create additional blocks
+ // to preserve LoopSimplify form and adjust edges accordingly.
SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B));
for (BasicBlock *P : Preds) {
if (!DeadBlocks.count(P))
continue;
- if (isCriticalEdge(P->getTerminator(), GetSuccessorNumber(P, B))) {
+ if (llvm::any_of(successors(P),
+ [B](BasicBlock *Succ) { return Succ == B; }) &&
+ isCriticalEdge(P->getTerminator(), B)) {
if (BasicBlock *S = splitCriticalEdges(P, B))
DeadBlocks.insert(P = S);
}
+ }
- for (BasicBlock::iterator II = B->begin(); isa<PHINode>(II); ++II) {
- PHINode &Phi = cast<PHINode>(*II);
+ // Now undef the incoming values from the dead predecessors.
+ for (BasicBlock *P : predecessors(B)) {
+ if (!DeadBlocks.count(P))
+ continue;
+ for (PHINode &Phi : B->phis()) {
Phi.setIncomingValueForBlock(P, UndefValue::get(Phi.getType()));
if (MD)
MD->invalidateCachedPointerInfo(&Phi);
@@ -2544,10 +2665,11 @@ public:
return Impl.runImpl(
F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
- getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F),
getAnalysis<AAResultsWrapperPass>().getAAResults(),
- NoMemDepAnalysis ? nullptr
- : &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(),
+ NoMemDepAnalysis
+ ? nullptr
+ : &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(),
LIWP ? &LIWP->getLoopInfo() : nullptr,
&getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE());
}
@@ -2556,6 +2678,7 @@ public:
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
if (!NoMemDepAnalysis)
AU.addRequired<MemoryDependenceWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
@@ -2563,6 +2686,8 @@ public:
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
AU.addPreserved<TargetLibraryInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addPreservedID(LoopSimplifyID);
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
}
diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp
index 7614599653c4..c87e41484b13 100644
--- a/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/lib/Transforms/Scalar/GVNHoist.cpp
@@ -257,7 +257,7 @@ public:
GVNHoist(DominatorTree *DT, PostDominatorTree *PDT, AliasAnalysis *AA,
MemoryDependenceResults *MD, MemorySSA *MSSA)
: DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA),
- MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {}
+ MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {}
bool run(Function &F) {
NumFuncArgs = F.arg_size();
@@ -539,7 +539,7 @@ private:
// Check for unsafe hoistings due to side effects.
if (K == InsKind::Store) {
- if (hasEHOrLoadsOnPath(NewPt, dyn_cast<MemoryDef>(U), NBBsOnAllPaths))
+ if (hasEHOrLoadsOnPath(NewPt, cast<MemoryDef>(U), NBBsOnAllPaths))
return false;
} else if (hasEHOnPath(NewBB, OldBB, NBBsOnAllPaths))
return false;
@@ -889,19 +889,18 @@ private:
void updateAlignment(Instruction *I, Instruction *Repl) {
if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) {
- ReplacementLoad->setAlignment(
- std::min(ReplacementLoad->getAlignment(),
- cast<LoadInst>(I)->getAlignment()));
+ ReplacementLoad->setAlignment(MaybeAlign(std::min(
+ ReplacementLoad->getAlignment(), cast<LoadInst>(I)->getAlignment())));
++NumLoadsRemoved;
} else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) {
ReplacementStore->setAlignment(
- std::min(ReplacementStore->getAlignment(),
- cast<StoreInst>(I)->getAlignment()));
+ MaybeAlign(std::min(ReplacementStore->getAlignment(),
+ cast<StoreInst>(I)->getAlignment())));
++NumStoresRemoved;
} else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) {
ReplacementAlloca->setAlignment(
- std::max(ReplacementAlloca->getAlignment(),
- cast<AllocaInst>(I)->getAlignment()));
+ MaybeAlign(std::max(ReplacementAlloca->getAlignment(),
+ cast<AllocaInst>(I)->getAlignment())));
} else if (isa<CallInst>(Repl)) {
++NumCallsRemoved;
}
diff --git a/lib/Transforms/Scalar/GuardWidening.cpp b/lib/Transforms/Scalar/GuardWidening.cpp
index e14f44bb7069..2697d7809568 100644
--- a/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/lib/Transforms/Scalar/GuardWidening.cpp
@@ -591,7 +591,7 @@ bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1,
else
Result = RC.getCheckInst();
}
-
+ assert(Result && "Failed to find result value");
Result->setName("wide.chk");
}
return true;
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index f9fc698a4a9b..5519a00c12c9 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -124,6 +124,11 @@ static cl::opt<bool>
DisableLFTR("disable-lftr", cl::Hidden, cl::init(false),
cl::desc("Disable Linear Function Test Replace optimization"));
+static cl::opt<bool>
+LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(false),
+ cl::desc("Predicate conditions in read only loops"));
+
+
namespace {
struct RewritePhi;
@@ -144,7 +149,11 @@ class IndVarSimplify {
bool rewriteNonIntegerIVs(Loop *L);
bool simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI);
- bool optimizeLoopExits(Loop *L);
+ /// Try to eliminate loop exits based on analyzeable exit counts
+ bool optimizeLoopExits(Loop *L, SCEVExpander &Rewriter);
+ /// Try to form loop invariant tests for loop exits by changing how many
+ /// iterations of the loop run when that is unobservable.
+ bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter);
bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet);
bool rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
@@ -628,12 +637,30 @@ bool IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
// Okay, this instruction has a user outside of the current loop
// and varies predictably *inside* the loop. Evaluate the value it
- // contains when the loop exits, if possible.
+ // contains when the loop exits, if possible. We prefer to start with
+ // expressions which are true for all exits (so as to maximize
+ // expression reuse by the SCEVExpander), but resort to per-exit
+ // evaluation if that fails.
const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
- if (!SE->isLoopInvariant(ExitValue, L) ||
- !isSafeToExpand(ExitValue, *SE))
- continue;
-
+ if (isa<SCEVCouldNotCompute>(ExitValue) ||
+ !SE->isLoopInvariant(ExitValue, L) ||
+ !isSafeToExpand(ExitValue, *SE)) {
+ // TODO: This should probably be sunk into SCEV in some way; maybe a
+ // getSCEVForExit(SCEV*, L, ExitingBB)? It can be generalized for
+ // most SCEV expressions and other recurrence types (e.g. shift
+ // recurrences). Is there existing code we can reuse?
+ const SCEV *ExitCount = SE->getExitCount(L, PN->getIncomingBlock(i));
+ if (isa<SCEVCouldNotCompute>(ExitCount))
+ continue;
+ if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Inst)))
+ if (AddRec->getLoop() == L)
+ ExitValue = AddRec->evaluateAtIteration(ExitCount, *SE);
+ if (isa<SCEVCouldNotCompute>(ExitValue) ||
+ !SE->isLoopInvariant(ExitValue, L) ||
+ !isSafeToExpand(ExitValue, *SE))
+ continue;
+ }
+
// Computing the value outside of the loop brings no benefit if it is
// definitely used inside the loop in a way which can not be optimized
// away. Avoid doing so unless we know we have a value which computes
@@ -804,7 +831,7 @@ bool IndVarSimplify::canLoopBeDeleted(
L->getExitingBlocks(ExitingBlocks);
SmallVector<BasicBlock *, 8> ExitBlocks;
L->getUniqueExitBlocks(ExitBlocks);
- if (ExitBlocks.size() > 1 || ExitingBlocks.size() > 1)
+ if (ExitBlocks.size() != 1 || ExitingBlocks.size() != 1)
return false;
BasicBlock *ExitBlock = ExitBlocks[0];
@@ -1654,6 +1681,10 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
return nullptr;
}
+ // if we reached this point then we are going to replace
+ // DU.NarrowUse with WideUse. Reattach DbgValue then.
+ replaceAllDbgUsesWith(*DU.NarrowUse, *WideUse, *WideUse, *DT);
+
ExtendKindMap[DU.NarrowUse] = WideAddRec.second;
// Returning WideUse pushes it on the worklist.
return WideUse;
@@ -1779,14 +1810,9 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
DeadInsts.emplace_back(DU.NarrowDef);
}
- // Attach any debug information to the new PHI. Since OrigPhi and WidePHI
- // evaluate the same recurrence, we can just copy the debug info over.
- SmallVector<DbgValueInst *, 1> DbgValues;
- llvm::findDbgValues(DbgValues, OrigPhi);
- auto *MDPhi = MetadataAsValue::get(WidePhi->getContext(),
- ValueAsMetadata::get(WidePhi));
- for (auto &DbgValue : DbgValues)
- DbgValue->setOperand(0, MDPhi);
+ // Attach any debug information to the new PHI.
+ replaceAllDbgUsesWith(*OrigPhi, *WidePhi, *WidePhi, *DT);
+
return WidePhi;
}
@@ -1817,8 +1843,8 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS));
auto CmpConstrainedLHSRange =
ConstantRange::makeAllowedICmpRegion(P, CmpRHSRange);
- auto NarrowDefRange =
- CmpConstrainedLHSRange.addWithNoSignedWrap(*NarrowDefRHS);
+ auto NarrowDefRange = CmpConstrainedLHSRange.addWithNoWrap(
+ *NarrowDefRHS, OverflowingBinaryOperator::NoSignedWrap);
updatePostIncRangeInfo(NarrowDef, NarrowUser, NarrowDefRange);
};
@@ -2242,8 +2268,8 @@ static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB,
if (BECount->getType()->isPointerTy() && !Phi->getType()->isPointerTy())
continue;
- const auto *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
-
+ const auto *AR = cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
+
// AR may be a pointer type, while BECount is an integer type.
// AR may be wider than BECount. With eq/ne tests overflow is immaterial.
// AR may not be a narrower type, or we may never exit.
@@ -2624,74 +2650,125 @@ bool IndVarSimplify::sinkUnusedInvariants(Loop *L) {
return MadeAnyChanges;
}
-bool IndVarSimplify::optimizeLoopExits(Loop *L) {
+/// Return a symbolic upper bound for the backedge taken count of the loop.
+/// This is more general than getConstantMaxBackedgeTakenCount as it returns
+/// an arbitrary expression as opposed to only constants.
+/// TODO: Move into the ScalarEvolution class.
+static const SCEV* getMaxBackedgeTakenCount(ScalarEvolution &SE,
+ DominatorTree &DT, Loop *L) {
SmallVector<BasicBlock*, 16> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
// Form an expression for the maximum exit count possible for this loop. We
// merge the max and exact information to approximate a version of
- // getMaxBackedgeTakenInfo which isn't restricted to just constants.
- // TODO: factor this out as a version of getMaxBackedgeTakenCount which
- // isn't guaranteed to return a constant.
+ // getConstantMaxBackedgeTakenCount which isn't restricted to just constants.
SmallVector<const SCEV*, 4> ExitCounts;
- const SCEV *MaxConstEC = SE->getMaxBackedgeTakenCount(L);
+ const SCEV *MaxConstEC = SE.getConstantMaxBackedgeTakenCount(L);
if (!isa<SCEVCouldNotCompute>(MaxConstEC))
ExitCounts.push_back(MaxConstEC);
for (BasicBlock *ExitingBB : ExitingBlocks) {
- const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+ const SCEV *ExitCount = SE.getExitCount(L, ExitingBB);
if (!isa<SCEVCouldNotCompute>(ExitCount)) {
- assert(DT->dominates(ExitingBB, L->getLoopLatch()) &&
+ assert(DT.dominates(ExitingBB, L->getLoopLatch()) &&
"We should only have known counts for exiting blocks that "
"dominate latch!");
ExitCounts.push_back(ExitCount);
}
}
if (ExitCounts.empty())
- return false;
- const SCEV *MaxExitCount = SE->getUMinFromMismatchedTypes(ExitCounts);
+ return SE.getCouldNotCompute();
+ return SE.getUMinFromMismatchedTypes(ExitCounts);
+}
- bool Changed = false;
- for (BasicBlock *ExitingBB : ExitingBlocks) {
+bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
+ SmallVector<BasicBlock*, 16> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+
+ // Remove all exits which aren't both rewriteable and analyzeable.
+ auto NewEnd = llvm::remove_if(ExitingBlocks,
+ [&](BasicBlock *ExitingBB) {
// If our exitting block exits multiple loops, we can only rewrite the
// innermost one. Otherwise, we're changing how many times the innermost
// loop runs before it exits.
if (LI->getLoopFor(ExitingBB) != L)
- continue;
+ return true;
// Can't rewrite non-branch yet.
BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
if (!BI)
- continue;
+ return true;
// If already constant, nothing to do.
if (isa<Constant>(BI->getCondition()))
- continue;
+ return true;
const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
if (isa<SCEVCouldNotCompute>(ExitCount))
- continue;
+ return true;
+ return false;
+ });
+ ExitingBlocks.erase(NewEnd, ExitingBlocks.end());
+
+ if (ExitingBlocks.empty())
+ return false;
+
+ // Get a symbolic upper bound on the loop backedge taken count.
+ const SCEV *MaxExitCount = getMaxBackedgeTakenCount(*SE, *DT, L);
+ if (isa<SCEVCouldNotCompute>(MaxExitCount))
+ return false;
+
+ // Visit our exit blocks in order of dominance. We know from the fact that
+ // all exits (left) are analyzeable that the must be a total dominance order
+ // between them as each must dominate the latch. The visit order only
+ // matters for the provably equal case.
+ llvm::sort(ExitingBlocks,
+ [&](BasicBlock *A, BasicBlock *B) {
+ // std::sort sorts in ascending order, so we want the inverse of
+ // the normal dominance relation.
+ if (DT->properlyDominates(A, B)) return true;
+ if (DT->properlyDominates(B, A)) return false;
+ llvm_unreachable("expected total dominance order!");
+ });
+#ifdef ASSERT
+ for (unsigned i = 1; i < ExitingBlocks.size(); i++) {
+ assert(DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i]));
+ }
+#endif
+
+ auto FoldExit = [&](BasicBlock *ExitingBB, bool IsTaken) {
+ BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+ bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
+ auto *OldCond = BI->getCondition();
+ auto *NewCond = ConstantInt::get(OldCond->getType(),
+ IsTaken ? ExitIfTrue : !ExitIfTrue);
+ BI->setCondition(NewCond);
+ if (OldCond->use_empty())
+ DeadInsts.push_back(OldCond);
+ };
+ bool Changed = false;
+ SmallSet<const SCEV*, 8> DominatingExitCounts;
+ for (BasicBlock *ExitingBB : ExitingBlocks) {
+ const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+ assert(!isa<SCEVCouldNotCompute>(ExitCount) && "checked above");
+
// If we know we'd exit on the first iteration, rewrite the exit to
// reflect this. This does not imply the loop must exit through this
// exit; there may be an earlier one taken on the first iteration.
// TODO: Given we know the backedge can't be taken, we should go ahead
// and break it. Or at least, kill all the header phis and simplify.
if (ExitCount->isZero()) {
- bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
- auto *OldCond = BI->getCondition();
- auto *NewCond = ExitIfTrue ? ConstantInt::getTrue(OldCond->getType()) :
- ConstantInt::getFalse(OldCond->getType());
- BI->setCondition(NewCond);
- if (OldCond->use_empty())
- DeadInsts.push_back(OldCond);
+ FoldExit(ExitingBB, true);
Changed = true;
continue;
}
- // If we end up with a pointer exit count, bail.
+ // If we end up with a pointer exit count, bail. Note that we can end up
+ // with a pointer exit count for one exiting block, and not for another in
+ // the same loop.
if (!ExitCount->getType()->isIntegerTy() ||
!MaxExitCount->getType()->isIntegerTy())
- return false;
+ continue;
Type *WiderType =
SE->getWiderType(MaxExitCount->getType(), ExitCount->getType());
@@ -2700,35 +2777,198 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L) {
assert(MaxExitCount->getType() == ExitCount->getType());
// Can we prove that some other exit must be taken strictly before this
- // one? TODO: handle cases where ule is known, and equality is covered
- // by a dominating exit
+ // one?
if (SE->isLoopEntryGuardedByCond(L, CmpInst::ICMP_ULT,
MaxExitCount, ExitCount)) {
- bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
- auto *OldCond = BI->getCondition();
- auto *NewCond = ExitIfTrue ? ConstantInt::getFalse(OldCond->getType()) :
- ConstantInt::getTrue(OldCond->getType());
- BI->setCondition(NewCond);
- if (OldCond->use_empty())
- DeadInsts.push_back(OldCond);
+ FoldExit(ExitingBB, false);
Changed = true;
continue;
}
- // TODO: If we can prove that the exiting iteration is equal to the exit
- // count for this exit and that no previous exit oppurtunities exist within
- // the loop, then we can discharge all other exits. (May fall out of
- // previous TODO.)
-
- // TODO: If we can't prove any relation between our exit count and the
- // loops exit count, but taking this exit doesn't require actually running
- // the loop (i.e. no side effects, no computed values used in exit), then
- // we can replace the exit test with a loop invariant test which exits on
- // the first iteration.
+ // As we run, keep track of which exit counts we've encountered. If we
+ // find a duplicate, we've found an exit which would have exited on the
+ // exiting iteration, but (from the visit order) strictly follows another
+ // which does the same and is thus dead.
+ if (!DominatingExitCounts.insert(ExitCount).second) {
+ FoldExit(ExitingBB, false);
+ Changed = true;
+ continue;
+ }
+
+ // TODO: There might be another oppurtunity to leverage SCEV's reasoning
+ // here. If we kept track of the min of dominanting exits so far, we could
+ // discharge exits with EC >= MDEC. This is less powerful than the existing
+ // transform (since later exits aren't considered), but potentially more
+ // powerful for any case where SCEV can prove a >=u b, but neither a == b
+ // or a >u b. Such a case is not currently known.
}
return Changed;
}
+bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
+ SmallVector<BasicBlock*, 16> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+
+ bool Changed = false;
+
+ // Finally, see if we can rewrite our exit conditions into a loop invariant
+ // form. If we have a read-only loop, and we can tell that we must exit down
+ // a path which does not need any of the values computed within the loop, we
+ // can rewrite the loop to exit on the first iteration. Note that this
+ // doesn't either a) tell us the loop exits on the first iteration (unless
+ // *all* exits are predicateable) or b) tell us *which* exit might be taken.
+ // This transformation looks a lot like a restricted form of dead loop
+ // elimination, but restricted to read-only loops and without neccesssarily
+ // needing to kill the loop entirely.
+ if (!LoopPredication)
+ return Changed;
+
+ if (!SE->hasLoopInvariantBackedgeTakenCount(L))
+ return Changed;
+
+ // Note: ExactBTC is the exact backedge taken count *iff* the loop exits
+ // through *explicit* control flow. We have to eliminate the possibility of
+ // implicit exits (see below) before we know it's truly exact.
+ const SCEV *ExactBTC = SE->getBackedgeTakenCount(L);
+ if (isa<SCEVCouldNotCompute>(ExactBTC) ||
+ !SE->isLoopInvariant(ExactBTC, L) ||
+ !isSafeToExpand(ExactBTC, *SE))
+ return Changed;
+
+ auto BadExit = [&](BasicBlock *ExitingBB) {
+ // If our exiting block exits multiple loops, we can only rewrite the
+ // innermost one. Otherwise, we're changing how many times the innermost
+ // loop runs before it exits.
+ if (LI->getLoopFor(ExitingBB) != L)
+ return true;
+
+ // Can't rewrite non-branch yet.
+ BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+ if (!BI)
+ return true;
+
+ // If already constant, nothing to do.
+ if (isa<Constant>(BI->getCondition()))
+ return true;
+
+ // If the exit block has phis, we need to be able to compute the values
+ // within the loop which contains them. This assumes trivially lcssa phis
+ // have already been removed; TODO: generalize
+ BasicBlock *ExitBlock =
+ BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0);
+ if (!ExitBlock->phis().empty())
+ return true;
+
+ const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+ assert(!isa<SCEVCouldNotCompute>(ExactBTC) && "implied by having exact trip count");
+ if (!SE->isLoopInvariant(ExitCount, L) ||
+ !isSafeToExpand(ExitCount, *SE))
+ return true;
+
+ return false;
+ };
+
+ // If we have any exits which can't be predicated themselves, than we can't
+ // predicate any exit which isn't guaranteed to execute before it. Consider
+ // two exits (a) and (b) which would both exit on the same iteration. If we
+ // can predicate (b), but not (a), and (a) preceeds (b) along some path, then
+ // we could convert a loop from exiting through (a) to one exiting through
+ // (b). Note that this problem exists only for exits with the same exit
+ // count, and we could be more aggressive when exit counts are known inequal.
+ llvm::sort(ExitingBlocks,
+ [&](BasicBlock *A, BasicBlock *B) {
+ // std::sort sorts in ascending order, so we want the inverse of
+ // the normal dominance relation, plus a tie breaker for blocks
+ // unordered by dominance.
+ if (DT->properlyDominates(A, B)) return true;
+ if (DT->properlyDominates(B, A)) return false;
+ return A->getName() < B->getName();
+ });
+ // Check to see if our exit blocks are a total order (i.e. a linear chain of
+ // exits before the backedge). If they aren't, reasoning about reachability
+ // is complicated and we choose not to for now.
+ for (unsigned i = 1; i < ExitingBlocks.size(); i++)
+ if (!DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i]))
+ return Changed;
+
+ // Given our sorted total order, we know that exit[j] must be evaluated
+ // after all exit[i] such j > i.
+ for (unsigned i = 0, e = ExitingBlocks.size(); i < e; i++)
+ if (BadExit(ExitingBlocks[i])) {
+ ExitingBlocks.resize(i);
+ break;
+ }
+
+ if (ExitingBlocks.empty())
+ return Changed;
+
+ // We rely on not being able to reach an exiting block on a later iteration
+ // then it's statically compute exit count. The implementaton of
+ // getExitCount currently has this invariant, but assert it here so that
+ // breakage is obvious if this ever changes..
+ assert(llvm::all_of(ExitingBlocks, [&](BasicBlock *ExitingBB) {
+ return DT->dominates(ExitingBB, L->getLoopLatch());
+ }));
+
+ // At this point, ExitingBlocks consists of only those blocks which are
+ // predicatable. Given that, we know we have at least one exit we can
+ // predicate if the loop is doesn't have side effects and doesn't have any
+ // implicit exits (because then our exact BTC isn't actually exact).
+ // @Reviewers - As structured, this is O(I^2) for loop nests. Any
+ // suggestions on how to improve this? I can obviously bail out for outer
+ // loops, but that seems less than ideal. MemorySSA can find memory writes,
+ // is that enough for *all* side effects?
+ for (BasicBlock *BB : L->blocks())
+ for (auto &I : *BB)
+ // TODO:isGuaranteedToTransfer
+ if (I.mayHaveSideEffects() || I.mayThrow())
+ return Changed;
+
+ // Finally, do the actual predication for all predicatable blocks. A couple
+ // of notes here:
+ // 1) We don't bother to constant fold dominated exits with identical exit
+ // counts; that's simply a form of CSE/equality propagation and we leave
+ // it for dedicated passes.
+ // 2) We insert the comparison at the branch. Hoisting introduces additional
+ // legality constraints and we leave that to dedicated logic. We want to
+ // predicate even if we can't insert a loop invariant expression as
+ // peeling or unrolling will likely reduce the cost of the otherwise loop
+ // varying check.
+ Rewriter.setInsertPoint(L->getLoopPreheader()->getTerminator());
+ IRBuilder<> B(L->getLoopPreheader()->getTerminator());
+ Value *ExactBTCV = nullptr; //lazy generated if needed
+ for (BasicBlock *ExitingBB : ExitingBlocks) {
+ const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+
+ auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
+ Value *NewCond;
+ if (ExitCount == ExactBTC) {
+ NewCond = L->contains(BI->getSuccessor(0)) ?
+ B.getFalse() : B.getTrue();
+ } else {
+ Value *ECV = Rewriter.expandCodeFor(ExitCount);
+ if (!ExactBTCV)
+ ExactBTCV = Rewriter.expandCodeFor(ExactBTC);
+ Value *RHS = ExactBTCV;
+ if (ECV->getType() != RHS->getType()) {
+ Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType());
+ ECV = B.CreateZExt(ECV, WiderTy);
+ RHS = B.CreateZExt(RHS, WiderTy);
+ }
+ auto Pred = L->contains(BI->getSuccessor(0)) ?
+ ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
+ NewCond = B.CreateICmp(Pred, ECV, RHS);
+ }
+ Value *OldCond = BI->getCondition();
+ BI->setCondition(NewCond);
+ if (OldCond->use_empty())
+ DeadInsts.push_back(OldCond);
+ Changed = true;
+ }
+
+ return Changed;
+}
+
//===----------------------------------------------------------------------===//
// IndVarSimplify driver. Manage several subpasses of IV simplification.
//===----------------------------------------------------------------------===//
@@ -2755,7 +2995,10 @@ bool IndVarSimplify::run(Loop *L) {
// transform them to use integer recurrences.
Changed |= rewriteNonIntegerIVs(L);
+#ifndef NDEBUG
+ // Used below for a consistency check only
const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+#endif
// Create a rewriter object which we'll use to transform the code with.
SCEVExpander Rewriter(*SE, DL, "indvars");
@@ -2772,20 +3015,22 @@ bool IndVarSimplify::run(Loop *L) {
Rewriter.disableCanonicalMode();
Changed |= simplifyAndExtend(L, Rewriter, LI);
- // Check to see if this loop has a computable loop-invariant execution count.
- // If so, this means that we can compute the final value of any expressions
+ // Check to see if we can compute the final value of any expressions
// that are recurrent in the loop, and substitute the exit values from the
- // loop into any instructions outside of the loop that use the final values of
- // the current expressions.
- //
- if (ReplaceExitValue != NeverRepl &&
- !isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+ // loop into any instructions outside of the loop that use the final values
+ // of the current expressions.
+ if (ReplaceExitValue != NeverRepl)
Changed |= rewriteLoopExitValues(L, Rewriter);
// Eliminate redundant IV cycles.
NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
- Changed |= optimizeLoopExits(L);
+ // Try to eliminate loop exits based on analyzeable exit counts
+ Changed |= optimizeLoopExits(L, Rewriter);
+
+ // Try to form loop invariant tests for loop exits by changing how many
+ // iterations of the loop run when that is unobservable.
+ Changed |= predicateLoopExits(L, Rewriter);
// If we have a trip count expression, rewrite the loop's exit condition
// using it.
@@ -2825,7 +3070,7 @@ bool IndVarSimplify::run(Loop *L) {
// that our definition of "high cost" is not exactly principled.
if (Rewriter.isHighCostExpansion(ExitCount, L))
continue;
-
+
// Check preconditions for proper SCEVExpander operation. SCEV does not
// express SCEVExpander's dependencies, such as LoopSimplify. Instead
// any pass that uses the SCEVExpander must do it. This does not work
@@ -2924,7 +3169,7 @@ struct IndVarSimplifyLegacyPass : public LoopPass {
auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
- auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+ auto *TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
auto *TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr;
const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
diff --git a/lib/Transforms/Scalar/InferAddressSpaces.cpp b/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 5f0e2001c73d..e7e73a132fbe 100644
--- a/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -141,6 +141,8 @@ using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
/// InferAddressSpaces
class InferAddressSpaces : public FunctionPass {
+ const TargetTransformInfo *TTI;
+
/// Target specific address space which uses of should be replaced if
/// possible.
unsigned FlatAddrSpace;
@@ -264,17 +266,6 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
Module *M = II->getParent()->getParent()->getParent();
switch (II->getIntrinsicID()) {
- case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:
- case Intrinsic::amdgcn_ds_fadd:
- case Intrinsic::amdgcn_ds_fmin:
- case Intrinsic::amdgcn_ds_fmax: {
- const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4));
- if (!IsVolatile || !IsVolatile->isZero())
- return false;
-
- LLVM_FALLTHROUGH;
- }
case Intrinsic::objectsize: {
Type *DestTy = II->getType();
Type *SrcTy = NewV->getType();
@@ -285,25 +276,27 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
return true;
}
default:
- return false;
+ return TTI->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
}
}
-// TODO: Move logic to TTI?
void InferAddressSpaces::collectRewritableIntrinsicOperands(
IntrinsicInst *II, std::vector<std::pair<Value *, bool>> &PostorderStack,
DenseSet<Value *> &Visited) const {
- switch (II->getIntrinsicID()) {
+ auto IID = II->getIntrinsicID();
+ switch (IID) {
case Intrinsic::objectsize:
- case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:
- case Intrinsic::amdgcn_ds_fadd:
- case Intrinsic::amdgcn_ds_fmin:
- case Intrinsic::amdgcn_ds_fmax:
appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
PostorderStack, Visited);
break;
default:
+ SmallVector<int, 2> OpIndexes;
+ if (TTI->collectFlatAddressOperands(OpIndexes, IID)) {
+ for (int Idx : OpIndexes) {
+ appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(Idx),
+ PostorderStack, Visited);
+ }
+ }
break;
}
}
@@ -631,11 +624,10 @@ bool InferAddressSpaces::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
- const TargetTransformInfo &TTI =
- getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
if (FlatAddrSpace == UninitializedAddressSpace) {
- FlatAddrSpace = TTI.getFlatAddressSpace();
+ FlatAddrSpace = TTI->getFlatAddressSpace();
if (FlatAddrSpace == UninitializedAddressSpace)
return false;
}
@@ -650,7 +642,7 @@ bool InferAddressSpaces::runOnFunction(Function &F) {
// Changes the address spaces of the flat address expressions who are inferred
// to point to a specific address space.
- return rewriteWithNewAddressSpaces(TTI, Postorder, InferredAddrSpace, &F);
+ return rewriteWithNewAddressSpaces(*TTI, Postorder, InferredAddrSpace, &F);
}
// Constants need to be tracked through RAUW to handle cases with nested
diff --git a/lib/Transforms/Scalar/InstSimplifyPass.cpp b/lib/Transforms/Scalar/InstSimplifyPass.cpp
index 6616364ab203..ec28f790f252 100644
--- a/lib/Transforms/Scalar/InstSimplifyPass.cpp
+++ b/lib/Transforms/Scalar/InstSimplifyPass.cpp
@@ -33,37 +33,39 @@ static bool runImpl(Function &F, const SimplifyQuery &SQ,
bool Changed = false;
do {
- for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
- // Here be subtlety: the iterator must be incremented before the loop
- // body (not sure why), so a range-for loop won't work here.
- for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
- Instruction *I = &*BI++;
- // The first time through the loop ToSimplify is empty and we try to
- // simplify all instructions. On later iterations ToSimplify is not
+ for (BasicBlock &BB : F) {
+ // Unreachable code can take on strange forms that we are not prepared to
+ // handle. For example, an instruction may have itself as an operand.
+ if (!SQ.DT->isReachableFromEntry(&BB))
+ continue;
+
+ SmallVector<Instruction *, 8> DeadInstsInBB;
+ for (Instruction &I : BB) {
+ // The first time through the loop, ToSimplify is empty and we try to
+ // simplify all instructions. On later iterations, ToSimplify is not
// empty and we only bother simplifying instructions that are in it.
- if (!ToSimplify->empty() && !ToSimplify->count(I))
+ if (!ToSimplify->empty() && !ToSimplify->count(&I))
continue;
- // Don't waste time simplifying unused instructions.
- if (!I->use_empty()) {
- if (Value *V = SimplifyInstruction(I, SQ, ORE)) {
+ // Don't waste time simplifying dead/unused instructions.
+ if (isInstructionTriviallyDead(&I)) {
+ DeadInstsInBB.push_back(&I);
+ Changed = true;
+ } else if (!I.use_empty()) {
+ if (Value *V = SimplifyInstruction(&I, SQ, ORE)) {
// Mark all uses for resimplification next time round the loop.
- for (User *U : I->users())
+ for (User *U : I.users())
Next->insert(cast<Instruction>(U));
- I->replaceAllUsesWith(V);
+ I.replaceAllUsesWith(V);
++NumSimplified;
Changed = true;
+ // A call can get simplified, but it may not be trivially dead.
+ if (isInstructionTriviallyDead(&I))
+ DeadInstsInBB.push_back(&I);
}
}
- if (RecursivelyDeleteTriviallyDeadInstructions(I, SQ.TLI)) {
- // RecursivelyDeleteTriviallyDeadInstruction can remove more than one
- // instruction, so simply incrementing the iterator does not work.
- // When instructions get deleted re-iterate instead.
- BI = BB->begin();
- BE = BB->end();
- Changed = true;
- }
}
+ RecursivelyDeleteTriviallyDeadInstructions(DeadInstsInBB, SQ.TLI);
}
// Place the list of instructions to simplify on the next loop iteration
@@ -90,7 +92,7 @@ struct InstSimplifyLegacyPass : public FunctionPass {
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
}
- /// runOnFunction - Remove instructions that simplify.
+ /// Remove instructions that simplify.
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
@@ -98,7 +100,7 @@ struct InstSimplifyLegacyPass : public FunctionPass {
const DominatorTree *DT =
&getAnalysis<DominatorTreeWrapperPass>().getDomTree();
const TargetLibraryInfo *TLI =
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
AssumptionCache *AC =
&getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
OptimizationRemarkEmitter *ORE =
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index b86bf2fefbe5..0cf00baaa24a 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -224,13 +224,21 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
BasicBlock *PhiBB) -> std::pair<BasicBlock *, BasicBlock *> {
auto *PredBB = IncomingBB;
auto *SuccBB = PhiBB;
+ SmallPtrSet<BasicBlock *, 16> Visited;
while (true) {
BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
if (PredBr && PredBr->isConditional())
return {PredBB, SuccBB};
+ Visited.insert(PredBB);
auto *SinglePredBB = PredBB->getSinglePredecessor();
if (!SinglePredBB)
return {nullptr, nullptr};
+
+ // Stop searching when SinglePredBB has been visited. It means we see
+ // an unreachable loop.
+ if (Visited.count(SinglePredBB))
+ return {nullptr, nullptr};
+
SuccBB = PredBB;
PredBB = SinglePredBB;
}
@@ -253,7 +261,9 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
return;
BasicBlock *PredBB = PredOutEdge.first;
- BranchInst *PredBr = cast<BranchInst>(PredBB->getTerminator());
+ BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
+ if (!PredBr)
+ return;
uint64_t PredTrueWeight, PredFalseWeight;
// FIXME: We currently only set the profile data when it is missing.
@@ -286,7 +296,7 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
bool JumpThreading::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
- auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
// Get DT analysis before LVI. When LVI is initialized it conditionally adds
// DT if it's available.
auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -1461,7 +1471,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
"Can't handle critical edge here!");
LoadInst *NewVal = new LoadInst(
LoadI->getType(), LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
- LoadI->getName() + ".pr", false, LoadI->getAlignment(),
+ LoadI->getName() + ".pr", false, MaybeAlign(LoadI->getAlignment()),
LoadI->getOrdering(), LoadI->getSyncScopeID(),
UnavailablePred->getTerminator());
NewVal->setDebugLoc(LoadI->getDebugLoc());
@@ -2423,7 +2433,7 @@ void JumpThreadingPass::UnfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB,
// |-----
// v
// BB
- BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
+ BranchInst *PredTerm = cast<BranchInst>(Pred->getTerminator());
BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold",
BB->getParent(), BB);
// Move the unconditional branch to NewBB.
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index d9dda4cef2d2..6ce4831a7359 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -220,7 +220,8 @@ struct LegacyLICMPass : public LoopPass {
&getAnalysis<AAResultsWrapperPass>().getAAResults(),
&getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
&getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+ *L->getHeader()->getParent()),
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
*L->getHeader()->getParent()),
SE ? &SE->getSE() : nullptr, MSSA, &ORE, false);
@@ -294,7 +295,7 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
PA.preserve<DominatorTreeAnalysis>();
PA.preserve<LoopAnalysis>();
- if (EnableMSSALoopDependency)
+ if (AR.MSSA)
PA.preserve<MemorySSAAnalysis>();
return PA;
@@ -330,6 +331,12 @@ bool LoopInvariantCodeMotion::runOnLoop(
assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
+ // If this loop has metadata indicating that LICM is not to be performed then
+ // just exit.
+ if (hasDisableLICMTransformsHint(L)) {
+ return false;
+ }
+
std::unique_ptr<AliasSetTracker> CurAST;
std::unique_ptr<MemorySSAUpdater> MSSAU;
bool NoOfMemAccTooLarge = false;
@@ -340,7 +347,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
CurAST = collectAliasInfoForLoop(L, LI, AA);
} else {
LLVM_DEBUG(dbgs() << "LICM: Using MemorySSA.\n");
- MSSAU = make_unique<MemorySSAUpdater>(MSSA);
+ MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
unsigned AccessCapCount = 0;
for (auto *BB : L->getBlocks()) {
@@ -956,7 +963,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
// Now that we've finished hoisting make sure that LI and DT are still
// valid.
-#ifndef NDEBUG
+#ifdef EXPENSIVE_CHECKS
if (Changed) {
assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
"Dominator tree verification failed");
@@ -1026,7 +1033,8 @@ namespace {
bool isHoistableAndSinkableInst(Instruction &I) {
// Only these instructions are hoistable/sinkable.
return (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) ||
- isa<FenceInst>(I) || isa<BinaryOperator>(I) || isa<CastInst>(I) ||
+ isa<FenceInst>(I) || isa<CastInst>(I) ||
+ isa<UnaryOperator>(I) || isa<BinaryOperator>(I) ||
isa<SelectInst>(I) || isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) ||
@@ -1092,7 +1100,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
// in the same alias set as something that ends up being modified.
if (AA->pointsToConstantMemory(LI->getOperand(0)))
return true;
- if (LI->getMetadata(LLVMContext::MD_invariant_load))
+ if (LI->hasMetadata(LLVMContext::MD_invariant_load))
return true;
if (LI->isAtomic() && !TargetExecutesOncePerLoop)
@@ -1240,12 +1248,22 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
// FIXME: More precise: no Uses that alias SI.
if (!Flags->IsSink && !MSSA->dominates(SIMD, MU))
return false;
- } else if (const auto *MD = dyn_cast<MemoryDef>(&MA))
+ } else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) {
if (auto *LI = dyn_cast<LoadInst>(MD->getMemoryInst())) {
(void)LI; // Silence warning.
assert(!LI->isUnordered() && "Expected unordered load");
return false;
}
+ // Any call, while it may not be clobbering SI, it may be a use.
+ if (auto *CI = dyn_cast<CallInst>(MD->getMemoryInst())) {
+ // Check if the call may read from the memory locattion written
+ // to by SI. Check CI's attributes and arguments; the number of
+ // such checks performed is limited above by NoOfMemAccTooLarge.
+ ModRefInfo MRI = AA->getModRefInfo(CI, MemoryLocation::get(SI));
+ if (isModOrRefSet(MRI))
+ return false;
+ }
+ }
}
auto *Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(SI);
@@ -1375,8 +1393,7 @@ static Instruction *CloneInstructionInExitBlock(
if (!I.getName().empty())
New->setName(I.getName() + ".le");
- MemoryAccess *OldMemAcc;
- if (MSSAU && (OldMemAcc = MSSAU->getMemorySSA()->getMemoryAccess(&I))) {
+ if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(&I)) {
// Create a new MemoryAccess and let MemorySSA set its defining access.
MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
New, nullptr, New->getParent(), MemorySSA::Beginning);
@@ -1385,7 +1402,7 @@ static Instruction *CloneInstructionInExitBlock(
MSSAU->insertDef(MemDef, /*RenameUses=*/true);
else {
auto *MemUse = cast<MemoryUse>(NewMemAcc);
- MSSAU->insertUse(MemUse);
+ MSSAU->insertUse(MemUse, /*RenameUses=*/true);
}
}
}
@@ -1783,7 +1800,7 @@ public:
StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
if (UnorderedAtomic)
NewSI->setOrdering(AtomicOrdering::Unordered);
- NewSI->setAlignment(Alignment);
+ NewSI->setAlignment(MaybeAlign(Alignment));
NewSI->setDebugLoc(DL);
if (AATags)
NewSI->setAAMetadata(AATags);
@@ -2016,7 +2033,8 @@ bool llvm::promoteLoopAccessesToScalars(
if (!DereferenceableInPH) {
DereferenceableInPH = isDereferenceableAndAlignedPointer(
Store->getPointerOperand(), Store->getValueOperand()->getType(),
- Store->getAlignment(), MDL, Preheader->getTerminator(), DT);
+ MaybeAlign(Store->getAlignment()), MDL,
+ Preheader->getTerminator(), DT);
}
} else
return false; // Not a load or store.
@@ -2101,20 +2119,21 @@ bool llvm::promoteLoopAccessesToScalars(
SomePtr->getName() + ".promoted", Preheader->getTerminator());
if (SawUnorderedAtomic)
PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
- PreheaderLoad->setAlignment(Alignment);
+ PreheaderLoad->setAlignment(MaybeAlign(Alignment));
PreheaderLoad->setDebugLoc(DL);
if (AATags)
PreheaderLoad->setAAMetadata(AATags);
SSA.AddAvailableValue(Preheader, PreheaderLoad);
- MemoryAccess *PreheaderLoadMemoryAccess;
if (MSSAU) {
- PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB(
+ MemoryAccess *PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB(
PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End);
MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess);
- MSSAU->insertUse(NewMemUse);
+ MSSAU->insertUse(NewMemUse, /*RenameUses=*/true);
}
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
// Rewrite all the loads in the loop and remember all the definitions from
// stores in the loop.
Promoter.run(LoopUses);
@@ -2161,7 +2180,7 @@ LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
LoopToAliasSetMap.erase(MapI);
}
if (!CurAST)
- CurAST = make_unique<AliasSetTracker>(*AA);
+ CurAST = std::make_unique<AliasSetTracker>(*AA);
// Add everything from the sub loops that are no longer directly available.
for (Loop *InnerL : RecomputeLoops)
@@ -2180,7 +2199,7 @@ std::unique_ptr<AliasSetTracker>
LoopInvariantCodeMotion::collectAliasInfoForLoopWithMSSA(
Loop *L, AliasAnalysis *AA, MemorySSAUpdater *MSSAU) {
auto *MSSA = MSSAU->getMemorySSA();
- auto CurAST = make_unique<AliasSetTracker>(*AA, MSSA, L);
+ auto CurAST = std::make_unique<AliasSetTracker>(*AA, MSSA, L);
CurAST->addAllInstructionsInLoopUsingMSSA();
return CurAST;
}
diff --git a/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index 1fcf1315a177..a972d6fa2fcd 100644
--- a/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -312,8 +312,8 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
IRBuilder<> Builder(MemI);
Module *M = BB->getParent()->getParent();
Type *I32 = Type::getInt32Ty(BB->getContext());
- Function *PrefetchFunc =
- Intrinsic::getDeclaration(M, Intrinsic::prefetch);
+ Function *PrefetchFunc = Intrinsic::getDeclaration(
+ M, Intrinsic::prefetch, PrefPtrValue->getType());
Builder.CreateCall(
PrefetchFunc,
{PrefPtrValue,
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 8371367e24e7..cee197cf8354 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -191,7 +191,7 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
// Don't remove loops for which we can't solve the trip count.
// They could be infinite, in which case we'd be changing program behavior.
- const SCEV *S = SE.getMaxBackedgeTakenCount(L);
+ const SCEV *S = SE.getConstantMaxBackedgeTakenCount(L);
if (isa<SCEVCouldNotCompute>(S)) {
LLVM_DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n");
return Changed ? LoopDeletionResult::Modified
diff --git a/lib/Transforms/Scalar/LoopFuse.cpp b/lib/Transforms/Scalar/LoopFuse.cpp
index 0bc2bcff2ae1..9f93c68e6128 100644
--- a/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/lib/Transforms/Scalar/LoopFuse.cpp
@@ -66,7 +66,7 @@ using namespace llvm;
#define DEBUG_TYPE "loop-fusion"
-STATISTIC(FuseCounter, "Count number of loop fusions performed");
+STATISTIC(FuseCounter, "Loops fused");
STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion");
STATISTIC(InvalidPreheader, "Loop has invalid preheader");
STATISTIC(InvalidHeader, "Loop has invalid header");
@@ -79,12 +79,15 @@ STATISTIC(MayThrowException, "Loop may throw an exception");
STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access");
STATISTIC(NotSimplifiedForm, "Loop is not in simplified form");
STATISTIC(InvalidDependencies, "Dependencies prevent fusion");
-STATISTIC(InvalidTripCount,
- "Loop does not have invariant backedge taken count");
+STATISTIC(UnknownTripCount, "Loop has unknown trip count");
STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop");
-STATISTIC(NonEqualTripCount, "Candidate trip counts are not the same");
-STATISTIC(NonAdjacent, "Candidates are not adjacent");
-STATISTIC(NonEmptyPreheader, "Candidate has a non-empty preheader");
+STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");
+STATISTIC(NonAdjacent, "Loops are not adjacent");
+STATISTIC(NonEmptyPreheader, "Loop has a non-empty preheader");
+STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
+STATISTIC(NonIdenticalGuards, "Candidates have different guards");
+STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block");
+STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block");
enum FusionDependenceAnalysisChoice {
FUSION_DEPENDENCE_ANALYSIS_SCEV,
@@ -110,6 +113,7 @@ static cl::opt<bool>
cl::Hidden, cl::init(false), cl::ZeroOrMore);
#endif
+namespace {
/// This class is used to represent a candidate for loop fusion. When it is
/// constructed, it checks the conditions for loop fusion to ensure that it
/// represents a valid candidate. It caches several parts of a loop that are
@@ -143,6 +147,8 @@ struct FusionCandidate {
SmallVector<Instruction *, 16> MemWrites;
/// Are all of the members of this fusion candidate still valid
bool Valid;
+ /// Guard branch of the loop, if it exists
+ BranchInst *GuardBranch;
/// Dominator and PostDominator trees are needed for the
/// FusionCandidateCompare function, required by FusionCandidateSet to
@@ -151,11 +157,20 @@ struct FusionCandidate {
const DominatorTree *DT;
const PostDominatorTree *PDT;
+ OptimizationRemarkEmitter &ORE;
+
FusionCandidate(Loop *L, const DominatorTree *DT,
- const PostDominatorTree *PDT)
+ const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE)
: Preheader(L->getLoopPreheader()), Header(L->getHeader()),
ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
- Latch(L->getLoopLatch()), L(L), Valid(true), DT(DT), PDT(PDT) {
+ Latch(L->getLoopLatch()), L(L), Valid(true), GuardBranch(nullptr),
+ DT(DT), PDT(PDT), ORE(ORE) {
+
+ // TODO: This is temporary while we fuse both rotated and non-rotated
+ // loops. Once we switch to only fusing rotated loops, the initialization of
+ // GuardBranch can be moved into the initialization list above.
+ if (isRotated())
+ GuardBranch = L->getLoopGuardBranch();
// Walk over all blocks in the loop and check for conditions that may
// prevent fusion. For each block, walk over all instructions and collect
@@ -163,28 +178,28 @@ struct FusionCandidate {
// found, invalidate this object and return.
for (BasicBlock *BB : L->blocks()) {
if (BB->hasAddressTaken()) {
- AddressTakenBB++;
invalidate();
+ reportInvalidCandidate(AddressTakenBB);
return;
}
for (Instruction &I : *BB) {
if (I.mayThrow()) {
- MayThrowException++;
invalidate();
+ reportInvalidCandidate(MayThrowException);
return;
}
if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
if (SI->isVolatile()) {
- ContainsVolatileAccess++;
invalidate();
+ reportInvalidCandidate(ContainsVolatileAccess);
return;
}
}
if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
if (LI->isVolatile()) {
- ContainsVolatileAccess++;
invalidate();
+ reportInvalidCandidate(ContainsVolatileAccess);
return;
}
}
@@ -214,19 +229,96 @@ struct FusionCandidate {
assert(Latch == L->getLoopLatch() && "Latch is out of sync");
}
+ /// Get the entry block for this fusion candidate.
+ ///
+ /// If this fusion candidate represents a guarded loop, the entry block is the
+ /// loop guard block. If it represents an unguarded loop, the entry block is
+ /// the preheader of the loop.
+ BasicBlock *getEntryBlock() const {
+ if (GuardBranch)
+ return GuardBranch->getParent();
+ else
+ return Preheader;
+ }
+
+ /// Given a guarded loop, get the successor of the guard that is not in the
+ /// loop.
+ ///
+ /// This method returns the successor of the loop guard that is not located
+ /// within the loop (i.e., the successor of the guard that is not the
+ /// preheader).
+ /// This method is only valid for guarded loops.
+ BasicBlock *getNonLoopBlock() const {
+ assert(GuardBranch && "Only valid on guarded loops.");
+ assert(GuardBranch->isConditional() &&
+ "Expecting guard to be a conditional branch.");
+ return (GuardBranch->getSuccessor(0) == Preheader)
+ ? GuardBranch->getSuccessor(1)
+ : GuardBranch->getSuccessor(0);
+ }
+
+ bool isRotated() const {
+ assert(L && "Expecting loop to be valid.");
+ assert(Latch && "Expecting latch to be valid.");
+ return L->isLoopExiting(Latch);
+ }
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD void dump() const {
- dbgs() << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
+ dbgs() << "\tGuardBranch: "
+ << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n"
+ << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
<< "\n"
<< "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n"
<< "\tExitingBB: "
<< (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n"
<< "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr")
<< "\n"
- << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n";
+ << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"
+ << "\tEntryBlock: "
+ << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr")
+ << "\n";
}
#endif
+ /// Determine if a fusion candidate (representing a loop) is eligible for
+ /// fusion. Note that this only checks whether a single loop can be fused - it
+ /// does not check whether it is *legal* to fuse two loops together.
+ bool isEligibleForFusion(ScalarEvolution &SE) const {
+ if (!isValid()) {
+ LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n");
+ if (!Preheader)
+ ++InvalidPreheader;
+ if (!Header)
+ ++InvalidHeader;
+ if (!ExitingBlock)
+ ++InvalidExitingBlock;
+ if (!ExitBlock)
+ ++InvalidExitBlock;
+ if (!Latch)
+ ++InvalidLatch;
+ if (L->isInvalid())
+ ++InvalidLoop;
+
+ return false;
+ }
+
+ // Require ScalarEvolution to be able to determine a trip count.
+ if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
+ LLVM_DEBUG(dbgs() << "Loop " << L->getName()
+ << " trip count not computable!\n");
+ return reportInvalidCandidate(UnknownTripCount);
+ }
+
+ if (!L->isLoopSimplifyForm()) {
+ LLVM_DEBUG(dbgs() << "Loop " << L->getName()
+ << " is not in simplified form!\n");
+ return reportInvalidCandidate(NotSimplifiedForm);
+ }
+
+ return true;
+ }
+
private:
// This is only used internally for now, to clear the MemWrites and MemReads
// list and setting Valid to false. I can't envision other uses of this right
@@ -239,17 +331,18 @@ private:
MemReads.clear();
Valid = false;
}
-};
-inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
- const FusionCandidate &FC) {
- if (FC.isValid())
- OS << FC.Preheader->getName();
- else
- OS << "<Invalid>";
-
- return OS;
-}
+ bool reportInvalidCandidate(llvm::Statistic &Stat) const {
+ using namespace ore;
+ assert(L && Preheader && "Fusion candidate not initialized properly!");
+ ++Stat;
+ ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(),
+ L->getStartLoc(), Preheader)
+ << "[" << Preheader->getParent()->getName() << "]: "
+ << "Loop is not a candidate for fusion: " << Stat.getDesc());
+ return false;
+ }
+};
struct FusionCandidateCompare {
/// Comparison functor to sort two Control Flow Equivalent fusion candidates
@@ -260,21 +353,24 @@ struct FusionCandidateCompare {
const FusionCandidate &RHS) const {
const DominatorTree *DT = LHS.DT;
+ BasicBlock *LHSEntryBlock = LHS.getEntryBlock();
+ BasicBlock *RHSEntryBlock = RHS.getEntryBlock();
+
// Do not save PDT to local variable as it is only used in asserts and thus
// will trigger an unused variable warning if building without asserts.
assert(DT && LHS.PDT && "Expecting valid dominator tree");
// Do this compare first so if LHS == RHS, function returns false.
- if (DT->dominates(RHS.Preheader, LHS.Preheader)) {
+ if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) {
// RHS dominates LHS
// Verify LHS post-dominates RHS
- assert(LHS.PDT->dominates(LHS.Preheader, RHS.Preheader));
+ assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));
return false;
}
- if (DT->dominates(LHS.Preheader, RHS.Preheader)) {
+ if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) {
// Verify RHS Postdominates LHS
- assert(LHS.PDT->dominates(RHS.Preheader, LHS.Preheader));
+ assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));
return true;
}
@@ -286,7 +382,6 @@ struct FusionCandidateCompare {
}
};
-namespace {
using LoopVector = SmallVector<Loop *, 4>;
// Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance
@@ -301,17 +396,26 @@ using LoopVector = SmallVector<Loop *, 4>;
// keeps the FusionCandidateSet sorted will also simplify the implementation.
using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;
using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;
-} // namespace
-inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+#if !defined(NDEBUG)
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+ const FusionCandidate &FC) {
+ if (FC.isValid())
+ OS << FC.Preheader->getName();
+ else
+ OS << "<Invalid>";
+
+ return OS;
+}
+
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
const FusionCandidateSet &CandSet) {
- for (auto IT : CandSet)
- OS << IT << "\n";
+ for (const FusionCandidate &FC : CandSet)
+ OS << FC << '\n';
return OS;
}
-#if !defined(NDEBUG)
static void
printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {
dbgs() << "Fusion Candidates: \n";
@@ -391,16 +495,6 @@ static void printLoopVector(const LoopVector &LV) {
}
#endif
-static void reportLoopFusion(const FusionCandidate &FC0,
- const FusionCandidate &FC1,
- OptimizationRemarkEmitter &ORE) {
- using namespace ore;
- ORE.emit(
- OptimizationRemark(DEBUG_TYPE, "LoopFusion", FC0.Preheader->getParent())
- << "Fused " << NV("Cand1", StringRef(FC0.Preheader->getName()))
- << " with " << NV("Cand2", StringRef(FC1.Preheader->getName())));
-}
-
struct LoopFuser {
private:
// Sets of control flow equivalent fusion candidates for a given nest level.
@@ -497,53 +591,16 @@ private:
const FusionCandidate &FC1) const {
assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders");
- if (DT.dominates(FC0.Preheader, FC1.Preheader))
- return PDT.dominates(FC1.Preheader, FC0.Preheader);
+ BasicBlock *FC0EntryBlock = FC0.getEntryBlock();
+ BasicBlock *FC1EntryBlock = FC1.getEntryBlock();
- if (DT.dominates(FC1.Preheader, FC0.Preheader))
- return PDT.dominates(FC0.Preheader, FC1.Preheader);
+ if (DT.dominates(FC0EntryBlock, FC1EntryBlock))
+ return PDT.dominates(FC1EntryBlock, FC0EntryBlock);
- return false;
- }
-
- /// Determine if a fusion candidate (representing a loop) is eligible for
- /// fusion. Note that this only checks whether a single loop can be fused - it
- /// does not check whether it is *legal* to fuse two loops together.
- bool eligibleForFusion(const FusionCandidate &FC) const {
- if (!FC.isValid()) {
- LLVM_DEBUG(dbgs() << "FC " << FC << " has invalid CFG requirements!\n");
- if (!FC.Preheader)
- InvalidPreheader++;
- if (!FC.Header)
- InvalidHeader++;
- if (!FC.ExitingBlock)
- InvalidExitingBlock++;
- if (!FC.ExitBlock)
- InvalidExitBlock++;
- if (!FC.Latch)
- InvalidLatch++;
- if (FC.L->isInvalid())
- InvalidLoop++;
+ if (DT.dominates(FC1EntryBlock, FC0EntryBlock))
+ return PDT.dominates(FC0EntryBlock, FC1EntryBlock);
- return false;
- }
-
- // Require ScalarEvolution to be able to determine a trip count.
- if (!SE.hasLoopInvariantBackedgeTakenCount(FC.L)) {
- LLVM_DEBUG(dbgs() << "Loop " << FC.L->getName()
- << " trip count not computable!\n");
- InvalidTripCount++;
- return false;
- }
-
- if (!FC.L->isLoopSimplifyForm()) {
- LLVM_DEBUG(dbgs() << "Loop " << FC.L->getName()
- << " is not in simplified form!\n");
- NotSimplifiedForm++;
- return false;
- }
-
- return true;
+ return false;
}
/// Iterate over all loops in the given loop set and identify the loops that
@@ -551,8 +608,8 @@ private:
/// Flow Equivalent sets, sorted by dominance.
void collectFusionCandidates(const LoopVector &LV) {
for (Loop *L : LV) {
- FusionCandidate CurrCand(L, &DT, &PDT);
- if (!eligibleForFusion(CurrCand))
+ FusionCandidate CurrCand(L, &DT, &PDT, ORE);
+ if (!CurrCand.isEligibleForFusion(SE))
continue;
// Go through each list in FusionCandidates and determine if L is control
@@ -664,31 +721,64 @@ private:
if (!identicalTripCounts(*FC0, *FC1)) {
LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
"counts. Not fusing.\n");
- NonEqualTripCount++;
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+ NonEqualTripCount);
continue;
}
if (!isAdjacent(*FC0, *FC1)) {
LLVM_DEBUG(dbgs()
<< "Fusion candidates are not adjacent. Not fusing.\n");
- NonAdjacent++;
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent);
continue;
}
- // For now we skip fusing if the second candidate has any instructions
- // in the preheader. This is done because we currently do not have the
- // safety checks to determine if it is save to move the preheader of
- // the second candidate past the body of the first candidate. Once
- // these checks are added, this condition can be removed.
+ // Ensure that FC0 and FC1 have identical guards.
+ // If one (or both) are not guarded, this check is not necessary.
+ if (FC0->GuardBranch && FC1->GuardBranch &&
+ !haveIdenticalGuards(*FC0, *FC1)) {
+ LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
+ "guards. Not Fusing.\n");
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+ NonIdenticalGuards);
+ continue;
+ }
+
+ // The following three checks look for empty blocks in FC0 and FC1. If
+ // any of these blocks are non-empty, we do not fuse. This is done
+ // because we currently do not have the safety checks to determine if
+ // it is safe to move the blocks past other blocks in the loop. Once
+ // these checks are added, these conditions can be relaxed.
if (!isEmptyPreheader(*FC1)) {
LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty "
"preheader. Not fusing.\n");
- NonEmptyPreheader++;
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+ NonEmptyPreheader);
+ continue;
+ }
+
+ if (FC0->GuardBranch && !isEmptyExitBlock(*FC0)) {
+ LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty exit "
+ "block. Not fusing.\n");
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+ NonEmptyExitBlock);
+ continue;
+ }
+
+ if (FC1->GuardBranch && !isEmptyGuardBlock(*FC1)) {
+ LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty guard "
+ "block. Not fusing.\n");
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+ NonEmptyGuardBlock);
continue;
}
+ // Check the dependencies across the loops and do not fuse if it would
+ // violate them.
if (!dependencesAllowFusion(*FC0, *FC1)) {
LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+ InvalidDependencies);
continue;
}
@@ -696,9 +786,11 @@ private:
LLVM_DEBUG(dbgs()
<< "\tFusion appears to be "
<< (BeneficialToFuse ? "" : "un") << "profitable!\n");
- if (!BeneficialToFuse)
+ if (!BeneficialToFuse) {
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+ FusionNotBeneficial);
continue;
-
+ }
// All analysis has completed and has determined that fusion is legal
// and profitable. At this point, start transforming the code and
// perform fusion.
@@ -710,15 +802,14 @@ private:
// Note this needs to be done *before* performFusion because
// performFusion will change the original loops, making it not
// possible to identify them after fusion is complete.
- reportLoopFusion(*FC0, *FC1, ORE);
+ reportLoopFusion<OptimizationRemark>(*FC0, *FC1, FuseCounter);
- FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT);
+ FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT, ORE);
FusedCand.verify();
- assert(eligibleForFusion(FusedCand) &&
+ assert(FusedCand.isEligibleForFusion(SE) &&
"Fused candidate should be eligible for fusion!");
// Notify the loop-depth-tree that these loops are not valid objects
- // anymore.
LDT.removeLoop(FC1->L);
CandidateSet.erase(FC0);
@@ -889,7 +980,7 @@ private:
LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1
<< "\n");
assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth());
- assert(DT.dominates(FC0.Preheader, FC1.Preheader));
+ assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()));
for (Instruction *WriteL0 : FC0.MemWrites) {
for (Instruction *WriteL1 : FC1.MemWrites)
@@ -939,18 +1030,89 @@ private:
return true;
}
- /// Determine if the exit block of \p FC0 is the preheader of \p FC1. In this
- /// case, there is no code in between the two fusion candidates, thus making
- /// them adjacent.
+ /// Determine if two fusion candidates are adjacent in the CFG.
+ ///
+ /// This method will determine if there are additional basic blocks in the CFG
+ /// between the exit of \p FC0 and the entry of \p FC1.
+ /// If the two candidates are guarded loops, then it checks whether the
+ /// non-loop successor of the \p FC0 guard branch is the entry block of \p
+ /// FC1. If not, then the loops are not adjacent. If the two candidates are
+ /// not guarded loops, then it checks whether the exit block of \p FC0 is the
+ /// preheader of \p FC1.
bool isAdjacent(const FusionCandidate &FC0,
const FusionCandidate &FC1) const {
- return FC0.ExitBlock == FC1.Preheader;
+ // If the successor of the guard branch is FC1, then the loops are adjacent
+ if (FC0.GuardBranch)
+ return FC0.getNonLoopBlock() == FC1.getEntryBlock();
+ else
+ return FC0.ExitBlock == FC1.getEntryBlock();
+ }
+
+ /// Determine if two fusion candidates have identical guards
+ ///
+ /// This method will determine if two fusion candidates have the same guards.
+ /// The guards are considered the same if:
+ /// 1. The instructions to compute the condition used in the compare are
+ /// identical.
+ /// 2. The successors of the guard have the same flow into/around the loop.
+ /// If the compare instructions are identical, then the first successor of the
+ /// guard must go to the same place (either the preheader of the loop or the
+ /// NonLoopBlock). In other words, the the first successor of both loops must
+ /// both go into the loop (i.e., the preheader) or go around the loop (i.e.,
+ /// the NonLoopBlock). The same must be true for the second successor.
+ bool haveIdenticalGuards(const FusionCandidate &FC0,
+ const FusionCandidate &FC1) const {
+ assert(FC0.GuardBranch && FC1.GuardBranch &&
+ "Expecting FC0 and FC1 to be guarded loops.");
+
+ if (auto FC0CmpInst =
+ dyn_cast<Instruction>(FC0.GuardBranch->getCondition()))
+ if (auto FC1CmpInst =
+ dyn_cast<Instruction>(FC1.GuardBranch->getCondition()))
+ if (!FC0CmpInst->isIdenticalTo(FC1CmpInst))
+ return false;
+
+ // The compare instructions are identical.
+ // Now make sure the successor of the guards have the same flow into/around
+ // the loop
+ if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader)
+ return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader);
+ else
+ return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
+ }
+
+ /// Check that the guard for \p FC *only* contains the cmp/branch for the
+ /// guard.
+ /// Once we are able to handle intervening code, any code in the guard block
+ /// for FC1 will need to be treated as intervening code and checked whether
+ /// it can safely move around the loops.
+ bool isEmptyGuardBlock(const FusionCandidate &FC) const {
+ assert(FC.GuardBranch && "Expecting a fusion candidate with guard branch.");
+ if (auto *CmpInst = dyn_cast<Instruction>(FC.GuardBranch->getCondition())) {
+ auto *GuardBlock = FC.GuardBranch->getParent();
+ // If the generation of the cmp value is in GuardBlock, then the size of
+ // the guard block should be 2 (cmp + branch). If the generation of the
+ // cmp value is in a different block, then the size of the guard block
+ // should only be 1.
+ if (CmpInst->getParent() == GuardBlock)
+ return GuardBlock->size() == 2;
+ else
+ return GuardBlock->size() == 1;
+ }
+
+ return false;
}
bool isEmptyPreheader(const FusionCandidate &FC) const {
+ assert(FC.Preheader && "Expecting a valid preheader");
return FC.Preheader->size() == 1;
}
+ bool isEmptyExitBlock(const FusionCandidate &FC) const {
+ assert(FC.ExitBlock && "Expecting a valid exit block");
+ return FC.ExitBlock->size() == 1;
+ }
+
/// Fuse two fusion candidates, creating a new fused loop.
///
/// This method contains the mechanics of fusing two loops, represented by \p
@@ -987,6 +1149,12 @@ private:
LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();
dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););
+ // Fusing guarded loops is handled slightly differently than non-guarded
+ // loops and has been broken out into a separate method instead of trying to
+ // intersperse the logic within a single method.
+ if (FC0.GuardBranch)
+ return fuseGuardedLoops(FC0, FC1);
+
assert(FC1.Preheader == FC0.ExitBlock);
assert(FC1.Preheader->size() == 1 &&
FC1.Preheader->getSingleSuccessor() == FC1.Header);
@@ -1131,7 +1299,258 @@ private:
SE.verify();
#endif
- FuseCounter++;
+ LLVM_DEBUG(dbgs() << "Fusion done:\n");
+
+ return FC0.L;
+ }
+
+ /// Report details on loop fusion opportunities.
+ ///
+ /// This template function can be used to report both successful and missed
+ /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should
+ /// be one of:
+ /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful
+ /// given two valid fusion candidates.
+ /// - OptimizationRemark to report successful fusion of two fusion
+ /// candidates.
+ /// The remarks will be printed using the form:
+ /// <path/filename>:<line number>:<column number>: [<function name>]:
+ /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
+ template <typename RemarkKind>
+ void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
+ llvm::Statistic &Stat) {
+ assert(FC0.Preheader && FC1.Preheader &&
+ "Expecting valid fusion candidates");
+ using namespace ore;
+ ++Stat;
+ ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(),
+ FC0.Preheader)
+ << "[" << FC0.Preheader->getParent()->getName()
+ << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName()))
+ << " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))
+ << ": " << Stat.getDesc());
+ }
+
+ /// Fuse two guarded fusion candidates, creating a new fused loop.
+ ///
+ /// Fusing guarded loops is handled much the same way as fusing non-guarded
+ /// loops. The rewiring of the CFG is slightly different though, because of
+ /// the presence of the guards around the loops and the exit blocks after the
+ /// loop body. As such, the new loop is rewired as follows:
+ /// 1. Keep the guard branch from FC0 and use the non-loop block target
+ /// from the FC1 guard branch.
+ /// 2. Remove the exit block from FC0 (this exit block should be empty
+ /// right now).
+ /// 3. Remove the guard branch for FC1
+ /// 4. Remove the preheader for FC1.
+ /// The exit block successor for the latch of FC0 is updated to be the header
+ /// of FC1 and the non-exit block successor of the latch of FC1 is updated to
+ /// be the header of FC0, thus creating the fused loop.
+ Loop *fuseGuardedLoops(const FusionCandidate &FC0,
+ const FusionCandidate &FC1) {
+ assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops");
+
+ BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent();
+ BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
+ BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
+ BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
+
+ assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");
+
+ SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
+
+ ////////////////////////////////////////////////////////////////////////////
+ // Update the Loop Guard
+ ////////////////////////////////////////////////////////////////////////////
+ // The guard for FC0 is updated to guard both FC0 and FC1. This is done by
+ // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1.
+ // Thus, one path from the guard goes to the preheader for FC0 (and thus
+ // executes the new fused loop) and the other path goes to the NonLoopBlock
+ // for FC1 (where FC1 guard would have gone if FC1 was not executed).
+ FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
+ FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock,
+ FC1.Header);
+
+ // The guard of FC1 is not necessary anymore.
+ FC1.GuardBranch->eraseFromParent();
+ new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock);
+
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Delete, FC1GuardBlock, FC1.Preheader));
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock));
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock));
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
+
+ assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&
+ "Expecting guard block to have no predecessors");
+ assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&
+ "Expecting guard block to have no successors");
+
+ // Remember the phi nodes originally in the header of FC0 in order to rewire
+ // them later. However, this is only necessary if the new loop carried
+ // values might not dominate the exiting branch. While we do not generally
+ // test if this is the case but simply insert intermediate phi nodes, we
+ // need to make sure these intermediate phi nodes have different
+ // predecessors. To this end, we filter the special case where the exiting
+ // block is the latch block of the first loop. Nothing needs to be done
+ // anyway as all loop carried values dominate the latch and thereby also the
+ // exiting branch.
+ // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch
+ // (because the loops are rotated. Thus, nothing will ever be added to
+ // OriginalFC0PHIs.
+ SmallVector<PHINode *, 8> OriginalFC0PHIs;
+ if (FC0.ExitingBlock != FC0.Latch)
+ for (PHINode &PHI : FC0.Header->phis())
+ OriginalFC0PHIs.push_back(&PHI);
+
+ assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!");
+
+ // Replace incoming blocks for header PHIs first.
+ FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
+ FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
+
+ // The old exiting block of the first loop (FC0) has to jump to the header
+ // of the second as we need to execute the code in the second header block
+ // regardless of the trip count. That is, if the trip count is 0, so the
+ // back edge is never taken, we still have to execute both loop headers,
+ // especially (but not only!) if the second is a do-while style loop.
+ // However, doing so might invalidate the phi nodes of the first loop as
+ // the new values do only need to dominate their latch and not the exiting
+ // predicate. To remedy this potential problem we always introduce phi
+ // nodes in the header of the second loop later that select the loop carried
+ // value, if the second header was reached through an old latch of the
+ // first, or undef otherwise. This is sound as exiting the first implies the
+ // second will exit too, __without__ taking the back-edge (their
+ // trip-counts are equal after all).
+ FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
+ FC1.Header);
+
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
+
+ // Remove FC0 Exit Block
+ // The exit block for FC0 is no longer needed since control will flow
+ // directly to the header of FC1. Since it is an empty block, it can be
+ // removed at this point.
+ // TODO: In the future, we can handle non-empty exit blocks my merging any
+ // instructions from FC0 exit block into FC1 exit block prior to removing
+ // the block.
+ assert(pred_begin(FC0.ExitBlock) == pred_end(FC0.ExitBlock) &&
+ "Expecting exit block to be empty");
+ FC0.ExitBlock->getTerminator()->eraseFromParent();
+ new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
+
+ // Remove FC1 Preheader
+ // The pre-header of L1 is not necessary anymore.
+ assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader));
+ FC1.Preheader->getTerminator()->eraseFromParent();
+ new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Delete, FC1.Preheader, FC1.Header));
+
+ // Moves the phi nodes from the second to the first loops header block.
+ while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
+ if (SE.isSCEVable(PHI->getType()))
+ SE.forgetValue(PHI);
+ if (PHI->hasNUsesOrMore(1))
+ PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
+ else
+ PHI->eraseFromParent();
+ }
+
+ // Introduce new phi nodes in the second loop header to ensure
+ // exiting the first and jumping to the header of the second does not break
+ // the SSA property of the phis originally in the first loop. See also the
+ // comment above.
+ Instruction *L1HeaderIP = &FC1.Header->front();
+ for (PHINode *LCPHI : OriginalFC0PHIs) {
+ int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
+ assert(L1LatchBBIdx >= 0 &&
+ "Expected loop carried value to be rewired at this point!");
+
+ Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
+
+ PHINode *L1HeaderPHI = PHINode::Create(
+ LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
+ L1HeaderPHI->addIncoming(LCV, FC0.Latch);
+ L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
+ FC0.ExitingBlock);
+
+ LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
+ }
+
+ // Update the latches
+
+ // Replace latch terminator destinations.
+ FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
+ FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
+
+ // If FC0.Latch and FC0.ExitingBlock are the same then we have already
+ // performed the updates above.
+ if (FC0.Latch != FC0.ExitingBlock)
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Insert, FC0.Latch, FC1.Header));
+
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
+ FC0.Latch, FC0.Header));
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
+ FC1.Latch, FC0.Header));
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
+ FC1.Latch, FC1.Header));
+
+ // All done
+ // Apply the updates to the Dominator Tree and cleanup.
+
+ assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&
+ "FC1GuardBlock has successors!!");
+ assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&
+ "FC1GuardBlock has predecessors!!");
+
+ // Update DT/PDT
+ DTU.applyUpdates(TreeUpdates);
+
+ LI.removeBlock(FC1.Preheader);
+ DTU.deleteBB(FC1.Preheader);
+ DTU.deleteBB(FC0.ExitBlock);
+ DTU.flush();
+
+ // Is there a way to keep SE up-to-date so we don't need to forget the loops
+ // and rebuild the information in subsequent passes of fusion?
+ SE.forgetLoop(FC1.L);
+ SE.forgetLoop(FC0.L);
+
+ // Merge the loops.
+ SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(),
+ FC1.L->block_end());
+ for (BasicBlock *BB : Blocks) {
+ FC0.L->addBlockEntry(BB);
+ FC1.L->removeBlockFromLoop(BB);
+ if (LI.getLoopFor(BB) != FC1.L)
+ continue;
+ LI.changeLoopFor(BB, FC0.L);
+ }
+ while (!FC1.L->empty()) {
+ const auto &ChildLoopIt = FC1.L->begin();
+ Loop *ChildLoop = *ChildLoopIt;
+ FC1.L->removeChildLoop(ChildLoopIt);
+ FC0.L->addChildLoop(ChildLoop);
+ }
+
+ // Delete the now empty loop L1.
+ LI.erase(FC1.L);
+
+#ifndef NDEBUG
+ assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
+ assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+ assert(PDT.verify());
+ LI.verify(DT);
+ SE.verify();
+#endif
LLVM_DEBUG(dbgs() << "Fusion done:\n");
@@ -1177,6 +1596,7 @@ struct LoopFuseLegacy : public FunctionPass {
return LF.fuseLoops(F);
}
};
+} // namespace
PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {
auto &LI = AM.getResult<LoopAnalysis>(F);
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index e561494f19cf..dd477e800693 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -41,6 +41,7 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
@@ -77,16 +78,20 @@
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/BuildLibCalls.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
@@ -102,6 +107,7 @@ using namespace llvm;
STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
+STATISTIC(NumBCmp, "Number of memcmp's formed from loop 2xload+eq-compare");
static cl::opt<bool> UseLIRCodeSizeHeurs(
"use-lir-code-size-heurs",
@@ -111,6 +117,26 @@ static cl::opt<bool> UseLIRCodeSizeHeurs(
namespace {
+// FIXME: reinventing the wheel much? Is there a cleaner solution?
+struct PMAbstraction {
+ virtual void markLoopAsDeleted(Loop *L) = 0;
+ virtual ~PMAbstraction() = default;
+};
+struct LegacyPMAbstraction : PMAbstraction {
+ LPPassManager &LPM;
+ LegacyPMAbstraction(LPPassManager &LPM) : LPM(LPM) {}
+ virtual ~LegacyPMAbstraction() = default;
+ void markLoopAsDeleted(Loop *L) override { LPM.markLoopAsDeleted(*L); }
+};
+struct NewPMAbstraction : PMAbstraction {
+ LPMUpdater &Updater;
+ NewPMAbstraction(LPMUpdater &Updater) : Updater(Updater) {}
+ virtual ~NewPMAbstraction() = default;
+ void markLoopAsDeleted(Loop *L) override {
+ Updater.markLoopAsDeleted(*L, L->getName());
+ }
+};
+
class LoopIdiomRecognize {
Loop *CurLoop = nullptr;
AliasAnalysis *AA;
@@ -120,6 +146,7 @@ class LoopIdiomRecognize {
TargetLibraryInfo *TLI;
const TargetTransformInfo *TTI;
const DataLayout *DL;
+ PMAbstraction &LoopDeleter;
OptimizationRemarkEmitter &ORE;
bool ApplyCodeSizeHeuristics;
@@ -128,9 +155,10 @@ public:
LoopInfo *LI, ScalarEvolution *SE,
TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI,
- const DataLayout *DL,
+ const DataLayout *DL, PMAbstraction &LoopDeleter,
OptimizationRemarkEmitter &ORE)
- : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {}
+ : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL),
+ LoopDeleter(LoopDeleter), ORE(ORE) {}
bool runOnLoop(Loop *L);
@@ -144,6 +172,8 @@ private:
bool HasMemset;
bool HasMemsetPattern;
bool HasMemcpy;
+ bool HasMemCmp;
+ bool HasBCmp;
/// Return code for isLegalStore()
enum LegalStoreKind {
@@ -186,6 +216,32 @@ private:
bool runOnNoncountableLoop();
+ struct CmpLoopStructure {
+ Value *BCmpValue, *LatchCmpValue;
+ BasicBlock *HeaderBrEqualBB, *HeaderBrUnequalBB;
+ BasicBlock *LatchBrFinishBB, *LatchBrContinueBB;
+ };
+ bool matchBCmpLoopStructure(CmpLoopStructure &CmpLoop) const;
+ struct CmpOfLoads {
+ ICmpInst::Predicate BCmpPred;
+ Value *LoadSrcA, *LoadSrcB;
+ Value *LoadA, *LoadB;
+ };
+ bool matchBCmpOfLoads(Value *BCmpValue, CmpOfLoads &CmpOfLoads) const;
+ bool recognizeBCmpLoopControlFlow(const CmpOfLoads &CmpOfLoads,
+ CmpLoopStructure &CmpLoop) const;
+ bool recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes, CmpOfLoads &CmpOfLoads,
+ const SCEV *&SrcA, const SCEV *&SrcB,
+ const SCEV *&Iterations) const;
+ bool detectBCmpIdiom(ICmpInst *&BCmpInst, CmpInst *&LatchCmpInst,
+ LoadInst *&LoadA, LoadInst *&LoadB, const SCEV *&SrcA,
+ const SCEV *&SrcB, const SCEV *&NBytes) const;
+ BasicBlock *transformBCmpControlFlow(ICmpInst *ComparedEqual);
+ void transformLoopToBCmp(ICmpInst *BCmpInst, CmpInst *LatchCmpInst,
+ LoadInst *LoadA, LoadInst *LoadB, const SCEV *SrcA,
+ const SCEV *SrcB, const SCEV *NBytes);
+ bool recognizeBCmp();
+
bool recognizePopcount();
void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
PHINode *CntPhi, Value *Var);
@@ -217,18 +273,20 @@ public:
LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
TargetLibraryInfo *TLI =
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+ *L->getHeader()->getParent());
const TargetTransformInfo *TTI =
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
*L->getHeader()->getParent());
const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
+ LegacyPMAbstraction LoopDeleter(LPM);
// For the old PM, we can't use OptimizationRemarkEmitter as an analysis
// pass. Function analyses need to be preserved across loop transformations
// but ORE cannot be preserved (see comment before the pass definition).
OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
- LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, ORE);
+ LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, LoopDeleter, ORE);
return LIR.runOnLoop(L);
}
@@ -247,7 +305,7 @@ char LoopIdiomRecognizeLegacyPass::ID = 0;
PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
- LPMUpdater &) {
+ LPMUpdater &Updater) {
const auto *DL = &L.getHeader()->getModule()->getDataLayout();
const auto &FAM =
@@ -261,8 +319,9 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
"LoopIdiomRecognizePass: OptimizationRemarkEmitterAnalysis not cached "
"at a higher level");
+ NewPMAbstraction LoopDeleter(Updater);
LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL,
- *ORE);
+ LoopDeleter, *ORE);
if (!LIR.runOnLoop(&L))
return PreservedAnalyses::all();
@@ -299,7 +358,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
// Disable loop idiom recognition if the function's name is a common idiom.
StringRef Name = L->getHeader()->getParent()->getName();
- if (Name == "memset" || Name == "memcpy")
+ if (Name == "memset" || Name == "memcpy" || Name == "memcmp" ||
+ Name == "bcmp")
return false;
// Determine if code size heuristics need to be applied.
@@ -309,8 +369,10 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
HasMemset = TLI->has(LibFunc_memset);
HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
HasMemcpy = TLI->has(LibFunc_memcpy);
+ HasMemCmp = TLI->has(LibFunc_memcmp);
+ HasBCmp = TLI->has(LibFunc_bcmp);
- if (HasMemset || HasMemsetPattern || HasMemcpy)
+ if (HasMemset || HasMemsetPattern || HasMemcpy || HasMemCmp || HasBCmp)
if (SE->hasLoopInvariantBackedgeTakenCount(L))
return runOnCountableLoop();
@@ -961,7 +1023,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
GlobalValue::PrivateLinkage,
PatternValue, ".memset_pattern");
GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
- GV->setAlignment(16);
+ GV->setAlignment(Align(16));
Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
}
@@ -1149,7 +1211,7 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
<< "] Noncountable Loop %"
<< CurLoop->getHeader()->getName() << "\n");
- return recognizePopcount() || recognizeAndInsertFFS();
+ return recognizeBCmp() || recognizePopcount() || recognizeAndInsertFFS();
}
/// Check if the given conditional branch is based on the comparison between
@@ -1823,3 +1885,811 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
// loop. The loop would otherwise not be deleted even if it becomes empty.
SE->forgetLoop(CurLoop);
}
+
+bool LoopIdiomRecognize::matchBCmpLoopStructure(
+ CmpLoopStructure &CmpLoop) const {
+ ICmpInst::Predicate BCmpPred;
+
+ // We are looking for the following basic layout:
+ // PreheaderBB: <preheader> ; preds = ???
+ // <...>
+ // br label %LoopHeaderBB
+ // LoopHeaderBB: <header,exiting> ; preds = %PreheaderBB,%LoopLatchBB
+ // <...>
+ // %BCmpValue = icmp <...>
+ // br i1 %BCmpValue, label %LoopLatchBB, label %Successor0
+ // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB
+ // <...>
+ // %LatchCmpValue = <are we done, or do next iteration?>
+ // br i1 %LatchCmpValue, label %Successor1, label %LoopHeaderBB
+ // Successor0: <exit> ; preds = %LoopHeaderBB
+ // <...>
+ // Successor1: <exit> ; preds = %LoopLatchBB
+ // <...>
+ //
+ // Successor0 and Successor1 may or may not be the same basic block.
+
+ // Match basic frame-work of this supposedly-comparison loop.
+ using namespace PatternMatch;
+ if (!match(CurLoop->getHeader()->getTerminator(),
+ m_Br(m_CombineAnd(m_ICmp(BCmpPred, m_Value(), m_Value()),
+ m_Value(CmpLoop.BCmpValue)),
+ CmpLoop.HeaderBrEqualBB, CmpLoop.HeaderBrUnequalBB)) ||
+ !match(CurLoop->getLoopLatch()->getTerminator(),
+ m_Br(m_CombineAnd(m_Cmp(), m_Value(CmpLoop.LatchCmpValue)),
+ CmpLoop.LatchBrFinishBB, CmpLoop.LatchBrContinueBB))) {
+ LLVM_DEBUG(dbgs() << "Basic control-flow layout unrecognized.\n");
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << "Recognized basic control-flow layout.\n");
+ return true;
+}
+
+bool LoopIdiomRecognize::matchBCmpOfLoads(Value *BCmpValue,
+ CmpOfLoads &CmpOfLoads) const {
+ using namespace PatternMatch;
+ LLVM_DEBUG(dbgs() << "Analyzing header icmp " << *BCmpValue
+ << " as bcmp pattern.\n");
+
+ // Match bcmp-style loop header cmp. It must be an eq-icmp of loads. Example:
+ // %v0 = load <...>, <...>* %LoadSrcA
+ // %v1 = load <...>, <...>* %LoadSrcB
+ // %CmpLoop.BCmpValue = icmp eq <...> %v0, %v1
+ // There won't be any no-op bitcasts between load and icmp,
+ // they would have been transformed into a load of bitcast.
+ // FIXME: {b,mem}cmp() calls have the same semantics as icmp. Match them too.
+ if (!match(BCmpValue,
+ m_ICmp(CmpOfLoads.BCmpPred,
+ m_CombineAnd(m_Load(m_Value(CmpOfLoads.LoadSrcA)),
+ m_Value(CmpOfLoads.LoadA)),
+ m_CombineAnd(m_Load(m_Value(CmpOfLoads.LoadSrcB)),
+ m_Value(CmpOfLoads.LoadB)))) ||
+ !ICmpInst::isEquality(CmpOfLoads.BCmpPred)) {
+ LLVM_DEBUG(dbgs() << "Loop header icmp did not match bcmp pattern.\n");
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << "Recognized header icmp as bcmp pattern with loads:\n\t"
+ << *CmpOfLoads.LoadA << "\n\t" << *CmpOfLoads.LoadB
+ << "\n");
+ // FIXME: handle memcmp pattern?
+ return true;
+}
+
+bool LoopIdiomRecognize::recognizeBCmpLoopControlFlow(
+ const CmpOfLoads &CmpOfLoads, CmpLoopStructure &CmpLoop) const {
+ BasicBlock *LoopHeaderBB = CurLoop->getHeader();
+ BasicBlock *LoopLatchBB = CurLoop->getLoopLatch();
+
+ // Be wary, comparisons can be inverted, canonicalize order.
+ // If this 'element' comparison passed, we expect to proceed to the next elt.
+ if (CmpOfLoads.BCmpPred != ICmpInst::Predicate::ICMP_EQ)
+ std::swap(CmpLoop.HeaderBrEqualBB, CmpLoop.HeaderBrUnequalBB);
+ // The predicate on loop latch does not matter, just canonicalize some order.
+ if (CmpLoop.LatchBrContinueBB != LoopHeaderBB)
+ std::swap(CmpLoop.LatchBrFinishBB, CmpLoop.LatchBrContinueBB);
+
+ SmallVector<BasicBlock *, 2> ExitBlocks;
+
+ CurLoop->getUniqueExitBlocks(ExitBlocks);
+ assert(ExitBlocks.size() <= 2U && "Can't have more than two exit blocks.");
+
+ // Check that control-flow between blocks is as expected.
+ if (CmpLoop.HeaderBrEqualBB != LoopLatchBB ||
+ CmpLoop.LatchBrContinueBB != LoopHeaderBB ||
+ !is_contained(ExitBlocks, CmpLoop.HeaderBrUnequalBB) ||
+ !is_contained(ExitBlocks, CmpLoop.LatchBrFinishBB)) {
+ LLVM_DEBUG(dbgs() << "Loop control-flow not recognized.\n");
+ return false;
+ }
+
+ assert(!is_contained(ExitBlocks, CmpLoop.HeaderBrEqualBB) &&
+ !is_contained(ExitBlocks, CmpLoop.LatchBrContinueBB) &&
+ "Unexpected exit edges.");
+
+ LLVM_DEBUG(dbgs() << "Recognized loop control-flow.\n");
+
+ LLVM_DEBUG(dbgs() << "Performing side-effect analysis on the loop.\n");
+ assert(CurLoop->isLCSSAForm(*DT) && "Should only get LCSSA-form loops here.");
+ // No loop instructions must be used outside of the loop. Since we are in
+ // LCSSA form, we only need to check successor block's PHI nodes's incoming
+ // values for incoming blocks that are the loop basic blocks.
+ for (const BasicBlock *ExitBB : ExitBlocks) {
+ for (const PHINode &PHI : ExitBB->phis()) {
+ for (const BasicBlock *LoopBB :
+ make_filter_range(PHI.blocks(), [this](BasicBlock *PredecessorBB) {
+ return CurLoop->contains(PredecessorBB);
+ })) {
+ const auto *I =
+ dyn_cast<Instruction>(PHI.getIncomingValueForBlock(LoopBB));
+ if (I && CurLoop->contains(I)) {
+ LLVM_DEBUG(dbgs()
+ << "Loop contains instruction " << *I
+ << " which is used outside of the loop in basic block "
+ << ExitBB->getName() << " in phi node " << PHI << "\n");
+ return false;
+ }
+ }
+ }
+ }
+ // Similarly, the loop should not have any other observable side-effects
+ // other than the final comparison result.
+ for (BasicBlock *LoopBB : CurLoop->blocks()) {
+ for (Instruction &I : *LoopBB) {
+ if (isa<DbgInfoIntrinsic>(I)) // Ignore dbginfo.
+ continue; // FIXME: anything else? lifetime info?
+ if ((I.mayHaveSideEffects() || I.isAtomic() || I.isFenceLike()) &&
+ &I != CmpOfLoads.LoadA && &I != CmpOfLoads.LoadB) {
+ LLVM_DEBUG(
+ dbgs() << "Loop contains instruction with potential side-effects: "
+ << I << "\n");
+ return false;
+ }
+ }
+ }
+ LLVM_DEBUG(dbgs() << "No loop instructions deemed to have side-effects.\n");
+ return true;
+}
+
+bool LoopIdiomRecognize::recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes,
+ CmpOfLoads &CmpOfLoads,
+ const SCEV *&SrcA,
+ const SCEV *&SrcB,
+ const SCEV *&Iterations) const {
+ // Try to compute SCEV of the loads, for this loop's scope.
+ const auto *ScevForSrcA = dyn_cast<SCEVAddRecExpr>(
+ SE->getSCEVAtScope(CmpOfLoads.LoadSrcA, CurLoop));
+ const auto *ScevForSrcB = dyn_cast<SCEVAddRecExpr>(
+ SE->getSCEVAtScope(CmpOfLoads.LoadSrcB, CurLoop));
+ if (!ScevForSrcA || !ScevForSrcB) {
+ LLVM_DEBUG(dbgs() << "Failed to get SCEV expressions for load sources.\n");
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "Got SCEV expressions (at loop scope) for loads:\n\t"
+ << *ScevForSrcA << "\n\t" << *ScevForSrcB << "\n");
+
+ // Loads must have folloving SCEV exprs: {%ptr,+,BCmpTyBytes}<%LoopHeaderBB>
+ const SCEV *RecStepForA = ScevForSrcA->getStepRecurrence(*SE);
+ const SCEV *RecStepForB = ScevForSrcB->getStepRecurrence(*SE);
+ if (!ScevForSrcA->isAffine() || !ScevForSrcB->isAffine() ||
+ ScevForSrcA->getLoop() != CurLoop || ScevForSrcB->getLoop() != CurLoop ||
+ RecStepForA != RecStepForB || !isa<SCEVConstant>(RecStepForA) ||
+ cast<SCEVConstant>(RecStepForA)->getAPInt() != BCmpTyBytes) {
+ LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads. Only support "
+ "affine SCEV expressions originating in the loop we "
+ "are analysing with identical constant positive step, "
+ "equal to the count of bytes compared. Got:\n\t"
+ << *RecStepForA << "\n\t" << *RecStepForB << "\n");
+ return false;
+ // FIXME: can support BCmpTyBytes > Step.
+ // But will need to account for the extra bytes compared at the end.
+ }
+
+ SrcA = ScevForSrcA->getStart();
+ SrcB = ScevForSrcB->getStart();
+ LLVM_DEBUG(dbgs() << "Got SCEV expressions for load sources:\n\t" << *SrcA
+ << "\n\t" << *SrcB << "\n");
+
+ // The load sources must be loop-invants that dominate the loop header.
+ if (SrcA == SE->getCouldNotCompute() || SrcB == SE->getCouldNotCompute() ||
+ !SE->isAvailableAtLoopEntry(SrcA, CurLoop) ||
+ !SE->isAvailableAtLoopEntry(SrcB, CurLoop)) {
+ LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads, unavaliable "
+ "prior to loop header.\n");
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "SCEV expressions for loads are acceptable.\n");
+
+ // bcmp / memcmp take length argument as size_t, so let's conservatively
+ // assume that the iteration count should be not wider than that.
+ Type *CmpFuncSizeTy = DL->getIntPtrType(SE->getContext());
+
+ // For how many iterations is loop guaranteed not to exit via LoopLatch?
+ // This is one less than the maximal number of comparisons,and is: n + -1
+ const SCEV *LoopExitCount =
+ SE->getExitCount(CurLoop, CurLoop->getLoopLatch());
+ LLVM_DEBUG(dbgs() << "Got SCEV expression for loop latch exit count: "
+ << *LoopExitCount << "\n");
+ // Exit count, similarly, must be loop-invant that dominates the loop header.
+ if (LoopExitCount == SE->getCouldNotCompute() ||
+ !LoopExitCount->getType()->isIntOrPtrTy() ||
+ LoopExitCount->getType()->getScalarSizeInBits() >
+ CmpFuncSizeTy->getScalarSizeInBits() ||
+ !SE->isAvailableAtLoopEntry(LoopExitCount, CurLoop)) {
+ LLVM_DEBUG(dbgs() << "Unsupported SCEV expression for loop latch exit.\n");
+ return false;
+ }
+
+ // LoopExitCount is always one less than the actual count of iterations.
+ // Do this before cast, else we will be stuck with 1 + zext(-1 + n)
+ Iterations = SE->getAddExpr(
+ LoopExitCount, SE->getOne(LoopExitCount->getType()), SCEV::FlagNUW);
+ assert(Iterations != SE->getCouldNotCompute() &&
+ "Shouldn't fail to increment by one.");
+
+ LLVM_DEBUG(dbgs() << "Computed iteration count: " << *Iterations << "\n");
+ return true;
+}
+
+/// Return true iff the bcmp idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p BCmpInst is set to the root byte-comparison instruction.
+/// 2) \p LatchCmpInst is set to the comparison that controls the latch.
+/// 3) \p LoadA is set to the first LoadInst.
+/// 4) \p LoadB is set to the second LoadInst.
+/// 5) \p SrcA is set to the first source location that is being compared.
+/// 6) \p SrcB is set to the second source location that is being compared.
+/// 7) \p NBytes is set to the number of bytes to compare.
+bool LoopIdiomRecognize::detectBCmpIdiom(ICmpInst *&BCmpInst,
+ CmpInst *&LatchCmpInst,
+ LoadInst *&LoadA, LoadInst *&LoadB,
+ const SCEV *&SrcA, const SCEV *&SrcB,
+ const SCEV *&NBytes) const {
+ LLVM_DEBUG(dbgs() << "Recognizing bcmp idiom\n");
+
+ // Give up if the loop is not in normal form, or has more than 2 blocks.
+ if (!CurLoop->isLoopSimplifyForm() || CurLoop->getNumBlocks() > 2) {
+ LLVM_DEBUG(dbgs() << "Basic loop structure unrecognized.\n");
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << "Recognized basic loop structure.\n");
+
+ CmpLoopStructure CmpLoop;
+ if (!matchBCmpLoopStructure(CmpLoop))
+ return false;
+
+ CmpOfLoads CmpOfLoads;
+ if (!matchBCmpOfLoads(CmpLoop.BCmpValue, CmpOfLoads))
+ return false;
+
+ if (!recognizeBCmpLoopControlFlow(CmpOfLoads, CmpLoop))
+ return false;
+
+ BCmpInst = cast<ICmpInst>(CmpLoop.BCmpValue); // FIXME: is there no
+ LatchCmpInst = cast<CmpInst>(CmpLoop.LatchCmpValue); // way to combine
+ LoadA = cast<LoadInst>(CmpOfLoads.LoadA); // these cast with
+ LoadB = cast<LoadInst>(CmpOfLoads.LoadB); // m_Value() matcher?
+
+ Type *BCmpValTy = BCmpInst->getOperand(0)->getType();
+ LLVMContext &Context = BCmpValTy->getContext();
+ uint64_t BCmpTyBits = DL->getTypeSizeInBits(BCmpValTy);
+ static constexpr uint64_t ByteTyBits = 8;
+
+ LLVM_DEBUG(dbgs() << "Got comparison between values of type " << *BCmpValTy
+ << " of size " << BCmpTyBits
+ << " bits (while byte = " << ByteTyBits << " bits).\n");
+ // bcmp()/memcmp() minimal unit of work is a byte. Therefore we must check
+ // that we are dealing with a multiple of a byte here.
+ if (BCmpTyBits % ByteTyBits != 0) {
+ LLVM_DEBUG(dbgs() << "Value size is not a multiple of byte.\n");
+ return false;
+ // FIXME: could still be done under a run-time check that the total bit
+ // count is a multiple of a byte i guess? Or handle remainder separately?
+ }
+
+ // Each comparison is done on this many bytes.
+ uint64_t BCmpTyBytes = BCmpTyBits / ByteTyBits;
+ LLVM_DEBUG(dbgs() << "Size is exactly " << BCmpTyBytes
+ << " bytes, eligible for bcmp conversion.\n");
+
+ const SCEV *Iterations;
+ if (!recognizeBCmpLoopSCEV(BCmpTyBytes, CmpOfLoads, SrcA, SrcB, Iterations))
+ return false;
+
+ // bcmp / memcmp take length argument as size_t, do promotion now.
+ Type *CmpFuncSizeTy = DL->getIntPtrType(Context);
+ Iterations = SE->getNoopOrZeroExtend(Iterations, CmpFuncSizeTy);
+ assert(Iterations != SE->getCouldNotCompute() && "Promotion failed.");
+ // Note that it didn't do ptrtoint cast, we will need to do it manually.
+
+ // We will be comparing *bytes*, not BCmpTy, we need to recalculate size.
+ // It's a multiplication, and it *could* overflow. But for it to overflow
+ // we'd want to compare more bytes than could be represented by size_t, But
+ // allocation functions also take size_t. So how'd you produce such buffer?
+ // FIXME: we likely need to actually check that we know this won't overflow,
+ // via llvm::computeOverflowForUnsignedMul().
+ NBytes = SE->getMulExpr(
+ Iterations, SE->getConstant(CmpFuncSizeTy, BCmpTyBytes), SCEV::FlagNUW);
+ assert(NBytes != SE->getCouldNotCompute() &&
+ "Shouldn't fail to increment by one.");
+
+ LLVM_DEBUG(dbgs() << "Computed total byte count: " << *NBytes << "\n");
+
+ if (LoadA->getPointerAddressSpace() != LoadB->getPointerAddressSpace() ||
+ LoadA->getPointerAddressSpace() != 0 || !LoadA->isSimple() ||
+ !LoadB->isSimple()) {
+ StringLiteral L("Unsupported loads in idiom - only support identical, "
+ "simple loads from address space 0.\n");
+ LLVM_DEBUG(dbgs() << L);
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "BCmpIdiomUnsupportedLoads",
+ BCmpInst->getDebugLoc(),
+ CurLoop->getHeader())
+ << L;
+ });
+ return false; // FIXME: support non-simple loads.
+ }
+
+ LLVM_DEBUG(dbgs() << "Recognized bcmp idiom\n");
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "RecognizedBCmpIdiom",
+ CurLoop->getStartLoc(),
+ CurLoop->getHeader())
+ << "Loop recognized as a bcmp idiom";
+ });
+
+ return true;
+}
+
+BasicBlock *
+LoopIdiomRecognize::transformBCmpControlFlow(ICmpInst *ComparedEqual) {
+ LLVM_DEBUG(dbgs() << "Transforming control-flow.\n");
+ SmallVector<DominatorTree::UpdateType, 8> DTUpdates;
+
+ BasicBlock *PreheaderBB = CurLoop->getLoopPreheader();
+ BasicBlock *HeaderBB = CurLoop->getHeader();
+ BasicBlock *LoopLatchBB = CurLoop->getLoopLatch();
+ SmallString<32> LoopName = CurLoop->getName();
+ Function *Func = PreheaderBB->getParent();
+ LLVMContext &Context = Func->getContext();
+
+ // Before doing anything, drop SCEV info.
+ SE->forgetLoop(CurLoop);
+
+ // Here we start with: (0/6)
+ // PreheaderBB: <preheader> ; preds = ???
+ // <...>
+ // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+ // %ComparedEqual = icmp eq <...> %memcmp, 0
+ // br label %LoopHeaderBB
+ // LoopHeaderBB: <header,exiting> ; preds = %PreheaderBB,%LoopLatchBB
+ // <...>
+ // br i1 %<...>, label %LoopLatchBB, label %Successor0BB
+ // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB
+ // <...>
+ // br i1 %<...>, label %Successor1BB, label %LoopHeaderBB
+ // Successor0BB: <exit> ; preds = %LoopHeaderBB
+ // %S0PHI = phi <...> [ <...>, %LoopHeaderBB ]
+ // <...>
+ // Successor1BB: <exit> ; preds = %LoopLatchBB
+ // %S1PHI = phi <...> [ <...>, %LoopLatchBB ]
+ // <...>
+ //
+ // Successor0 and Successor1 may or may not be the same basic block.
+
+ // Decouple the edge between loop preheader basic block and loop header basic
+ // block. Thus the loop has become unreachable.
+ assert(cast<BranchInst>(PreheaderBB->getTerminator())->isUnconditional() &&
+ PreheaderBB->getTerminator()->getSuccessor(0) == HeaderBB &&
+ "Preheader bb must end with an unconditional branch to header bb.");
+ PreheaderBB->getTerminator()->eraseFromParent();
+ DTUpdates.push_back({DominatorTree::Delete, PreheaderBB, HeaderBB});
+
+ // Create a new preheader basic block before loop header basic block.
+ auto *PhonyPreheaderBB = BasicBlock::Create(
+ Context, LoopName + ".phonypreheaderbb", Func, HeaderBB);
+ // And insert an unconditional branch from phony preheader basic block to
+ // loop header basic block.
+ IRBuilder<>(PhonyPreheaderBB).CreateBr(HeaderBB);
+ DTUpdates.push_back({DominatorTree::Insert, PhonyPreheaderBB, HeaderBB});
+
+ // Create a *single* new empty block that we will substitute as a
+ // successor basic block for the loop's exits. This one is temporary.
+ // Much like phony preheader basic block, it is not connected.
+ auto *PhonySuccessorBB =
+ BasicBlock::Create(Context, LoopName + ".phonysuccessorbb", Func,
+ LoopLatchBB->getNextNode());
+ // That block must have *some* non-PHI instruction, or else deleteDeadLoop()
+ // will mess up cleanup of dbginfo, and verifier will complain.
+ IRBuilder<>(PhonySuccessorBB).CreateUnreachable();
+
+ // Create two new empty blocks that we will use to preserve the original
+ // loop exit control-flow, and preserve the incoming values in the PHI nodes
+ // in loop's successor exit blocks. These will live one.
+ auto *ComparedUnequalBB =
+ BasicBlock::Create(Context, ComparedEqual->getName() + ".unequalbb", Func,
+ PhonySuccessorBB->getNextNode());
+ auto *ComparedEqualBB =
+ BasicBlock::Create(Context, ComparedEqual->getName() + ".equalbb", Func,
+ PhonySuccessorBB->getNextNode());
+
+ // By now we have: (1/6)
+ // PreheaderBB: ; preds = ???
+ // <...>
+ // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+ // %ComparedEqual = icmp eq <...> %memcmp, 0
+ // [no terminator instruction!]
+ // PhonyPreheaderBB: <preheader> ; No preds, UNREACHABLE!
+ // br label %LoopHeaderBB
+ // LoopHeaderBB: <header,exiting> ; preds = %PhonyPreheaderBB, %LoopLatchBB
+ // <...>
+ // br i1 %<...>, label %LoopLatchBB, label %Successor0BB
+ // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB
+ // <...>
+ // br i1 %<...>, label %Successor1BB, label %LoopHeaderBB
+ // PhonySuccessorBB: ; No preds, UNREACHABLE!
+ // unreachable
+ // EqualBB: ; No preds, UNREACHABLE!
+ // [no terminator instruction!]
+ // UnequalBB: ; No preds, UNREACHABLE!
+ // [no terminator instruction!]
+ // Successor0BB: <exit> ; preds = %LoopHeaderBB
+ // %S0PHI = phi <...> [ <...>, %LoopHeaderBB ]
+ // <...>
+ // Successor1BB: <exit> ; preds = %LoopLatchBB
+ // %S1PHI = phi <...> [ <...>, %LoopLatchBB ]
+ // <...>
+
+ // What is the mapping/replacement basic block for exiting out of the loop
+ // from either of old's loop basic blocks?
+ auto GetReplacementBB = [this, ComparedEqualBB,
+ ComparedUnequalBB](const BasicBlock *OldBB) {
+ assert(CurLoop->contains(OldBB) && "Only for loop's basic blocks.");
+ if (OldBB == CurLoop->getLoopLatch()) // "all elements compared equal".
+ return ComparedEqualBB;
+ if (OldBB == CurLoop->getHeader()) // "element compared unequal".
+ return ComparedUnequalBB;
+ llvm_unreachable("Only had two basic blocks in loop.");
+ };
+
+ // What are the exits out of this loop?
+ SmallVector<Loop::Edge, 2> LoopExitEdges;
+ CurLoop->getExitEdges(LoopExitEdges);
+ assert(LoopExitEdges.size() == 2 && "Should have only to two exit edges.");
+
+ // Populate new basic blocks, update the exiting control-flow, PHI nodes.
+ for (const Loop::Edge &Edge : LoopExitEdges) {
+ auto *OldLoopBB = const_cast<BasicBlock *>(Edge.first);
+ auto *SuccessorBB = const_cast<BasicBlock *>(Edge.second);
+ assert(CurLoop->contains(OldLoopBB) && !CurLoop->contains(SuccessorBB) &&
+ "Unexpected edge.");
+
+ // If we would exit the loop from this loop's basic block,
+ // what semantically would that mean? Did comparison succeed or fail?
+ BasicBlock *NewBB = GetReplacementBB(OldLoopBB);
+ assert(NewBB->empty() && "Should not get same new basic block here twice.");
+ IRBuilder<> Builder(NewBB);
+ Builder.SetCurrentDebugLocation(OldLoopBB->getTerminator()->getDebugLoc());
+ Builder.CreateBr(SuccessorBB);
+ DTUpdates.push_back({DominatorTree::Insert, NewBB, SuccessorBB});
+ // Also, be *REALLY* careful with PHI nodes in successor basic block,
+ // update them to recieve the same input value, but not from current loop's
+ // basic block, but from new basic block instead.
+ SuccessorBB->replacePhiUsesWith(OldLoopBB, NewBB);
+ // Also, change loop control-flow. This loop's basic block shall no longer
+ // exit from the loop to it's original successor basic block, but to our new
+ // phony successor basic block. Note that new successor will be unique exit.
+ OldLoopBB->getTerminator()->replaceSuccessorWith(SuccessorBB,
+ PhonySuccessorBB);
+ DTUpdates.push_back({DominatorTree::Delete, OldLoopBB, SuccessorBB});
+ DTUpdates.push_back({DominatorTree::Insert, OldLoopBB, PhonySuccessorBB});
+ }
+
+ // Inform DomTree about edge changes. Note that LoopInfo is still out-of-date.
+ assert(DTUpdates.size() == 8 && "Update count prediction failed.");
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+ DTU.applyUpdates(DTUpdates);
+ DTUpdates.clear();
+
+ // By now we have: (2/6)
+ // PreheaderBB: ; preds = ???
+ // <...>
+ // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+ // %ComparedEqual = icmp eq <...> %memcmp, 0
+ // [no terminator instruction!]
+ // PhonyPreheaderBB: <preheader> ; No preds, UNREACHABLE!
+ // br label %LoopHeaderBB
+ // LoopHeaderBB: <header,exiting> ; preds = %PhonyPreheaderBB, %LoopLatchBB
+ // <...>
+ // br i1 %<...>, label %LoopLatchBB, label %PhonySuccessorBB
+ // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB
+ // <...>
+ // br i1 %<...>, label %PhonySuccessorBB, label %LoopHeaderBB
+ // PhonySuccessorBB: <uniq. exit> ; preds = %LoopHeaderBB, %LoopLatchBB
+ // unreachable
+ // EqualBB: ; No preds, UNREACHABLE!
+ // br label %Successor1BB
+ // UnequalBB: ; No preds, UNREACHABLE!
+ // br label %Successor0BB
+ // Successor0BB: ; preds = %UnequalBB
+ // %S0PHI = phi <...> [ <...>, %UnequalBB ]
+ // <...>
+ // Successor1BB: ; preds = %EqualBB
+ // %S0PHI = phi <...> [ <...>, %EqualBB ]
+ // <...>
+
+ // *Finally*, zap the original loop. Record it's parent loop though.
+ Loop *ParentLoop = CurLoop->getParentLoop();
+ LLVM_DEBUG(dbgs() << "Deleting old loop.\n");
+ LoopDeleter.markLoopAsDeleted(CurLoop); // Mark as deleted *BEFORE* deleting!
+ deleteDeadLoop(CurLoop, DT, SE, LI); // And actually delete the loop.
+ CurLoop = nullptr;
+
+ // By now we have: (3/6)
+ // PreheaderBB: ; preds = ???
+ // <...>
+ // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+ // %ComparedEqual = icmp eq <...> %memcmp, 0
+ // [no terminator instruction!]
+ // PhonyPreheaderBB: ; No preds, UNREACHABLE!
+ // br label %PhonySuccessorBB
+ // PhonySuccessorBB: ; preds = %PhonyPreheaderBB
+ // unreachable
+ // EqualBB: ; No preds, UNREACHABLE!
+ // br label %Successor1BB
+ // UnequalBB: ; No preds, UNREACHABLE!
+ // br label %Successor0BB
+ // Successor0BB: ; preds = %UnequalBB
+ // %S0PHI = phi <...> [ <...>, %UnequalBB ]
+ // <...>
+ // Successor1BB: ; preds = %EqualBB
+ // %S0PHI = phi <...> [ <...>, %EqualBB ]
+ // <...>
+
+ // Now, actually restore the CFG.
+
+ // Insert an unconditional branch from an actual preheader basic block to
+ // phony preheader basic block.
+ IRBuilder<>(PreheaderBB).CreateBr(PhonyPreheaderBB);
+ DTUpdates.push_back({DominatorTree::Insert, PhonyPreheaderBB, HeaderBB});
+ // Insert proper conditional branch from phony successor basic block to the
+ // "dispatch" basic blocks, which were used to preserve incoming values in
+ // original loop's successor basic blocks.
+ assert(isa<UnreachableInst>(PhonySuccessorBB->getTerminator()) &&
+ "Yep, that's the one we created to keep deleteDeadLoop() happy.");
+ PhonySuccessorBB->getTerminator()->eraseFromParent();
+ {
+ IRBuilder<> Builder(PhonySuccessorBB);
+ Builder.SetCurrentDebugLocation(ComparedEqual->getDebugLoc());
+ Builder.CreateCondBr(ComparedEqual, ComparedEqualBB, ComparedUnequalBB);
+ }
+ DTUpdates.push_back(
+ {DominatorTree::Insert, PhonySuccessorBB, ComparedEqualBB});
+ DTUpdates.push_back(
+ {DominatorTree::Insert, PhonySuccessorBB, ComparedUnequalBB});
+
+ BasicBlock *DispatchBB = PhonySuccessorBB;
+ DispatchBB->setName(LoopName + ".bcmpdispatchbb");
+
+ assert(DTUpdates.size() == 3 && "Update count prediction failed.");
+ DTU.applyUpdates(DTUpdates);
+ DTUpdates.clear();
+
+ // By now we have: (4/6)
+ // PreheaderBB: ; preds = ???
+ // <...>
+ // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+ // %ComparedEqual = icmp eq <...> %memcmp, 0
+ // br label %PhonyPreheaderBB
+ // PhonyPreheaderBB: ; preds = %PreheaderBB
+ // br label %DispatchBB
+ // DispatchBB: ; preds = %PhonyPreheaderBB
+ // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
+ // EqualBB: ; preds = %DispatchBB
+ // br label %Successor1BB
+ // UnequalBB: ; preds = %DispatchBB
+ // br label %Successor0BB
+ // Successor0BB: ; preds = %UnequalBB
+ // %S0PHI = phi <...> [ <...>, %UnequalBB ]
+ // <...>
+ // Successor1BB: ; preds = %EqualBB
+ // %S0PHI = phi <...> [ <...>, %EqualBB ]
+ // <...>
+
+ // The basic CFG has been restored! Now let's merge redundant basic blocks.
+
+ // Merge phony successor basic block into it's only predecessor,
+ // phony preheader basic block. It is fully pointlessly redundant.
+ MergeBasicBlockIntoOnlyPred(DispatchBB, &DTU);
+
+ // By now we have: (5/6)
+ // PreheaderBB: ; preds = ???
+ // <...>
+ // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+ // %ComparedEqual = icmp eq <...> %memcmp, 0
+ // br label %DispatchBB
+ // DispatchBB: ; preds = %PreheaderBB
+ // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
+ // EqualBB: ; preds = %DispatchBB
+ // br label %Successor1BB
+ // UnequalBB: ; preds = %DispatchBB
+ // br label %Successor0BB
+ // Successor0BB: ; preds = %UnequalBB
+ // %S0PHI = phi <...> [ <...>, %UnequalBB ]
+ // <...>
+ // Successor1BB: ; preds = %EqualBB
+ // %S0PHI = phi <...> [ <...>, %EqualBB ]
+ // <...>
+
+ // Was this loop nested?
+ if (!ParentLoop) {
+ // If the loop was *NOT* nested, then let's also merge phony successor
+ // basic block into it's only predecessor, preheader basic block.
+ // Also, here we need to update LoopInfo.
+ LI->removeBlock(PreheaderBB);
+ MergeBasicBlockIntoOnlyPred(DispatchBB, &DTU);
+
+ // By now we have: (6/6)
+ // DispatchBB: ; preds = ???
+ // <...>
+ // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+ // %ComparedEqual = icmp eq <...> %memcmp, 0
+ // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
+ // EqualBB: ; preds = %DispatchBB
+ // br label %Successor1BB
+ // UnequalBB: ; preds = %DispatchBB
+ // br label %Successor0BB
+ // Successor0BB: ; preds = %UnequalBB
+ // %S0PHI = phi <...> [ <...>, %UnequalBB ]
+ // <...>
+ // Successor1BB: ; preds = %EqualBB
+ // %S0PHI = phi <...> [ <...>, %EqualBB ]
+ // <...>
+
+ return DispatchBB;
+ }
+
+ // Otherwise, we need to "preserve" the LoopSimplify form of the deleted loop.
+ // To achieve that, we shall keep the preheader basic block (mainly so that
+ // the loop header block will be guaranteed to have a predecessor outside of
+ // the loop), and create a phony loop with all these new three basic blocks.
+ Loop *PhonyLoop = LI->AllocateLoop();
+ ParentLoop->addChildLoop(PhonyLoop);
+ PhonyLoop->addBasicBlockToLoop(DispatchBB, *LI);
+ PhonyLoop->addBasicBlockToLoop(ComparedEqualBB, *LI);
+ PhonyLoop->addBasicBlockToLoop(ComparedUnequalBB, *LI);
+
+ // But we only have a preheader basic block, a header basic block block and
+ // two exiting basic blocks. For a proper loop we also need a backedge from
+ // non-header basic block to header bb.
+ // Let's just add a never-taken branch from both of the exiting basic blocks.
+ for (BasicBlock *BB : {ComparedEqualBB, ComparedUnequalBB}) {
+ BranchInst *OldTerminator = cast<BranchInst>(BB->getTerminator());
+ assert(OldTerminator->isUnconditional() && "That's the one we created.");
+ BasicBlock *SuccessorBB = OldTerminator->getSuccessor(0);
+
+ IRBuilder<> Builder(OldTerminator);
+ Builder.SetCurrentDebugLocation(OldTerminator->getDebugLoc());
+ Builder.CreateCondBr(ConstantInt::getTrue(Context), SuccessorBB,
+ DispatchBB);
+ OldTerminator->eraseFromParent();
+ // Yes, the backedge will never be taken. The control-flow is redundant.
+ // If it can be simplified further, other passes will take care.
+ DTUpdates.push_back({DominatorTree::Delete, BB, SuccessorBB});
+ DTUpdates.push_back({DominatorTree::Insert, BB, SuccessorBB});
+ DTUpdates.push_back({DominatorTree::Insert, BB, DispatchBB});
+ }
+ assert(DTUpdates.size() == 6 && "Update count prediction failed.");
+ DTU.applyUpdates(DTUpdates);
+ DTUpdates.clear();
+
+ // By now we have: (6/6)
+ // PreheaderBB: <preheader> ; preds = ???
+ // <...>
+ // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+ // %ComparedEqual = icmp eq <...> %memcmp, 0
+ // br label %BCmpDispatchBB
+ // BCmpDispatchBB: <header> ; preds = %PreheaderBB
+ // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
+ // EqualBB: <latch,exiting> ; preds = %BCmpDispatchBB
+ // br i1 %true, label %Successor1BB, label %BCmpDispatchBB
+ // UnequalBB: <latch,exiting> ; preds = %BCmpDispatchBB
+ // br i1 %true, label %Successor0BB, label %BCmpDispatchBB
+ // Successor0BB: ; preds = %UnequalBB
+ // %S0PHI = phi <...> [ <...>, %UnequalBB ]
+ // <...>
+ // Successor1BB: ; preds = %EqualBB
+ // %S0PHI = phi <...> [ <...>, %EqualBB ]
+ // <...>
+
+ // Finally fully DONE!
+ return DispatchBB;
+}
+
+void LoopIdiomRecognize::transformLoopToBCmp(ICmpInst *BCmpInst,
+ CmpInst *LatchCmpInst,
+ LoadInst *LoadA, LoadInst *LoadB,
+ const SCEV *SrcA, const SCEV *SrcB,
+ const SCEV *NBytes) {
+ // We will be inserting before the terminator instruction of preheader block.
+ IRBuilder<> Builder(CurLoop->getLoopPreheader()->getTerminator());
+
+ LLVM_DEBUG(dbgs() << "Transforming bcmp loop idiom into a call.\n");
+ LLVM_DEBUG(dbgs() << "Emitting new instructions.\n");
+
+ // Expand the SCEV expressions for both sources to compare, and produce value
+ // for the byte len (beware of Iterations potentially being a pointer, and
+ // account for element size being BCmpTyBytes bytes, which may be not 1 byte)
+ Value *PtrA, *PtrB, *Len;
+ {
+ SCEVExpander SExp(*SE, *DL, "LoopToBCmp");
+ SExp.setInsertPoint(&*Builder.GetInsertPoint());
+
+ auto HandlePtr = [&SExp](LoadInst *Load, const SCEV *Src) {
+ SExp.SetCurrentDebugLocation(DebugLoc());
+ // If the pointer operand of original load had dbgloc - use it.
+ if (const auto *I = dyn_cast<Instruction>(Load->getPointerOperand()))
+ SExp.SetCurrentDebugLocation(I->getDebugLoc());
+ return SExp.expandCodeFor(Src);
+ };
+ PtrA = HandlePtr(LoadA, SrcA);
+ PtrB = HandlePtr(LoadB, SrcB);
+
+ // For len calculation let's use dbgloc for the loop's latch condition.
+ Builder.SetCurrentDebugLocation(LatchCmpInst->getDebugLoc());
+ SExp.SetCurrentDebugLocation(LatchCmpInst->getDebugLoc());
+ Len = SExp.expandCodeFor(NBytes);
+
+ Type *CmpFuncSizeTy = DL->getIntPtrType(Builder.getContext());
+ assert(SE->getTypeSizeInBits(Len->getType()) ==
+ DL->getTypeSizeInBits(CmpFuncSizeTy) &&
+ "Len should already have the correct size.");
+
+ // Make sure that iteration count is a number, insert ptrtoint cast if not.
+ if (Len->getType()->isPointerTy())
+ Len = Builder.CreatePtrToInt(Len, CmpFuncSizeTy);
+ assert(Len->getType() == CmpFuncSizeTy && "Should have correct type now.");
+
+ Len->setName(Len->getName() + ".bytecount");
+
+ // There is no legality check needed. We want to compare that the memory
+ // regions [PtrA, PtrA+Len) and [PtrB, PtrB+Len) are fully identical, equal.
+ // For them to be fully equal, they must match bit-by-bit. And likewise,
+ // for them to *NOT* be fully equal, they have to differ just by one bit.
+ // The step of comparison (bits compared at once) simply does not matter.
+ }
+
+ // For the rest of new instructions, dbgloc should point at the value cmp.
+ Builder.SetCurrentDebugLocation(BCmpInst->getDebugLoc());
+
+ // Emit the comparison itself.
+ auto *CmpCall =
+ cast<CallInst>(HasBCmp ? emitBCmp(PtrA, PtrB, Len, Builder, *DL, TLI)
+ : emitMemCmp(PtrA, PtrB, Len, Builder, *DL, TLI));
+ // FIXME: add {B,Mem}CmpInst with MemoryCompareInst
+ // (based on MemIntrinsicBase) as base?
+ // FIXME: propagate metadata from loads? (alignments, AS, TBAA, ...)
+
+ // {b,mem}cmp returned 0 if they were equal, or non-zero if not equal.
+ auto *ComparedEqual = cast<ICmpInst>(Builder.CreateICmpEQ(
+ CmpCall, ConstantInt::get(CmpCall->getType(), 0),
+ PtrA->getName() + ".vs." + PtrB->getName() + ".eqcmp"));
+
+ BasicBlock *BB = transformBCmpControlFlow(ComparedEqual);
+ Builder.ClearInsertionPoint();
+
+ // We're done.
+ LLVM_DEBUG(dbgs() << "Transformed loop bcmp idiom into a call.\n");
+ ORE.emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "TransformedBCmpIdiomToCall",
+ CmpCall->getDebugLoc(), BB)
+ << "Transformed bcmp idiom into a call to "
+ << ore::NV("NewFunction", CmpCall->getCalledFunction())
+ << "() function";
+ });
+ ++NumBCmp;
+}
+
+/// Recognizes a bcmp idiom in a non-countable loop.
+///
+/// If detected, transforms the relevant code to issue the bcmp (or memcmp)
+/// intrinsic function call, and returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizeBCmp() {
+ if (!HasMemCmp && !HasBCmp)
+ return false;
+
+ ICmpInst *BCmpInst;
+ CmpInst *LatchCmpInst;
+ LoadInst *LoadA, *LoadB;
+ const SCEV *SrcA, *SrcB, *NBytes;
+ if (!detectBCmpIdiom(BCmpInst, LatchCmpInst, LoadA, LoadB, SrcA, SrcB,
+ NBytes)) {
+ LLVM_DEBUG(dbgs() << "bcmp idiom recognition failed.\n");
+ return false;
+ }
+
+ transformLoopToBCmp(BCmpInst, LatchCmpInst, LoadA, LoadB, SrcA, SrcB, NBytes);
+ return true;
+}
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 31191b52895c..368b9d4e8df1 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -192,7 +192,8 @@ public:
getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
*L->getHeader()->getParent());
const TargetLibraryInfo &TLI =
- getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+ *L->getHeader()->getParent());
MemorySSA *MSSA = nullptr;
Optional<MemorySSAUpdater> MSSAU;
if (EnableMSSALoopDependency) {
@@ -233,7 +234,7 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
auto PA = getLoopPassPreservedAnalyses();
PA.preserveSet<CFGAnalyses>();
- if (EnableMSSALoopDependency)
+ if (AR.MSSA)
PA.preserve<MemorySSAAnalysis>();
return PA;
}
diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp
index 9a42365adc1b..1af4b21b432e 100644
--- a/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -410,8 +410,6 @@ public:
void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
private:
- void splitInnerLoopLatch(Instruction *);
- void splitInnerLoopHeader();
bool adjustLoopLinks();
void adjustLoopPreheaders();
bool adjustLoopBranches();
@@ -1226,7 +1224,7 @@ bool LoopInterchangeTransform::transform() {
if (InnerLoop->getSubLoops().empty()) {
BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
- LLVM_DEBUG(dbgs() << "Calling Split Inner Loop\n");
+ LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
if (!InductionPHI) {
LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
@@ -1242,11 +1240,55 @@ bool LoopInterchangeTransform::transform() {
if (&InductionPHI->getParent()->front() != InductionPHI)
InductionPHI->moveBefore(&InductionPHI->getParent()->front());
- // Split at the place were the induction variable is
- // incremented/decremented.
- // TODO: This splitting logic may not work always. Fix this.
- splitInnerLoopLatch(InnerIndexVar);
- LLVM_DEBUG(dbgs() << "splitInnerLoopLatch done\n");
+ // Create a new latch block for the inner loop. We split at the
+ // current latch's terminator and then move the condition and all
+ // operands that are not either loop-invariant or the induction PHI into the
+ // new latch block.
+ BasicBlock *NewLatch =
+ SplitBlock(InnerLoop->getLoopLatch(),
+ InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
+
+ SmallSetVector<Instruction *, 4> WorkList;
+ unsigned i = 0;
+ auto MoveInstructions = [&i, &WorkList, this, InductionPHI, NewLatch]() {
+ for (; i < WorkList.size(); i++) {
+ // Duplicate instruction and move it the new latch. Update uses that
+ // have been moved.
+ Instruction *NewI = WorkList[i]->clone();
+ NewI->insertBefore(NewLatch->getFirstNonPHI());
+ assert(!NewI->mayHaveSideEffects() &&
+ "Moving instructions with side-effects may change behavior of "
+ "the loop nest!");
+ for (auto UI = WorkList[i]->use_begin(), UE = WorkList[i]->use_end();
+ UI != UE;) {
+ Use &U = *UI++;
+ Instruction *UserI = cast<Instruction>(U.getUser());
+ if (!InnerLoop->contains(UserI->getParent()) ||
+ UserI->getParent() == NewLatch || UserI == InductionPHI)
+ U.set(NewI);
+ }
+ // Add operands of moved instruction to the worklist, except if they are
+ // outside the inner loop or are the induction PHI.
+ for (Value *Op : WorkList[i]->operands()) {
+ Instruction *OpI = dyn_cast<Instruction>(Op);
+ if (!OpI ||
+ this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
+ OpI == InductionPHI)
+ continue;
+ WorkList.insert(OpI);
+ }
+ }
+ };
+
+ // FIXME: Should we interchange when we have a constant condition?
+ Instruction *CondI = dyn_cast<Instruction>(
+ cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator())
+ ->getCondition());
+ if (CondI)
+ WorkList.insert(CondI);
+ MoveInstructions();
+ WorkList.insert(cast<Instruction>(InnerIndexVar));
+ MoveInstructions();
// Splits the inner loops phi nodes out into a separate basic block.
BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
@@ -1263,10 +1305,6 @@ bool LoopInterchangeTransform::transform() {
return true;
}
-void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
- SplitBlock(InnerLoop->getLoopLatch(), Inc, DT, LI);
-}
-
/// \brief Move all instructions except the terminator from FromBB right before
/// InsertBefore
static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 2b3d5e0ce9b7..e8dc879a184b 100644
--- a/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -435,7 +435,8 @@ public:
PH->getTerminator());
Value *Initial = new LoadInst(
Cand.Load->getType(), InitialPtr, "load_initial",
- /* isVolatile */ false, Cand.Load->getAlignment(), PH->getTerminator());
+ /* isVolatile */ false, MaybeAlign(Cand.Load->getAlignment()),
+ PH->getTerminator());
PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded",
&L->getHeader()->front());
diff --git a/lib/Transforms/Scalar/LoopPredication.cpp b/lib/Transforms/Scalar/LoopPredication.cpp
index 507a1e251ca6..885c0e8f4b8b 100644
--- a/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/lib/Transforms/Scalar/LoopPredication.cpp
@@ -543,7 +543,7 @@ bool LoopPredication::isLoopInvariantValue(const SCEV* S) {
if (const auto *LI = dyn_cast<LoadInst>(U->getValue()))
if (LI->isUnordered() && L->hasLoopInvariantOperands(LI))
if (AA->pointsToConstantMemory(LI->getOperand(0)) ||
- LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr)
+ LI->hasMetadata(LLVMContext::MD_invariant_load))
return true;
return false;
}
diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp
index 166b57f20b43..96e2c2a3ac6b 100644
--- a/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -1644,7 +1644,8 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+ *L->getHeader()->getParent());
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index e009947690af..94517996df39 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -55,7 +55,7 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
AR.MSSA->verifyMemorySSA();
auto PA = getLoopPassPreservedAnalyses();
- if (EnableMSSALoopDependency)
+ if (AR.MSSA)
PA.preserve<MemorySSAAnalysis>();
return PA;
}
@@ -94,17 +94,15 @@ public:
auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
- auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
- auto *SE = SEWP ? &SEWP->getSE() : nullptr;
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
Optional<MemorySSAUpdater> MSSAU;
if (EnableMSSALoopDependency) {
MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
MSSAU = MemorySSAUpdater(MSSA);
}
- return LoopRotation(L, LI, TTI, AC, DT, SE,
+ return LoopRotation(L, LI, TTI, AC, &DT, &SE,
MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ,
false, MaxHeaderSize, false);
}
diff --git a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 046f4c8af492..299f3fc5fb19 100644
--- a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -690,7 +690,7 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &LPMU) {
Optional<MemorySSAUpdater> MSSAU;
- if (EnableMSSALoopDependency && AR.MSSA)
+ if (AR.MSSA)
MSSAU = MemorySSAUpdater(AR.MSSA);
bool DeleteCurrentLoop = false;
if (!simplifyLoopCFG(L, AR.DT, AR.LI, AR.SE,
@@ -702,7 +702,7 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
LPMU.markLoopAsDeleted(L, "loop-simplifycfg");
auto PA = getLoopPassPreservedAnalyses();
- if (EnableMSSALoopDependency)
+ if (AR.MSSA)
PA.preserve<MemorySSAAnalysis>();
return PA;
}
diff --git a/lib/Transforms/Scalar/LoopSink.cpp b/lib/Transforms/Scalar/LoopSink.cpp
index 975452e13f09..65e0dee0225a 100644
--- a/lib/Transforms/Scalar/LoopSink.cpp
+++ b/lib/Transforms/Scalar/LoopSink.cpp
@@ -230,12 +230,9 @@ static bool sinkInstruction(Loop &L, Instruction &I,
IC->setName(I.getName());
IC->insertBefore(&*N->getFirstInsertionPt());
// Replaces uses of I with IC in N
- for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;) {
- Use &U = *UI++;
- auto *I = cast<Instruction>(U.getUser());
- if (I->getParent() == N)
- U.set(IC);
- }
+ I.replaceUsesWithIf(IC, [N](Use &U) {
+ return cast<Instruction>(U.getUser())->getParent() == N;
+ });
// Replaces uses of I with IC in blocks dominated by N
replaceDominatedUsesWith(&I, IC, DT, N);
LLVM_DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 59a387a186b8..7f119175c4a8 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1386,7 +1386,9 @@ void Cost::RateFormula(const Formula &F,
// Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
// additional instruction (at least fill).
- unsigned TTIRegNum = TTI->getNumberOfRegisters(false) - 1;
+ // TODO: Need distinguish register class?
+ unsigned TTIRegNum = TTI->getNumberOfRegisters(
+ TTI->getRegisterClassForType(false, F.getType())) - 1;
if (C.NumRegs > TTIRegNum) {
// Cost already exceeded TTIRegNum, then only newly added register can add
// new instructions.
@@ -3165,6 +3167,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
return;
}
+ assert(IVSrc && "Failed to find IV chain source");
LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
Type *IVTy = IVSrc->getType();
@@ -3265,12 +3268,12 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
// requirements for both N and i at the same time. Limiting this code to
// equality icmps is not a problem because all interesting loops use
// equality icmps, thanks to IndVarSimplify.
- if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst))
+ if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
+ // If CI can be saved in some target, like replaced inside hardware loop
+ // in PowerPC, no need to generate initial formulae for it.
+ if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
+ continue;
if (CI->isEquality()) {
- // If CI can be saved in some target, like replaced inside hardware loop
- // in PowerPC, no need to generate initial formulae for it.
- if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
- continue;
// Swap the operands if needed to put the OperandValToReplace on the
// left, for consistency.
Value *NV = CI->getOperand(1);
@@ -3298,6 +3301,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
Factors.insert(-(uint64_t)Factors[i]);
Factors.insert(-1);
}
+ }
// Get or create an LSRUse.
std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
@@ -4834,6 +4838,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
}
}
}
+ assert(Best && "Failed to find best LSRUse candidate");
LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
<< " will yield profitable reuse.\n");
@@ -5740,7 +5745,8 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
*L->getHeader()->getParent());
auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
*L->getHeader()->getParent());
- auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+ *L->getHeader()->getParent());
return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, LibInfo);
}
diff --git a/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 86891eb451bb..8d88be420314 100644
--- a/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -166,7 +166,7 @@ static bool computeUnrollAndJamCount(
bool UseUpperBound = false;
bool ExplicitUnroll = computeUnrollCount(
L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount,
- OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
+ /*MaxOrZero*/ false, OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
if (ExplicitUnroll || UseUpperBound) {
// If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
// for the unroller instead.
@@ -293,9 +293,9 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
if (Latch != Exit || SubLoopLatch != SubLoopExit)
return LoopUnrollResult::Unmodified;
- TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
- L, SE, TTI, nullptr, nullptr, OptLevel,
- None, None, None, None, None, None);
+ TargetTransformInfo::UnrollingPreferences UP =
+ gatherUnrollingPreferences(L, SE, TTI, nullptr, nullptr, OptLevel, None,
+ None, None, None, None, None, None, None);
if (AllowUnrollAndJam.getNumOccurrences() > 0)
UP.UnrollAndJam = AllowUnrollAndJam;
if (UnrollAndJamThreshold.getNumOccurrences() > 0)
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 2fa7436213dd..a6d4164c3645 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -178,7 +178,9 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
- Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling) {
+ Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling,
+ Optional<bool> UserAllowProfileBasedPeeling,
+ Optional<unsigned> UserFullUnrollMaxCount) {
TargetTransformInfo::UnrollingPreferences UP;
// Set up the defaults
@@ -202,6 +204,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
UP.UpperBound = false;
UP.AllowPeeling = true;
UP.UnrollAndJam = false;
+ UP.PeelProfiledIterations = true;
UP.UnrollAndJamInnerLoopThreshold = 60;
// Override with any target specific settings
@@ -257,6 +260,10 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
UP.UpperBound = *UserUpperBound;
if (UserAllowPeeling.hasValue())
UP.AllowPeeling = *UserAllowPeeling;
+ if (UserAllowProfileBasedPeeling.hasValue())
+ UP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
+ if (UserFullUnrollMaxCount.hasValue())
+ UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
return UP;
}
@@ -730,7 +737,7 @@ bool llvm::computeUnrollCount(
Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
- unsigned &TripMultiple, unsigned LoopSize,
+ bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize,
TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
// Check for explicit Count.
@@ -781,18 +788,34 @@ bool llvm::computeUnrollCount(
// Also we need to check if we exceed FullUnrollMaxCount.
// If using the upper bound to unroll, TripMultiple should be set to 1 because
// we do not know when loop may exit.
- // MaxTripCount and ExactTripCount cannot both be non zero since we only
+
+ // We can unroll by the upper bound amount if it's generally allowed or if
+ // we know that the loop is executed either the upper bound or zero times.
+ // (MaxOrZero unrolling keeps only the first loop test, so the number of
+ // loop tests remains the same compared to the non-unrolled version, whereas
+ // the generic upper bound unrolling keeps all but the last loop test so the
+ // number of loop tests goes up which may end up being worse on targets with
+ // constrained branch predictor resources so is controlled by an option.)
+ // In addition we only unroll small upper bounds.
+ unsigned FullUnrollMaxTripCount = MaxTripCount;
+ if (!(UP.UpperBound || MaxOrZero) ||
+ FullUnrollMaxTripCount > UnrollMaxUpperBound)
+ FullUnrollMaxTripCount = 0;
+
+ // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only
// compute the former when the latter is zero.
unsigned ExactTripCount = TripCount;
- assert((ExactTripCount == 0 || MaxTripCount == 0) &&
- "ExtractTripCount and MaxTripCount cannot both be non zero.");
- unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount;
+ assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) &&
+ "ExtractTripCount and UnrollByMaxCount cannot both be non zero.");
+
+ unsigned FullUnrollTripCount =
+ ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount;
UP.Count = FullUnrollTripCount;
if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
// When computing the unrolled size, note that BEInsns are not replicated
// like the rest of the loop body.
if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
- UseUpperBound = (MaxTripCount == FullUnrollTripCount);
+ UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
TripCount = FullUnrollTripCount;
TripMultiple = UP.UpperBound ? 1 : TripMultiple;
return ExplicitUnroll;
@@ -806,7 +829,7 @@ bool llvm::computeUnrollCount(
unsigned Boost =
getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
- UseUpperBound = (MaxTripCount == FullUnrollTripCount);
+ UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
TripCount = FullUnrollTripCount;
TripMultiple = UP.UpperBound ? 1 : TripMultiple;
return ExplicitUnroll;
@@ -882,6 +905,8 @@ bool llvm::computeUnrollCount(
"because "
"unrolled size is too large.";
});
+ LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count
+ << "\n");
return ExplicitUnroll;
}
assert(TripCount == 0 &&
@@ -903,6 +928,12 @@ bool llvm::computeUnrollCount(
return false;
}
+ // Don't unroll a small upper bound loop unless user or TTI asked to do so.
+ if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
+ UP.Count = 0;
+ return false;
+ }
+
// Check if the runtime trip count is too small when profile is available.
if (L->getHeader()->getParent()->hasProfileData()) {
if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
@@ -966,7 +997,11 @@ bool llvm::computeUnrollCount(
if (UP.Count > UP.MaxCount)
UP.Count = UP.MaxCount;
- LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count
+
+ if (MaxTripCount && UP.Count > MaxTripCount)
+ UP.Count = MaxTripCount;
+
+ LLVM_DEBUG(dbgs() << " runtime unrolling with count: " << UP.Count
<< "\n");
if (UP.Count < 2)
UP.Count = 0;
@@ -976,13 +1011,14 @@ bool llvm::computeUnrollCount(
static LoopUnrollResult tryToUnrollLoop(
Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
const TargetTransformInfo &TTI, AssumptionCache &AC,
- OptimizationRemarkEmitter &ORE,
- BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- bool PreserveLCSSA, int OptLevel,
+ OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,
bool OnlyWhenForced, bool ForgetAllSCEV, Optional<unsigned> ProvidedCount,
Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial,
Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound,
- Optional<bool> ProvidedAllowPeeling) {
+ Optional<bool> ProvidedAllowPeeling,
+ Optional<bool> ProvidedAllowProfileBasedPeeling,
+ Optional<unsigned> ProvidedFullUnrollMaxCount) {
LLVM_DEBUG(dbgs() << "Loop Unroll: F["
<< L->getHeader()->getParent()->getName() << "] Loop %"
<< L->getHeader()->getName() << "\n");
@@ -1007,7 +1043,8 @@ static LoopUnrollResult tryToUnrollLoop(
TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount,
ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
- ProvidedAllowPeeling);
+ ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
+ ProvidedFullUnrollMaxCount);
// Exit early if unrolling is disabled. For OptForSize, we pick the loop size
// as threshold later on.
@@ -1028,10 +1065,10 @@ static LoopUnrollResult tryToUnrollLoop(
return LoopUnrollResult::Unmodified;
}
- // When optimizing for size, use LoopSize as threshold, to (fully) unroll
- // loops, if it does not increase code size.
+ // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
+ // later), to (fully) unroll loops, if it does not increase code size.
if (OptForSize)
- UP.Threshold = std::max(UP.Threshold, LoopSize);
+ UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
if (NumInlineCandidates != 0) {
LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
@@ -1040,7 +1077,6 @@ static LoopUnrollResult tryToUnrollLoop(
// Find trip count and trip multiple if count is not available
unsigned TripCount = 0;
- unsigned MaxTripCount = 0;
unsigned TripMultiple = 1;
// If there are multiple exiting blocks but one of them is the latch, use the
// latch for the trip count estimation. Otherwise insist on a single exiting
@@ -1070,28 +1106,18 @@ static LoopUnrollResult tryToUnrollLoop(
// Try to find the trip count upper bound if we cannot find the exact trip
// count.
+ unsigned MaxTripCount = 0;
bool MaxOrZero = false;
if (!TripCount) {
MaxTripCount = SE.getSmallConstantMaxTripCount(L);
MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
- // We can unroll by the upper bound amount if it's generally allowed or if
- // we know that the loop is executed either the upper bound or zero times.
- // (MaxOrZero unrolling keeps only the first loop test, so the number of
- // loop tests remains the same compared to the non-unrolled version, whereas
- // the generic upper bound unrolling keeps all but the last loop test so the
- // number of loop tests goes up which may end up being worse on targets with
- // constrained branch predictor resources so is controlled by an option.)
- // In addition we only unroll small upper bounds.
- if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) {
- MaxTripCount = 0;
- }
}
// computeUnrollCount() decides whether it is beneficial to use upper bound to
// fully unroll the loop.
bool UseUpperBound = false;
bool IsCountSetExplicitly = computeUnrollCount(
- L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount,
+ L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
TripMultiple, LoopSize, UP, UseUpperBound);
if (!UP.Count)
return LoopUnrollResult::Unmodified;
@@ -1139,7 +1165,7 @@ static LoopUnrollResult tryToUnrollLoop(
// If the loop was peeled, we already "used up" the profile information
// we had, so we don't want to unroll or peel again.
if (UnrollResult != LoopUnrollResult::FullyUnrolled &&
- (IsCountSetExplicitly || UP.PeelCount))
+ (IsCountSetExplicitly || (UP.PeelProfiledIterations && UP.PeelCount)))
L->setLoopAlreadyUnrolled();
return UnrollResult;
@@ -1169,18 +1195,24 @@ public:
Optional<bool> ProvidedRuntime;
Optional<bool> ProvidedUpperBound;
Optional<bool> ProvidedAllowPeeling;
+ Optional<bool> ProvidedAllowProfileBasedPeeling;
+ Optional<unsigned> ProvidedFullUnrollMaxCount;
LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
bool ForgetAllSCEV = false, Optional<unsigned> Threshold = None,
Optional<unsigned> Count = None,
Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
Optional<bool> UpperBound = None,
- Optional<bool> AllowPeeling = None)
+ Optional<bool> AllowPeeling = None,
+ Optional<bool> AllowProfileBasedPeeling = None,
+ Optional<unsigned> ProvidedFullUnrollMaxCount = None)
: LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
- ProvidedAllowPeeling(AllowPeeling) {
+ ProvidedAllowPeeling(AllowPeeling),
+ ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
+ ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
}
@@ -1203,10 +1235,11 @@ public:
bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
LoopUnrollResult Result = tryToUnrollLoop(
- L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr,
- PreserveLCSSA, OptLevel, OnlyWhenForced,
- ForgetAllSCEV, ProvidedCount, ProvidedThreshold, ProvidedAllowPartial,
- ProvidedRuntime, ProvidedUpperBound, ProvidedAllowPeeling);
+ L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel,
+ OnlyWhenForced, ForgetAllSCEV, ProvidedCount, ProvidedThreshold,
+ ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
+ ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
+ ProvidedFullUnrollMaxCount);
if (Result == LoopUnrollResult::FullyUnrolled)
LPM.markLoopAsDeleted(*L);
@@ -1283,14 +1316,16 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
std::string LoopName = L.getName();
- bool Changed =
- tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
- /*BFI*/ nullptr, /*PSI*/ nullptr,
- /*PreserveLCSSA*/ true, OptLevel, OnlyWhenForced,
- ForgetSCEV, /*Count*/ None,
- /*Threshold*/ None, /*AllowPartial*/ false,
- /*Runtime*/ false, /*UpperBound*/ false,
- /*AllowPeeling*/ false) != LoopUnrollResult::Unmodified;
+ bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
+ /*BFI*/ nullptr, /*PSI*/ nullptr,
+ /*PreserveLCSSA*/ true, OptLevel,
+ OnlyWhenForced, ForgetSCEV, /*Count*/ None,
+ /*Threshold*/ None, /*AllowPartial*/ false,
+ /*Runtime*/ false, /*UpperBound*/ false,
+ /*AllowPeeling*/ false,
+ /*AllowProfileBasedPeeling*/ false,
+ /*FullUnrollMaxCount*/ None) !=
+ LoopUnrollResult::Unmodified;
if (!Changed)
return PreservedAnalyses::all();
@@ -1430,7 +1465,8 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
/*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced,
UnrollOpts.ForgetSCEV, /*Count*/ None,
/*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,
- UnrollOpts.AllowUpperBound, LocalAllowPeeling);
+ UnrollOpts.AllowUpperBound, LocalAllowPeeling,
+ UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
Changed |= Result != LoopUnrollResult::Unmodified;
// The parent must not be damaged by unrolling!
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index b5b8e720069c..b410df0c5f68 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -420,7 +420,8 @@ enum OperatorChain {
/// cost of creating an entirely new loop.
static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
OperatorChain &ParentChain,
- DenseMap<Value *, Value *> &Cache) {
+ DenseMap<Value *, Value *> &Cache,
+ MemorySSAUpdater *MSSAU) {
auto CacheIt = Cache.find(Cond);
if (CacheIt != Cache.end())
return CacheIt->second;
@@ -438,7 +439,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
// TODO: Handle: br (VARIANT|INVARIANT).
// Hoist simple values out.
- if (L->makeLoopInvariant(Cond, Changed)) {
+ if (L->makeLoopInvariant(Cond, Changed, nullptr, MSSAU)) {
Cache[Cond] = Cond;
return Cond;
}
@@ -478,7 +479,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
// which will cause the branch to go away in one loop and the condition to
// simplify in the other one.
if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed,
- ParentChain, Cache)) {
+ ParentChain, Cache, MSSAU)) {
Cache[Cond] = LHS;
return LHS;
}
@@ -486,7 +487,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
// operand(1).
ParentChain = NewChain;
if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed,
- ParentChain, Cache)) {
+ ParentChain, Cache, MSSAU)) {
Cache[Cond] = RHS;
return RHS;
}
@@ -500,12 +501,12 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
/// an invariant piece, return the invariant along with the operator chain type.
/// Otherwise, return null.
-static std::pair<Value *, OperatorChain> FindLIVLoopCondition(Value *Cond,
- Loop *L,
- bool &Changed) {
+static std::pair<Value *, OperatorChain>
+FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
+ MemorySSAUpdater *MSSAU) {
DenseMap<Value *, Value *> Cache;
OperatorChain OpChain = OC_OpChainNone;
- Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache);
+ Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache, MSSAU);
// In case we do find a LIV, it can not be obtained by walking up a mixed
// operator chain.
@@ -525,7 +526,7 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
if (EnableMSSALoopDependency) {
MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
- MSSAU = make_unique<MemorySSAUpdater>(MSSA);
+ MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
assert(DT && "Cannot update MemorySSA without a valid DomTree.");
}
currentLoop = L;
@@ -694,8 +695,9 @@ bool LoopUnswitch::processCurrentLoop() {
}
for (IntrinsicInst *Guard : Guards) {
- Value *LoopCond =
- FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed).first;
+ Value *LoopCond = FindLIVLoopCondition(Guard->getOperand(0), currentLoop,
+ Changed, MSSAU.get())
+ .first;
if (LoopCond &&
UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
// NB! Unswitching (if successful) could have erased some of the
@@ -735,8 +737,9 @@ bool LoopUnswitch::processCurrentLoop() {
if (BI->isConditional()) {
// See if this, or some part of it, is loop invariant. If so, we can
// unswitch on it if we desire.
- Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
- currentLoop, Changed).first;
+ Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), currentLoop,
+ Changed, MSSAU.get())
+ .first;
if (LoopCond && !EqualityPropUnSafe(*LoopCond) &&
UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
++NumBranches;
@@ -748,7 +751,7 @@ bool LoopUnswitch::processCurrentLoop() {
Value *LoopCond;
OperatorChain OpChain;
std::tie(LoopCond, OpChain) =
- FindLIVLoopCondition(SC, currentLoop, Changed);
+ FindLIVLoopCondition(SC, currentLoop, Changed, MSSAU.get());
unsigned NumCases = SI->getNumCases();
if (LoopCond && NumCases) {
@@ -808,8 +811,9 @@ bool LoopUnswitch::processCurrentLoop() {
for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end();
BBI != E; ++BBI)
if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
- Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
- currentLoop, Changed).first;
+ Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), currentLoop,
+ Changed, MSSAU.get())
+ .first;
if (LoopCond && UnswitchIfProfitable(LoopCond,
ConstantInt::getTrue(Context))) {
++NumSelects;
@@ -1123,8 +1127,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
if (!BI->isConditional())
return false;
- Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
- currentLoop, Changed).first;
+ Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), currentLoop,
+ Changed, MSSAU.get())
+ .first;
// Unswitch only if the trivial condition itself is an LIV (not
// partial LIV which could occur in and/or)
@@ -1157,8 +1162,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
return true;
} else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
// If this isn't switching on an invariant condition, we can't unswitch it.
- Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
- currentLoop, Changed).first;
+ Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), currentLoop,
+ Changed, MSSAU.get())
+ .first;
// Unswitch only if the trivial condition itself is an LIV (not
// partial LIV which could occur in and/or)
@@ -1240,6 +1246,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
LoopBlocks.clear();
NewBlocks.clear();
+ if (MSSAU && VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
+
// First step, split the preheader and exit blocks, and add these blocks to
// the LoopBlocks list.
BasicBlock *NewPreheader =
@@ -1607,36 +1616,30 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
// If BI's parent is the only pred of the successor, fold the two blocks
// together.
BasicBlock *Pred = BI->getParent();
+ (void)Pred;
BasicBlock *Succ = BI->getSuccessor(0);
BasicBlock *SinglePred = Succ->getSinglePredecessor();
if (!SinglePred) continue; // Nothing to do.
assert(SinglePred == Pred && "CFG broken");
- LLVM_DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- "
- << Succ->getName() << "\n");
-
- // Resolve any single entry PHI nodes in Succ.
- while (PHINode *PN = dyn_cast<PHINode>(Succ->begin()))
- ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM,
- MSSAU.get());
-
- // If Succ has any successors with PHI nodes, update them to have
- // entries coming from Pred instead of Succ.
- Succ->replaceAllUsesWith(Pred);
-
- // Move all of the successor contents from Succ to Pred.
- Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(),
- Succ->begin(), Succ->end());
- if (MSSAU)
- MSSAU->moveAllAfterMergeBlocks(Succ, Pred, BI);
+ // Make the LPM and Worklist updates specific to LoopUnswitch.
LPM->deleteSimpleAnalysisValue(BI, L);
RemoveFromWorklist(BI, Worklist);
- BI->eraseFromParent();
-
- // Remove Succ from the loop tree.
- LI->removeBlock(Succ);
LPM->deleteSimpleAnalysisValue(Succ, L);
- Succ->eraseFromParent();
+ auto SuccIt = Succ->begin();
+ while (PHINode *PN = dyn_cast<PHINode>(SuccIt++)) {
+ for (unsigned It = 0, E = PN->getNumOperands(); It != E; ++It)
+ if (Instruction *Use = dyn_cast<Instruction>(PN->getOperand(It)))
+ Worklist.push_back(Use);
+ for (User *U : PN->users())
+ Worklist.push_back(cast<Instruction>(U));
+ LPM->deleteSimpleAnalysisValue(PN, L);
+ RemoveFromWorklist(PN, Worklist);
+ ++NumSimplify;
+ }
+ // Merge the block and make the remaining analyses updates.
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+ MergeBlockIntoPredecessor(Succ, &DTU, LI, MSSAU.get());
++NumSimplify;
continue;
}
diff --git a/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 896dd8bcb922..2ccb7cae3079 100644
--- a/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -112,37 +112,6 @@ static cl::opt<unsigned> LVLoopDepthThreshold(
"LoopVersioningLICM's threshold for maximum allowed loop nest/depth"),
cl::init(2), cl::Hidden);
-/// Create MDNode for input string.
-static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) {
- LLVMContext &Context = TheLoop->getHeader()->getContext();
- Metadata *MDs[] = {
- MDString::get(Context, Name),
- ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))};
- return MDNode::get(Context, MDs);
-}
-
-/// Set input string into loop metadata by keeping other values intact.
-void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
- unsigned V) {
- SmallVector<Metadata *, 4> MDs(1);
- // If the loop already has metadata, retain it.
- MDNode *LoopID = TheLoop->getLoopID();
- if (LoopID) {
- for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
- MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
- MDs.push_back(Node);
- }
- }
- // Add new metadata.
- MDs.push_back(createStringMetadata(TheLoop, MDString, V));
- // Replace current metadata node with new one.
- LLVMContext &Context = TheLoop->getHeader()->getContext();
- MDNode *NewLoopID = MDNode::get(Context, MDs);
- // Set operand 0 to refer to the loop id itself.
- NewLoopID->replaceOperandWith(0, NewLoopID);
- TheLoop->setLoopID(NewLoopID);
-}
-
namespace {
struct LoopVersioningLICM : public LoopPass {
diff --git a/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
new file mode 100644
index 000000000000..d0fcf38b5a7b
--- /dev/null
+++ b/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
@@ -0,0 +1,170 @@
+//===- LowerConstantIntrinsics.cpp - Lower constant intrinsic calls -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers all remaining 'objectsize' 'is.constant' intrinsic calls
+// and provides constant propagation and basic CFG cleanup on the result.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "lower-is-constant-intrinsic"
+
+STATISTIC(IsConstantIntrinsicsHandled,
+ "Number of 'is.constant' intrinsic calls handled");
+STATISTIC(ObjectSizeIntrinsicsHandled,
+ "Number of 'objectsize' intrinsic calls handled");
+
+static Value *lowerIsConstantIntrinsic(IntrinsicInst *II) {
+ Value *Op = II->getOperand(0);
+
+ return isa<Constant>(Op) ? ConstantInt::getTrue(II->getType())
+ : ConstantInt::getFalse(II->getType());
+}
+
+static bool replaceConditionalBranchesOnConstant(Instruction *II,
+ Value *NewValue) {
+ bool HasDeadBlocks = false;
+ SmallSetVector<Instruction *, 8> Worklist;
+ replaceAndRecursivelySimplify(II, NewValue, nullptr, nullptr, nullptr,
+ &Worklist);
+ for (auto I : Worklist) {
+ BranchInst *BI = dyn_cast<BranchInst>(I);
+ if (!BI)
+ continue;
+ if (BI->isUnconditional())
+ continue;
+
+ BasicBlock *Target, *Other;
+ if (match(BI->getOperand(0), m_Zero())) {
+ Target = BI->getSuccessor(1);
+ Other = BI->getSuccessor(0);
+ } else if (match(BI->getOperand(0), m_One())) {
+ Target = BI->getSuccessor(0);
+ Other = BI->getSuccessor(1);
+ } else {
+ Target = nullptr;
+ Other = nullptr;
+ }
+ if (Target && Target != Other) {
+ BasicBlock *Source = BI->getParent();
+ Other->removePredecessor(Source);
+ BI->eraseFromParent();
+ BranchInst::Create(Target, Source);
+ if (pred_begin(Other) == pred_end(Other))
+ HasDeadBlocks = true;
+ }
+ }
+ return HasDeadBlocks;
+}
+
+static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo *TLI) {
+ bool HasDeadBlocks = false;
+ const auto &DL = F.getParent()->getDataLayout();
+ SmallVector<WeakTrackingVH, 8> Worklist;
+
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+ for (BasicBlock *BB : RPOT) {
+ for (Instruction &I: *BB) {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+ if (!II)
+ continue;
+ switch (II->getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::is_constant:
+ case Intrinsic::objectsize:
+ Worklist.push_back(WeakTrackingVH(&I));
+ break;
+ }
+ }
+ }
+ for (WeakTrackingVH &VH: Worklist) {
+ // Items on the worklist can be mutated by earlier recursive replaces.
+ // This can remove the intrinsic as dead (VH == null), but also replace
+ // the intrinsic in place.
+ if (!VH)
+ continue;
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*VH);
+ if (!II)
+ continue;
+ Value *NewValue;
+ switch (II->getIntrinsicID()) {
+ default:
+ continue;
+ case Intrinsic::is_constant:
+ NewValue = lowerIsConstantIntrinsic(II);
+ IsConstantIntrinsicsHandled++;
+ break;
+ case Intrinsic::objectsize:
+ NewValue = lowerObjectSizeCall(II, DL, TLI, true);
+ ObjectSizeIntrinsicsHandled++;
+ break;
+ }
+ HasDeadBlocks |= replaceConditionalBranchesOnConstant(II, NewValue);
+ }
+ if (HasDeadBlocks)
+ removeUnreachableBlocks(F);
+ return !Worklist.empty();
+}
+
+PreservedAnalyses
+LowerConstantIntrinsicsPass::run(Function &F, FunctionAnalysisManager &AM) {
+ if (lowerConstantIntrinsics(F, AM.getCachedResult<TargetLibraryAnalysis>(F)))
+ return PreservedAnalyses::none();
+
+ return PreservedAnalyses::all();
+}
+
+namespace {
+/// Legacy pass for lowering is.constant intrinsics out of the IR.
+///
+/// When this pass is run over a function it converts is.constant intrinsics
+/// into 'true' or 'false'. This is completements the normal constand folding
+/// to 'true' as part of Instruction Simplify passes.
+class LowerConstantIntrinsics : public FunctionPass {
+public:
+ static char ID;
+ LowerConstantIntrinsics() : FunctionPass(ID) {
+ initializeLowerConstantIntrinsicsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+ return lowerConstantIntrinsics(F, TLI);
+ }
+};
+} // namespace
+
+char LowerConstantIntrinsics::ID = 0;
+INITIALIZE_PASS(LowerConstantIntrinsics, "lower-constant-intrinsics",
+ "Lower constant intrinsics", false, false)
+
+FunctionPass *llvm::createLowerConstantIntrinsicsPass() {
+ return new LowerConstantIntrinsics();
+}
diff --git a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 0d67c0d740ec..d85f20b3f80c 100644
--- a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -26,6 +26,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/MisExpect.h"
using namespace llvm;
@@ -71,15 +72,20 @@ static bool handleSwitchExpect(SwitchInst &SI) {
unsigned n = SI.getNumCases(); // +1 for default case.
SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight);
- if (Case == *SI.case_default())
- Weights[0] = LikelyBranchWeight;
- else
- Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight;
+ uint64_t Index = (Case == *SI.case_default()) ? 0 : Case.getCaseIndex() + 1;
+ Weights[Index] = LikelyBranchWeight;
+
+ SI.setMetadata(
+ LLVMContext::MD_misexpect,
+ MDBuilder(CI->getContext())
+ .createMisExpect(Index, LikelyBranchWeight, UnlikelyBranchWeight));
+
+ SI.setCondition(ArgValue);
+ misexpect::checkFrontendInstrumentation(SI);
SI.setMetadata(LLVMContext::MD_prof,
MDBuilder(CI->getContext()).createBranchWeights(Weights));
- SI.setCondition(ArgValue);
return true;
}
@@ -155,7 +161,7 @@ static void handlePhiDef(CallInst *Expect) {
return Result;
};
- auto *PhiDef = dyn_cast<PHINode>(V);
+ auto *PhiDef = cast<PHINode>(V);
// Get the first dominating conditional branch of the operand
// i's incoming block.
@@ -280,19 +286,28 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
MDBuilder MDB(CI->getContext());
MDNode *Node;
+ MDNode *ExpNode;
if ((ExpectedValue->getZExtValue() == ValueComparedTo) ==
- (Predicate == CmpInst::ICMP_EQ))
+ (Predicate == CmpInst::ICMP_EQ)) {
Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight);
- else
+ ExpNode = MDB.createMisExpect(0, LikelyBranchWeight, UnlikelyBranchWeight);
+ } else {
Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight);
+ ExpNode = MDB.createMisExpect(1, LikelyBranchWeight, UnlikelyBranchWeight);
+ }
- BSI.setMetadata(LLVMContext::MD_prof, Node);
+ BSI.setMetadata(LLVMContext::MD_misexpect, ExpNode);
if (CmpI)
CmpI->setOperand(0, ArgValue);
else
BSI.setCondition(ArgValue);
+
+ misexpect::checkFrontendInstrumentation(BSI);
+
+ BSI.setMetadata(LLVMContext::MD_prof, Node);
+
return true;
}
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 5a055139be4f..2364748efb05 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -69,90 +69,6 @@ STATISTIC(NumMemSetInfer, "Number of memsets inferred");
STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy");
STATISTIC(NumCpyToSet, "Number of memcpys converted to memset");
-static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx,
- bool &VariableIdxFound,
- const DataLayout &DL) {
- // Skip over the first indices.
- gep_type_iterator GTI = gep_type_begin(GEP);
- for (unsigned i = 1; i != Idx; ++i, ++GTI)
- /*skip along*/;
-
- // Compute the offset implied by the rest of the indices.
- int64_t Offset = 0;
- for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
- ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i));
- if (!OpC)
- return VariableIdxFound = true;
- if (OpC->isZero()) continue; // No offset.
-
- // Handle struct indices, which add their field offset to the pointer.
- if (StructType *STy = GTI.getStructTypeOrNull()) {
- Offset += DL.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
- continue;
- }
-
- // Otherwise, we have a sequential type like an array or vector. Multiply
- // the index by the ElementSize.
- uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
- Offset += Size*OpC->getSExtValue();
- }
-
- return Offset;
-}
-
-/// Return true if Ptr1 is provably equal to Ptr2 plus a constant offset, and
-/// return that constant offset. For example, Ptr1 might be &A[42], and Ptr2
-/// might be &A[40]. In this case offset would be -8.
-static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
- const DataLayout &DL) {
- Ptr1 = Ptr1->stripPointerCasts();
- Ptr2 = Ptr2->stripPointerCasts();
-
- // Handle the trivial case first.
- if (Ptr1 == Ptr2) {
- Offset = 0;
- return true;
- }
-
- GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1);
- GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2);
-
- bool VariableIdxFound = false;
-
- // If one pointer is a GEP and the other isn't, then see if the GEP is a
- // constant offset from the base, as in "P" and "gep P, 1".
- if (GEP1 && !GEP2 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) {
- Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, DL);
- return !VariableIdxFound;
- }
-
- if (GEP2 && !GEP1 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) {
- Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, DL);
- return !VariableIdxFound;
- }
-
- // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical
- // base. After that base, they may have some number of common (and
- // potentially variable) indices. After that they handle some constant
- // offset, which determines their offset from each other. At this point, we
- // handle no other case.
- if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0))
- return false;
-
- // Skip any common indices and track the GEP types.
- unsigned Idx = 1;
- for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx)
- if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx))
- break;
-
- int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, DL);
- int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, DL);
- if (VariableIdxFound) return false;
-
- Offset = Offset2-Offset1;
- return true;
-}
-
namespace {
/// Represents a range of memset'd bytes with the ByteVal value.
@@ -419,12 +335,12 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
break;
// Check to see if this store is to a constant offset from the start ptr.
- int64_t Offset;
- if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset,
- DL))
+ Optional<int64_t> Offset =
+ isPointerOffset(StartPtr, NextStore->getPointerOperand(), DL);
+ if (!Offset)
break;
- Ranges.addStore(Offset, NextStore);
+ Ranges.addStore(*Offset, NextStore);
} else {
MemSetInst *MSI = cast<MemSetInst>(BI);
@@ -433,11 +349,11 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
break;
// Check to see if this store is to a constant offset from the start ptr.
- int64_t Offset;
- if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, DL))
+ Optional<int64_t> Offset = isPointerOffset(StartPtr, MSI->getDest(), DL);
+ if (!Offset)
break;
- Ranges.addMemSet(Offset, MSI);
+ Ranges.addMemSet(*Offset, MSI);
}
}
@@ -597,9 +513,13 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
ToLift.push_back(C);
for (unsigned k = 0, e = C->getNumOperands(); k != e; ++k)
- if (auto *A = dyn_cast<Instruction>(C->getOperand(k)))
- if (A->getParent() == SI->getParent())
+ if (auto *A = dyn_cast<Instruction>(C->getOperand(k))) {
+ if (A->getParent() == SI->getParent()) {
+ // Cannot hoist user of P above P
+ if(A == P) return false;
Args.insert(A);
+ }
+ }
}
// We made it, we need to lift
@@ -979,7 +899,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
// If the destination wasn't sufficiently aligned then increase its alignment.
if (!isDestSufficientlyAligned) {
assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
- cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
+ cast<AllocaInst>(cpyDest)->setAlignment(MaybeAlign(srcAlign));
}
// Drop any cached information about the call, because we may have changed
@@ -1516,7 +1436,7 @@ bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
return false;
auto *MD = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
- auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
auto LookupAliasAnalysis = [this]() -> AliasAnalysis & {
return getAnalysis<AAResultsWrapperPass>().getAAResults();
diff --git a/lib/Transforms/Scalar/MergeICmps.cpp b/lib/Transforms/Scalar/MergeICmps.cpp
index 3d047a193267..98a45b391319 100644
--- a/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/lib/Transforms/Scalar/MergeICmps.cpp
@@ -897,7 +897,7 @@ public:
bool runOnFunction(Function &F) override {
if (skipFunction(F)) return false;
- const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
// MergeICmps does not need the DominatorTree, but we update it if it's
// already available.
diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 30645f4400e3..9799ea7960ec 100644
--- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -14,9 +14,11 @@
// diamond (hammock) and merges them into a single load in the header. Similar
// it sinks and merges two stores to the tail block (footer). The algorithm
// iterates over the instructions of one side of the diamond and attempts to
-// find a matching load/store on the other side. It hoists / sinks when it
-// thinks it safe to do so. This optimization helps with eg. hiding load
-// latencies, triggering if-conversion, and reducing static code size.
+// find a matching load/store on the other side. New tail/footer block may be
+// insterted if the tail/footer block has more predecessors (not only the two
+// predecessors that are forming the diamond). It hoists / sinks when it thinks
+// it safe to do so. This optimization helps with eg. hiding load latencies,
+// triggering if-conversion, and reducing static code size.
//
// NOTE: This code no longer performs load hoisting, it is subsumed by GVNHoist.
//
@@ -103,7 +105,9 @@ class MergedLoadStoreMotion {
// Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl.
const int MagicCompileTimeControl = 250;
+ const bool SplitFooterBB;
public:
+ MergedLoadStoreMotion(bool SplitFooterBB) : SplitFooterBB(SplitFooterBB) {}
bool run(Function &F, AliasAnalysis &AA);
private:
@@ -114,7 +118,9 @@ private:
PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
bool isStoreSinkBarrierInRange(const Instruction &Start,
const Instruction &End, MemoryLocation Loc);
- bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst);
+ bool canSinkStoresAndGEPs(StoreInst *S0, StoreInst *S1) const;
+ void sinkStoresAndGEPs(BasicBlock *BB, StoreInst *SinkCand,
+ StoreInst *ElseInst);
bool mergeStores(BasicBlock *BB);
};
} // end anonymous namespace
@@ -217,74 +223,82 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
}
///
+/// Check if 2 stores can be sunk together with corresponding GEPs
+///
+bool MergedLoadStoreMotion::canSinkStoresAndGEPs(StoreInst *S0,
+ StoreInst *S1) const {
+ auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
+ auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
+ return A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() &&
+ (A0->getParent() == S0->getParent()) && A1->hasOneUse() &&
+ (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0);
+}
+
+///
/// Merge two stores to same address and sink into \p BB
///
/// Also sinks GEP instruction computing the store address
///
-bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
- StoreInst *S1) {
+void MergedLoadStoreMotion::sinkStoresAndGEPs(BasicBlock *BB, StoreInst *S0,
+ StoreInst *S1) {
// Only one definition?
auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
- if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() &&
- (A0->getParent() == S0->getParent()) && A1->hasOneUse() &&
- (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0)) {
- LLVM_DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
- dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
- dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
- // Hoist the instruction.
- BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
- // Intersect optional metadata.
- S0->andIRFlags(S1);
- S0->dropUnknownNonDebugMetadata();
-
- // Create the new store to be inserted at the join point.
- StoreInst *SNew = cast<StoreInst>(S0->clone());
- Instruction *ANew = A0->clone();
- SNew->insertBefore(&*InsertPt);
- ANew->insertBefore(SNew);
-
- assert(S0->getParent() == A0->getParent());
- assert(S1->getParent() == A1->getParent());
-
- // New PHI operand? Use it.
- if (PHINode *NewPN = getPHIOperand(BB, S0, S1))
- SNew->setOperand(0, NewPN);
- S0->eraseFromParent();
- S1->eraseFromParent();
- A0->replaceAllUsesWith(ANew);
- A0->eraseFromParent();
- A1->replaceAllUsesWith(ANew);
- A1->eraseFromParent();
- return true;
- }
- return false;
+ LLVM_DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
+ dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
+ dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
+ // Hoist the instruction.
+ BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
+ // Intersect optional metadata.
+ S0->andIRFlags(S1);
+ S0->dropUnknownNonDebugMetadata();
+
+ // Create the new store to be inserted at the join point.
+ StoreInst *SNew = cast<StoreInst>(S0->clone());
+ Instruction *ANew = A0->clone();
+ SNew->insertBefore(&*InsertPt);
+ ANew->insertBefore(SNew);
+
+ assert(S0->getParent() == A0->getParent());
+ assert(S1->getParent() == A1->getParent());
+
+ // New PHI operand? Use it.
+ if (PHINode *NewPN = getPHIOperand(BB, S0, S1))
+ SNew->setOperand(0, NewPN);
+ S0->eraseFromParent();
+ S1->eraseFromParent();
+ A0->replaceAllUsesWith(ANew);
+ A0->eraseFromParent();
+ A1->replaceAllUsesWith(ANew);
+ A1->eraseFromParent();
}
///
/// True when two stores are equivalent and can sink into the footer
///
-/// Starting from a diamond tail block, iterate over the instructions in one
-/// predecessor block and try to match a store in the second predecessor.
+/// Starting from a diamond head block, iterate over the instructions in one
+/// successor block and try to match a store in the second successor.
///
-bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
+bool MergedLoadStoreMotion::mergeStores(BasicBlock *HeadBB) {
bool MergedStores = false;
- assert(T && "Footer of a diamond cannot be empty");
-
- pred_iterator PI = pred_begin(T), E = pred_end(T);
- assert(PI != E);
- BasicBlock *Pred0 = *PI;
- ++PI;
- BasicBlock *Pred1 = *PI;
- ++PI;
+ BasicBlock *TailBB = getDiamondTail(HeadBB);
+ BasicBlock *SinkBB = TailBB;
+ assert(SinkBB && "Footer of a diamond cannot be empty");
+
+ succ_iterator SI = succ_begin(HeadBB);
+ assert(SI != succ_end(HeadBB) && "Diamond head cannot have zero successors");
+ BasicBlock *Pred0 = *SI;
+ ++SI;
+ assert(SI != succ_end(HeadBB) && "Diamond head cannot have single successor");
+ BasicBlock *Pred1 = *SI;
// tail block of a diamond/hammock?
if (Pred0 == Pred1)
return false; // No.
- if (PI != E)
- return false; // No. More than 2 predecessors.
-
- // #Instructions in Succ1 for Compile Time Control
+ // bail out early if we can not merge into the footer BB
+ if (!SplitFooterBB && TailBB->hasNPredecessorsOrMore(3))
+ return false;
+ // #Instructions in Pred1 for Compile Time Control
auto InstsNoDbg = Pred1->instructionsWithoutDebug();
int Size1 = std::distance(InstsNoDbg.begin(), InstsNoDbg.end());
int NStores = 0;
@@ -304,14 +318,23 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
if (NStores * Size1 >= MagicCompileTimeControl)
break;
if (StoreInst *S1 = canSinkFromBlock(Pred1, S0)) {
- bool Res = sinkStore(T, S0, S1);
- MergedStores |= Res;
- // Don't attempt to sink below stores that had to stick around
- // But after removal of a store and some of its feeding
- // instruction search again from the beginning since the iterator
- // is likely stale at this point.
- if (!Res)
+ if (!canSinkStoresAndGEPs(S0, S1))
+ // Don't attempt to sink below stores that had to stick around
+ // But after removal of a store and some of its feeding
+ // instruction search again from the beginning since the iterator
+ // is likely stale at this point.
break;
+
+ if (SinkBB == TailBB && TailBB->hasNPredecessorsOrMore(3)) {
+ // We have more than 2 predecessors. Insert a new block
+ // postdominating 2 predecessors we're going to sink from.
+ SinkBB = SplitBlockPredecessors(TailBB, {Pred0, Pred1}, ".sink.split");
+ if (!SinkBB)
+ break;
+ }
+
+ MergedStores = true;
+ sinkStoresAndGEPs(SinkBB, S0, S1);
RBI = Pred0->rbegin();
RBE = Pred0->rend();
LLVM_DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
@@ -328,13 +351,15 @@ bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) {
// Merge unconditional branches, allowing PRE to catch more
// optimization opportunities.
+ // This loop doesn't care about newly inserted/split blocks
+ // since they never will be diamond heads.
for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) {
BasicBlock *BB = &*FI++;
// Hoist equivalent loads and sink stores
// outside diamonds when possible
if (isDiamondHead(BB)) {
- Changed |= mergeStores(getDiamondTail(BB));
+ Changed |= mergeStores(BB);
}
}
return Changed;
@@ -342,9 +367,11 @@ bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) {
namespace {
class MergedLoadStoreMotionLegacyPass : public FunctionPass {
+ const bool SplitFooterBB;
public:
static char ID; // Pass identification, replacement for typeid
- MergedLoadStoreMotionLegacyPass() : FunctionPass(ID) {
+ MergedLoadStoreMotionLegacyPass(bool SplitFooterBB = false)
+ : FunctionPass(ID), SplitFooterBB(SplitFooterBB) {
initializeMergedLoadStoreMotionLegacyPassPass(
*PassRegistry::getPassRegistry());
}
@@ -355,13 +382,14 @@ public:
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
- MergedLoadStoreMotion Impl;
+ MergedLoadStoreMotion Impl(SplitFooterBB);
return Impl.run(F, getAnalysis<AAResultsWrapperPass>().getAAResults());
}
private:
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
+ if (!SplitFooterBB)
+ AU.setPreservesCFG();
AU.addRequired<AAResultsWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
@@ -373,8 +401,8 @@ char MergedLoadStoreMotionLegacyPass::ID = 0;
///
/// createMergedLoadStoreMotionPass - The public interface to this file.
///
-FunctionPass *llvm::createMergedLoadStoreMotionPass() {
- return new MergedLoadStoreMotionLegacyPass();
+FunctionPass *llvm::createMergedLoadStoreMotionPass(bool SplitFooterBB) {
+ return new MergedLoadStoreMotionLegacyPass(SplitFooterBB);
}
INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion",
@@ -385,13 +413,14 @@ INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion",
PreservedAnalyses
MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) {
- MergedLoadStoreMotion Impl;
+ MergedLoadStoreMotion Impl(Options.SplitFooterBB);
auto &AA = AM.getResult<AAManager>(F);
if (!Impl.run(F, AA))
return PreservedAnalyses::all();
PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
+ if (!Options.SplitFooterBB)
+ PA.preserveSet<CFGAnalyses>();
PA.preserve<GlobalsAA>();
return PA;
}
diff --git a/lib/Transforms/Scalar/NaryReassociate.cpp b/lib/Transforms/Scalar/NaryReassociate.cpp
index 94436b55752a..1260bd39cdee 100644
--- a/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -170,7 +170,7 @@ bool NaryReassociateLegacyPass::runOnFunction(Function &F) {
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
return Impl.runImpl(F, AC, DT, SE, TLI, TTI);
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 08ac2b666fce..b213264de557 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -89,6 +89,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
@@ -122,6 +123,7 @@
using namespace llvm;
using namespace llvm::GVNExpression;
using namespace llvm::VNCoercion;
+using namespace llvm::PatternMatch;
#define DEBUG_TYPE "newgvn"
@@ -656,7 +658,7 @@ public:
TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA,
const DataLayout &DL)
: F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), DL(DL),
- PredInfo(make_unique<PredicateInfo>(F, *DT, *AC)),
+ PredInfo(std::make_unique<PredicateInfo>(F, *DT, *AC)),
SQ(DL, TLI, DT, AC, /*CtxI=*/nullptr, /*UseInstrInfo=*/false) {}
bool runGVN();
@@ -1332,7 +1334,7 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
E->setOpcode(0);
E->op_push_back(PointerOp);
if (LI)
- E->setAlignment(LI->getAlignment());
+ E->setAlignment(MaybeAlign(LI->getAlignment()));
// TODO: Value number heap versions. We may be able to discover
// things alias analysis can't on it's own (IE that a store and a
@@ -1637,8 +1639,11 @@ const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
if (AA->doesNotAccessMemory(CI)) {
return createCallExpression(CI, TOPClass->getMemoryLeader());
} else if (AA->onlyReadsMemory(CI)) {
- MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(CI);
- return createCallExpression(CI, DefiningAccess);
+ if (auto *MA = MSSA->getMemoryAccess(CI)) {
+ auto *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(MA);
+ return createCallExpression(CI, DefiningAccess);
+ } else // MSSA determined that CI does not access memory.
+ return createCallExpression(CI, TOPClass->getMemoryLeader());
}
return nullptr;
}
@@ -1754,7 +1759,7 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
return true;
});
// If we are left with no operands, it's dead.
- if (empty(Filtered)) {
+ if (Filtered.empty()) {
// If it has undef at this point, it means there are no-non-undef arguments,
// and thus, the value of the phi node must be undef.
if (HasUndef) {
@@ -2464,9 +2469,9 @@ Value *NewGVN::findConditionEquivalence(Value *Cond) const {
// Process the outgoing edges of a block for reachability.
void NewGVN::processOutgoingEdges(Instruction *TI, BasicBlock *B) {
// Evaluate reachability of terminator instruction.
- BranchInst *BR;
- if ((BR = dyn_cast<BranchInst>(TI)) && BR->isConditional()) {
- Value *Cond = BR->getCondition();
+ Value *Cond;
+ BasicBlock *TrueSucc, *FalseSucc;
+ if (match(TI, m_Br(m_Value(Cond), TrueSucc, FalseSucc))) {
Value *CondEvaluated = findConditionEquivalence(Cond);
if (!CondEvaluated) {
if (auto *I = dyn_cast<Instruction>(Cond)) {
@@ -2479,8 +2484,6 @@ void NewGVN::processOutgoingEdges(Instruction *TI, BasicBlock *B) {
}
}
ConstantInt *CI;
- BasicBlock *TrueSucc = BR->getSuccessor(0);
- BasicBlock *FalseSucc = BR->getSuccessor(1);
if (CondEvaluated && (CI = dyn_cast<ConstantInt>(CondEvaluated))) {
if (CI->isOne()) {
LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI
@@ -4196,7 +4199,7 @@ bool NewGVNLegacyPass::runOnFunction(Function &F) {
return false;
return NewGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
&getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F),
&getAnalysis<AAResultsWrapperPass>().getAAResults(),
&getAnalysis<MemorySSAWrapperPass>().getMSSA(),
F.getParent()->getDataLayout())
diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 039123218544..68a0f5151ad5 100644
--- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -161,7 +161,7 @@ public:
return false;
TargetLibraryInfo *TLI =
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
const TargetTransformInfo *TTI =
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
return runPartiallyInlineLibCalls(F, TLI, TTI);
diff --git a/lib/Transforms/Scalar/PlaceSafepoints.cpp b/lib/Transforms/Scalar/PlaceSafepoints.cpp
index b544f0a39ea8..beb299272ed8 100644
--- a/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -131,7 +131,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
for (Loop *I : *LI) {
runOnLoopAndSubLoops(I);
}
@@ -240,7 +240,7 @@ static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,
static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
BasicBlock *Pred) {
// A conservative bound on the loop as a whole.
- const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L);
+ const SCEV *MaxTrips = SE->getConstantMaxBackedgeTakenCount(L);
if (MaxTrips != SE->getCouldNotCompute() &&
SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(
CountedLoopTripWidth))
@@ -478,7 +478,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
return false;
const TargetLibraryInfo &TLI =
- getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
bool Modified = false;
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index fa8c9e2a5fe4..124f625ef7b6 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -861,7 +861,7 @@ static Value *NegateValue(Value *V, Instruction *BI,
// this use. We do this by moving it to the entry block (if it is a
// non-instruction value) or right after the definition. These negates will
// be zapped by reassociate later, so we don't need much finesse here.
- BinaryOperator *TheNeg = cast<BinaryOperator>(U);
+ Instruction *TheNeg = cast<Instruction>(U);
// Verify that the negate is in this function, V might be a constant expr.
if (TheNeg->getParent()->getParent() != BI->getParent()->getParent())
@@ -1938,88 +1938,132 @@ void ReassociatePass::EraseInst(Instruction *I) {
MadeChange = true;
}
-// Canonicalize expressions of the following form:
-// x + (-Constant * y) -> x - (Constant * y)
-// x - (-Constant * y) -> x + (Constant * y)
-Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) {
- if (!I->hasOneUse() || I->getType()->isVectorTy())
- return nullptr;
-
- // Must be a fmul or fdiv instruction.
- unsigned Opcode = I->getOpcode();
- if (Opcode != Instruction::FMul && Opcode != Instruction::FDiv)
- return nullptr;
-
- auto *C0 = dyn_cast<ConstantFP>(I->getOperand(0));
- auto *C1 = dyn_cast<ConstantFP>(I->getOperand(1));
-
- // Both operands are constant, let it get constant folded away.
- if (C0 && C1)
- return nullptr;
-
- ConstantFP *CF = C0 ? C0 : C1;
-
- // Must have one constant operand.
- if (!CF)
- return nullptr;
+/// Recursively analyze an expression to build a list of instructions that have
+/// negative floating-point constant operands. The caller can then transform
+/// the list to create positive constants for better reassociation and CSE.
+static void getNegatibleInsts(Value *V,
+ SmallVectorImpl<Instruction *> &Candidates) {
+ // Handle only one-use instructions. Combining negations does not justify
+ // replicating instructions.
+ Instruction *I;
+ if (!match(V, m_OneUse(m_Instruction(I))))
+ return;
- // Must be a negative ConstantFP.
- if (!CF->isNegative())
- return nullptr;
+ // Handle expressions of multiplications and divisions.
+ // TODO: This could look through floating-point casts.
+ const APFloat *C;
+ switch (I->getOpcode()) {
+ case Instruction::FMul:
+ // Not expecting non-canonical code here. Bail out and wait.
+ if (match(I->getOperand(0), m_Constant()))
+ break;
- // User must be a binary operator with one or more uses.
- Instruction *User = I->user_back();
- if (!isa<BinaryOperator>(User) || User->use_empty())
- return nullptr;
+ if (match(I->getOperand(1), m_APFloat(C)) && C->isNegative()) {
+ Candidates.push_back(I);
+ LLVM_DEBUG(dbgs() << "FMul with negative constant: " << *I << '\n');
+ }
+ getNegatibleInsts(I->getOperand(0), Candidates);
+ getNegatibleInsts(I->getOperand(1), Candidates);
+ break;
+ case Instruction::FDiv:
+ // Not expecting non-canonical code here. Bail out and wait.
+ if (match(I->getOperand(0), m_Constant()) &&
+ match(I->getOperand(1), m_Constant()))
+ break;
- unsigned UserOpcode = User->getOpcode();
- if (UserOpcode != Instruction::FAdd && UserOpcode != Instruction::FSub)
- return nullptr;
+ if ((match(I->getOperand(0), m_APFloat(C)) && C->isNegative()) ||
+ (match(I->getOperand(1), m_APFloat(C)) && C->isNegative())) {
+ Candidates.push_back(I);
+ LLVM_DEBUG(dbgs() << "FDiv with negative constant: " << *I << '\n');
+ }
+ getNegatibleInsts(I->getOperand(0), Candidates);
+ getNegatibleInsts(I->getOperand(1), Candidates);
+ break;
+ default:
+ break;
+ }
+}
- // Subtraction is not commutative. Explicitly, the following transform is
- // not valid: (-Constant * y) - x -> x + (Constant * y)
- if (!User->isCommutative() && User->getOperand(1) != I)
+/// Given an fadd/fsub with an operand that is a one-use instruction
+/// (the fadd/fsub), try to change negative floating-point constants into
+/// positive constants to increase potential for reassociation and CSE.
+Instruction *ReassociatePass::canonicalizeNegFPConstantsForOp(Instruction *I,
+ Instruction *Op,
+ Value *OtherOp) {
+ assert((I->getOpcode() == Instruction::FAdd ||
+ I->getOpcode() == Instruction::FSub) && "Expected fadd/fsub");
+
+ // Collect instructions with negative FP constants from the subtree that ends
+ // in Op.
+ SmallVector<Instruction *, 4> Candidates;
+ getNegatibleInsts(Op, Candidates);
+ if (Candidates.empty())
return nullptr;
// Don't canonicalize x + (-Constant * y) -> x - (Constant * y), if the
// resulting subtract will be broken up later. This can get us into an
// infinite loop during reassociation.
- if (UserOpcode == Instruction::FAdd && ShouldBreakUpSubtract(User))
+ bool IsFSub = I->getOpcode() == Instruction::FSub;
+ bool NeedsSubtract = !IsFSub && Candidates.size() % 2 == 1;
+ if (NeedsSubtract && ShouldBreakUpSubtract(I))
return nullptr;
- // Change the sign of the constant.
- APFloat Val = CF->getValueAPF();
- Val.changeSign();
- I->setOperand(C0 ? 0 : 1, ConstantFP::get(CF->getContext(), Val));
-
- // Canonicalize I to RHS to simplify the next bit of logic. E.g.,
- // ((-Const*y) + x) -> (x + (-Const*y)).
- if (User->getOperand(0) == I && User->isCommutative())
- cast<BinaryOperator>(User)->swapOperands();
-
- Value *Op0 = User->getOperand(0);
- Value *Op1 = User->getOperand(1);
- BinaryOperator *NI;
- switch (UserOpcode) {
- default:
- llvm_unreachable("Unexpected Opcode!");
- case Instruction::FAdd:
- NI = BinaryOperator::CreateFSub(Op0, Op1);
- NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
- break;
- case Instruction::FSub:
- NI = BinaryOperator::CreateFAdd(Op0, Op1);
- NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
- break;
+ for (Instruction *Negatible : Candidates) {
+ const APFloat *C;
+ if (match(Negatible->getOperand(0), m_APFloat(C))) {
+ assert(!match(Negatible->getOperand(1), m_Constant()) &&
+ "Expecting only 1 constant operand");
+ assert(C->isNegative() && "Expected negative FP constant");
+ Negatible->setOperand(0, ConstantFP::get(Negatible->getType(), abs(*C)));
+ MadeChange = true;
+ }
+ if (match(Negatible->getOperand(1), m_APFloat(C))) {
+ assert(!match(Negatible->getOperand(0), m_Constant()) &&
+ "Expecting only 1 constant operand");
+ assert(C->isNegative() && "Expected negative FP constant");
+ Negatible->setOperand(1, ConstantFP::get(Negatible->getType(), abs(*C)));
+ MadeChange = true;
+ }
}
+ assert(MadeChange == true && "Negative constant candidate was not changed");
- NI->insertBefore(User);
- NI->setName(User->getName());
- User->replaceAllUsesWith(NI);
- NI->setDebugLoc(I->getDebugLoc());
+ // Negations cancelled out.
+ if (Candidates.size() % 2 == 0)
+ return I;
+
+ // Negate the final operand in the expression by flipping the opcode of this
+ // fadd/fsub.
+ assert(Candidates.size() % 2 == 1 && "Expected odd number");
+ IRBuilder<> Builder(I);
+ Value *NewInst = IsFSub ? Builder.CreateFAddFMF(OtherOp, Op, I)
+ : Builder.CreateFSubFMF(OtherOp, Op, I);
+ I->replaceAllUsesWith(NewInst);
RedoInsts.insert(I);
- MadeChange = true;
- return NI;
+ return dyn_cast<Instruction>(NewInst);
+}
+
+/// Canonicalize expressions that contain a negative floating-point constant
+/// of the following form:
+/// OtherOp + (subtree) -> OtherOp {+/-} (canonical subtree)
+/// (subtree) + OtherOp -> OtherOp {+/-} (canonical subtree)
+/// OtherOp - (subtree) -> OtherOp {+/-} (canonical subtree)
+///
+/// The fadd/fsub opcode may be switched to allow folding a negation into the
+/// input instruction.
+Instruction *ReassociatePass::canonicalizeNegFPConstants(Instruction *I) {
+ LLVM_DEBUG(dbgs() << "Combine negations for: " << *I << '\n');
+ Value *X;
+ Instruction *Op;
+ if (match(I, m_FAdd(m_Value(X), m_OneUse(m_Instruction(Op)))))
+ if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X))
+ I = R;
+ if (match(I, m_FAdd(m_OneUse(m_Instruction(Op)), m_Value(X))))
+ if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X))
+ I = R;
+ if (match(I, m_FSub(m_Value(X), m_OneUse(m_Instruction(Op)))))
+ if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X))
+ I = R;
+ return I;
}
/// Inspect and optimize the given instruction. Note that erasing
@@ -2042,16 +2086,16 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
I = NI;
}
- // Canonicalize negative constants out of expressions.
- if (Instruction *Res = canonicalizeNegConstExpr(I))
- I = Res;
-
// Commute binary operators, to canonicalize the order of their operands.
// This can potentially expose more CSE opportunities, and makes writing other
// transformations simpler.
if (I->isCommutative())
canonicalizeOperands(I);
+ // Canonicalize negative constants out of expressions.
+ if (Instruction *Res = canonicalizeNegFPConstants(I))
+ I = Res;
+
// Don't optimize floating-point instructions unless they are 'fast'.
if (I->getType()->isFPOrFPVectorTy() && !I->isFast())
return;
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index c358258d24cf..48bbdd8d1b33 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -172,8 +172,6 @@ public:
bool runOnModule(Module &M) override {
bool Changed = false;
- const TargetLibraryInfo &TLI =
- getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
for (Function &F : M) {
// Nothing to do for declarations.
if (F.isDeclaration() || F.empty())
@@ -186,6 +184,8 @@ public:
TargetTransformInfo &TTI =
getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ const TargetLibraryInfo &TLI =
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
auto &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
Changed |= Impl.runOnFunction(F, DT, TTI, TLI);
@@ -2530,7 +2530,7 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
// statepoints surviving this pass. This makes testing easier and the
// resulting IR less confusing to human readers.
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
- bool MadeChange = removeUnreachableBlocks(F, nullptr, &DTU);
+ bool MadeChange = removeUnreachableBlocks(F, &DTU);
// Flush the Dominator Tree.
DTU.getDomTree();
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 4093e50ce899..10fbdc8aacd2 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -191,7 +191,7 @@ public:
///
class SCCPSolver : public InstVisitor<SCCPSolver> {
const DataLayout &DL;
- const TargetLibraryInfo *TLI;
+ std::function<const TargetLibraryInfo &(Function &)> GetTLI;
SmallPtrSet<BasicBlock *, 8> BBExecutable; // The BBs that are executable.
DenseMap<Value *, LatticeVal> ValueState; // The state each value is in.
// The state each parameter is in.
@@ -268,8 +268,9 @@ public:
return {A->second.DT, A->second.PDT, DomTreeUpdater::UpdateStrategy::Lazy};
}
- SCCPSolver(const DataLayout &DL, const TargetLibraryInfo *tli)
- : DL(DL), TLI(tli) {}
+ SCCPSolver(const DataLayout &DL,
+ std::function<const TargetLibraryInfo &(Function &)> GetTLI)
+ : DL(DL), GetTLI(std::move(GetTLI)) {}
/// MarkBlockExecutable - This method can be used by clients to mark all of
/// the blocks that are known to be intrinsically live in the processed unit.
@@ -1290,7 +1291,7 @@ CallOverdefined:
// If we can constant fold this, mark the result of the call as a
// constant.
if (Constant *C = ConstantFoldCall(cast<CallBase>(CS.getInstruction()), F,
- Operands, TLI)) {
+ Operands, &GetTLI(*F))) {
// call -> undef.
if (isa<UndefValue>(C))
return;
@@ -1465,7 +1466,24 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
}
LatticeVal &LV = getValueState(&I);
- if (!LV.isUnknown()) continue;
+ if (!LV.isUnknown())
+ continue;
+
+ // There are two reasons a call can have an undef result
+ // 1. It could be tracked.
+ // 2. It could be constant-foldable.
+ // Because of the way we solve return values, tracked calls must
+ // never be marked overdefined in ResolvedUndefsIn.
+ if (CallSite CS = CallSite(&I)) {
+ if (Function *F = CS.getCalledFunction())
+ if (TrackedRetVals.count(F))
+ continue;
+
+ // If the call is constant-foldable, we mark it overdefined because
+ // we do not know what return values are valid.
+ markOverdefined(&I);
+ return true;
+ }
// extractvalue is safe; check here because the argument is a struct.
if (isa<ExtractValueInst>(I))
@@ -1638,19 +1656,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
case Instruction::Call:
case Instruction::Invoke:
case Instruction::CallBr:
- // There are two reasons a call can have an undef result
- // 1. It could be tracked.
- // 2. It could be constant-foldable.
- // Because of the way we solve return values, tracked calls must
- // never be marked overdefined in ResolvedUndefsIn.
- if (Function *F = CallSite(&I).getCalledFunction())
- if (TrackedRetVals.count(F))
- break;
-
- // If the call is constant-foldable, we mark it overdefined because
- // we do not know what return values are valid.
- markOverdefined(&I);
- return true;
+ llvm_unreachable("Call-like instructions should have be handled early");
default:
// If we don't know what should happen here, conservatively mark it
// overdefined.
@@ -1751,7 +1757,7 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
[](const LatticeVal &LV) { return LV.isOverdefined(); }))
return false;
std::vector<Constant *> ConstVals;
- auto *ST = dyn_cast<StructType>(V->getType());
+ auto *ST = cast<StructType>(V->getType());
for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
LatticeVal V = IVs[i];
ConstVals.push_back(V.isConstant()
@@ -1796,7 +1802,8 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
static bool runSCCP(Function &F, const DataLayout &DL,
const TargetLibraryInfo *TLI) {
LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
- SCCPSolver Solver(DL, TLI);
+ SCCPSolver Solver(
+ DL, [TLI](Function &F) -> const TargetLibraryInfo & { return *TLI; });
// Mark the first block of the function as being executable.
Solver.MarkBlockExecutable(&F.front());
@@ -1891,7 +1898,7 @@ public:
return false;
const DataLayout &DL = F.getParent()->getDataLayout();
const TargetLibraryInfo *TLI =
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
return runSCCP(F, DL, TLI);
}
};
@@ -1924,6 +1931,27 @@ static void findReturnsToZap(Function &F,
return;
}
+ assert(
+ all_of(F.users(),
+ [&Solver](User *U) {
+ if (isa<Instruction>(U) &&
+ !Solver.isBlockExecutable(cast<Instruction>(U)->getParent()))
+ return true;
+ // Non-callsite uses are not impacted by zapping. Also, constant
+ // uses (like blockaddresses) could stuck around, without being
+ // used in the underlying IR, meaning we do not have lattice
+ // values for them.
+ if (!CallSite(U))
+ return true;
+ if (U->getType()->isStructTy()) {
+ return all_of(
+ Solver.getStructLatticeValueFor(U),
+ [](const LatticeVal &LV) { return !LV.isOverdefined(); });
+ }
+ return !Solver.getLatticeValueFor(U).isOverdefined();
+ }) &&
+ "We can only zap functions where all live users have a concrete value");
+
for (BasicBlock &BB : F) {
if (CallInst *CI = BB.getTerminatingMustTailCall()) {
LLVM_DEBUG(dbgs() << "Can't zap return of the block due to present "
@@ -1974,9 +2002,10 @@ static void forceIndeterminateEdge(Instruction* I, SCCPSolver &Solver) {
}
bool llvm::runIPSCCP(
- Module &M, const DataLayout &DL, const TargetLibraryInfo *TLI,
+ Module &M, const DataLayout &DL,
+ std::function<const TargetLibraryInfo &(Function &)> GetTLI,
function_ref<AnalysisResultsForFn(Function &)> getAnalysis) {
- SCCPSolver Solver(DL, TLI);
+ SCCPSolver Solver(DL, GetTLI);
// Loop over all functions, marking arguments to those with their addresses
// taken or that are external as overdefined.
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 33f90d0b01e4..74b8ff913050 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -959,14 +959,16 @@ private:
std::tie(UsedI, I) = Uses.pop_back_val();
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
- Size = std::max(Size, DL.getTypeStoreSize(LI->getType()));
+ Size = std::max(Size,
+ DL.getTypeStoreSize(LI->getType()).getFixedSize());
continue;
}
if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
Value *Op = SI->getOperand(0);
if (Op == UsedI)
return SI;
- Size = std::max(Size, DL.getTypeStoreSize(Op->getType()));
+ Size = std::max(Size,
+ DL.getTypeStoreSize(Op->getType()).getFixedSize());
continue;
}
@@ -1197,7 +1199,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
// TODO: Allow recursive phi users.
// TODO: Allow stores.
BasicBlock *BB = PN.getParent();
- unsigned MaxAlign = 0;
+ MaybeAlign MaxAlign;
uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
APInt MaxSize(APWidth, 0);
bool HaveLoad = false;
@@ -1218,8 +1220,8 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
if (BBI->mayWriteToMemory())
return false;
- uint64_t Size = DL.getTypeStoreSizeInBits(LI->getType());
- MaxAlign = std::max(MaxAlign, LI->getAlignment());
+ uint64_t Size = DL.getTypeStoreSize(LI->getType());
+ MaxAlign = std::max(MaxAlign, MaybeAlign(LI->getAlignment()));
MaxSize = MaxSize.ult(Size) ? APInt(APWidth, Size) : MaxSize;
HaveLoad = true;
}
@@ -1266,11 +1268,11 @@ static void speculatePHINodeLoads(PHINode &PN) {
PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
PN.getName() + ".sroa.speculated");
- // Get the AA tags and alignment to use from one of the loads. It doesn't
+ // Get the AA tags and alignment to use from one of the loads. It does not
// matter which one we get and if any differ.
AAMDNodes AATags;
SomeLoad->getAAMetadata(AATags);
- unsigned Align = SomeLoad->getAlignment();
+ const MaybeAlign Align = MaybeAlign(SomeLoad->getAlignment());
// Rewrite all loads of the PN to use the new PHI.
while (!PN.use_empty()) {
@@ -1338,11 +1340,11 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) {
// Both operands to the select need to be dereferenceable, either
// absolutely (e.g. allocas) or at this point because we can see other
// accesses to it.
- if (!isSafeToLoadUnconditionally(TValue, LI->getType(), LI->getAlignment(),
- DL, LI))
+ if (!isSafeToLoadUnconditionally(TValue, LI->getType(),
+ MaybeAlign(LI->getAlignment()), DL, LI))
return false;
- if (!isSafeToLoadUnconditionally(FValue, LI->getType(), LI->getAlignment(),
- DL, LI))
+ if (!isSafeToLoadUnconditionally(FValue, LI->getType(),
+ MaybeAlign(LI->getAlignment()), DL, LI))
return false;
}
@@ -1368,8 +1370,8 @@ static void speculateSelectInstLoads(SelectInst &SI) {
NumLoadsSpeculated += 2;
// Transfer alignment and AA info if present.
- TL->setAlignment(LI->getAlignment());
- FL->setAlignment(LI->getAlignment());
+ TL->setAlignment(MaybeAlign(LI->getAlignment()));
+ FL->setAlignment(MaybeAlign(LI->getAlignment()));
AAMDNodes Tags;
LI->getAAMetadata(Tags);
@@ -1888,6 +1890,14 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
bool HaveCommonEltTy = true;
auto CheckCandidateType = [&](Type *Ty) {
if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+ // Return if bitcast to vectors is different for total size in bits.
+ if (!CandidateTys.empty()) {
+ VectorType *V = CandidateTys[0];
+ if (DL.getTypeSizeInBits(VTy) != DL.getTypeSizeInBits(V)) {
+ CandidateTys.clear();
+ return;
+ }
+ }
CandidateTys.push_back(VTy);
if (!CommonEltTy)
CommonEltTy = VTy->getElementType();
@@ -3110,7 +3120,7 @@ private:
unsigned LoadAlign = LI->getAlignment();
if (!LoadAlign)
LoadAlign = DL.getABITypeAlignment(LI->getType());
- LI->setAlignment(std::min(LoadAlign, getSliceAlign()));
+ LI->setAlignment(MaybeAlign(std::min(LoadAlign, getSliceAlign())));
continue;
}
if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
@@ -3119,7 +3129,7 @@ private:
Value *Op = SI->getOperand(0);
StoreAlign = DL.getABITypeAlignment(Op->getType());
}
- SI->setAlignment(std::min(StoreAlign, getSliceAlign()));
+ SI->setAlignment(MaybeAlign(std::min(StoreAlign, getSliceAlign())));
continue;
}
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 869cf00e0a89..1d2e40bf62be 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -79,6 +79,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeLoopVersioningLICMPass(Registry);
initializeLoopIdiomRecognizeLegacyPassPass(Registry);
initializeLowerAtomicLegacyPassPass(Registry);
+ initializeLowerConstantIntrinsicsPass(Registry);
initializeLowerExpectIntrinsicPass(Registry);
initializeLowerGuardIntrinsicLegacyPassPass(Registry);
initializeLowerWidenableConditionLegacyPassPass(Registry);
@@ -123,6 +124,10 @@ void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createAggressiveDCEPass());
}
+void LLVMAddDCEPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createDeadCodeEliminationPass());
+}
+
void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createBitTrackingDCEPass());
}
@@ -280,6 +285,10 @@ void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createBasicAAWrapperPass());
}
+void LLVMAddLowerConstantIntrinsicsPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLowerConstantIntrinsicsPass());
+}
+
void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createLowerExpectIntrinsicPass());
}
diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index f6a12fb13142..41554fccdf08 100644
--- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1121,7 +1121,7 @@ bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
bool Changed = false;
for (BasicBlock &B : F) {
for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;)
diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index aeac6f548b32..ac832b9b4567 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -1909,7 +1909,7 @@ static void unswitchNontrivialInvariants(
// We can only unswitch switches, conditional branches with an invariant
// condition, or combining invariant conditions with an instruction.
- assert((SI || BI->isConditional()) &&
+ assert((SI || (BI && BI->isConditional())) &&
"Can only unswitch switches and conditional branch!");
bool FullUnswitch = SI || BI->getCondition() == Invariants[0];
if (FullUnswitch)
@@ -2141,17 +2141,21 @@ static void unswitchNontrivialInvariants(
buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction,
*ClonedPH, *LoopPH);
DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
+
+ if (MSSAU) {
+ DT.applyUpdates(DTUpdates);
+ DTUpdates.clear();
+
+ // Perform MSSA cloning updates.
+ for (auto &VMap : VMaps)
+ MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap,
+ /*IgnoreIncomingWithNoClones=*/true);
+ MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT);
+ }
}
// Apply the updates accumulated above to get an up-to-date dominator tree.
DT.applyUpdates(DTUpdates);
- if (!FullUnswitch && MSSAU) {
- // Update MSSA for partial unswitch, after DT update.
- SmallVector<CFGUpdate, 1> Updates;
- Updates.push_back(
- {cfg::UpdateKind::Insert, SplitBB, ClonedPHs.begin()->second});
- MSSAU->applyInsertUpdates(Updates, DT);
- }
// Now that we have an accurate dominator tree, first delete the dead cloned
// blocks so that we can accurately build any cloned loops. It is important to
@@ -2720,7 +2724,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
return Cost * (SuccessorsCount - 1);
};
Instruction *BestUnswitchTI = nullptr;
- int BestUnswitchCost;
+ int BestUnswitchCost = 0;
ArrayRef<Value *> BestUnswitchInvariants;
for (auto &TerminatorAndInvariants : UnswitchCandidates) {
Instruction &TI = *TerminatorAndInvariants.first;
@@ -2752,6 +2756,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
BestUnswitchInvariants = Invariants;
}
}
+ assert(BestUnswitchTI && "Failed to find loop unswitch candidate");
if (BestUnswitchCost >= UnswitchThreshold) {
LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: "
@@ -2880,7 +2885,7 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast));
auto PA = getLoopPassPreservedAnalyses();
- if (EnableMSSALoopDependency)
+ if (AR.MSSA)
PA.preserve<MemorySSAAnalysis>();
return PA;
}
diff --git a/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
index c13fb3e04516..e6db11f47ead 100644
--- a/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
+++ b/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
@@ -777,8 +777,10 @@ static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
// speculation if the predecessor is an invoke. This doesn't seem
// fundamental and we should probably be splitting critical edges
// differently.
- if (isa<IndirectBrInst>(PredBB->getTerminator()) ||
- isa<InvokeInst>(PredBB->getTerminator())) {
+ const auto *TermInst = PredBB->getTerminator();
+ if (isa<IndirectBrInst>(TermInst) ||
+ isa<InvokeInst>(TermInst) ||
+ isa<CallBrInst>(TermInst)) {
LLVM_DEBUG(dbgs() << " Invalid: predecessor terminator: "
<< PredBB->getName() << "\n");
return false;
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index e5400676c7e8..9791cf41f621 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -65,7 +65,7 @@ static cl::opt<bool> ForceSkipUniformRegions(
static cl::opt<bool>
RelaxedUniformRegions("structurizecfg-relaxed-uniform-regions", cl::Hidden,
cl::desc("Allow relaxed uniform region checks"),
- cl::init(false));
+ cl::init(true));
// Definition of the complex types used in this pass.
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index f0b79079d817..b27a36b67d62 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -341,7 +341,7 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
const DataLayout &DL = L->getModule()->getDataLayout();
if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) ||
!isSafeToLoadUnconditionally(L->getPointerOperand(), L->getType(),
- L->getAlignment(), DL, L))
+ MaybeAlign(L->getAlignment()), DL, L))
return false;
}
}