summaryrefslogtreecommitdiff
path: root/lib/Transforms/Scalar
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2015-01-18 16:17:27 +0000
committerDimitry Andric <dim@FreeBSD.org>2015-01-18 16:17:27 +0000
commit67c32a98315f785a9ec9d531c1f571a0196c7463 (patch)
tree4abb9cbeecc7901726dd0b4a37369596c852e9ef /lib/Transforms/Scalar
parent9f61947910e6ab40de38e6b4034751ef1513200f (diff)
Diffstat (limited to 'lib/Transforms/Scalar')
-rw-r--r--lib/Transforms/Scalar/ADCE.cpp2
-rw-r--r--lib/Transforms/Scalar/AlignmentFromAssumptions.cpp428
-rw-r--r--lib/Transforms/Scalar/CMakeLists.txt1
-rw-r--r--lib/Transforms/Scalar/ConstantHoisting.cpp4
-rw-r--r--lib/Transforms/Scalar/CorrelatedValuePropagation.cpp18
-rw-r--r--lib/Transforms/Scalar/DeadStoreElimination.cpp9
-rw-r--r--lib/Transforms/Scalar/EarlyCSE.cpp29
-rw-r--r--lib/Transforms/Scalar/GVN.cpp440
-rw-r--r--lib/Transforms/Scalar/IndVarSimplify.cpp154
-rw-r--r--lib/Transforms/Scalar/JumpThreading.cpp130
-rw-r--r--lib/Transforms/Scalar/LICM.cpp99
-rw-r--r--lib/Transforms/Scalar/LLVMBuild.txt2
-rw-r--r--lib/Transforms/Scalar/LoadCombine.cpp25
-rw-r--r--lib/Transforms/Scalar/LoopDeletion.cpp5
-rw-r--r--lib/Transforms/Scalar/LoopInstSimplify.cpp11
-rw-r--r--lib/Transforms/Scalar/LoopRerollPass.cpp58
-rw-r--r--lib/Transforms/Scalar/LoopRotation.cpp49
-rw-r--r--lib/Transforms/Scalar/LoopStrengthReduce.cpp72
-rw-r--r--lib/Transforms/Scalar/LoopUnrollPass.cpp158
-rw-r--r--lib/Transforms/Scalar/LoopUnswitch.cpp24
-rw-r--r--lib/Transforms/Scalar/MemCpyOptimizer.cpp62
-rw-r--r--lib/Transforms/Scalar/MergedLoadStoreMotion.cpp151
-rw-r--r--lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp4
-rw-r--r--lib/Transforms/Scalar/Reassociate.cpp473
-rw-r--r--lib/Transforms/Scalar/Reg2Mem.cpp2
-rw-r--r--lib/Transforms/Scalar/SCCP.cpp8
-rw-r--r--lib/Transforms/Scalar/SROA.cpp1824
-rw-r--r--lib/Transforms/Scalar/SampleProfile.cpp570
-rw-r--r--lib/Transforms/Scalar/Scalar.cpp14
-rw-r--r--lib/Transforms/Scalar/ScalarReplAggregates.cpp52
-rw-r--r--lib/Transforms/Scalar/Scalarizer.cpp33
-rw-r--r--lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp333
-rw-r--r--lib/Transforms/Scalar/SimplifyCFGPass.cpp27
-rw-r--r--lib/Transforms/Scalar/Sink.cpp11
-rw-r--r--lib/Transforms/Scalar/StructurizeCFG.cpp7
-rw-r--r--lib/Transforms/Scalar/TailRecursionElimination.cpp36
36 files changed, 3531 insertions, 1794 deletions
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index 1a3a4aadce6a..3d9198469bc5 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -73,7 +73,7 @@ bool ADCE::runOnFunction(Function& F) {
for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end();
OI != OE; ++OI)
if (Instruction* Inst = dyn_cast<Instruction>(OI))
- if (alive.insert(Inst))
+ if (alive.insert(Inst).second)
worklist.push_back(Inst);
}
diff --git a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
new file mode 100644
index 000000000000..f48cefaa4fba
--- /dev/null
+++ b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -0,0 +1,428 @@
+//===----------------------- AlignmentFromAssumptions.cpp -----------------===//
+// Set Load/Store Alignments From Assumptions
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a ScalarEvolution-based transformation to set
+// the alignments of load, stores and memory intrinsics based on the truth
+// expressions of assume intrinsics. The primary motivation is to handle
+// complex alignment assumptions that apply to vector loads and stores that
+// appear after vectorization and unrolling.
+//
+//===----------------------------------------------------------------------===//
+
+#define AA_NAME "alignment-from-assumptions"
+#define DEBUG_TYPE AA_NAME
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(NumLoadAlignChanged,
+ "Number of loads changed by alignment assumptions");
+STATISTIC(NumStoreAlignChanged,
+ "Number of stores changed by alignment assumptions");
+STATISTIC(NumMemIntAlignChanged,
+ "Number of memory intrinsics changed by alignment assumptions");
+
+namespace {
+struct AlignmentFromAssumptions : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ AlignmentFromAssumptions() : FunctionPass(ID) {
+ initializeAlignmentFromAssumptionsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F);
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<ScalarEvolution>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+
+ AU.setPreservesCFG();
+ AU.addPreserved<LoopInfo>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<ScalarEvolution>();
+ }
+
+ // For memory transfers, we need a common alignment for both the source and
+ // destination. If we have a new alignment for only one operand of a transfer
+ // instruction, save it in these maps. If we reach the other operand through
+ // another assumption later, then we may change the alignment at that point.
+ DenseMap<MemTransferInst *, unsigned> NewDestAlignments, NewSrcAlignments;
+
+ ScalarEvolution *SE;
+ DominatorTree *DT;
+ const DataLayout *DL;
+
+ bool extractAlignmentInfo(CallInst *I, Value *&AAPtr, const SCEV *&AlignSCEV,
+ const SCEV *&OffSCEV);
+ bool processAssumption(CallInst *I);
+};
+}
+
+char AlignmentFromAssumptions::ID = 0;
+static const char aip_name[] = "Alignment from assumptions";
+INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME,
+ aip_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME,
+ aip_name, false, false)
+
+FunctionPass *llvm::createAlignmentFromAssumptionsPass() {
+ return new AlignmentFromAssumptions();
+}
+
+// Given an expression for the (constant) alignment, AlignSCEV, and an
+// expression for the displacement between a pointer and the aligned address,
+// DiffSCEV, compute the alignment of the displaced pointer if it can be reduced
+// to a constant. Using SCEV to compute alignment handles the case where
+// DiffSCEV is a recurrence with constant start such that the aligned offset
+// is constant. e.g. {16,+,32} % 32 -> 16.
+static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,
+ const SCEV *AlignSCEV,
+ ScalarEvolution *SE) {
+ // DiffUnits = Diff % int64_t(Alignment)
+ const SCEV *DiffAlignDiv = SE->getUDivExpr(DiffSCEV, AlignSCEV);
+ const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV);
+ const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV);
+
+ DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is " <<
+ *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
+
+ if (const SCEVConstant *ConstDUSCEV =
+ dyn_cast<SCEVConstant>(DiffUnitsSCEV)) {
+ int64_t DiffUnits = ConstDUSCEV->getValue()->getSExtValue();
+
+ // If the displacement is an exact multiple of the alignment, then the
+ // displaced pointer has the same alignment as the aligned pointer, so
+ // return the alignment value.
+ if (!DiffUnits)
+ return (unsigned)
+ cast<SCEVConstant>(AlignSCEV)->getValue()->getSExtValue();
+
+ // If the displacement is not an exact multiple, but the remainder is a
+ // constant, then return this remainder (but only if it is a power of 2).
+ uint64_t DiffUnitsAbs = abs64(DiffUnits);
+ if (isPowerOf2_64(DiffUnitsAbs))
+ return (unsigned) DiffUnitsAbs;
+ }
+
+ return 0;
+}
+
+// There is an address given by an offset OffSCEV from AASCEV which has an
+// alignment AlignSCEV. Use that information, if possible, to compute a new
+// alignment for Ptr.
+static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
+ const SCEV *OffSCEV, Value *Ptr,
+ ScalarEvolution *SE) {
+ const SCEV *PtrSCEV = SE->getSCEV(Ptr);
+ const SCEV *DiffSCEV = SE->getMinusSCEV(PtrSCEV, AASCEV);
+
+ // On 32-bit platforms, DiffSCEV might now have type i32 -- we've always
+ // sign-extended OffSCEV to i64, so make sure they agree again.
+ DiffSCEV = SE->getNoopOrSignExtend(DiffSCEV, OffSCEV->getType());
+
+ // What we really want to know is the overall offset to the aligned
+ // address. This address is displaced by the provided offset.
+ DiffSCEV = SE->getMinusSCEV(DiffSCEV, OffSCEV);
+
+ DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to " <<
+ *AlignSCEV << " and offset " << *OffSCEV <<
+ " using diff " << *DiffSCEV << "\n");
+
+ unsigned NewAlignment = getNewAlignmentDiff(DiffSCEV, AlignSCEV, SE);
+ DEBUG(dbgs() << "\tnew alignment: " << NewAlignment << "\n");
+
+ if (NewAlignment) {
+ return NewAlignment;
+ } else if (const SCEVAddRecExpr *DiffARSCEV =
+ dyn_cast<SCEVAddRecExpr>(DiffSCEV)) {
+ // The relative offset to the alignment assumption did not yield a constant,
+ // but we should try harder: if we assume that a is 32-byte aligned, then in
+ // for (i = 0; i < 1024; i += 4) r += a[i]; not all of the loads from a are
+ // 32-byte aligned, but instead alternate between 32 and 16-byte alignment.
+ // As a result, the new alignment will not be a constant, but can still
+ // be improved over the default (of 4) to 16.
+
+ const SCEV *DiffStartSCEV = DiffARSCEV->getStart();
+ const SCEV *DiffIncSCEV = DiffARSCEV->getStepRecurrence(*SE);
+
+ DEBUG(dbgs() << "\ttrying start/inc alignment using start " <<
+ *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n");
+
+ // Now compute the new alignment using the displacement to the value in the
+ // first iteration, and also the alignment using the per-iteration delta.
+ // If these are the same, then use that answer. Otherwise, use the smaller
+ // one, but only if it divides the larger one.
+ NewAlignment = getNewAlignmentDiff(DiffStartSCEV, AlignSCEV, SE);
+ unsigned NewIncAlignment = getNewAlignmentDiff(DiffIncSCEV, AlignSCEV, SE);
+
+ DEBUG(dbgs() << "\tnew start alignment: " << NewAlignment << "\n");
+ DEBUG(dbgs() << "\tnew inc alignment: " << NewIncAlignment << "\n");
+
+ if (!NewAlignment || !NewIncAlignment) {
+ return 0;
+ } else if (NewAlignment > NewIncAlignment) {
+ if (NewAlignment % NewIncAlignment == 0) {
+ DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+ NewIncAlignment << "\n");
+ return NewIncAlignment;
+ }
+ } else if (NewIncAlignment > NewAlignment) {
+ if (NewIncAlignment % NewAlignment == 0) {
+ DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+ NewAlignment << "\n");
+ return NewAlignment;
+ }
+ } else if (NewIncAlignment == NewAlignment) {
+ DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+ NewAlignment << "\n");
+ return NewAlignment;
+ }
+ }
+
+ return 0;
+}
+
+bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I,
+ Value *&AAPtr, const SCEV *&AlignSCEV,
+ const SCEV *&OffSCEV) {
+ // An alignment assume must be a statement about the least-significant
+ // bits of the pointer being zero, possibly with some offset.
+ ICmpInst *ICI = dyn_cast<ICmpInst>(I->getArgOperand(0));
+ if (!ICI)
+ return false;
+
+ // This must be an expression of the form: x & m == 0.
+ if (ICI->getPredicate() != ICmpInst::ICMP_EQ)
+ return false;
+
+ // Swap things around so that the RHS is 0.
+ Value *CmpLHS = ICI->getOperand(0);
+ Value *CmpRHS = ICI->getOperand(1);
+ const SCEV *CmpLHSSCEV = SE->getSCEV(CmpLHS);
+ const SCEV *CmpRHSSCEV = SE->getSCEV(CmpRHS);
+ if (CmpLHSSCEV->isZero())
+ std::swap(CmpLHS, CmpRHS);
+ else if (!CmpRHSSCEV->isZero())
+ return false;
+
+ BinaryOperator *CmpBO = dyn_cast<BinaryOperator>(CmpLHS);
+ if (!CmpBO || CmpBO->getOpcode() != Instruction::And)
+ return false;
+
+ // Swap things around so that the right operand of the and is a constant
+ // (the mask); we cannot deal with variable masks.
+ Value *AndLHS = CmpBO->getOperand(0);
+ Value *AndRHS = CmpBO->getOperand(1);
+ const SCEV *AndLHSSCEV = SE->getSCEV(AndLHS);
+ const SCEV *AndRHSSCEV = SE->getSCEV(AndRHS);
+ if (isa<SCEVConstant>(AndLHSSCEV)) {
+ std::swap(AndLHS, AndRHS);
+ std::swap(AndLHSSCEV, AndRHSSCEV);
+ }
+
+ const SCEVConstant *MaskSCEV = dyn_cast<SCEVConstant>(AndRHSSCEV);
+ if (!MaskSCEV)
+ return false;
+
+ // The mask must have some trailing ones (otherwise the condition is
+ // trivial and tells us nothing about the alignment of the left operand).
+ unsigned TrailingOnes =
+ MaskSCEV->getValue()->getValue().countTrailingOnes();
+ if (!TrailingOnes)
+ return false;
+
+ // Cap the alignment at the maximum with which LLVM can deal (and make sure
+ // we don't overflow the shift).
+ uint64_t Alignment;
+ TrailingOnes = std::min(TrailingOnes,
+ unsigned(sizeof(unsigned) * CHAR_BIT - 1));
+ Alignment = std::min(1u << TrailingOnes, +Value::MaximumAlignment);
+
+ Type *Int64Ty = Type::getInt64Ty(I->getParent()->getParent()->getContext());
+ AlignSCEV = SE->getConstant(Int64Ty, Alignment);
+
+ // The LHS might be a ptrtoint instruction, or it might be the pointer
+ // with an offset.
+ AAPtr = nullptr;
+ OffSCEV = nullptr;
+ if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(AndLHS)) {
+ AAPtr = PToI->getPointerOperand();
+ OffSCEV = SE->getConstant(Int64Ty, 0);
+ } else if (const SCEVAddExpr* AndLHSAddSCEV =
+ dyn_cast<SCEVAddExpr>(AndLHSSCEV)) {
+ // Try to find the ptrtoint; subtract it and the rest is the offset.
+ for (SCEVAddExpr::op_iterator J = AndLHSAddSCEV->op_begin(),
+ JE = AndLHSAddSCEV->op_end(); J != JE; ++J)
+ if (const SCEVUnknown *OpUnk = dyn_cast<SCEVUnknown>(*J))
+ if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(OpUnk->getValue())) {
+ AAPtr = PToI->getPointerOperand();
+ OffSCEV = SE->getMinusSCEV(AndLHSAddSCEV, *J);
+ break;
+ }
+ }
+
+ if (!AAPtr)
+ return false;
+
+ // Sign extend the offset to 64 bits (so that it is like all of the other
+ // expressions).
+ unsigned OffSCEVBits = OffSCEV->getType()->getPrimitiveSizeInBits();
+ if (OffSCEVBits < 64)
+ OffSCEV = SE->getSignExtendExpr(OffSCEV, Int64Ty);
+ else if (OffSCEVBits > 64)
+ return false;
+
+ AAPtr = AAPtr->stripPointerCasts();
+ return true;
+}
+
+bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) {
+ Value *AAPtr;
+ const SCEV *AlignSCEV, *OffSCEV;
+ if (!extractAlignmentInfo(ACall, AAPtr, AlignSCEV, OffSCEV))
+ return false;
+
+ const SCEV *AASCEV = SE->getSCEV(AAPtr);
+
+ // Apply the assumption to all other users of the specified pointer.
+ SmallPtrSet<Instruction *, 32> Visited;
+ SmallVector<Instruction*, 16> WorkList;
+ for (User *J : AAPtr->users()) {
+ if (J == ACall)
+ continue;
+
+ if (Instruction *K = dyn_cast<Instruction>(J))
+ if (isValidAssumeForContext(ACall, K, DL, DT))
+ WorkList.push_back(K);
+ }
+
+ while (!WorkList.empty()) {
+ Instruction *J = WorkList.pop_back_val();
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(J)) {
+ unsigned NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+ LI->getPointerOperand(), SE);
+
+ if (NewAlignment > LI->getAlignment()) {
+ LI->setAlignment(NewAlignment);
+ ++NumLoadAlignChanged;
+ }
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) {
+ unsigned NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+ SI->getPointerOperand(), SE);
+
+ if (NewAlignment > SI->getAlignment()) {
+ SI->setAlignment(NewAlignment);
+ ++NumStoreAlignChanged;
+ }
+ } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) {
+ unsigned NewDestAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+ MI->getDest(), SE);
+
+ // For memory transfers, we need a common alignment for both the
+ // source and destination. If we have a new alignment for this
+ // instruction, but only for one operand, save it. If we reach the
+ // other operand through another assumption later, then we may
+ // change the alignment at that point.
+ if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+ unsigned NewSrcAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+ MTI->getSource(), SE);
+
+ DenseMap<MemTransferInst *, unsigned>::iterator DI =
+ NewDestAlignments.find(MTI);
+ unsigned AltDestAlignment = (DI == NewDestAlignments.end()) ?
+ 0 : DI->second;
+
+ DenseMap<MemTransferInst *, unsigned>::iterator SI =
+ NewSrcAlignments.find(MTI);
+ unsigned AltSrcAlignment = (SI == NewSrcAlignments.end()) ?
+ 0 : SI->second;
+
+ DEBUG(dbgs() << "\tmem trans: " << NewDestAlignment << " " <<
+ AltDestAlignment << " " << NewSrcAlignment <<
+ " " << AltSrcAlignment << "\n");
+
+ // Of these four alignments, pick the largest possible...
+ unsigned NewAlignment = 0;
+ if (NewDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment))
+ NewAlignment = std::max(NewAlignment, NewDestAlignment);
+ if (AltDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment))
+ NewAlignment = std::max(NewAlignment, AltDestAlignment);
+ if (NewSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment))
+ NewAlignment = std::max(NewAlignment, NewSrcAlignment);
+ if (AltSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment))
+ NewAlignment = std::max(NewAlignment, AltSrcAlignment);
+
+ if (NewAlignment > MI->getAlignment()) {
+ MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
+ MI->getParent()->getContext()), NewAlignment));
+ ++NumMemIntAlignChanged;
+ }
+
+ NewDestAlignments.insert(std::make_pair(MTI, NewDestAlignment));
+ NewSrcAlignments.insert(std::make_pair(MTI, NewSrcAlignment));
+ } else if (NewDestAlignment > MI->getAlignment()) {
+ assert((!isa<MemIntrinsic>(MI) || isa<MemSetInst>(MI)) &&
+ "Unknown memory intrinsic");
+
+ MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
+ MI->getParent()->getContext()), NewDestAlignment));
+ ++NumMemIntAlignChanged;
+ }
+ }
+
+ // Now that we've updated that use of the pointer, look for other uses of
+ // the pointer to update.
+ Visited.insert(J);
+ for (User *UJ : J->users()) {
+ Instruction *K = cast<Instruction>(UJ);
+ if (!Visited.count(K) && isValidAssumeForContext(ACall, K, DL, DT))
+ WorkList.push_back(K);
+ }
+ }
+
+ return true;
+}
+
+bool AlignmentFromAssumptions::runOnFunction(Function &F) {
+ bool Changed = false;
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ SE = &getAnalysis<ScalarEvolution>();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+ DL = DLP ? &DLP->getDataLayout() : nullptr;
+
+ NewDestAlignments.clear();
+ NewSrcAlignments.clear();
+
+ for (auto &AssumeVH : AC.assumptions())
+ if (AssumeVH)
+ Changed |= processAssumption(cast<CallInst>(AssumeVH));
+
+ return Changed;
+}
+
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index 261ddda30150..b3ee11ed67cd 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -1,5 +1,6 @@
add_llvm_library(LLVMScalarOpts
ADCE.cpp
+ AlignmentFromAssumptions.cpp
ConstantHoisting.cpp
ConstantProp.cpp
CorrelatedValuePropagation.cpp
diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp
index 763d02b9fcd6..27c177a542e3 100644
--- a/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -91,7 +91,7 @@ struct RebasedConstantInfo {
Constant *Offset;
RebasedConstantInfo(ConstantUseListType &&Uses, Constant *Offset)
- : Uses(Uses), Offset(Offset) { }
+ : Uses(std::move(Uses)), Offset(Offset) { }
};
/// \brief A base constant and all its rebased constants.
@@ -395,7 +395,7 @@ void ConstantHoisting::findAndMakeBaseConstant(ConstCandVecType::iterator S,
ConstInfo.RebasedConstants.push_back(
RebasedConstantInfo(std::move(ConstCand->Uses), Offset));
}
- ConstantVec.push_back(ConstInfo);
+ ConstantVec.push_back(std::move(ConstInfo));
}
/// \brief Finds and combines constant candidates that can be easily
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 082946229b35..5a3b5cf34cc3 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -73,7 +73,7 @@ bool CorrelatedValuePropagation::processSelect(SelectInst *S) {
if (S->getType()->isVectorTy()) return false;
if (isa<Constant>(S->getOperand(0))) return false;
- Constant *C = LVI->getConstant(S->getOperand(0), S->getParent());
+ Constant *C = LVI->getConstant(S->getOperand(0), S->getParent(), S);
if (!C) return false;
ConstantInt *CI = dyn_cast<ConstantInt>(C);
@@ -100,7 +100,7 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) {
Value *Incoming = P->getIncomingValue(i);
if (isa<Constant>(Incoming)) continue;
- Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB);
+ Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB, P);
// Look if the incoming value is a select with a constant but LVI tells us
// that the incoming value can never be that constant. In that case replace
@@ -114,7 +114,7 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) {
if (!C) continue;
if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C,
- P->getIncomingBlock(i), BB) !=
+ P->getIncomingBlock(i), BB, P) !=
LazyValueInfo::False)
continue;
@@ -126,6 +126,7 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) {
Changed = true;
}
+ // FIXME: Provide DL, TLI, DT, AT to SimplifyInstruction.
if (Value *V = SimplifyInstruction(P)) {
P->replaceAllUsesWith(V);
P->eraseFromParent();
@@ -147,7 +148,7 @@ bool CorrelatedValuePropagation::processMemAccess(Instruction *I) {
if (isa<Constant>(Pointer)) return false;
- Constant *C = LVI->getConstant(Pointer, I->getParent());
+ Constant *C = LVI->getConstant(Pointer, I->getParent(), I);
if (!C) return false;
++NumMemAccess;
@@ -173,13 +174,15 @@ bool CorrelatedValuePropagation::processCmp(CmpInst *C) {
if (PI == PE) return false;
LazyValueInfo::Tristate Result = LVI->getPredicateOnEdge(C->getPredicate(),
- C->getOperand(0), Op1, *PI, C->getParent());
+ C->getOperand(0), Op1, *PI,
+ C->getParent(), C);
if (Result == LazyValueInfo::Unknown) return false;
++PI;
while (PI != PE) {
LazyValueInfo::Tristate Res = LVI->getPredicateOnEdge(C->getPredicate(),
- C->getOperand(0), Op1, *PI, C->getParent());
+ C->getOperand(0), Op1, *PI,
+ C->getParent(), C);
if (Res != Result) return false;
++PI;
}
@@ -229,7 +232,8 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) {
for (pred_iterator PI = PB; PI != PE; ++PI) {
// Is the switch condition equal to the case value?
LazyValueInfo::Tristate Value = LVI->getPredicateOnEdge(CmpInst::ICMP_EQ,
- Cond, Case, *PI, BB);
+ Cond, Case, *PI,
+ BB, SI);
// Give up on this case if nothing is known.
if (Value == LazyValueInfo::Unknown) {
State = LazyValueInfo::Unknown;
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 3af8ee7546fb..a1ddc00da5ba 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -356,15 +356,8 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
// If we don't know the sizes of either access, then we can't do a
// comparison.
if (Later.Size == AliasAnalysis::UnknownSize ||
- Earlier.Size == AliasAnalysis::UnknownSize) {
- // If we have no DataLayout information around, then the size of the store
- // is inferrable from the pointee type. If they are the same type, then
- // we know that the store is safe.
- if (DL == nullptr && Later.Ptr->getType() == Earlier.Ptr->getType())
- return OverwriteComplete;
-
+ Earlier.Size == AliasAnalysis::UnknownSize)
return OverwriteUnknown;
- }
// Make sure that the Later size is >= the Earlier size.
if (Later.Size >= Earlier.Size)
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 735f5c194cb5..394b0d3de7bd 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -16,17 +16,21 @@
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/ScopedHashTable.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/RecyclingAllocator.h"
#include "llvm/Target/TargetLibraryInfo.h"
#include "llvm/Transforms/Utils/Local.h"
-#include <vector>
+#include <deque>
using namespace llvm;
+using namespace llvm::PatternMatch;
#define DEBUG_TYPE "early-cse"
@@ -266,6 +270,7 @@ public:
const DataLayout *DL;
const TargetLibraryInfo *TLI;
DominatorTree *DT;
+ AssumptionCache *AC;
typedef RecyclingAllocator<BumpPtrAllocator,
ScopedHashTableVal<SimpleValue, Value*> > AllocatorTy;
typedef ScopedHashTable<SimpleValue, Value*, DenseMapInfo<SimpleValue>,
@@ -378,6 +383,7 @@ private:
// This transformation requires dominator postdominator info
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetLibraryInfo>();
AU.setPreservesCFG();
@@ -393,6 +399,7 @@ FunctionPass *llvm::createEarlyCSEPass() {
}
INITIALIZE_PASS_BEGIN(EarlyCSE, "early-cse", "Early CSE", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false)
@@ -431,9 +438,18 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
continue;
}
+ // Skip assume intrinsics, they don't really have side effects (although
+ // they're marked as such to ensure preservation of control dependencies),
+ // and this pass will not disturb any of the assumption's control
+ // dependencies.
+ if (match(Inst, m_Intrinsic<Intrinsic::assume>())) {
+ DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n');
+ continue;
+ }
+
// If the instruction can be simplified (e.g. X+0 = X) then replace it with
// its simpler value.
- if (Value *V = SimplifyInstruction(Inst, DL, TLI, DT)) {
+ if (Value *V = SimplifyInstruction(Inst, DL, TLI, DT, AC)) {
DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n');
Inst->replaceAllUsesWith(V);
Inst->eraseFromParent();
@@ -530,7 +546,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
Changed = true;
++NumDSE;
LastStore = nullptr;
- continue;
+ // fallthrough - we can exploit information about this store
}
// Okay, we just invalidated anything we knew about loaded values. Try
@@ -556,12 +572,17 @@ bool EarlyCSE::runOnFunction(Function &F) {
if (skipOptnoneFunction(F))
return false;
- std::vector<StackNode *> nodesToProcess;
+ // Note, deque is being used here because there is significant performance gains
+ // over vector when the container becomes very large due to the specific access
+ // patterns. For more information see the mailing list discussion on this:
+ // http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
+ std::deque<StackNode *> nodesToProcess;
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
DL = DLP ? &DLP->getDataLayout() : nullptr;
TLI = &getAnalysis<TargetLibraryInfo>();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
// Tables that the pass uses when walking the domtree.
ScopedHTType AVTable;
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 106eba099ca0..b814b2525dca 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -20,10 +20,12 @@
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/InstructionSimplify.h"
@@ -45,6 +47,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Target/TargetLibraryInfo.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
#include <vector>
using namespace llvm;
@@ -590,6 +593,7 @@ namespace {
DominatorTree *DT;
const DataLayout *DL;
const TargetLibraryInfo *TLI;
+ AssumptionCache *AC;
SetVector<BasicBlock *> DeadBlocks;
ValueTable VN;
@@ -679,6 +683,7 @@ namespace {
// This transformation requires dominator postdominator info
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetLibraryInfo>();
if (!NoLoads)
@@ -705,6 +710,7 @@ namespace {
void dump(DenseMap<uint32_t, Value*> &d);
bool iterateOnFunction(Function &F);
bool performPRE(Function &F);
+ bool performScalarPRE(Instruction *I);
Value *findLeader(const BasicBlock *BB, uint32_t num);
void cleanupGlobalSets();
void verifyRemoved(const Instruction *I) const;
@@ -727,6 +733,7 @@ FunctionPass *llvm::createGVNPass(bool NoLoads) {
}
INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
@@ -1616,7 +1623,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
// If all preds have a single successor, then we know it is safe to insert
// the load on the pred (?!?), so we can insert code to materialize the
// pointer if it is not available.
- PHITransAddr Address(LI->getPointerOperand(), DL);
+ PHITransAddr Address(LI->getPointerOperand(), DL, AC);
Value *LoadPtr = nullptr;
LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
*DT, NewInsts);
@@ -1669,9 +1676,11 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
LI->getAlignment(),
UnavailablePred->getTerminator());
- // Transfer the old load's TBAA tag to the new load.
- if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa))
- NewLoad->setMetadata(LLVMContext::MD_tbaa, Tag);
+ // Transfer the old load's AA tags to the new load.
+ AAMDNodes Tags;
+ LI->getAAMetadata(Tags);
+ if (Tags)
+ NewLoad->setAAMetadata(Tags);
// Transfer DebugLoc.
NewLoad->setDebugLoc(LI->getDebugLoc());
@@ -1700,8 +1709,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
bool GVN::processNonLocalLoad(LoadInst *LI) {
// Step 1: Find the non-local dependencies of the load.
LoadDepVect Deps;
- AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI);
- MD->getNonLocalPointerDependency(Loc, true, LI->getParent(), Deps);
+ MD->getNonLocalPointerDependency(LI, Deps);
// If we had to process more than one hundred blocks to find the
// dependencies, this load isn't worth worrying about. Optimizing
@@ -1722,6 +1730,15 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
return false;
}
+ // If this load follows a GEP, see if we can PRE the indices before analyzing.
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) {
+ for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(),
+ OE = GEP->idx_end();
+ OI != OE; ++OI)
+ if (Instruction *I = dyn_cast<Instruction>(OI->get()))
+ performScalarPRE(I);
+ }
+
// Step 2: Analyze the availability of the load
AvailValInBlkVect ValuesPerBlock;
UnavailBlkVect UnavailableBlocks;
@@ -1774,36 +1791,24 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) {
ReplOp->setHasNoUnsignedWrap(false);
}
if (Instruction *ReplInst = dyn_cast<Instruction>(Repl)) {
- SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata;
- ReplInst->getAllMetadataOtherThanDebugLoc(Metadata);
- for (int i = 0, n = Metadata.size(); i < n; ++i) {
- unsigned Kind = Metadata[i].first;
- MDNode *IMD = I->getMetadata(Kind);
- MDNode *ReplMD = Metadata[i].second;
- switch(Kind) {
- default:
- ReplInst->setMetadata(Kind, nullptr); // Remove unknown metadata
- break;
- case LLVMContext::MD_dbg:
- llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg");
- case LLVMContext::MD_tbaa:
- ReplInst->setMetadata(Kind, MDNode::getMostGenericTBAA(IMD, ReplMD));
- break;
- case LLVMContext::MD_range:
- ReplInst->setMetadata(Kind, MDNode::getMostGenericRange(IMD, ReplMD));
- break;
- case LLVMContext::MD_prof:
- llvm_unreachable("MD_prof in a non-terminator instruction");
- break;
- case LLVMContext::MD_fpmath:
- ReplInst->setMetadata(Kind, MDNode::getMostGenericFPMath(IMD, ReplMD));
- break;
- case LLVMContext::MD_invariant_load:
- // Only set the !invariant.load if it is present in both instructions.
- ReplInst->setMetadata(Kind, IMD);
- break;
- }
- }
+ // FIXME: If both the original and replacement value are part of the
+ // same control-flow region (meaning that the execution of one
+ // guarentees the executation of the other), then we can combine the
+ // noalias scopes here and do better than the general conservative
+ // answer used in combineMetadata().
+
+ // In general, GVN unifies expressions over different control-flow
+ // regions, and so we need a conservative combination of the noalias
+ // scopes.
+ unsigned KnownIDs[] = {
+ LLVMContext::MD_tbaa,
+ LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias,
+ LLVMContext::MD_range,
+ LLVMContext::MD_fpmath,
+ LLVMContext::MD_invariant_load,
+ };
+ combineMetadata(ReplInst, I, KnownIDs);
}
}
@@ -2101,15 +2106,15 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
std::swap(LHS, RHS);
assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!");
- // If there is no obvious reason to prefer the left-hand side over the right-
- // hand side, ensure the longest lived term is on the right-hand side, so the
- // shortest lived term will be replaced by the longest lived. This tends to
- // expose more simplifications.
+ // If there is no obvious reason to prefer the left-hand side over the
+ // right-hand side, ensure the longest lived term is on the right-hand side,
+ // so the shortest lived term will be replaced by the longest lived.
+ // This tends to expose more simplifications.
uint32_t LVN = VN.lookup_or_add(LHS);
if ((isa<Argument>(LHS) && isa<Argument>(RHS)) ||
(isa<Instruction>(LHS) && isa<Instruction>(RHS))) {
- // Move the 'oldest' value to the right-hand side, using the value number as
- // a proxy for age.
+ // Move the 'oldest' value to the right-hand side, using the value number
+ // as a proxy for age.
uint32_t RVN = VN.lookup_or_add(RHS);
if (LVN < RVN) {
std::swap(LHS, RHS);
@@ -2138,10 +2143,10 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
NumGVNEqProp += NumReplacements;
}
- // Now try to deduce additional equalities from this one. For example, if the
- // known equality was "(A != B)" == "false" then it follows that A and B are
- // equal in the scope. Only boolean equalities with an explicit true or false
- // RHS are currently supported.
+ // Now try to deduce additional equalities from this one. For example, if
+ // the known equality was "(A != B)" == "false" then it follows that A and B
+ // are equal in the scope. Only boolean equalities with an explicit true or
+ // false RHS are currently supported.
if (!RHS->getType()->isIntegerTy(1))
// Not a boolean equality - bail out.
continue;
@@ -2166,7 +2171,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
// If we are propagating an equality like "(A == B)" == "true" then also
// propagate the equality A == B. When propagating a comparison such as
// "(A >= B)" == "true", replace all instances of "A < B" with "false".
- if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) {
+ if (CmpInst *Cmp = dyn_cast<CmpInst>(LHS)) {
Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
// If "A == B" is known true, or "A != B" is known false, then replace
@@ -2175,12 +2180,17 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
(isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE))
Worklist.push_back(std::make_pair(Op0, Op1));
+ // Handle the floating point versions of equality comparisons too.
+ if ((isKnownTrue && Cmp->getPredicate() == CmpInst::FCMP_OEQ) ||
+ (isKnownFalse && Cmp->getPredicate() == CmpInst::FCMP_UNE))
+ Worklist.push_back(std::make_pair(Op0, Op1));
+
// If "A >= B" is known true, replace "A < B" with false everywhere.
CmpInst::Predicate NotPred = Cmp->getInversePredicate();
Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse);
- // Since we don't have the instruction "A < B" immediately to hand, work out
- // the value number that it would have and use that to find an appropriate
- // instruction (if any).
+ // Since we don't have the instruction "A < B" immediately to hand, work
+ // out the value number that it would have and use that to find an
+ // appropriate instruction (if any).
uint32_t NextNum = VN.getNextUnusedValueNumber();
uint32_t Num = VN.lookup_or_add_cmp(Cmp->getOpcode(), NotPred, Op0, Op1);
// If the number we were assigned was brand new then there is no point in
@@ -2219,7 +2229,7 @@ bool GVN::processInstruction(Instruction *I) {
// to value numbering it. Value numbering often exposes redundancies, for
// example if it determines that %y is equal to %x then the instruction
// "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
- if (Value *V = SimplifyInstruction(I, DL, TLI, DT)) {
+ if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) {
I->replaceAllUsesWith(V);
if (MD && V->getType()->getScalarType()->isPointerTy())
MD->invalidateCachedPointerInfo(V);
@@ -2339,6 +2349,7 @@ bool GVN::runOnFunction(Function& F) {
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
DL = DLP ? &DLP->getDataLayout() : nullptr;
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
TLI = &getAnalysis<TargetLibraryInfo>();
VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>());
VN.setMemDep(MD);
@@ -2435,175 +2446,182 @@ bool GVN::processBlock(BasicBlock *BB) {
return ChangedFunction;
}
-/// performPRE - Perform a purely local form of PRE that looks for diamond
-/// control flow patterns and attempts to perform simple PRE at the join point.
-bool GVN::performPRE(Function &F) {
- bool Changed = false;
+bool GVN::performScalarPRE(Instruction *CurInst) {
SmallVector<std::pair<Value*, BasicBlock*>, 8> predMap;
- for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) {
- // Nothing to PRE in the entry block.
- if (CurrentBlock == &F.getEntryBlock()) continue;
- // Don't perform PRE on a landing pad.
- if (CurrentBlock->isLandingPad()) continue;
+ if (isa<AllocaInst>(CurInst) || isa<TerminatorInst>(CurInst) ||
+ isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() ||
+ CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
+ isa<DbgInfoIntrinsic>(CurInst))
+ return false;
- for (BasicBlock::iterator BI = CurrentBlock->begin(),
- BE = CurrentBlock->end(); BI != BE; ) {
- Instruction *CurInst = BI++;
+ // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
+ // sinking the compare again, and it would force the code generator to
+ // move the i1 from processor flags or predicate registers into a general
+ // purpose register.
+ if (isa<CmpInst>(CurInst))
+ return false;
- if (isa<AllocaInst>(CurInst) ||
- isa<TerminatorInst>(CurInst) || isa<PHINode>(CurInst) ||
- CurInst->getType()->isVoidTy() ||
- CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
- isa<DbgInfoIntrinsic>(CurInst))
- continue;
+ // We don't currently value number ANY inline asm calls.
+ if (CallInst *CallI = dyn_cast<CallInst>(CurInst))
+ if (CallI->isInlineAsm())
+ return false;
- // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
- // sinking the compare again, and it would force the code generator to
- // move the i1 from processor flags or predicate registers into a general
- // purpose register.
- if (isa<CmpInst>(CurInst))
- continue;
+ uint32_t ValNo = VN.lookup(CurInst);
+
+ // Look for the predecessors for PRE opportunities. We're
+ // only trying to solve the basic diamond case, where
+ // a value is computed in the successor and one predecessor,
+ // but not the other. We also explicitly disallow cases
+ // where the successor is its own predecessor, because they're
+ // more complicated to get right.
+ unsigned NumWith = 0;
+ unsigned NumWithout = 0;
+ BasicBlock *PREPred = nullptr;
+ BasicBlock *CurrentBlock = CurInst->getParent();
+ predMap.clear();
+
+ for (pred_iterator PI = pred_begin(CurrentBlock), PE = pred_end(CurrentBlock);
+ PI != PE; ++PI) {
+ BasicBlock *P = *PI;
+ // We're not interested in PRE where the block is its
+ // own predecessor, or in blocks with predecessors
+ // that are not reachable.
+ if (P == CurrentBlock) {
+ NumWithout = 2;
+ break;
+ } else if (!DT->isReachableFromEntry(P)) {
+ NumWithout = 2;
+ break;
+ }
- // We don't currently value number ANY inline asm calls.
- if (CallInst *CallI = dyn_cast<CallInst>(CurInst))
- if (CallI->isInlineAsm())
- continue;
+ Value *predV = findLeader(P, ValNo);
+ if (!predV) {
+ predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
+ PREPred = P;
+ ++NumWithout;
+ } else if (predV == CurInst) {
+ /* CurInst dominates this predecessor. */
+ NumWithout = 2;
+ break;
+ } else {
+ predMap.push_back(std::make_pair(predV, P));
+ ++NumWith;
+ }
+ }
- uint32_t ValNo = VN.lookup(CurInst);
-
- // Look for the predecessors for PRE opportunities. We're
- // only trying to solve the basic diamond case, where
- // a value is computed in the successor and one predecessor,
- // but not the other. We also explicitly disallow cases
- // where the successor is its own predecessor, because they're
- // more complicated to get right.
- unsigned NumWith = 0;
- unsigned NumWithout = 0;
- BasicBlock *PREPred = nullptr;
- predMap.clear();
-
- for (pred_iterator PI = pred_begin(CurrentBlock),
- PE = pred_end(CurrentBlock); PI != PE; ++PI) {
- BasicBlock *P = *PI;
- // We're not interested in PRE where the block is its
- // own predecessor, or in blocks with predecessors
- // that are not reachable.
- if (P == CurrentBlock) {
- NumWithout = 2;
- break;
- } else if (!DT->isReachableFromEntry(P)) {
- NumWithout = 2;
- break;
- }
+ // Don't do PRE when it might increase code size, i.e. when
+ // we would need to insert instructions in more than one pred.
+ if (NumWithout != 1 || NumWith == 0)
+ return false;
- Value* predV = findLeader(P, ValNo);
- if (!predV) {
- predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
- PREPred = P;
- ++NumWithout;
- } else if (predV == CurInst) {
- /* CurInst dominates this predecessor. */
- NumWithout = 2;
- break;
- } else {
- predMap.push_back(std::make_pair(predV, P));
- ++NumWith;
- }
- }
+ // Don't do PRE across indirect branch.
+ if (isa<IndirectBrInst>(PREPred->getTerminator()))
+ return false;
- // Don't do PRE when it might increase code size, i.e. when
- // we would need to insert instructions in more than one pred.
- if (NumWithout != 1 || NumWith == 0)
- continue;
+ // We can't do PRE safely on a critical edge, so instead we schedule
+ // the edge to be split and perform the PRE the next time we iterate
+ // on the function.
+ unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock);
+ if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) {
+ toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum));
+ return false;
+ }
- // Don't do PRE across indirect branch.
- if (isa<IndirectBrInst>(PREPred->getTerminator()))
- continue;
+ // Instantiate the expression in the predecessor that lacked it.
+ // Because we are going top-down through the block, all value numbers
+ // will be available in the predecessor by the time we need them. Any
+ // that weren't originally present will have been instantiated earlier
+ // in this loop.
+ Instruction *PREInstr = CurInst->clone();
+ bool success = true;
+ for (unsigned i = 0, e = CurInst->getNumOperands(); i != e; ++i) {
+ Value *Op = PREInstr->getOperand(i);
+ if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op))
+ continue;
- // We can't do PRE safely on a critical edge, so instead we schedule
- // the edge to be split and perform the PRE the next time we iterate
- // on the function.
- unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock);
- if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) {
- toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum));
- continue;
- }
+ if (Value *V = findLeader(PREPred, VN.lookup(Op))) {
+ PREInstr->setOperand(i, V);
+ } else {
+ success = false;
+ break;
+ }
+ }
- // Instantiate the expression in the predecessor that lacked it.
- // Because we are going top-down through the block, all value numbers
- // will be available in the predecessor by the time we need them. Any
- // that weren't originally present will have been instantiated earlier
- // in this loop.
- Instruction *PREInstr = CurInst->clone();
- bool success = true;
- for (unsigned i = 0, e = CurInst->getNumOperands(); i != e; ++i) {
- Value *Op = PREInstr->getOperand(i);
- if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op))
- continue;
+ // Fail out if we encounter an operand that is not available in
+ // the PRE predecessor. This is typically because of loads which
+ // are not value numbered precisely.
+ if (!success) {
+ DEBUG(verifyRemoved(PREInstr));
+ delete PREInstr;
+ return false;
+ }
- if (Value *V = findLeader(PREPred, VN.lookup(Op))) {
- PREInstr->setOperand(i, V);
- } else {
- success = false;
- break;
- }
- }
+ PREInstr->insertBefore(PREPred->getTerminator());
+ PREInstr->setName(CurInst->getName() + ".pre");
+ PREInstr->setDebugLoc(CurInst->getDebugLoc());
+ VN.add(PREInstr, ValNo);
+ ++NumGVNPRE;
- // Fail out if we encounter an operand that is not available in
- // the PRE predecessor. This is typically because of loads which
- // are not value numbered precisely.
- if (!success) {
- DEBUG(verifyRemoved(PREInstr));
- delete PREInstr;
- continue;
- }
+ // Update the availability map to include the new instruction.
+ addToLeaderTable(ValNo, PREInstr, PREPred);
- PREInstr->insertBefore(PREPred->getTerminator());
- PREInstr->setName(CurInst->getName() + ".pre");
- PREInstr->setDebugLoc(CurInst->getDebugLoc());
- VN.add(PREInstr, ValNo);
- ++NumGVNPRE;
-
- // Update the availability map to include the new instruction.
- addToLeaderTable(ValNo, PREInstr, PREPred);
-
- // Create a PHI to make the value available in this block.
- PHINode* Phi = PHINode::Create(CurInst->getType(), predMap.size(),
- CurInst->getName() + ".pre-phi",
- CurrentBlock->begin());
- for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
- if (Value *V = predMap[i].first)
- Phi->addIncoming(V, predMap[i].second);
- else
- Phi->addIncoming(PREInstr, PREPred);
- }
+ // Create a PHI to make the value available in this block.
+ PHINode *Phi =
+ PHINode::Create(CurInst->getType(), predMap.size(),
+ CurInst->getName() + ".pre-phi", CurrentBlock->begin());
+ for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
+ if (Value *V = predMap[i].first)
+ Phi->addIncoming(V, predMap[i].second);
+ else
+ Phi->addIncoming(PREInstr, PREPred);
+ }
- VN.add(Phi, ValNo);
- addToLeaderTable(ValNo, Phi, CurrentBlock);
- Phi->setDebugLoc(CurInst->getDebugLoc());
- CurInst->replaceAllUsesWith(Phi);
- if (Phi->getType()->getScalarType()->isPointerTy()) {
- // Because we have added a PHI-use of the pointer value, it has now
- // "escaped" from alias analysis' perspective. We need to inform
- // AA of this.
- for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee;
- ++ii) {
- unsigned jj = PHINode::getOperandNumForIncomingValue(ii);
- VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj));
- }
+ VN.add(Phi, ValNo);
+ addToLeaderTable(ValNo, Phi, CurrentBlock);
+ Phi->setDebugLoc(CurInst->getDebugLoc());
+ CurInst->replaceAllUsesWith(Phi);
+ if (Phi->getType()->getScalarType()->isPointerTy()) {
+ // Because we have added a PHI-use of the pointer value, it has now
+ // "escaped" from alias analysis' perspective. We need to inform
+ // AA of this.
+ for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee; ++ii) {
+ unsigned jj = PHINode::getOperandNumForIncomingValue(ii);
+ VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj));
+ }
- if (MD)
- MD->invalidateCachedPointerInfo(Phi);
- }
- VN.erase(CurInst);
- removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
+ if (MD)
+ MD->invalidateCachedPointerInfo(Phi);
+ }
+ VN.erase(CurInst);
+ removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
+
+ DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
+ if (MD)
+ MD->removeInstruction(CurInst);
+ DEBUG(verifyRemoved(CurInst));
+ CurInst->eraseFromParent();
+ return true;
+}
+
+/// performPRE - Perform a purely local form of PRE that looks for diamond
+/// control flow patterns and attempts to perform simple PRE at the join point.
+bool GVN::performPRE(Function &F) {
+ bool Changed = false;
+ for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) {
+ // Nothing to PRE in the entry block.
+ if (CurrentBlock == &F.getEntryBlock())
+ continue;
+
+ // Don't perform PRE on a landing pad.
+ if (CurrentBlock->isLandingPad())
+ continue;
- DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
- if (MD) MD->removeInstruction(CurInst);
- DEBUG(verifyRemoved(CurInst));
- CurInst->eraseFromParent();
- Changed = true;
+ for (BasicBlock::iterator BI = CurrentBlock->begin(),
+ BE = CurrentBlock->end();
+ BI != BE;) {
+ Instruction *CurInst = BI++;
+ Changed = performScalarPRE(CurInst);
}
}
@@ -2641,25 +2659,21 @@ bool GVN::iterateOnFunction(Function &F) {
// Top-down walk of the dominator tree
bool Changed = false;
-#if 0
- // Needed for value numbering with phi construction to work.
- ReversePostOrderTraversal<Function*> RPOT(&F);
- for (ReversePostOrderTraversal<Function*>::rpo_iterator RI = RPOT.begin(),
- RE = RPOT.end(); RI != RE; ++RI)
- Changed |= processBlock(*RI);
-#else
// Save the blocks this function have before transformation begins. GVN may
// split critical edge, and hence may invalidate the RPO/DT iterator.
//
std::vector<BasicBlock *> BBVect;
BBVect.reserve(256);
- for (DomTreeNode *x : depth_first(DT->getRootNode()))
- BBVect.push_back(x->getBlock());
+ // Needed for value numbering with phi construction to work.
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+ for (ReversePostOrderTraversal<Function *>::rpo_iterator RI = RPOT.begin(),
+ RE = RPOT.end();
+ RI != RE; ++RI)
+ BBVect.push_back(*RI);
for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end();
I != E; I++)
Changed |= processBlock(*I);
-#endif
return Changed;
}
@@ -2802,7 +2816,7 @@ bool GVN::processFoldableCondBr(BranchInst *BI) {
return true;
}
-// performPRE() will trigger assert if it come across an instruciton without
+// performPRE() will trigger assert if it comes across an instruction without
// associated val-num. As it normally has far more live instructions than dead
// instructions, it makes more sense just to "fabricate" a val-number for the
// dead code than checking if instruction involved is dead or not.
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 9cf0ca0912f9..c01f57f26ea9 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -31,6 +31,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
@@ -69,11 +70,12 @@ static cl::opt<bool> ReduceLiveIVs("liv-reduce", cl::Hidden,
namespace {
class IndVarSimplify : public LoopPass {
- LoopInfo *LI;
- ScalarEvolution *SE;
- DominatorTree *DT;
- const DataLayout *DL;
- TargetLibraryInfo *TLI;
+ LoopInfo *LI;
+ ScalarEvolution *SE;
+ DominatorTree *DT;
+ const DataLayout *DL;
+ TargetLibraryInfo *TLI;
+ const TargetTransformInfo *TTI;
SmallVector<WeakVH, 16> DeadInsts;
bool Changed;
@@ -650,7 +652,7 @@ namespace {
struct WideIVInfo {
PHINode *NarrowIV;
Type *WidestNativeType; // Widest integer type created [sz]ext
- bool IsSigned; // Was an sext user seen before a zext?
+ bool IsSigned; // Was a sext user seen before a zext?
WideIVInfo() : NarrowIV(nullptr), WidestNativeType(nullptr),
IsSigned(false) {}
@@ -661,7 +663,7 @@ namespace {
/// extended by this sign or zero extend operation. This is used to determine
/// the final width of the IV before actually widening it.
static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,
- const DataLayout *DL) {
+ const DataLayout *DL, const TargetTransformInfo *TTI) {
bool IsSigned = Cast->getOpcode() == Instruction::SExt;
if (!IsSigned && Cast->getOpcode() != Instruction::ZExt)
return;
@@ -671,6 +673,19 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,
if (DL && !DL->isLegalInteger(Width))
return;
+ // Cast is either an sext or zext up to this point.
+ // We should not widen an indvar if arithmetics on the wider indvar are more
+ // expensive than those on the narrower indvar. We check only the cost of ADD
+ // because at least an ADD is required to increment the induction variable. We
+ // could compute more comprehensively the cost of all instructions on the
+ // induction variable when necessary.
+ if (TTI &&
+ TTI->getArithmeticInstrCost(Instruction::Add, Ty) >
+ TTI->getArithmeticInstrCost(Instruction::Add,
+ Cast->getOperand(0)->getType())) {
+ return;
+ }
+
if (!WI.WidestNativeType) {
WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
WI.IsSigned = IsSigned;
@@ -757,8 +772,13 @@ protected:
const SCEVAddRecExpr* GetExtendedOperandRecurrence(NarrowIVDefUse DU);
+ const SCEV *GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
+ unsigned OpCode) const;
+
Instruction *WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);
+ bool WidenLoopCompare(NarrowIVDefUse DU);
+
void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
};
} // anonymous namespace
@@ -833,18 +853,35 @@ Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) {
}
}
+const SCEV *WidenIV::GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
+ unsigned OpCode) const {
+ if (OpCode == Instruction::Add)
+ return SE->getAddExpr(LHS, RHS);
+ if (OpCode == Instruction::Sub)
+ return SE->getMinusSCEV(LHS, RHS);
+ if (OpCode == Instruction::Mul)
+ return SE->getMulExpr(LHS, RHS);
+
+ llvm_unreachable("Unsupported opcode.");
+}
+
/// No-wrap operations can transfer sign extension of their result to their
/// operands. Generate the SCEV value for the widened operation without
/// actually modifying the IR yet. If the expression after extending the
/// operands is an AddRec for this loop, return it.
const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) {
+
// Handle the common case of add<nsw/nuw>
- if (DU.NarrowUse->getOpcode() != Instruction::Add)
+ const unsigned OpCode = DU.NarrowUse->getOpcode();
+ // Only Add/Sub/Mul instructions supported yet.
+ if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
+ OpCode != Instruction::Mul)
return nullptr;
// One operand (NarrowDef) has already been extended to WideDef. Now determine
// if extending the other will lead to a recurrence.
- unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
+ const unsigned ExtendOperIdx =
+ DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
const SCEV *ExtendOperExpr = nullptr;
@@ -859,13 +896,20 @@ const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) {
else
return nullptr;
- // When creating this AddExpr, don't apply the current operations NSW or NUW
+ // When creating this SCEV expr, don't apply the current operations NSW or NUW
// flags. This instruction may be guarded by control flow that the no-wrap
// behavior depends on. Non-control-equivalent instructions can be mapped to
// the same SCEV expression, and it would be incorrect to transfer NSW/NUW
// semantics to those operations.
- const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(
- SE->getAddExpr(SE->getSCEV(DU.WideDef), ExtendOperExpr));
+ const SCEV *lhs = SE->getSCEV(DU.WideDef);
+ const SCEV *rhs = ExtendOperExpr;
+
+ // Let's swap operands to the initial order for the case of non-commutative
+ // operations, like SUB. See PR21014.
+ if (ExtendOperIdx == 0)
+ std::swap(lhs, rhs);
+ const SCEVAddRecExpr *AddRec =
+ dyn_cast<SCEVAddRecExpr>(GetSCEVByOpCode(lhs, rhs, OpCode));
if (!AddRec || AddRec->getLoop() != L)
return nullptr;
@@ -908,6 +952,35 @@ static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT) {
DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
}
+/// If the narrow use is a compare instruction, then widen the compare
+// (and possibly the other operand). The extend operation is hoisted into the
+// loop preheader as far as possible.
+bool WidenIV::WidenLoopCompare(NarrowIVDefUse DU) {
+ ICmpInst *Cmp = dyn_cast<ICmpInst>(DU.NarrowUse);
+ if (!Cmp)
+ return false;
+
+ // Sign of IV user and compare must match.
+ if (IsSigned != CmpInst::isSigned(Cmp->getPredicate()))
+ return false;
+
+ Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0);
+ unsigned CastWidth = SE->getTypeSizeInBits(Op->getType());
+ unsigned IVWidth = SE->getTypeSizeInBits(WideType);
+ assert (CastWidth <= IVWidth && "Unexpected width while widening compare.");
+
+ // Widen the compare instruction.
+ IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT));
+ DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
+
+ // Widen the other operand of the compare, if necessary.
+ if (CastWidth < IVWidth) {
+ Value *ExtOp = getExtend(Op, WideType, IsSigned, Cmp);
+ DU.NarrowUse->replaceUsesOfWith(Op, ExtOp);
+ }
+ return true;
+}
+
/// WidenIVUse - Determine whether an individual user of the narrow IV can be
/// widened. If so, return the wide clone of the user.
Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
@@ -975,10 +1048,15 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
// Does this user itself evaluate to a recurrence after widening?
const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(DU.NarrowUse);
+ if (!WideAddRec)
+ WideAddRec = GetExtendedOperandRecurrence(DU);
+
if (!WideAddRec) {
- WideAddRec = GetExtendedOperandRecurrence(DU);
- }
- if (!WideAddRec) {
+ // If use is a loop condition, try to promote the condition instead of
+ // truncating the IV first.
+ if (WidenLoopCompare(DU))
+ return nullptr;
+
// This user does not evaluate to a recurence after widening, so don't
// follow it. Instead insert a Trunc to kill off the original use,
// eventually isolating the original narrow IV so it can be removed.
@@ -1024,7 +1102,7 @@ void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
Instruction *NarrowUser = cast<Instruction>(U);
// Handle data flow merges and bizarre phi cycles.
- if (!Widened.insert(NarrowUser))
+ if (!Widened.insert(NarrowUser).second)
continue;
NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUser, WideDef));
@@ -1124,14 +1202,16 @@ namespace {
class IndVarSimplifyVisitor : public IVVisitor {
ScalarEvolution *SE;
const DataLayout *DL;
+ const TargetTransformInfo *TTI;
PHINode *IVPhi;
public:
WideIVInfo WI;
IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV,
- const DataLayout *DL, const DominatorTree *DTree):
- SE(SCEV), DL(DL), IVPhi(IV) {
+ const DataLayout *DL, const TargetTransformInfo *TTI,
+ const DominatorTree *DTree)
+ : SE(SCEV), DL(DL), TTI(TTI), IVPhi(IV) {
DT = DTree;
WI.NarrowIV = IVPhi;
if (ReduceLiveIVs)
@@ -1139,7 +1219,9 @@ namespace {
}
// Implement the interface used by simplifyUsersOfIV.
- void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, DL); }
+ void visitCast(CastInst *Cast) override {
+ visitIVCast(Cast, WI, SE, DL, TTI);
+ }
};
}
@@ -1173,7 +1255,7 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L,
PHINode *CurrIV = LoopPhis.pop_back_val();
// Information about sign/zero extensions of CurrIV.
- IndVarSimplifyVisitor Visitor(CurrIV, SE, DL, DT);
+ IndVarSimplifyVisitor Visitor(CurrIV, SE, DL, TTI, DT);
Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &Visitor);
@@ -1200,9 +1282,9 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L,
/// BackedgeTakenInfo. If these expressions have not been reduced, then
/// expanding them may incur additional cost (albeit in the loop preheader).
static bool isHighCostExpansion(const SCEV *S, BranchInst *BI,
- SmallPtrSet<const SCEV*, 8> &Processed,
+ SmallPtrSetImpl<const SCEV*> &Processed,
ScalarEvolution *SE) {
- if (!Processed.insert(S))
+ if (!Processed.insert(S).second)
return false;
// If the backedge-taken count is a UDiv, it's very likely a UDiv that
@@ -1373,7 +1455,7 @@ static bool needsLFTR(Loop *L, DominatorTree *DT) {
/// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils
/// down to checking that all operands are constant and listing instructions
/// that may hide undef.
-static bool hasConcreteDefImpl(Value *V, SmallPtrSet<Value*, 8> &Visited,
+static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited,
unsigned Depth) {
if (isa<Constant>(V))
return !isa<UndefValue>(V);
@@ -1393,7 +1475,7 @@ static bool hasConcreteDefImpl(Value *V, SmallPtrSet<Value*, 8> &Visited,
// Optimistically handle other instructions.
for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) {
- if (!Visited.insert(*OI))
+ if (!Visited.insert(*OI).second)
continue;
if (!hasConcreteDefImpl(*OI, Visited, Depth+1))
return false;
@@ -1637,8 +1719,29 @@ LinearFunctionTestReplace(Loop *L,
// FIXME: In theory, SCEV could drop flags even though they exist in IR.
// A more robust solution would involve getting a new expression for
// CmpIndVar by applying non-NSW/NUW AddExprs.
+ auto WrappingFlags =
+ ScalarEvolution::setFlags(SCEV::FlagNUW, SCEV::FlagNSW);
+ const SCEV *IVInit = IncrementedIndvarSCEV->getStart();
+ if (SE->getTypeSizeInBits(IVInit->getType()) >
+ SE->getTypeSizeInBits(IVCount->getType()))
+ IVInit = SE->getTruncateExpr(IVInit, IVCount->getType());
+ unsigned BitWidth = SE->getTypeSizeInBits(IVCount->getType());
+ Type *WideTy = IntegerType::get(SE->getContext(), BitWidth + 1);
+ // Check if InitIV + BECount+1 requires sign/zero extension.
+ // If not, clear the corresponding flag from WrappingFlags because it is not
+ // necessary for those flags in the IncrementedIndvarSCEV expression.
+ if (SE->getSignExtendExpr(SE->getAddExpr(IVInit, BackedgeTakenCount),
+ WideTy) ==
+ SE->getAddExpr(SE->getSignExtendExpr(IVInit, WideTy),
+ SE->getSignExtendExpr(BackedgeTakenCount, WideTy)))
+ WrappingFlags = ScalarEvolution::clearFlags(WrappingFlags, SCEV::FlagNSW);
+ if (SE->getZeroExtendExpr(SE->getAddExpr(IVInit, BackedgeTakenCount),
+ WideTy) ==
+ SE->getAddExpr(SE->getZeroExtendExpr(IVInit, WideTy),
+ SE->getZeroExtendExpr(BackedgeTakenCount, WideTy)))
+ WrappingFlags = ScalarEvolution::clearFlags(WrappingFlags, SCEV::FlagNUW);
if (!ScalarEvolution::maskFlags(IncrementedIndvarSCEV->getNoWrapFlags(),
- SCEV::FlagNUW | SCEV::FlagNSW)) {
+ WrappingFlags)) {
// Add one to the "backedge-taken" count to get the trip count.
// This addition may overflow, which is valid as long as the comparison is
// truncated to BackedgeTakenCount->getType().
@@ -1832,6 +1935,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
DL = DLP ? &DLP->getDataLayout() : nullptr;
TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+ TTI = getAnalysisIfAvailable<TargetTransformInfo>();
DeadInsts.clear();
Changed = false;
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 21f80385cf46..78beb3f98dcd 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -26,6 +26,7 @@
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
@@ -44,7 +45,7 @@ STATISTIC(NumFolds, "Number of terminators folded");
STATISTIC(NumDupes, "Number of branch blocks duplicated to eliminate phi");
static cl::opt<unsigned>
-Threshold("jump-threading-threshold",
+BBDuplicateThreshold("jump-threading-threshold",
cl::desc("Max block size to duplicate for jump threading"),
cl::init(6), cl::Hidden);
@@ -87,6 +88,8 @@ namespace {
#endif
DenseSet<std::pair<Value*, BasicBlock*> > RecursionSet;
+ unsigned BBDupThreshold;
+
// RAII helper for updating the recursion stack.
struct RecursionSetRemover {
DenseSet<std::pair<Value*, BasicBlock*> > &TheSet;
@@ -102,7 +105,8 @@ namespace {
};
public:
static char ID; // Pass identification
- JumpThreading() : FunctionPass(ID) {
+ JumpThreading(int T = -1) : FunctionPass(ID) {
+ BBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
}
@@ -123,9 +127,11 @@ namespace {
bool ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,
PredValueInfo &Result,
- ConstantPreference Preference);
+ ConstantPreference Preference,
+ Instruction *CxtI = nullptr);
bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
- ConstantPreference Preference);
+ ConstantPreference Preference,
+ Instruction *CxtI = nullptr);
bool ProcessBranchOnPHI(PHINode *PN);
bool ProcessBranchOnXOR(BinaryOperator *BO);
@@ -144,7 +150,7 @@ INITIALIZE_PASS_END(JumpThreading, "jump-threading",
"Jump Threading", false, false)
// Public interface to the Jump Threading pass
-FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); }
+FunctionPass *llvm::createJumpThreadingPass(int Threshold) { return new JumpThreading(Threshold); }
/// runOnFunction - Top level algorithm.
///
@@ -182,7 +188,7 @@ bool JumpThreading::runOnFunction(Function &F) {
// If the block is trivially dead, zap it. This eliminates the successor
// edges which simplifies the CFG.
- if (pred_begin(BB) == pred_end(BB) &&
+ if (pred_empty(BB) &&
BB != &BB->getParent()->getEntryBlock()) {
DEBUG(dbgs() << " JT: Deleting dead block '" << BB->getName()
<< "' with terminator: " << *BB->getTerminator() << '\n');
@@ -339,7 +345,8 @@ static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) {
///
bool JumpThreading::
ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
- ConstantPreference Preference) {
+ ConstantPreference Preference,
+ Instruction *CxtI) {
// This method walks up use-def chains recursively. Because of this, we could
// get into an infinite loop going around loops in the use-def chain. To
// prevent this, keep track of what (value, block) pairs we've already visited
@@ -381,7 +388,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
BasicBlock *P = *PI;
// If the value is known by LazyValueInfo to be a constant in a
// predecessor, use that information to try to thread this block.
- Constant *PredCst = LVI->getConstantOnEdge(V, P, BB);
+ Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI);
if (Constant *KC = getKnownConstant(PredCst, Preference))
Result.push_back(std::make_pair(KC, P));
}
@@ -397,7 +404,8 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i)));
} else {
Constant *CI = LVI->getConstantOnEdge(InVal,
- PN->getIncomingBlock(i), BB);
+ PN->getIncomingBlock(i),
+ BB, CxtI);
if (Constant *KC = getKnownConstant(CI, Preference))
Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i)));
}
@@ -416,9 +424,9 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
if (I->getOpcode() == Instruction::Or ||
I->getOpcode() == Instruction::And) {
ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
- WantInteger);
+ WantInteger, CxtI);
ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals,
- WantInteger);
+ WantInteger, CxtI);
if (LHSVals.empty() && RHSVals.empty())
return false;
@@ -459,7 +467,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
isa<ConstantInt>(I->getOperand(1)) &&
cast<ConstantInt>(I->getOperand(1))->isOne()) {
ComputeValueKnownInPredecessors(I->getOperand(0), BB, Result,
- WantInteger);
+ WantInteger, CxtI);
if (Result.empty())
return false;
@@ -477,7 +485,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
PredValueInfoTy LHSVals;
ComputeValueKnownInPredecessors(BO->getOperand(0), BB, LHSVals,
- WantInteger);
+ WantInteger, CxtI);
// Try to use constant folding to simplify the binary operator.
for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) {
@@ -511,7 +519,8 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
LazyValueInfo::Tristate
ResT = LVI->getPredicateOnEdge(Cmp->getPredicate(), LHS,
- cast<Constant>(RHS), PredBB, BB);
+ cast<Constant>(RHS), PredBB, BB,
+ CxtI ? CxtI : Cmp);
if (ResT == LazyValueInfo::Unknown)
continue;
Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT);
@@ -524,7 +533,6 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
return !Result.empty();
}
-
// If comparing a live-in value against a constant, see if we know the
// live-in value on any predecessors.
if (isa<Constant>(Cmp->getOperand(1)) && Cmp->getType()->isIntegerTy()) {
@@ -538,7 +546,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
// predecessor, use that information to try to thread this block.
LazyValueInfo::Tristate Res =
LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0),
- RHSCst, P, BB);
+ RHSCst, P, BB, CxtI ? CxtI : Cmp);
if (Res == LazyValueInfo::Unknown)
continue;
@@ -554,7 +562,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
if (Constant *CmpConst = dyn_cast<Constant>(Cmp->getOperand(1))) {
PredValueInfoTy LHSVals;
ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
- WantInteger);
+ WantInteger, CxtI);
for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) {
Constant *V = LHSVals[i].first;
@@ -577,7 +585,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
PredValueInfoTy Conds;
if ((TrueVal || FalseVal) &&
ComputeValueKnownInPredecessors(SI->getCondition(), BB, Conds,
- WantInteger)) {
+ WantInteger, CxtI)) {
for (unsigned i = 0, e = Conds.size(); i != e; ++i) {
Constant *Cond = Conds[i].first;
@@ -604,7 +612,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
}
// If all else fails, see if LVI can figure out a constant value for us.
- Constant *CI = LVI->getConstant(V, BB);
+ Constant *CI = LVI->getConstant(V, BB, CxtI);
if (Constant *KC = getKnownConstant(CI, Preference)) {
for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
Result.push_back(std::make_pair(KC, *PI));
@@ -654,7 +662,7 @@ static bool hasAddressTakenAndUsed(BasicBlock *BB) {
bool JumpThreading::ProcessBlock(BasicBlock *BB) {
// If the block is trivially dead, just return and let the caller nuke it.
// This simplifies other transformations.
- if (pred_begin(BB) == pred_end(BB) &&
+ if (pred_empty(BB) &&
BB != &BB->getParent()->getEntryBlock())
return false;
@@ -744,7 +752,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
// All the rest of our checks depend on the condition being an instruction.
if (!CondInst) {
// FIXME: Unify this with code below.
- if (ProcessThreadableEdges(Condition, BB, Preference))
+ if (ProcessThreadableEdges(Condition, BB, Preference, Terminator))
return true;
return false;
}
@@ -766,13 +774,14 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
// FIXME: We could handle mixed true/false by duplicating code.
LazyValueInfo::Tristate Baseline =
LVI->getPredicateOnEdge(CondCmp->getPredicate(), CondCmp->getOperand(0),
- CondConst, *PI, BB);
+ CondConst, *PI, BB, CondCmp);
if (Baseline != LazyValueInfo::Unknown) {
// Check that all remaining incoming values match the first one.
while (++PI != PE) {
LazyValueInfo::Tristate Ret =
LVI->getPredicateOnEdge(CondCmp->getPredicate(),
- CondCmp->getOperand(0), CondConst, *PI, BB);
+ CondCmp->getOperand(0), CondConst, *PI, BB,
+ CondCmp);
if (Ret != Baseline) break;
}
@@ -787,6 +796,21 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
}
}
+ } else if (CondBr && CondConst && CondBr->isConditional()) {
+ // There might be an invariant in the same block with the conditional
+ // that can determine the predicate.
+
+ LazyValueInfo::Tristate Ret =
+ LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
+ CondConst, CondCmp);
+ if (Ret != LazyValueInfo::Unknown) {
+ unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0;
+ unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1;
+ CondBr->getSuccessor(ToRemove)->removePredecessor(BB, true);
+ BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
+ CondBr->eraseFromParent();
+ return true;
+ }
}
if (CondBr && CondConst && TryToUnfoldSelect(CondCmp, BB))
@@ -814,7 +838,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
// a PHI node in the current block. If we can prove that any predecessors
// compute a predictable value based on a PHI node, thread those predecessors.
//
- if (ProcessThreadableEdges(CondInst, BB, Preference))
+ if (ProcessThreadableEdges(CondInst, BB, Preference, Terminator))
return true;
// If this is an otherwise-unfoldable branch on a phi node in the current
@@ -877,6 +901,9 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
// If the returned value is the load itself, replace with an undef. This can
// only happen in dead loops.
if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType());
+ if (AvailableVal->getType() != LI->getType())
+ AvailableVal =
+ CastInst::CreateBitOrPointerCast(AvailableVal, LI->getType(), "", LI);
LI->replaceAllUsesWith(AvailableVal);
LI->eraseFromParent();
return true;
@@ -888,9 +915,10 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
if (BBIt != LoadBB->begin())
return false;
- // If all of the loads and stores that feed the value have the same TBAA tag,
- // then we can propagate it onto any newly inserted loads.
- MDNode *TBAATag = LI->getMetadata(LLVMContext::MD_tbaa);
+ // If all of the loads and stores that feed the value have the same AA tags,
+ // then we can propagate them onto any newly inserted loads.
+ AAMDNodes AATags;
+ LI->getAAMetadata(AATags);
SmallPtrSet<BasicBlock*, 8> PredsScanned;
typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy;
@@ -904,21 +932,21 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
BasicBlock *PredBB = *PI;
// If we already scanned this predecessor, skip it.
- if (!PredsScanned.insert(PredBB))
+ if (!PredsScanned.insert(PredBB).second)
continue;
// Scan the predecessor to see if the value is available in the pred.
BBIt = PredBB->end();
- MDNode *ThisTBAATag = nullptr;
+ AAMDNodes ThisAATags;
Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6,
- nullptr, &ThisTBAATag);
+ nullptr, &ThisAATags);
if (!PredAvailable) {
OneUnavailablePred = PredBB;
continue;
}
- // If tbaa tags disagree or are not present, forget about them.
- if (TBAATag != ThisTBAATag) TBAATag = nullptr;
+ // If AA tags disagree or are not present, forget about them.
+ if (AATags != ThisAATags) AATags = AAMDNodes();
// If so, this load is partially redundant. Remember this info so that we
// can create a PHI node.
@@ -978,8 +1006,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
LI->getAlignment(),
UnavailablePred->getTerminator());
NewVal->setDebugLoc(LI->getDebugLoc());
- if (TBAATag)
- NewVal->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+ if (AATags)
+ NewVal->setAAMetadata(AATags);
AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
}
@@ -1006,7 +1034,16 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
assert(I != AvailablePreds.end() && I->first == P &&
"Didn't find entry for predecessor!");
- PN->addIncoming(I->second, I->first);
+ // If we have an available predecessor but it requires casting, insert the
+ // cast in the predecessor and use the cast. Note that we have to update the
+ // AvailablePreds vector as we go so that all of the PHI entries for this
+ // predecessor use the same bitcast.
+ Value *&PredV = I->second;
+ if (PredV->getType() != LI->getType())
+ PredV = CastInst::CreateBitOrPointerCast(PredV, LI->getType(), "",
+ P->getTerminator());
+
+ PN->addIncoming(PredV, I->first);
}
//cerr << "PRE: " << *LI << *PN << "\n";
@@ -1081,14 +1118,15 @@ FindMostPopularDest(BasicBlock *BB,
}
bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
- ConstantPreference Preference) {
+ ConstantPreference Preference,
+ Instruction *CxtI) {
// If threading this would thread across a loop header, don't even try to
// thread the edge.
if (LoopHeaders.count(BB))
return false;
PredValueInfoTy PredValues;
- if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference))
+ if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference, CxtI))
return false;
assert(!PredValues.empty() &&
@@ -1113,7 +1151,7 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
for (unsigned i = 0, e = PredValues.size(); i != e; ++i) {
BasicBlock *Pred = PredValues[i].second;
- if (!SeenPreds.insert(Pred))
+ if (!SeenPreds.insert(Pred).second)
continue; // Duplicate predecessor entry.
// If the predecessor ends with an indirect goto, we can't change its
@@ -1253,10 +1291,10 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) {
PredValueInfoTy XorOpValues;
bool isLHS = true;
if (!ComputeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues,
- WantInteger)) {
+ WantInteger, BO)) {
assert(XorOpValues.empty());
if (!ComputeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues,
- WantInteger))
+ WantInteger, BO))
return false;
isLHS = false;
}
@@ -1366,8 +1404,8 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
return false;
}
- unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, Threshold);
- if (JumpThreadCost > Threshold) {
+ unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+ if (JumpThreadCost > BBDupThreshold) {
DEBUG(dbgs() << " Not threading BB '" << BB->getName()
<< "' - Cost is too high: " << JumpThreadCost << "\n");
return false;
@@ -1509,8 +1547,8 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
return false;
}
- unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, Threshold);
- if (DuplicationCost > Threshold) {
+ unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+ if (DuplicationCost > BBDupThreshold) {
DEBUG(dbgs() << " Not duplicating BB '" << BB->getName()
<< "' - Cost is too high: " << DuplicationCost << "\n");
return false;
@@ -1672,10 +1710,10 @@ bool JumpThreading::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
// cases will be threaded in any case.
LazyValueInfo::Tristate LHSFolds =
LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1),
- CondRHS, Pred, BB);
+ CondRHS, Pred, BB, CondCmp);
LazyValueInfo::Tristate RHSFolds =
LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2),
- CondRHS, Pred, BB);
+ CondRHS, Pred, BB, CondCmp);
if ((LHSFolds != LazyValueInfo::Unknown ||
RHSFolds != LazyValueInfo::Unknown) &&
LHSFolds != RHSFolds) {
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index abcceb20050a..e145981846d9 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -120,6 +120,7 @@ namespace {
bool MayThrow; // The current loop contains an instruction which
// may throw, thus preventing code motion of
// instructions with side effects.
+ bool HeaderMayThrow; // Same as previous, but specific to loop header
DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap;
/// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
@@ -130,6 +131,9 @@ namespace {
/// set.
void deleteAnalysisValue(Value *V, Loop *L) override;
+ /// Simple Analysis hook. Delete loop L from alias set map.
+ void deleteAnalysisLoop(Loop *L) override;
+
/// SinkRegion - Walk the specified region of the CFG (defined by all blocks
/// dominated by the specified block, and that are in the current loop) in
/// reverse depth first order w.r.t the DominatorTree. This allows us to
@@ -180,9 +184,9 @@ namespace {
/// store into the memory location pointed to by V.
///
bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
- const MDNode *TBAAInfo) {
+ const AAMDNodes &AAInfo) {
// Check to see if any of the basic blocks in CurLoop invalidate *V.
- return CurAST->getAliasSetForPointer(V, Size, TBAAInfo).isMod();
+ return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod();
}
bool canSinkOrHoistInst(Instruction &I);
@@ -270,7 +274,12 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
CurAST->add(*BB); // Incorporate the specified basic block
}
- MayThrow = false;
+ HeaderMayThrow = false;
+ BasicBlock *Header = L->getHeader();
+ for (BasicBlock::iterator I = Header->begin(), E = Header->end();
+ (I != E) && !HeaderMayThrow; ++I)
+ HeaderMayThrow |= I->mayThrow();
+ MayThrow = HeaderMayThrow;
// TODO: We've already searched for instructions which may throw in subloops.
// We may want to reuse this information.
for (Loop::block_iterator BB = L->block_begin(), BBE = L->block_end();
@@ -313,7 +322,8 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
// SSAUpdater strategy during promotion that was LCSSA aware and reformed
// it as it went.
if (Changed)
- formLCSSARecursively(*L, *DT, getAnalysisIfAvailable<ScalarEvolution>());
+ formLCSSARecursively(*L, *DT, LI,
+ getAnalysisIfAvailable<ScalarEvolution>());
}
// Check that neither this loop nor its parent have had LCSSA broken. LICM is
@@ -441,15 +451,18 @@ bool LICM::canSinkOrHoistInst(Instruction &I) {
// in the same alias set as something that ends up being modified.
if (AA->pointsToConstantMemory(LI->getOperand(0)))
return true;
- if (LI->getMetadata("invariant.load"))
+ if (LI->getMetadata(LLVMContext::MD_invariant_load))
return true;
// Don't hoist loads which have may-aliased stores in loop.
uint64_t Size = 0;
if (LI->getType()->isSized())
Size = AA->getTypeStoreSize(LI->getType());
- return !pointerInvalidatedByLoop(LI->getOperand(0), Size,
- LI->getMetadata(LLVMContext::MD_tbaa));
+
+ AAMDNodes AAInfo;
+ LI->getAAMetadata(AAInfo);
+
+ return !pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo);
} else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
// Don't sink or hoist dbg info; it's legal, but not useful.
if (isa<DbgInfoIntrinsic>(I))
@@ -594,8 +607,13 @@ void LICM::sink(Instruction &I) {
// PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
// the instruction.
while (!I.use_empty()) {
+ Instruction *User = I.user_back();
+ if (!DT->isReachableFromEntry(User->getParent())) {
+ User->replaceUsesOfWith(&I, UndefValue::get(I.getType()));
+ continue;
+ }
// The user must be a PHI node.
- PHINode *PN = cast<PHINode>(I.user_back());
+ PHINode *PN = cast<PHINode>(User);
BasicBlock *ExitBlock = PN->getParent();
assert(ExitBlockSet.count(ExitBlock) &&
@@ -647,12 +665,7 @@ bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) {
bool LICM::isGuaranteedToExecute(Instruction &Inst) {
- // Somewhere in this loop there is an instruction which may throw and make us
- // exit the loop.
- if (MayThrow)
- return false;
-
- // Otherwise we have to check to make sure that the instruction dominates all
+ // We have to check to make sure that the instruction dominates all
// of the exit blocks. If it doesn't, then there is a path out of the loop
// which does not execute this instruction, so we can't hoist it.
@@ -660,7 +673,14 @@ bool LICM::isGuaranteedToExecute(Instruction &Inst) {
// common), it is always guaranteed to dominate the exit blocks. Since this
// is a common case, and can save some work, check it now.
if (Inst.getParent() == CurLoop->getHeader())
- return true;
+ // If there's a throw in the header block, we can't guarantee we'll reach
+ // Inst.
+ return !HeaderMayThrow;
+
+ // Somewhere in this loop there is an instruction which may throw and make us
+ // exit the loop.
+ if (MayThrow)
+ return false;
// Get the exit blocks for the current loop.
SmallVector<BasicBlock*, 8> ExitBlocks;
@@ -682,7 +702,7 @@ bool LICM::isGuaranteedToExecute(Instruction &Inst) {
namespace {
class LoopPromoter : public LoadAndStorePromoter {
Value *SomePtr; // Designated pointer to store to.
- SmallPtrSet<Value*, 4> &PointerMustAliases;
+ SmallPtrSetImpl<Value*> &PointerMustAliases;
SmallVectorImpl<BasicBlock*> &LoopExitBlocks;
SmallVectorImpl<Instruction*> &LoopInsertPts;
PredIteratorCache &PredCache;
@@ -690,7 +710,7 @@ namespace {
LoopInfo &LI;
DebugLoc DL;
int Alignment;
- MDNode *TBAATag;
+ AAMDNodes AATags;
Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
if (Instruction *I = dyn_cast<Instruction>(V))
@@ -710,14 +730,14 @@ namespace {
public:
LoopPromoter(Value *SP, const SmallVectorImpl<Instruction *> &Insts,
- SSAUpdater &S, SmallPtrSet<Value *, 4> &PMA,
+ SSAUpdater &S, SmallPtrSetImpl<Value *> &PMA,
SmallVectorImpl<BasicBlock *> &LEB,
SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC,
AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment,
- MDNode *TBAATag)
+ const AAMDNodes &AATags)
: LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast),
- LI(li), DL(dl), Alignment(alignment), TBAATag(TBAATag) {}
+ LI(li), DL(dl), Alignment(alignment), AATags(AATags) {}
bool isInstInList(Instruction *I,
const SmallVectorImpl<Instruction*> &) const override {
@@ -743,7 +763,7 @@ namespace {
StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
NewSI->setAlignment(Alignment);
NewSI->setDebugLoc(DL);
- if (TBAATag) NewSI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+ if (AATags) NewSI->setAAMetadata(AATags);
}
}
@@ -798,11 +818,12 @@ void LICM::PromoteAliasSet(AliasSet &AS,
// We start with an alignment of one and try to find instructions that allow
// us to prove better alignment.
unsigned Alignment = 1;
- MDNode *TBAATag = nullptr;
+ AAMDNodes AATags;
+ bool HasDedicatedExits = CurLoop->hasDedicatedExits();
// Check that all of the pointers in the alias set have the same type. We
// cannot (yet) promote a memory location that is loaded and stored in
- // different sizes. While we are at it, collect alignment and TBAA info.
+ // different sizes. While we are at it, collect alignment and AA info.
for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) {
Value *ASIV = ASI->getValue();
PointerMustAliases.insert(ASIV);
@@ -833,6 +854,13 @@ void LICM::PromoteAliasSet(AliasSet &AS,
assert(!store->isVolatile() && "AST broken");
if (!store->isSimple())
return;
+ // Don't sink stores from loops without dedicated block exits. Exits
+ // containing indirect branches are not transformed by loop simplify,
+ // make sure we catch that. An additional load may be generated in the
+ // preheader for SSA updater, so also avoid sinking when no preheader
+ // is available.
+ if (!HasDedicatedExits || !Preheader)
+ return;
// Note that we only check GuaranteedToExecute inside the store case
// so that we do not introduce stores where they did not exist before
@@ -855,13 +883,12 @@ void LICM::PromoteAliasSet(AliasSet &AS,
} else
return; // Not a load or store.
- // Merge the TBAA tags.
+ // Merge the AA tags.
if (LoopUses.empty()) {
- // On the first load/store, just take its TBAA tag.
- TBAATag = UI->getMetadata(LLVMContext::MD_tbaa);
- } else if (TBAATag) {
- TBAATag = MDNode::getMostGenericTBAA(TBAATag,
- UI->getMetadata(LLVMContext::MD_tbaa));
+ // On the first load/store, just take its AA tags.
+ UI->getAAMetadata(AATags);
+ } else if (AATags) {
+ UI->getAAMetadata(AATags, /* Merge = */ true);
}
LoopUses.push_back(UI);
@@ -896,7 +923,7 @@ void LICM::PromoteAliasSet(AliasSet &AS,
SmallVector<PHINode*, 16> NewPHIs;
SSAUpdater SSA(&NewPHIs);
LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
- InsertPts, PIC, *CurAST, *LI, DL, Alignment, TBAATag);
+ InsertPts, PIC, *CurAST, *LI, DL, Alignment, AATags);
// Set up the preheader to have a definition of the value. It is the live-out
// value from the preheader that uses in the loop will use.
@@ -905,7 +932,7 @@ void LICM::PromoteAliasSet(AliasSet &AS,
Preheader->getTerminator());
PreheaderLoad->setAlignment(Alignment);
PreheaderLoad->setDebugLoc(DL);
- if (TBAATag) PreheaderLoad->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+ if (AATags) PreheaderLoad->setAAMetadata(AATags);
SSA.AddAvailableValue(Preheader, PreheaderLoad);
// Rewrite all the loads in the loop and remember all the definitions from
@@ -936,3 +963,13 @@ void LICM::deleteAnalysisValue(Value *V, Loop *L) {
AST->deleteValue(V);
}
+
+/// Simple Analysis hook. Delete value L from alias set map.
+void LICM::deleteAnalysisLoop(Loop *L) {
+ AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
+ if (!AST)
+ return;
+
+ delete AST;
+ LoopToAliasSetMap.erase(L);
+}
diff --git a/lib/Transforms/Scalar/LLVMBuild.txt b/lib/Transforms/Scalar/LLVMBuild.txt
index 1f6df7dac7ff..2bb49a3026c9 100644
--- a/lib/Transforms/Scalar/LLVMBuild.txt
+++ b/lib/Transforms/Scalar/LLVMBuild.txt
@@ -20,4 +20,4 @@ type = Library
name = Scalar
parent = Transforms
library_name = ScalarOpts
-required_libraries = Analysis Core IPA InstCombine Support Target TransformUtils
+required_libraries = Analysis Core InstCombine ProfileData Support Target TransformUtils
diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp
index 846aa703c9c3..11e4d7606d96 100644
--- a/lib/Transforms/Scalar/LoadCombine.cpp
+++ b/lib/Transforms/Scalar/LoadCombine.cpp
@@ -15,6 +15,8 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
#include "llvm/Analysis/TargetFolder.h"
#include "llvm/Pass.h"
#include "llvm/IR/DataLayout.h"
@@ -51,13 +53,16 @@ struct LoadPOPPair {
class LoadCombine : public BasicBlockPass {
LLVMContext *C;
const DataLayout *DL;
+ AliasAnalysis *AA;
public:
LoadCombine()
: BasicBlockPass(ID),
- C(nullptr), DL(nullptr) {
+ C(nullptr), DL(nullptr), AA(nullptr) {
initializeSROAPass(*PassRegistry::getPassRegistry());
}
+
+ using llvm::Pass::doInitialization;
bool doInitialization(Function &) override;
bool runOnBasicBlock(BasicBlock &BB) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
@@ -223,19 +228,23 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
if (skipOptnoneFunction(BB) || !DL)
return false;
+ AA = &getAnalysis<AliasAnalysis>();
+
IRBuilder<true, TargetFolder>
TheBuilder(BB.getContext(), TargetFolder(DL));
Builder = &TheBuilder;
DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap;
+ AliasSetTracker AST(*AA);
bool Combined = false;
unsigned Index = 0;
for (auto &I : BB) {
- if (I.mayWriteToMemory() || I.mayThrow()) {
+ if (I.mayThrow() || (I.mayWriteToMemory() && AST.containsUnknown(&I))) {
if (combineLoads(LoadMap))
Combined = true;
LoadMap.clear();
+ AST.clear();
continue;
}
LoadInst *LI = dyn_cast<LoadInst>(&I);
@@ -248,6 +257,7 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
if (!POP.Pointer)
continue;
LoadMap[POP.Pointer].push_back(LoadPOPPair(LI, POP, Index++));
+ AST.add(LI);
}
if (combineLoads(LoadMap))
Combined = true;
@@ -256,6 +266,9 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
void LoadCombine::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesCFG();
+
+ AU.addRequired<AliasAnalysis>();
+ AU.addPreserved<AliasAnalysis>();
}
char LoadCombine::ID = 0;
@@ -264,5 +277,9 @@ BasicBlockPass *llvm::createLoadCombinePass() {
return new LoadCombine();
}
-INITIALIZE_PASS(LoadCombine, "load-combine", "Combine Adjacent Loads", false,
- false)
+INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", "Combine Adjacent Loads",
+ false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(LoadCombine, "load-combine", "Combine Adjacent Loads",
+ false, false)
+
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 5ab686aa831a..1d1f33ae6183 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -239,9 +239,8 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) {
LoopInfo &loopInfo = getAnalysis<LoopInfo>();
SmallPtrSet<BasicBlock*, 8> blocks;
blocks.insert(L->block_begin(), L->block_end());
- for (SmallPtrSet<BasicBlock*,8>::iterator I = blocks.begin(),
- E = blocks.end(); I != E; ++I)
- loopInfo.removeBlock(*I);
+ for (BasicBlock *BB : blocks)
+ loopInfo.removeBlock(BB);
// The last step is to inform the loop pass manager that we've
// eliminated this loop.
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index ab1a9393c526..1ac38e0f52a5 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -14,6 +14,7 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
@@ -41,6 +42,7 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<LoopInfo>();
AU.addRequiredID(LoopSimplifyID);
AU.addPreservedID(LoopSimplifyID);
@@ -54,6 +56,7 @@ namespace {
char LoopInstSimplify::ID = 0;
INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify",
"Simplify instructions in loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfo)
@@ -76,6 +79,8 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *L->getHeader()->getParent());
SmallVector<BasicBlock*, 8> ExitBlocks;
L->getUniqueExitBlocks(ExitBlocks);
@@ -116,7 +121,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
// Don't bother simplifying unused instructions.
if (!I->use_empty()) {
- Value *V = SimplifyInstruction(I, DL, TLI, DT);
+ Value *V = SimplifyInstruction(I, DL, TLI, DT, &AC);
if (V && LI->replacementPreservesLCSSAForm(I, V)) {
// Mark all uses for resimplification next time round the loop.
for (User *U : I->users())
@@ -148,7 +153,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE;
++SI) {
BasicBlock *SuccBB = *SI;
- if (!Visited.insert(SuccBB))
+ if (!Visited.insert(SuccBB).second)
continue;
const Loop *SuccLoop = LI->getLoopFor(SuccBB);
@@ -161,7 +166,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) {
BasicBlock *ExitBB = SubLoopExitBlocks[i];
- if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB))
+ if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB).second)
VisitStack.push_back(WorklistItem(ExitBB, false));
}
diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp
index b6fbb16166dd..8f122041c248 100644
--- a/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -215,9 +215,7 @@ protected:
typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector;
// Add a new possible reduction.
- void addSLR(SimpleLoopReduction &SLR) {
- PossibleReds.push_back(SLR);
- }
+ void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); }
// Setup to track possible reductions corresponding to the provided
// rerolling scale. Only reductions with a number of non-PHI instructions
@@ -225,7 +223,8 @@ protected:
// are filled in:
// - A set of all possible instructions in eligible reductions.
// - A set of all PHIs in eligible reductions
- // - A set of all reduced values (last instructions) in eligible reductions.
+ // - A set of all reduced values (last instructions) in eligible
+ // reductions.
void restrictToScale(uint64_t Scale,
SmallInstructionSet &PossibleRedSet,
SmallInstructionSet &PossibleRedPHISet,
@@ -238,13 +237,12 @@ protected:
if (PossibleReds[i].size() % Scale == 0) {
PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
PossibleRedPHISet.insert(PossibleReds[i].getPHI());
-
+
PossibleRedSet.insert(PossibleReds[i].getPHI());
PossibleRedIdx[PossibleReds[i].getPHI()] = i;
- for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(),
- JE = PossibleReds[i].end(); J != JE; ++J) {
- PossibleRedSet.insert(*J);
- PossibleRedIdx[*J] = i;
+ for (Instruction *J : PossibleReds[i]) {
+ PossibleRedSet.insert(J);
+ PossibleRedIdx[J] = i;
}
}
}
@@ -487,7 +485,7 @@ void LoopReroll::collectInLoopUserSet(Loop *L,
if (PN->getIncomingBlock(U) == L->getHeader())
continue;
}
-
+
if (L->contains(User) && !Exclude.count(User)) {
Queue.push_back(User);
}
@@ -659,16 +657,15 @@ bool LoopReroll::ReductionTracker::validateSelected() {
RI != RIE; ++RI) {
int i = *RI;
int PrevIter = 0, BaseCount = 0, Count = 0;
- for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(),
- JE = PossibleReds[i].end(); J != JE; ++J) {
- // Note that all instructions in the chain must have been found because
- // all instructions in the function must have been assigned to some
- // iteration.
- int Iter = PossibleRedIter[*J];
+ for (Instruction *J : PossibleReds[i]) {
+ // Note that all instructions in the chain must have been found because
+ // all instructions in the function must have been assigned to some
+ // iteration.
+ int Iter = PossibleRedIter[J];
if (Iter != PrevIter && Iter != PrevIter + 1 &&
!PossibleReds[i].getReducedValue()->isAssociative()) {
DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
- *J << "\n");
+ J << "\n");
return false;
}
@@ -881,7 +878,7 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
// needed because otherwise isSafeToSpeculativelyExecute returns
// false on PHI nodes.
if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL))
- FutureSideEffects = true;
+ FutureSideEffects = true;
}
++J2;
@@ -952,9 +949,9 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) {
Value *Op2 = J2->getOperand(j);
- // If this is part of a reduction (and the operation is not
- // associatve), then we match all operands, but not those that are
- // part of the reduction.
+ // If this is part of a reduction (and the operation is not
+ // associatve), then we match all operands, but not those that are
+ // part of the reduction.
if (InReduction)
if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
if (Reductions.isPairInSame(J2, Op2I))
@@ -968,11 +965,11 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
Op2 = IV;
if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
- // If we've not already decided to swap the matched operands, and
- // we've not already matched our first operand (note that we could
- // have skipped matching the first operand because it is part of a
- // reduction above), and the instruction is commutative, then try
- // the swapped match.
+ // If we've not already decided to swap the matched operands, and
+ // we've not already matched our first operand (note that we could
+ // have skipped matching the first operand because it is part of a
+ // reduction above), and the instruction is commutative, then try
+ // the swapped match.
if (!Swapped && J1->isCommutative() && !SomeOpMatched &&
J1->getOperand(!j) == Op2) {
Swapped = true;
@@ -1069,7 +1066,7 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
continue;
}
- ++J;
+ ++J;
}
// Insert the new induction variable.
@@ -1110,9 +1107,9 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(),
Preheader->getTerminator());
}
-
- Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1,
- "exitcond");
+
+ Value *Cond =
+ new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, "exitcond");
BI->setCondition(Cond);
if (BI->getSuccessor(1) != Header)
@@ -1182,4 +1179,3 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
return Changed;
}
-
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 2ce58314f8ef..9164be224654 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -13,6 +13,7 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopPass.h"
@@ -53,6 +54,7 @@ namespace {
// LCSSA form makes instruction renaming easier.
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfo>();
AU.addPreserved<LoopInfo>();
@@ -72,12 +74,14 @@ namespace {
unsigned MaxHeaderSize;
LoopInfo *LI;
const TargetTransformInfo *TTI;
+ AssumptionCache *AC;
};
}
char LoopRotate::ID = 0;
INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(LoopInfo)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -98,6 +102,8 @@ bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) {
LI = &getAnalysis<LoopInfo>();
TTI = &getAnalysis<TargetTransformInfo>();
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *L->getHeader()->getParent());
// Simplify the loop latch before attempting to rotate the header
// upward. Rotation may not be needed if the loop tail can be folded into the
@@ -184,13 +190,18 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
}
}
-/// Determine whether the instructions in this range my be safely and cheaply
+/// Determine whether the instructions in this range may be safely and cheaply
/// speculated. This is not an important enough situation to develop complex
/// heuristics. We handle a single arithmetic instruction along with any type
/// conversions.
static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
- BasicBlock::iterator End) {
+ BasicBlock::iterator End, Loop *L) {
bool seenIncrement = false;
+ bool MultiExitLoop = false;
+
+ if (!L->getExitingBlock())
+ MultiExitLoop = true;
+
for (BasicBlock::iterator I = Begin; I != End; ++I) {
if (!isSafeToSpeculativelyExecute(I))
@@ -214,11 +225,33 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
case Instruction::Xor:
case Instruction::Shl:
case Instruction::LShr:
- case Instruction::AShr:
+ case Instruction::AShr: {
+ Value *IVOpnd = nullptr;
+ if (isa<ConstantInt>(I->getOperand(0)))
+ IVOpnd = I->getOperand(1);
+
+ if (isa<ConstantInt>(I->getOperand(1))) {
+ if (IVOpnd)
+ return false;
+
+ IVOpnd = I->getOperand(0);
+ }
+
+ // If increment operand is used outside of the loop, this speculation
+ // could cause extra live range interference.
+ if (MultiExitLoop && IVOpnd) {
+ for (User *UseI : IVOpnd->users()) {
+ auto *UserInst = cast<Instruction>(UseI);
+ if (!L->contains(UserInst))
+ return false;
+ }
+ }
+
if (seenIncrement)
return false;
seenIncrement = true;
break;
+ }
case Instruction::Trunc:
case Instruction::ZExt:
case Instruction::SExt:
@@ -232,7 +265,7 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
/// Fold the loop tail into the loop exit by speculating the loop tail
/// instructions. Typically, this is a single post-increment. In the case of a
/// simple 2-block loop, hoisting the increment can be much better than
-/// duplicating the entire loop header. In the cast of loops with early exits,
+/// duplicating the entire loop header. In the case of loops with early exits,
/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
/// canonical form so downstream passes can handle it.
///
@@ -254,7 +287,7 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) {
if (!BI)
return false;
- if (!shouldSpeculateInstrs(Latch->begin(), Jmp))
+ if (!shouldSpeculateInstrs(Latch->begin(), Jmp, L))
return false;
DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
@@ -323,8 +356,11 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
// Check size of original header and reject loop if it is very big or we can't
// duplicate blocks inside it.
{
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
CodeMetrics Metrics;
- Metrics.analyzeBasicBlock(OrigHeader, *TTI);
+ Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
if (Metrics.notDuplicatable) {
DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
<< " instructions: "; L->dump());
@@ -406,6 +442,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
// With the operands remapped, see if the instruction constant folds or is
// otherwise simplifyable. This commonly occurs because the entry from PHI
// nodes allows icmps and other instructions to fold.
+ // FIXME: Provide DL, TLI, DT, AC to SimplifyInstruction.
Value *V = SimplifyInstruction(C);
if (V && LI->replacementPreservesLCSSAForm(C, V)) {
// If so, then delete the temporary instruction and stick the folded value
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 914b56aa8167..7b60373dc508 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -744,7 +744,7 @@ static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
/// obvious multiple of the UDivExpr.
static bool isHighCostExpansion(const SCEV *S,
- SmallPtrSet<const SCEV*, 8> &Processed,
+ SmallPtrSetImpl<const SCEV*> &Processed,
ScalarEvolution &SE) {
// Zero/One operand expressions
switch (S->getSCEVType()) {
@@ -762,7 +762,7 @@ static bool isHighCostExpansion(const SCEV *S,
Processed, SE);
}
- if (!Processed.insert(S))
+ if (!Processed.insert(S).second)
return false;
if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
@@ -892,34 +892,34 @@ public:
void RateFormula(const TargetTransformInfo &TTI,
const Formula &F,
- SmallPtrSet<const SCEV *, 16> &Regs,
+ SmallPtrSetImpl<const SCEV *> &Regs,
const DenseSet<const SCEV *> &VisitedRegs,
const Loop *L,
const SmallVectorImpl<int64_t> &Offsets,
ScalarEvolution &SE, DominatorTree &DT,
const LSRUse &LU,
- SmallPtrSet<const SCEV *, 16> *LoserRegs = nullptr);
+ SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
void print(raw_ostream &OS) const;
void dump() const;
private:
void RateRegister(const SCEV *Reg,
- SmallPtrSet<const SCEV *, 16> &Regs,
+ SmallPtrSetImpl<const SCEV *> &Regs,
const Loop *L,
ScalarEvolution &SE, DominatorTree &DT);
void RatePrimaryRegister(const SCEV *Reg,
- SmallPtrSet<const SCEV *, 16> &Regs,
+ SmallPtrSetImpl<const SCEV *> &Regs,
const Loop *L,
ScalarEvolution &SE, DominatorTree &DT,
- SmallPtrSet<const SCEV *, 16> *LoserRegs);
+ SmallPtrSetImpl<const SCEV *> *LoserRegs);
};
}
/// RateRegister - Tally up interesting quantities from the given register.
void Cost::RateRegister(const SCEV *Reg,
- SmallPtrSet<const SCEV *, 16> &Regs,
+ SmallPtrSetImpl<const SCEV *> &Regs,
const Loop *L,
ScalarEvolution &SE, DominatorTree &DT) {
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
@@ -967,15 +967,15 @@ void Cost::RateRegister(const SCEV *Reg,
/// before, rate it. Optional LoserRegs provides a way to declare any formula
/// that refers to one of those regs an instant loser.
void Cost::RatePrimaryRegister(const SCEV *Reg,
- SmallPtrSet<const SCEV *, 16> &Regs,
+ SmallPtrSetImpl<const SCEV *> &Regs,
const Loop *L,
ScalarEvolution &SE, DominatorTree &DT,
- SmallPtrSet<const SCEV *, 16> *LoserRegs) {
+ SmallPtrSetImpl<const SCEV *> *LoserRegs) {
if (LoserRegs && LoserRegs->count(Reg)) {
Lose();
return;
}
- if (Regs.insert(Reg)) {
+ if (Regs.insert(Reg).second) {
RateRegister(Reg, Regs, L, SE, DT);
if (LoserRegs && isLoser())
LoserRegs->insert(Reg);
@@ -984,13 +984,13 @@ void Cost::RatePrimaryRegister(const SCEV *Reg,
void Cost::RateFormula(const TargetTransformInfo &TTI,
const Formula &F,
- SmallPtrSet<const SCEV *, 16> &Regs,
+ SmallPtrSetImpl<const SCEV *> &Regs,
const DenseSet<const SCEV *> &VisitedRegs,
const Loop *L,
const SmallVectorImpl<int64_t> &Offsets,
ScalarEvolution &SE, DominatorTree &DT,
const LSRUse &LU,
- SmallPtrSet<const SCEV *, 16> *LoserRegs) {
+ SmallPtrSetImpl<const SCEV *> *LoserRegs) {
assert(F.isCanonical() && "Cost is accurate only for canonical formula");
// Tally up the registers.
if (const SCEV *ScaledReg = F.ScaledReg) {
@@ -1337,10 +1337,9 @@ void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
}
// Update the RegTracker.
- for (SmallPtrSet<const SCEV *, 4>::iterator I = OldRegs.begin(),
- E = OldRegs.end(); I != E; ++I)
- if (!Regs.count(*I))
- RegUses.DropRegister(*I, LUIdx);
+ for (const SCEV *S : OldRegs)
+ if (!Regs.count(S))
+ RegUses.DropRegister(S, LUIdx);
}
void LSRUse::print(raw_ostream &OS) const {
@@ -2226,13 +2225,12 @@ LSRInstance::OptimizeLoopTermCond() {
// must dominate all the post-inc comparisons we just set up, and it must
// dominate the loop latch edge.
IVIncInsertPos = L->getLoopLatch()->getTerminator();
- for (SmallPtrSet<Instruction *, 4>::const_iterator I = PostIncs.begin(),
- E = PostIncs.end(); I != E; ++I) {
+ for (Instruction *Inst : PostIncs) {
BasicBlock *BB =
DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
- (*I)->getParent());
- if (BB == (*I)->getParent())
- IVIncInsertPos = *I;
+ Inst->getParent());
+ if (BB == Inst->getParent())
+ IVIncInsertPos = Inst;
else if (BB != IVIncInsertPos->getParent())
IVIncInsertPos = BB->getTerminator();
}
@@ -2557,7 +2555,7 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
///
/// TODO: Consider IVInc free if it's already used in another chains.
static bool
-isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users,
+isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
ScalarEvolution &SE, const TargetTransformInfo &TTI) {
if (StressIVChain)
return true;
@@ -2567,9 +2565,8 @@ isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users,
if (!Users.empty()) {
DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
- for (SmallPtrSet<Instruction*, 4>::const_iterator I = Users.begin(),
- E = Users.end(); I != E; ++I) {
- dbgs() << " " << **I << "\n";
+ for (Instruction *Inst : Users) {
+ dbgs() << " " << *Inst << "\n";
});
return false;
}
@@ -2805,7 +2802,7 @@ void LSRInstance::CollectChains() {
User::op_iterator IVOpIter = findIVOperand(I->op_begin(), IVOpEnd, L, SE);
while (IVOpIter != IVOpEnd) {
Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
- if (UniqueOperands.insert(IVOpInst))
+ if (UniqueOperands.insert(IVOpInst).second)
ChainInstruction(I, IVOpInst, ChainUsersVec);
IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
}
@@ -3119,11 +3116,15 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
void
LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
- SmallPtrSet<const SCEV *, 8> Inserted;
+ SmallPtrSet<const SCEV *, 32> Visited;
while (!Worklist.empty()) {
const SCEV *S = Worklist.pop_back_val();
+ // Don't process the same SCEV twice
+ if (!Visited.insert(S).second)
+ continue;
+
if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
Worklist.append(N->op_begin(), N->op_end());
else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
@@ -3132,7 +3133,6 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
Worklist.push_back(D->getLHS());
Worklist.push_back(D->getRHS());
} else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
- if (!Inserted.insert(US)) continue;
const Value *V = US->getValue();
if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
// Look for instructions defined outside the loop.
@@ -3774,7 +3774,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
LUIdx = UsedByIndices.find_next(LUIdx))
// Make a memo of this use, offset, and register tuple.
- if (UniqueItems.insert(std::make_pair(LUIdx, Imm)))
+ if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
}
}
@@ -4302,10 +4302,9 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
// reference that register in order to be considered. This prunes out
// unprofitable searching.
SmallSetVector<const SCEV *, 4> ReqRegs;
- for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(),
- E = CurRegs.end(); I != E; ++I)
- if (LU.Regs.count(*I))
- ReqRegs.insert(*I);
+ for (const SCEV *S : CurRegs)
+ if (LU.Regs.count(S))
+ ReqRegs.insert(S);
SmallPtrSet<const SCEV *, 16> NewRegs;
Cost NewCost;
@@ -4350,9 +4349,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
} else {
DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
dbgs() << ".\n Regs:";
- for (SmallPtrSet<const SCEV *, 16>::const_iterator
- I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I)
- dbgs() << ' ' << **I;
+ for (const SCEV *S : NewRegs)
+ dbgs() << ' ' << *S;
dbgs() << '\n');
SolutionCost = NewCost;
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 935f289f040f..fef52107f623 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -13,7 +13,9 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/FunctionTargetTransformInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -52,7 +54,7 @@ UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::init(false), cl::Hidden,
static cl::opt<unsigned>
PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
- cl::desc("Unrolled size limit for loops with an unroll(enable) or "
+ cl::desc("Unrolled size limit for loops with an unroll(full) or "
"unroll_count pragma."));
namespace {
@@ -102,6 +104,7 @@ namespace {
/// loop preheaders be inserted into the CFG...
///
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<LoopInfo>();
AU.addPreserved<LoopInfo>();
AU.addRequiredID(LoopSimplifyID);
@@ -111,6 +114,7 @@ namespace {
AU.addRequired<ScalarEvolution>();
AU.addPreserved<ScalarEvolution>();
AU.addRequired<TargetTransformInfo>();
+ AU.addRequired<FunctionTargetTransformInfo>();
// FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
// If loop unroll does not preserve dom info then LCSSA pass on next
// loop will receive invalid dom info.
@@ -120,7 +124,7 @@ namespace {
// Fill in the UnrollingPreferences parameter with values from the
// TargetTransformationInfo.
- void getUnrollingPreferences(Loop *L, const TargetTransformInfo &TTI,
+ void getUnrollingPreferences(Loop *L, const FunctionTargetTransformInfo &FTTI,
TargetTransformInfo::UnrollingPreferences &UP) {
UP.Threshold = CurrentThreshold;
UP.OptSizeThreshold = OptSizeUnrollThreshold;
@@ -130,7 +134,7 @@ namespace {
UP.MaxCount = UINT_MAX;
UP.Partial = CurrentAllowPartial;
UP.Runtime = CurrentRuntime;
- TTI.getUnrollingPreferences(L, UP);
+ FTTI.getUnrollingPreferences(L, UP);
}
// Select and return an unroll count based on parameters from
@@ -138,12 +142,11 @@ namespace {
// SetExplicitly is set to true if the unroll count is is set by
// the user or a pragma rather than selected heuristically.
unsigned
- selectUnrollCount(const Loop *L, unsigned TripCount, bool HasEnablePragma,
+ selectUnrollCount(const Loop *L, unsigned TripCount, bool PragmaFullUnroll,
unsigned PragmaCount,
const TargetTransformInfo::UnrollingPreferences &UP,
bool &SetExplicitly);
-
// Select threshold values used to limit unrolling based on a
// total unrolled size. Parameters Threshold and PartialThreshold
// are set to the maximum unrolled size for fully and partially
@@ -183,6 +186,8 @@ namespace {
char LoopUnroll::ID = 0;
INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(FunctionTargetTransformInfo)
INITIALIZE_PASS_DEPENDENCY(LoopInfo)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -201,11 +206,15 @@ Pass *llvm::createSimpleLoopUnrollPass() {
/// ApproximateLoopSize - Approximate the size of the loop.
static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
bool &NotDuplicatable,
- const TargetTransformInfo &TTI) {
+ const TargetTransformInfo &TTI,
+ AssumptionCache *AC) {
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
CodeMetrics Metrics;
for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
I != E; ++I)
- Metrics.analyzeBasicBlock(*I, TTI);
+ Metrics.analyzeBasicBlock(*I, TTI, EphValues);
NumCalls = Metrics.NumInlineCandidates;
NotDuplicatable = Metrics.notDuplicatable;
@@ -213,19 +222,22 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
// Don't allow an estimate of size zero. This would allows unrolling of loops
// with huge iteration counts, which is a compile time problem even if it's
- // not a problem for code quality.
- if (LoopSize == 0) LoopSize = 1;
+ // not a problem for code quality. Also, the code using this size may assume
+ // that each loop has at least three instructions (likely a conditional
+ // branch, a comparison feeding that branch, and some kind of loop increment
+ // feeding that comparison instruction).
+ LoopSize = std::max(LoopSize, 3u);
return LoopSize;
}
-// Returns the value associated with the given metadata node name (for
-// example, "llvm.loop.unroll.count"). If no such named metadata node
-// exists, then nullptr is returned.
-static const ConstantInt *GetUnrollMetadataValue(const Loop *L,
- StringRef Name) {
+// Returns the loop hint metadata node with the given name (for example,
+// "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
+// returned.
+static const MDNode *GetUnrollMetadata(const Loop *L, StringRef Name) {
MDNode *LoopID = L->getLoopID();
- if (!LoopID) return nullptr;
+ if (!LoopID)
+ return nullptr;
// First operand should refer to the loop id itself.
assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
@@ -233,41 +245,38 @@ static const ConstantInt *GetUnrollMetadataValue(const Loop *L,
for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
- if (!MD) continue;
+ if (!MD)
+ continue;
const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
- if (!S) continue;
+ if (!S)
+ continue;
- if (Name.equals(S->getString())) {
- assert(MD->getNumOperands() == 2 &&
- "Unroll hint metadata should have two operands.");
- return cast<ConstantInt>(MD->getOperand(1));
- }
+ if (Name.equals(S->getString()))
+ return MD;
}
return nullptr;
}
-// Returns true if the loop has an unroll(enable) pragma.
-static bool HasUnrollEnablePragma(const Loop *L) {
- const ConstantInt *EnableValue =
- GetUnrollMetadataValue(L, "llvm.loop.unroll.enable");
- return (EnableValue && EnableValue->getZExtValue());
+// Returns true if the loop has an unroll(full) pragma.
+static bool HasUnrollFullPragma(const Loop *L) {
+ return GetUnrollMetadata(L, "llvm.loop.unroll.full");
}
// Returns true if the loop has an unroll(disable) pragma.
static bool HasUnrollDisablePragma(const Loop *L) {
- const ConstantInt *EnableValue =
- GetUnrollMetadataValue(L, "llvm.loop.unroll.enable");
- return (EnableValue && !EnableValue->getZExtValue());
+ return GetUnrollMetadata(L, "llvm.loop.unroll.disable");
}
// If loop has an unroll_count pragma return the (necessarily
// positive) value from the pragma. Otherwise return 0.
static unsigned UnrollCountPragmaValue(const Loop *L) {
- const ConstantInt *CountValue =
- GetUnrollMetadataValue(L, "llvm.loop.unroll.count");
- if (CountValue) {
- unsigned Count = CountValue->getZExtValue();
+ const MDNode *MD = GetUnrollMetadata(L, "llvm.loop.unroll.count");
+ if (MD) {
+ assert(MD->getNumOperands() == 2 &&
+ "Unroll count hint metadata should have two operands.");
+ unsigned Count =
+ mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
assert(Count >= 1 && "Unroll count must be positive.");
return Count;
}
@@ -283,9 +292,9 @@ static void SetLoopAlreadyUnrolled(Loop *L) {
if (!LoopID) return;
// First remove any existing loop unrolling metadata.
- SmallVector<Value *, 4> Vals;
+ SmallVector<Metadata *, 4> MDs;
// Reserve first location for self reference to the LoopID metadata node.
- Vals.push_back(nullptr);
+ MDs.push_back(nullptr);
for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
bool IsUnrollMetadata = false;
MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
@@ -293,26 +302,25 @@ static void SetLoopAlreadyUnrolled(Loop *L) {
const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
}
- if (!IsUnrollMetadata) Vals.push_back(LoopID->getOperand(i));
+ if (!IsUnrollMetadata)
+ MDs.push_back(LoopID->getOperand(i));
}
// Add unroll(disable) metadata to disable future unrolling.
LLVMContext &Context = L->getHeader()->getContext();
- SmallVector<Value *, 2> DisableOperands;
- DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.enable"));
- DisableOperands.push_back(ConstantInt::get(Type::getInt1Ty(Context), 0));
+ SmallVector<Metadata *, 1> DisableOperands;
+ DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
MDNode *DisableNode = MDNode::get(Context, DisableOperands);
- Vals.push_back(DisableNode);
+ MDs.push_back(DisableNode);
- MDNode *NewLoopID = MDNode::get(Context, Vals);
+ MDNode *NewLoopID = MDNode::get(Context, MDs);
// Set operand 0 to refer to the loop id itself.
NewLoopID->replaceOperandWith(0, NewLoopID);
L->setLoopID(NewLoopID);
- LoopID->replaceAllUsesWith(NewLoopID);
}
unsigned LoopUnroll::selectUnrollCount(
- const Loop *L, unsigned TripCount, bool HasEnablePragma,
+ const Loop *L, unsigned TripCount, bool PragmaFullUnroll,
unsigned PragmaCount, const TargetTransformInfo::UnrollingPreferences &UP,
bool &SetExplicitly) {
SetExplicitly = true;
@@ -326,9 +334,7 @@ unsigned LoopUnroll::selectUnrollCount(
if (Count == 0) {
if (PragmaCount) {
Count = PragmaCount;
- } else if (HasEnablePragma) {
- // unroll(enable) pragma without an unroll_count pragma
- // indicates to unroll loop fully.
+ } else if (PragmaFullUnroll) {
Count = TripCount;
}
}
@@ -360,6 +366,10 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
LoopInfo *LI = &getAnalysis<LoopInfo>();
ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
+ const FunctionTargetTransformInfo &FTTI =
+ getAnalysis<FunctionTargetTransformInfo>();
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *L->getHeader()->getParent());
BasicBlock *Header = L->getHeader();
DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName()
@@ -368,37 +378,43 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
if (HasUnrollDisablePragma(L)) {
return false;
}
- bool HasEnablePragma = HasUnrollEnablePragma(L);
+ bool PragmaFullUnroll = HasUnrollFullPragma(L);
unsigned PragmaCount = UnrollCountPragmaValue(L);
- bool HasPragma = HasEnablePragma || PragmaCount > 0;
+ bool HasPragma = PragmaFullUnroll || PragmaCount > 0;
TargetTransformInfo::UnrollingPreferences UP;
- getUnrollingPreferences(L, TTI, UP);
+ getUnrollingPreferences(L, FTTI, UP);
// Find trip count and trip multiple if count is not available
unsigned TripCount = 0;
unsigned TripMultiple = 1;
- // Find "latch trip count". UnrollLoop assumes that control cannot exit
- // via the loop latch on any iteration prior to TripCount. The loop may exit
- // early via an earlier branch.
- BasicBlock *LatchBlock = L->getLoopLatch();
- if (LatchBlock) {
- TripCount = SE->getSmallConstantTripCount(L, LatchBlock);
- TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock);
+ // If there are multiple exiting blocks but one of them is the latch, use the
+ // latch for the trip count estimation. Otherwise insist on a single exiting
+ // block for the trip count estimation.
+ BasicBlock *ExitingBlock = L->getLoopLatch();
+ if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
+ ExitingBlock = L->getExitingBlock();
+ if (ExitingBlock) {
+ TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
+ TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
}
// Select an initial unroll count. This may be reduced later based
// on size thresholds.
bool CountSetExplicitly;
- unsigned Count = selectUnrollCount(L, TripCount, HasEnablePragma, PragmaCount,
- UP, CountSetExplicitly);
+ unsigned Count = selectUnrollCount(L, TripCount, PragmaFullUnroll,
+ PragmaCount, UP, CountSetExplicitly);
unsigned NumInlineCandidates;
bool notDuplicatable;
unsigned LoopSize =
- ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI);
+ ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI, &AC);
DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
- uint64_t UnrolledSize = (uint64_t)LoopSize * Count;
+
+ // When computing the unrolled size, note that the conditional branch on the
+ // backedge and the comparison feeding it are not replicated like the rest of
+ // the loop body (which is why 2 is subtracted).
+ uint64_t UnrolledSize = (uint64_t)(LoopSize-2) * Count + 2;
if (notDuplicatable) {
DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
<< " instructions.\n");
@@ -443,7 +459,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
}
if (PartialThreshold != NoThreshold && UnrolledSize > PartialThreshold) {
// Reduce unroll count to be modulo of TripCount for partial unrolling.
- Count = PartialThreshold / LoopSize;
+ Count = (std::max(PartialThreshold, 3u)-2) / (LoopSize-2);
while (Count != 0 && TripCount % Count != 0)
Count--;
}
@@ -457,7 +473,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
// the original count which satisfies the threshold limit.
while (Count != 0 && UnrolledSize > PartialThreshold) {
Count >>= 1;
- UnrolledSize = LoopSize * Count;
+ UnrolledSize = (LoopSize-2) * Count + 2;
}
if (Count > UP.MaxCount)
Count = UP.MaxCount;
@@ -465,25 +481,26 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
}
if (HasPragma) {
- // Mark loop as unrolled to prevent unrolling beyond that
- // requested by the pragma.
- SetLoopAlreadyUnrolled(L);
+ if (PragmaCount != 0)
+ // If loop has an unroll count pragma mark loop as unrolled to prevent
+ // unrolling beyond that requested by the pragma.
+ SetLoopAlreadyUnrolled(L);
// Emit optimization remarks if we are unable to unroll the loop
// as directed by a pragma.
DebugLoc LoopLoc = L->getStartLoc();
Function *F = Header->getParent();
LLVMContext &Ctx = F->getContext();
- if (HasEnablePragma && PragmaCount == 0) {
+ if (PragmaFullUnroll && PragmaCount == 0) {
if (TripCount && Count != TripCount) {
emitOptimizationRemarkMissed(
Ctx, DEBUG_TYPE, *F, LoopLoc,
- "Unable to fully unroll loop as directed by unroll(enable) pragma "
+ "Unable to fully unroll loop as directed by unroll(full) pragma "
"because unrolled size is too large.");
} else if (!TripCount) {
emitOptimizationRemarkMissed(
Ctx, DEBUG_TYPE, *F, LoopLoc,
- "Unable to fully unroll loop as directed by unroll(enable) pragma "
+ "Unable to fully unroll loop as directed by unroll(full) pragma "
"because loop has a runtime trip count.");
}
} else if (PragmaCount > 0 && Count != OriginalCount) {
@@ -501,7 +518,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
}
// Unroll the loop.
- if (!UnrollLoop(L, Count, TripCount, AllowRuntime, TripMultiple, LI, this, &LPM))
+ if (!UnrollLoop(L, Count, TripCount, AllowRuntime, TripMultiple, LI, this,
+ &LPM, &AC))
return false;
return true;
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 977c53a3bc63..9f4c12270d76 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -30,6 +30,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
@@ -103,7 +104,8 @@ namespace {
// Analyze loop. Check its size, calculate is it possible to unswitch
// it. Returns true if we can unswitch this loop.
- bool countLoop(const Loop *L, const TargetTransformInfo &TTI);
+ bool countLoop(const Loop *L, const TargetTransformInfo &TTI,
+ AssumptionCache *AC);
// Clean all data related to given loop.
void forgetLoop(const Loop *L);
@@ -126,6 +128,7 @@ namespace {
class LoopUnswitch : public LoopPass {
LoopInfo *LI; // Loop information
LPPassManager *LPM;
+ AssumptionCache *AC;
// LoopProcessWorklist - Used to check if second loop needs processing
// after RewriteLoopBodyWithConditionConstant rewrites first loop.
@@ -164,6 +167,7 @@ namespace {
/// loop preheaders be inserted into the CFG.
///
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequiredID(LoopSimplifyID);
AU.addPreservedID(LoopSimplifyID);
AU.addRequired<LoopInfo>();
@@ -212,7 +216,8 @@ namespace {
// Analyze loop. Check its size, calculate is it possible to unswitch
// it. Returns true if we can unswitch this loop.
-bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) {
+bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
+ AssumptionCache *AC) {
LoopPropsMapIt PropsIt;
bool Inserted;
@@ -229,13 +234,16 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) {
// large numbers of branches which cause loop unswitching to go crazy.
// This is a very ad-hoc heuristic.
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
// FIXME: This is overly conservative because it does not take into
// consideration code simplification opportunities and code that can
// be shared by the resultant unswitched loops.
CodeMetrics Metrics;
for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
I != E; ++I)
- Metrics.analyzeBasicBlock(*I, TTI);
+ Metrics.analyzeBasicBlock(*I, TTI, EphValues);
Props.SizeEstimation = std::min(Metrics.NumInsts, Metrics.NumBlocks * 5);
Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation);
@@ -326,6 +334,7 @@ char LoopUnswitch::ID = 0;
INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
false, false)
INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LoopInfo)
INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -376,6 +385,8 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
if (skipOptnoneFunction(L))
return false;
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *L->getHeader()->getParent());
LI = &getAnalysis<LoopInfo>();
LPM = &LPM_Ref;
DominatorTreeWrapperPass *DTWP =
@@ -421,7 +432,8 @@ bool LoopUnswitch::processCurrentLoop() {
// Probably we reach the quota of branches for this loop. If so
// stop unswitching.
- if (!BranchesInfo.countLoop(currentLoop, getAnalysis<TargetTransformInfo>()))
+ if (!BranchesInfo.countLoop(currentLoop, getAnalysis<TargetTransformInfo>(),
+ AC))
return false;
// Loop over all of the basic blocks in the loop. If we find an interior
@@ -823,6 +835,10 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
F->getBasicBlockList().splice(NewPreheader, F->getBasicBlockList(),
NewBlocks[0], F->end());
+ // FIXME: We could register any cloned assumptions instead of clearing the
+ // whole function's cache.
+ AC->clear();
+
// Now we create the new Loop object for the versioned loop.
Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 7c184a4ad2c3..33b5f9df5a27 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -16,6 +16,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/DataLayout.h"
@@ -329,6 +330,7 @@ namespace {
// This transformation requires dominator postdominator info
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<MemoryDependenceAnalysis>();
AU.addRequired<AliasAnalysis>();
@@ -361,6 +363,7 @@ FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); }
INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
@@ -631,22 +634,24 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
if (destSize < srcSize)
return false;
} else if (Argument *A = dyn_cast<Argument>(cpyDest)) {
- // If the destination is an sret parameter then only accesses that are
- // outside of the returned struct type can trap.
- if (!A->hasStructRetAttr())
- return false;
+ if (A->getDereferenceableBytes() < srcSize) {
+ // If the destination is an sret parameter then only accesses that are
+ // outside of the returned struct type can trap.
+ if (!A->hasStructRetAttr())
+ return false;
- Type *StructTy = cast<PointerType>(A->getType())->getElementType();
- if (!StructTy->isSized()) {
- // The call may never return and hence the copy-instruction may never
- // be executed, and therefore it's not safe to say "the destination
- // has at least <cpyLen> bytes, as implied by the copy-instruction",
- return false;
- }
+ Type *StructTy = cast<PointerType>(A->getType())->getElementType();
+ if (!StructTy->isSized()) {
+ // The call may never return and hence the copy-instruction may never
+ // be executed, and therefore it's not safe to say "the destination
+ // has at least <cpyLen> bytes, as implied by the copy-instruction",
+ return false;
+ }
- uint64_t destSize = DL->getTypeAllocSize(StructTy);
- if (destSize < srcSize)
- return false;
+ uint64_t destSize = DL->getTypeAllocSize(StructTy);
+ if (destSize < srcSize)
+ return false;
+ }
} else {
return false;
}
@@ -673,15 +678,23 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) {
for (User *UU : U->users())
srcUseList.push_back(UU);
- } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) {
- if (G->hasAllZeroIndices())
- for (User *UU : U->users())
- srcUseList.push_back(UU);
- else
+ continue;
+ }
+ if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) {
+ if (!G->hasAllZeroIndices())
return false;
- } else if (U != C && U != cpy) {
- return false;
+
+ for (User *UU : U->users())
+ srcUseList.push_back(UU);
+ continue;
}
+ if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U))
+ if (IT->getIntrinsicID() == Intrinsic::lifetime_start ||
+ IT->getIntrinsicID() == Intrinsic::lifetime_end)
+ continue;
+
+ if (U != C && U != cpy)
+ return false;
}
// Check that src isn't captured by the called function since the
@@ -969,8 +982,13 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
// If it is greater than the memcpy, then we check to see if we can force the
// source of the memcpy to the alignment we need. If we fail, we bail out.
+ AssumptionCache &AC =
+ getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *CS->getParent()->getParent());
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
if (MDep->getAlignment() < ByValAlign &&
- getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, DL) < ByValAlign)
+ getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &AC,
+ CS.getInstruction(), &DT) < ByValAlign)
return false;
// Verify that the copied-from memory doesn't change in between the memcpy and
diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index c2467fecb5eb..8509713b3367 100644
--- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -97,9 +97,6 @@ using namespace llvm;
//===----------------------------------------------------------------------===//
// MergedLoadStoreMotion Pass
//===----------------------------------------------------------------------===//
-static cl::opt<bool>
-EnableMLSM("mlsm", cl::desc("Enable motion of merged load and store"),
- cl::init(true));
namespace {
class MergedLoadStoreMotion : public FunctionPass {
@@ -134,7 +131,9 @@ private:
BasicBlock *getDiamondTail(BasicBlock *BB);
bool isDiamondHead(BasicBlock *BB);
// Routines for hoisting loads
- bool isLoadHoistBarrier(Instruction *Inst);
+ bool isLoadHoistBarrierInRange(const Instruction& Start,
+ const Instruction& End,
+ LoadInst* LI);
LoadInst *canHoistFromBlock(BasicBlock *BB, LoadInst *LI);
void hoistInstruction(BasicBlock *BB, Instruction *HoistCand,
Instruction *ElseInst);
@@ -144,7 +143,9 @@ private:
// Routines for sinking stores
StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI);
PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
- bool isStoreSinkBarrier(Instruction *Inst);
+ bool isStoreSinkBarrierInRange(const Instruction& Start,
+ const Instruction& End,
+ AliasAnalysis::Location Loc);
bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst);
bool mergeStores(BasicBlock *BB);
// The mergeLoad/Store algorithms could have Size0 * Size1 complexity,
@@ -235,27 +236,12 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
/// being loaded or protect against the load from happening
/// it is considered a hoist barrier.
///
-bool MergedLoadStoreMotion::isLoadHoistBarrier(Instruction *Inst) {
- // FIXME: A call with no side effects should not be a barrier.
- // Aren't all such calls covered by mayHaveSideEffects() below?
- // Then this check can be removed.
- if (isa<CallInst>(Inst))
- return true;
- if (isa<TerminatorInst>(Inst))
- return true;
- // FIXME: Conservatively let a store instruction block the load.
- // Use alias analysis instead.
- if (isa<StoreInst>(Inst))
- return true;
- // Note: mayHaveSideEffects covers all instructions that could
- // trigger a change to state. Eg. in-flight stores have to be executed
- // before ordered loads or fences, calls could invoke functions that store
- // data to memory etc.
- if (Inst->mayHaveSideEffects()) {
- return true;
- }
- DEBUG(dbgs() << "No Hoist Barrier\n");
- return false;
+
+bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start,
+ const Instruction& End,
+ LoadInst* LI) {
+ AliasAnalysis::Location Loc = AA->getLocation(LI);
+ return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::Mod);
}
///
@@ -265,33 +251,29 @@ bool MergedLoadStoreMotion::isLoadHoistBarrier(Instruction *Inst) {
/// and it can be hoisted from \p BB, return that load.
/// Otherwise return Null.
///
-LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB,
- LoadInst *LI) {
- LoadInst *I = nullptr;
- assert(isa<LoadInst>(LI));
- if (LI->isUsedOutsideOfBlock(LI->getParent()))
- return nullptr;
+LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1,
+ LoadInst *Load0) {
- for (BasicBlock::iterator BBI = BB->begin(), BBE = BB->end(); BBI != BBE;
+ for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); BBI != BBE;
++BBI) {
Instruction *Inst = BBI;
// Only merge and hoist loads when their result in used only in BB
- if (isLoadHoistBarrier(Inst))
- break;
- if (!isa<LoadInst>(Inst))
- continue;
- if (Inst->isUsedOutsideOfBlock(Inst->getParent()))
+ if (!isa<LoadInst>(Inst) || Inst->isUsedOutsideOfBlock(BB1))
continue;
- AliasAnalysis::Location LocLI = AA->getLocation(LI);
- AliasAnalysis::Location LocInst = AA->getLocation((LoadInst *)Inst);
- if (AA->isMustAlias(LocLI, LocInst) && LI->getType() == Inst->getType()) {
- I = (LoadInst *)Inst;
- break;
+ LoadInst *Load1 = dyn_cast<LoadInst>(Inst);
+ BasicBlock *BB0 = Load0->getParent();
+
+ AliasAnalysis::Location Loc0 = AA->getLocation(Load0);
+ AliasAnalysis::Location Loc1 = AA->getLocation(Load1);
+ if (AA->isMustAlias(Loc0, Loc1) && Load0->isSameOperationAs(Load1) &&
+ !isLoadHoistBarrierInRange(BB1->front(), *Load1, Load1) &&
+ !isLoadHoistBarrierInRange(BB0->front(), *Load0, Load0)) {
+ return Load1;
}
}
- return I;
+ return nullptr;
}
///
@@ -388,15 +370,10 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) {
Instruction *I = BBI;
++BBI;
- if (isLoadHoistBarrier(I))
- break;
// Only move non-simple (atomic, volatile) loads.
- if (!isa<LoadInst>(I))
- continue;
-
- LoadInst *L0 = (LoadInst *)I;
- if (!L0->isSimple())
+ LoadInst *L0 = dyn_cast<LoadInst>(I);
+ if (!L0 || !L0->isSimple() || L0->isUsedOutsideOfBlock(Succ0))
continue;
++NLoads;
@@ -414,26 +391,19 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) {
}
///
-/// \brief True when instruction is sink barrier for a store
-///
-bool MergedLoadStoreMotion::isStoreSinkBarrier(Instruction *Inst) {
- // FIXME: Conservatively let a load instruction block the store.
- // Use alias analysis instead.
- if (isa<LoadInst>(Inst))
- return true;
- if (isa<CallInst>(Inst))
- return true;
- if (isa<TerminatorInst>(Inst) && !isa<BranchInst>(Inst))
- return true;
- // Note: mayHaveSideEffects covers all instructions that could
- // trigger a change to state. Eg. in-flight stores have to be executed
- // before ordered loads or fences, calls could invoke functions that store
- // data to memory etc.
- if (!isa<StoreInst>(Inst) && Inst->mayHaveSideEffects()) {
- return true;
- }
- DEBUG(dbgs() << "No Sink Barrier\n");
- return false;
+/// \brief True when instruction is a sink barrier for a store
+/// located in Loc
+///
+/// Whenever an instruction could possibly read or modify the
+/// value being stored or protect against the store from
+/// happening it is considered a sink barrier.
+///
+
+bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction& Start,
+ const Instruction& End,
+ AliasAnalysis::Location
+ Loc) {
+ return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::Ref);
}
///
@@ -441,27 +411,28 @@ bool MergedLoadStoreMotion::isStoreSinkBarrier(Instruction *Inst) {
///
/// \return The store in \p when it is safe to sink. Otherwise return Null.
///
-StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB,
- StoreInst *SI) {
- StoreInst *I = 0;
- DEBUG(dbgs() << "can Sink? : "; SI->dump(); dbgs() << "\n");
- for (BasicBlock::reverse_iterator RBI = BB->rbegin(), RBE = BB->rend();
+StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
+ StoreInst *Store0) {
+ DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
+ for (BasicBlock::reverse_iterator RBI = BB1->rbegin(), RBE = BB1->rend();
RBI != RBE; ++RBI) {
Instruction *Inst = &*RBI;
- // Only move loads if they are used in the block.
- if (isStoreSinkBarrier(Inst))
- break;
- if (isa<StoreInst>(Inst)) {
- AliasAnalysis::Location LocSI = AA->getLocation(SI);
- AliasAnalysis::Location LocInst = AA->getLocation((StoreInst *)Inst);
- if (AA->isMustAlias(LocSI, LocInst)) {
- I = (StoreInst *)Inst;
- break;
- }
+ if (!isa<StoreInst>(Inst))
+ continue;
+
+ StoreInst *Store1 = cast<StoreInst>(Inst);
+ BasicBlock *BB0 = Store0->getParent();
+
+ AliasAnalysis::Location Loc0 = AA->getLocation(Store0);
+ AliasAnalysis::Location Loc1 = AA->getLocation(Store1);
+ if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) &&
+ !isStoreSinkBarrierInRange(*Store1, BB1->back(), Loc1) &&
+ !isStoreSinkBarrierInRange(*Store0, BB0->back(), Loc0)) {
+ return Store1;
}
}
- return I;
+ return nullptr;
}
///
@@ -573,8 +544,7 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
Instruction *I = &*RBI;
++RBI;
- if (isStoreSinkBarrier(I))
- break;
+
// Sink move non-simple (atomic, volatile) stores
if (!isa<StoreInst>(I))
continue;
@@ -611,8 +581,6 @@ bool MergedLoadStoreMotion::runOnFunction(Function &F) {
AA = &getAnalysis<AliasAnalysis>();
bool Changed = false;
- if (!EnableMLSM)
- return false;
DEBUG(dbgs() << "Instruction Merger\n");
// Merge unconditional branches, allowing PRE to catch more
@@ -622,7 +590,6 @@ bool MergedLoadStoreMotion::runOnFunction(Function &F) {
// Hoist equivalent loads and sink stores
// outside diamonds when possible
- // Run outside core GVN
if (isDiamondHead(BB)) {
Changed |= mergeLoads(BB);
Changed |= mergeStores(getDiamondTail(BB));
diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 7cce89e0627e..5c8bed585b64 100644
--- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -108,6 +108,10 @@ bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call,
if (Call->onlyReadsMemory())
return false;
+ // The call must have the expected result type.
+ if (!Call->getType()->isFloatingPointTy())
+ return false;
+
// Do the following transformation:
//
// (before)
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index ea2cf7cf9b5f..4e022556f9cc 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -176,6 +176,7 @@ namespace {
private:
void BuildRankMap(Function &F);
unsigned getRank(Value *V);
+ void canonicalizeOperands(Instruction *I);
void ReassociateExpression(BinaryOperator *I);
void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
Value *OptimizeExpression(BinaryOperator *I,
@@ -194,6 +195,7 @@ namespace {
Value *RemoveFactorFromExpression(Value *V, Value *Factor);
void EraseInst(Instruction *I);
void OptimizeInst(Instruction *I);
+ Instruction *canonicalizeNegConstExpr(Instruction *I);
};
}
@@ -235,7 +237,20 @@ FunctionPass *llvm::createReassociatePass() { return new Reassociate(); }
/// opcode and if it only has one use.
static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
if (V->hasOneUse() && isa<Instruction>(V) &&
- cast<Instruction>(V)->getOpcode() == Opcode)
+ cast<Instruction>(V)->getOpcode() == Opcode &&
+ (!isa<FPMathOperator>(V) ||
+ cast<Instruction>(V)->hasUnsafeAlgebra()))
+ return cast<BinaryOperator>(V);
+ return nullptr;
+}
+
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
+ unsigned Opcode2) {
+ if (V->hasOneUse() && isa<Instruction>(V) &&
+ (cast<Instruction>(V)->getOpcode() == Opcode1 ||
+ cast<Instruction>(V)->getOpcode() == Opcode2) &&
+ (!isa<FPMathOperator>(V) ||
+ cast<Instruction>(V)->hasUnsafeAlgebra()))
return cast<BinaryOperator>(V);
return nullptr;
}
@@ -264,9 +279,11 @@ static bool isUnmovableInstruction(Instruction *I) {
void Reassociate::BuildRankMap(Function &F) {
unsigned i = 2;
- // Assign distinct ranks to function arguments
- for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I)
+ // Assign distinct ranks to function arguments.
+ for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
ValueRankMap[&*I] = ++i;
+ DEBUG(dbgs() << "Calculated Rank[" << I->getName() << "] = " << i << "\n");
+ }
ReversePostOrderTraversal<Function*> RPOT(&F);
for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(),
@@ -304,24 +321,78 @@ unsigned Reassociate::getRank(Value *V) {
// If this is a not or neg instruction, do not count it for rank. This
// assures us that X and ~X will have the same rank.
- if (!I->getType()->isIntegerTy() ||
- (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I)))
+ Type *Ty = V->getType();
+ if ((!Ty->isIntegerTy() && !Ty->isFloatingPointTy()) ||
+ (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
+ !BinaryOperator::isFNeg(I)))
++Rank;
- //DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = "
- // << Rank << "\n");
+ DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank << "\n");
return ValueRankMap[I] = Rank;
}
+// Canonicalize constants to RHS. Otherwise, sort the operands by rank.
+void Reassociate::canonicalizeOperands(Instruction *I) {
+ assert(isa<BinaryOperator>(I) && "Expected binary operator.");
+ assert(I->isCommutative() && "Expected commutative operator.");
+
+ Value *LHS = I->getOperand(0);
+ Value *RHS = I->getOperand(1);
+ unsigned LHSRank = getRank(LHS);
+ unsigned RHSRank = getRank(RHS);
+
+ if (isa<Constant>(RHS))
+ return;
+
+ if (isa<Constant>(LHS) || RHSRank < LHSRank)
+ cast<BinaryOperator>(I)->swapOperands();
+}
+
+static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name,
+ Instruction *InsertBefore, Value *FlagsOp) {
+ if (S1->getType()->isIntegerTy())
+ return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore);
+ else {
+ BinaryOperator *Res =
+ BinaryOperator::CreateFAdd(S1, S2, Name, InsertBefore);
+ Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+ return Res;
+ }
+}
+
+static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name,
+ Instruction *InsertBefore, Value *FlagsOp) {
+ if (S1->getType()->isIntegerTy())
+ return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore);
+ else {
+ BinaryOperator *Res =
+ BinaryOperator::CreateFMul(S1, S2, Name, InsertBefore);
+ Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+ return Res;
+ }
+}
+
+static BinaryOperator *CreateNeg(Value *S1, const Twine &Name,
+ Instruction *InsertBefore, Value *FlagsOp) {
+ if (S1->getType()->isIntegerTy())
+ return BinaryOperator::CreateNeg(S1, Name, InsertBefore);
+ else {
+ BinaryOperator *Res = BinaryOperator::CreateFNeg(S1, Name, InsertBefore);
+ Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+ return Res;
+ }
+}
+
/// LowerNegateToMultiply - Replace 0-X with X*-1.
///
static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) {
- Constant *Cst = Constant::getAllOnesValue(Neg->getType());
+ Type *Ty = Neg->getType();
+ Constant *NegOne = Ty->isIntegerTy() ? ConstantInt::getAllOnesValue(Ty)
+ : ConstantFP::get(Ty, -1.0);
- BinaryOperator *Res =
- BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg);
- Neg->setOperand(1, Constant::getNullValue(Neg->getType())); // Drop use of op.
+ BinaryOperator *Res = CreateMul(Neg->getOperand(1), NegOne, "", Neg, Neg);
+ Neg->setOperand(1, Constant::getNullValue(Ty)); // Drop use of op.
Res->takeName(Neg);
Neg->replaceAllUsesWith(Res);
Res->setDebugLoc(Neg->getDebugLoc());
@@ -377,13 +448,14 @@ static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
LHS = 0; // 1 + 1 === 0 modulo 2.
return;
}
- if (Opcode == Instruction::Add) {
+ if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) {
// TODO: Reduce the weight by exploiting nsw/nuw?
LHS += RHS;
return;
}
- assert(Opcode == Instruction::Mul && "Unknown associative operation!");
+ assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) &&
+ "Unknown associative operation!");
unsigned Bitwidth = LHS.getBitWidth();
// If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth
// can be replaced with W-CM. That's because x^W=x^(W-CM) for every Bitwidth
@@ -499,8 +571,7 @@ static bool LinearizeExprTree(BinaryOperator *I,
DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
unsigned Opcode = I->getOpcode();
- assert(Instruction::isAssociative(Opcode) &&
- Instruction::isCommutative(Opcode) &&
+ assert(I->isAssociative() && I->isCommutative() &&
"Expected an associative and commutative operation!");
// Visit all operands of the expression, keeping track of their weight (the
@@ -515,7 +586,7 @@ static bool LinearizeExprTree(BinaryOperator *I,
// ways to get to it.
SmallVector<std::pair<BinaryOperator*, APInt>, 8> Worklist; // (Op, Weight)
Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1)));
- bool MadeChange = false;
+ bool Changed = false;
// Leaves of the expression are values that either aren't the right kind of
// operation (eg: a constant, or a multiply in an add tree), or are, but have
@@ -552,7 +623,7 @@ static bool LinearizeExprTree(BinaryOperator *I,
// If this is a binary operation of the right kind with only one use then
// add its operands to the expression.
if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
- assert(Visited.insert(Op) && "Not first visit!");
+ assert(Visited.insert(Op).second && "Not first visit!");
DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
Worklist.push_back(std::make_pair(BO, Weight));
continue;
@@ -562,7 +633,7 @@ static bool LinearizeExprTree(BinaryOperator *I,
LeafMap::iterator It = Leaves.find(Op);
if (It == Leaves.end()) {
// Not in the leaf map. Must be the first time we saw this operand.
- assert(Visited.insert(Op) && "Not first visit!");
+ assert(Visited.insert(Op).second && "Not first visit!");
if (!Op->hasOneUse()) {
// This value has uses not accounted for by the expression, so it is
// not safe to modify. Mark it as being a leaf.
@@ -584,7 +655,7 @@ static bool LinearizeExprTree(BinaryOperator *I,
// exactly one such use, drop this new use of the leaf.
assert(!Op->hasOneUse() && "Only one use, but we got here twice!");
I->setOperand(OpIdx, UndefValue::get(I->getType()));
- MadeChange = true;
+ Changed = true;
// If the leaf is a binary operation of the right kind and we now see
// that its multiple original uses were in fact all by nodes belonging
@@ -613,21 +684,24 @@ static bool LinearizeExprTree(BinaryOperator *I,
// expression. This means that it can safely be modified. See if we
// can usefully morph it into an expression of the right kind.
assert((!isa<Instruction>(Op) ||
- cast<Instruction>(Op)->getOpcode() != Opcode) &&
+ cast<Instruction>(Op)->getOpcode() != Opcode
+ || (isa<FPMathOperator>(Op) &&
+ !cast<Instruction>(Op)->hasUnsafeAlgebra())) &&
"Should have been handled above!");
assert(Op->hasOneUse() && "Has uses outside the expression tree!");
// If this is a multiply expression, turn any internal negations into
// multiplies by -1 so they can be reassociated.
- BinaryOperator *BO = dyn_cast<BinaryOperator>(Op);
- if (Opcode == Instruction::Mul && BO && BinaryOperator::isNeg(BO)) {
- DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
- BO = LowerNegateToMultiply(BO);
- DEBUG(dbgs() << *BO << 'n');
- Worklist.push_back(std::make_pair(BO, Weight));
- MadeChange = true;
- continue;
- }
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op))
+ if ((Opcode == Instruction::Mul && BinaryOperator::isNeg(BO)) ||
+ (Opcode == Instruction::FMul && BinaryOperator::isFNeg(BO))) {
+ DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
+ BO = LowerNegateToMultiply(BO);
+ DEBUG(dbgs() << *BO << '\n');
+ Worklist.push_back(std::make_pair(BO, Weight));
+ Changed = true;
+ continue;
+ }
// Failed to morph into an expression of the right type. This really is
// a leaf.
@@ -665,7 +739,7 @@ static bool LinearizeExprTree(BinaryOperator *I,
Ops.push_back(std::make_pair(Identity, APInt(Bitwidth, 1)));
}
- return MadeChange;
+ return Changed;
}
// RewriteExprTree - Now that the operands for this expression tree are
@@ -798,6 +872,8 @@ void Reassociate::RewriteExprTree(BinaryOperator *I,
Constant *Undef = UndefValue::get(I->getType());
NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode),
Undef, Undef, "", I);
+ if (NewOp->getType()->isFloatingPointTy())
+ NewOp->setFastMathFlags(I->getFastMathFlags());
} else {
NewOp = NodesToRewrite.pop_back_val();
}
@@ -817,7 +893,14 @@ void Reassociate::RewriteExprTree(BinaryOperator *I,
// expression tree is dominated by all of Ops.
if (ExpressionChanged)
do {
- ExpressionChanged->clearSubclassOptionalData();
+ // Preserve FastMathFlags.
+ if (isa<FPMathOperator>(I)) {
+ FastMathFlags Flags = I->getFastMathFlags();
+ ExpressionChanged->clearSubclassOptionalData();
+ ExpressionChanged->setFastMathFlags(Flags);
+ } else
+ ExpressionChanged->clearSubclassOptionalData();
+
if (ExpressionChanged == I)
break;
ExpressionChanged->moveBefore(I);
@@ -834,6 +917,8 @@ void Reassociate::RewriteExprTree(BinaryOperator *I,
/// version of the value is returned, and BI is left pointing at the instruction
/// that should be processed next by the reassociation pass.
static Value *NegateValue(Value *V, Instruction *BI) {
+ if (ConstantFP *C = dyn_cast<ConstantFP>(V))
+ return ConstantExpr::getFNeg(C);
if (Constant *C = dyn_cast<Constant>(V))
return ConstantExpr::getNeg(C);
@@ -846,7 +931,8 @@ static Value *NegateValue(Value *V, Instruction *BI) {
// the constants. We assume that instcombine will clean up the mess later if
// we introduce tons of unnecessary negation instructions.
//
- if (BinaryOperator *I = isReassociableOp(V, Instruction::Add)) {
+ if (BinaryOperator *I =
+ isReassociableOp(V, Instruction::Add, Instruction::FAdd)) {
// Push the negates through the add.
I->setOperand(0, NegateValue(I->getOperand(0), BI));
I->setOperand(1, NegateValue(I->getOperand(1), BI));
@@ -864,7 +950,8 @@ static Value *NegateValue(Value *V, Instruction *BI) {
// Okay, we need to materialize a negated version of V with an instruction.
// Scan the use lists of V to see if we have one already.
for (User *U : V->users()) {
- if (!BinaryOperator::isNeg(U)) continue;
+ if (!BinaryOperator::isNeg(U) && !BinaryOperator::isFNeg(U))
+ continue;
// We found one! Now we have to make sure that the definition dominates
// this use. We do this by moving it to the entry block (if it is a
@@ -894,27 +981,34 @@ static Value *NegateValue(Value *V, Instruction *BI) {
// Insert a 'neg' instruction that subtracts the value from zero to get the
// negation.
- return BinaryOperator::CreateNeg(V, V->getName() + ".neg", BI);
+ return CreateNeg(V, V->getName() + ".neg", BI, BI);
}
/// ShouldBreakUpSubtract - Return true if we should break up this subtract of
/// X-Y into (X + -Y).
static bool ShouldBreakUpSubtract(Instruction *Sub) {
// If this is a negation, we can't split it up!
- if (BinaryOperator::isNeg(Sub))
+ if (BinaryOperator::isNeg(Sub) || BinaryOperator::isFNeg(Sub))
+ return false;
+
+ // Don't breakup X - undef.
+ if (isa<UndefValue>(Sub->getOperand(1)))
return false;
// Don't bother to break this up unless either the LHS is an associable add or
// subtract or if this is only used by one.
- if (isReassociableOp(Sub->getOperand(0), Instruction::Add) ||
- isReassociableOp(Sub->getOperand(0), Instruction::Sub))
+ Value *V0 = Sub->getOperand(0);
+ if (isReassociableOp(V0, Instruction::Add, Instruction::FAdd) ||
+ isReassociableOp(V0, Instruction::Sub, Instruction::FSub))
return true;
- if (isReassociableOp(Sub->getOperand(1), Instruction::Add) ||
- isReassociableOp(Sub->getOperand(1), Instruction::Sub))
+ Value *V1 = Sub->getOperand(1);
+ if (isReassociableOp(V1, Instruction::Add, Instruction::FAdd) ||
+ isReassociableOp(V1, Instruction::Sub, Instruction::FSub))
return true;
+ Value *VB = Sub->user_back();
if (Sub->hasOneUse() &&
- (isReassociableOp(Sub->user_back(), Instruction::Add) ||
- isReassociableOp(Sub->user_back(), Instruction::Sub)))
+ (isReassociableOp(VB, Instruction::Add, Instruction::FAdd) ||
+ isReassociableOp(VB, Instruction::Sub, Instruction::FSub)))
return true;
return false;
@@ -931,8 +1025,7 @@ static BinaryOperator *BreakUpSubtract(Instruction *Sub) {
// and set it as the RHS of the add instruction we just made.
//
Value *NegVal = NegateValue(Sub->getOperand(1), Sub);
- BinaryOperator *New =
- BinaryOperator::CreateAdd(Sub->getOperand(0), NegVal, "", Sub);
+ BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub);
Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op.
New->takeName(Sub);
@@ -956,8 +1049,19 @@ static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op.
Mul->takeName(Shl);
+
+ // Everyone now refers to the mul instruction.
Shl->replaceAllUsesWith(Mul);
Mul->setDebugLoc(Shl->getDebugLoc());
+
+ // We can safely preserve the nuw flag in all cases. It's also safe to turn a
+ // nuw nsw shl into a nuw nsw mul. However, nsw in isolation requires special
+ // handling.
+ bool NSW = cast<BinaryOperator>(Shl)->hasNoSignedWrap();
+ bool NUW = cast<BinaryOperator>(Shl)->hasNoUnsignedWrap();
+ if (NSW && NUW)
+ Mul->setHasNoSignedWrap(true);
+ Mul->setHasNoUnsignedWrap(NUW);
return Mul;
}
@@ -969,13 +1073,23 @@ static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i,
Value *X) {
unsigned XRank = Ops[i].Rank;
unsigned e = Ops.size();
- for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j)
+ for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) {
if (Ops[j].Op == X)
return j;
+ if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
+ if (Instruction *I2 = dyn_cast<Instruction>(X))
+ if (I1->isIdenticalTo(I2))
+ return j;
+ }
// Scan backwards.
- for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j)
+ for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j) {
if (Ops[j].Op == X)
return j;
+ if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
+ if (Instruction *I2 = dyn_cast<Instruction>(X))
+ if (I1->isIdenticalTo(I2))
+ return j;
+ }
return i;
}
@@ -988,15 +1102,16 @@ static Value *EmitAddTreeOfValues(Instruction *I,
Value *V1 = Ops.back();
Ops.pop_back();
Value *V2 = EmitAddTreeOfValues(I, Ops);
- return BinaryOperator::CreateAdd(V2, V1, "tmp", I);
+ return CreateAdd(V2, V1, "tmp", I, I);
}
/// RemoveFactorFromExpression - If V is an expression tree that is a
/// multiplication sequence, and if this sequence contains a multiply by Factor,
/// remove Factor from the tree and return the new tree.
Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
- BinaryOperator *BO = isReassociableOp(V, Instruction::Mul);
- if (!BO) return nullptr;
+ BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
+ if (!BO)
+ return nullptr;
SmallVector<RepeatedValue, 8> Tree;
MadeChange |= LinearizeExprTree(BO, Tree);
@@ -1018,13 +1133,25 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
}
// If this is a negative version of this factor, remove it.
- if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor))
+ if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) {
if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op))
if (FC1->getValue() == -FC2->getValue()) {
FoundFactor = NeedsNegate = true;
Factors.erase(Factors.begin()+i);
break;
}
+ } else if (ConstantFP *FC1 = dyn_cast<ConstantFP>(Factor)) {
+ if (ConstantFP *FC2 = dyn_cast<ConstantFP>(Factors[i].Op)) {
+ APFloat F1(FC1->getValueAPF());
+ APFloat F2(FC2->getValueAPF());
+ F2.changeSign();
+ if (F1.compare(F2) == APFloat::cmpEqual) {
+ FoundFactor = NeedsNegate = true;
+ Factors.erase(Factors.begin() + i);
+ break;
+ }
+ }
+ }
}
if (!FoundFactor) {
@@ -1046,7 +1173,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
}
if (NeedsNegate)
- V = BinaryOperator::CreateNeg(V, "neg", InsertPt);
+ V = CreateNeg(V, "neg", InsertPt, BO);
return V;
}
@@ -1058,7 +1185,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
static void FindSingleUseMultiplyFactors(Value *V,
SmallVectorImpl<Value*> &Factors,
const SmallVectorImpl<ValueEntry> &Ops) {
- BinaryOperator *BO = isReassociableOp(V, Instruction::Mul);
+ BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
if (!BO) {
Factors.push_back(V);
return;
@@ -1385,17 +1512,19 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
++NumFound;
} while (i != Ops.size() && Ops[i].Op == TheOp);
- DEBUG(errs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
+ DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
++NumFactor;
// Insert a new multiply.
- Value *Mul = ConstantInt::get(cast<IntegerType>(I->getType()), NumFound);
- Mul = BinaryOperator::CreateMul(TheOp, Mul, "factor", I);
+ Type *Ty = TheOp->getType();
+ Constant *C = Ty->isIntegerTy() ? ConstantInt::get(Ty, NumFound)
+ : ConstantFP::get(Ty, NumFound);
+ Instruction *Mul = CreateMul(TheOp, C, "factor", I, I);
// Now that we have inserted a multiply, optimize it. This allows us to
// handle cases that require multiple factoring steps, such as this:
// (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6
- RedoInsts.insert(cast<Instruction>(Mul));
+ RedoInsts.insert(Mul);
// If every add operand was a duplicate, return the multiply.
if (Ops.empty())
@@ -1412,11 +1541,12 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
}
// Check for X and -X or X and ~X in the operand list.
- if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isNot(TheOp))
+ if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isFNeg(TheOp) &&
+ !BinaryOperator::isNot(TheOp))
continue;
Value *X = nullptr;
- if (BinaryOperator::isNeg(TheOp))
+ if (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp))
X = BinaryOperator::getNegArgument(TheOp);
else if (BinaryOperator::isNot(TheOp))
X = BinaryOperator::getNotArgument(TheOp);
@@ -1426,7 +1556,8 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
continue;
// Remove X and -X from the operand list.
- if (Ops.size() == 2 && BinaryOperator::isNeg(TheOp))
+ if (Ops.size() == 2 &&
+ (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp)))
return Constant::getNullValue(X->getType());
// Remove X and ~X from the operand list.
@@ -1463,7 +1594,8 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
unsigned MaxOcc = 0;
Value *MaxOccVal = nullptr;
for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
- BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul);
+ BinaryOperator *BOp =
+ isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
if (!BOp)
continue;
@@ -1476,40 +1608,65 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
SmallPtrSet<Value*, 8> Duplicates;
for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
Value *Factor = Factors[i];
- if (!Duplicates.insert(Factor)) continue;
+ if (!Duplicates.insert(Factor).second)
+ continue;
unsigned Occ = ++FactorOccurrences[Factor];
- if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; }
+ if (Occ > MaxOcc) {
+ MaxOcc = Occ;
+ MaxOccVal = Factor;
+ }
// If Factor is a negative constant, add the negated value as a factor
// because we can percolate the negate out. Watch for minint, which
// cannot be positivified.
- if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor))
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) {
if (CI->isNegative() && !CI->isMinValue(true)) {
Factor = ConstantInt::get(CI->getContext(), -CI->getValue());
assert(!Duplicates.count(Factor) &&
"Shouldn't have two constant factors, missed a canonicalize");
-
unsigned Occ = ++FactorOccurrences[Factor];
- if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; }
+ if (Occ > MaxOcc) {
+ MaxOcc = Occ;
+ MaxOccVal = Factor;
+ }
+ }
+ } else if (ConstantFP *CF = dyn_cast<ConstantFP>(Factor)) {
+ if (CF->isNegative()) {
+ APFloat F(CF->getValueAPF());
+ F.changeSign();
+ Factor = ConstantFP::get(CF->getContext(), F);
+ assert(!Duplicates.count(Factor) &&
+ "Shouldn't have two constant factors, missed a canonicalize");
+ unsigned Occ = ++FactorOccurrences[Factor];
+ if (Occ > MaxOcc) {
+ MaxOcc = Occ;
+ MaxOccVal = Factor;
+ }
}
+ }
}
}
// If any factor occurred more than one time, we can pull it out.
if (MaxOcc > 1) {
- DEBUG(errs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
+ DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
++NumFactor;
// Create a new instruction that uses the MaxOccVal twice. If we don't do
// this, we could otherwise run into situations where removing a factor
// from an expression will drop a use of maxocc, and this can cause
// RemoveFactorFromExpression on successive values to behave differently.
- Instruction *DummyInst = BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal);
+ Instruction *DummyInst =
+ I->getType()->isIntegerTy()
+ ? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal)
+ : BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal);
+
SmallVector<WeakVH, 4> NewMulOps;
for (unsigned i = 0; i != Ops.size(); ++i) {
// Only try to remove factors from expressions we're allowed to.
- BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul);
+ BinaryOperator *BOp =
+ isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
if (!BOp)
continue;
@@ -1542,7 +1699,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
RedoInsts.insert(VI);
// Create the multiply.
- Instruction *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I);
+ Instruction *V2 = CreateMul(V, MaxOccVal, "tmp", I, I);
// Rerun associate on the multiply in case the inner expression turned into
// a multiply. We want to make sure that we keep things in canonical form.
@@ -1632,7 +1789,10 @@ static Value *buildMultiplyTree(IRBuilder<> &Builder,
Value *LHS = Ops.pop_back_val();
do {
- LHS = Builder.CreateMul(LHS, Ops.pop_back_val());
+ if (LHS->getType()->isIntegerTy())
+ LHS = Builder.CreateMul(LHS, Ops.pop_back_val());
+ else
+ LHS = Builder.CreateFMul(LHS, Ops.pop_back_val());
} while (!Ops.empty());
return LHS;
@@ -1765,11 +1925,13 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I,
break;
case Instruction::Add:
+ case Instruction::FAdd:
if (Value *Result = OptimizeAdd(I, Ops))
return Result;
break;
case Instruction::Mul:
+ case Instruction::FMul:
if (Value *Result = OptimizeMul(I, Ops))
return Result;
break;
@@ -1797,12 +1959,104 @@ void Reassociate::EraseInst(Instruction *I) {
// and add that since that's where optimization actually happens.
unsigned Opcode = Op->getOpcode();
while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode &&
- Visited.insert(Op))
+ Visited.insert(Op).second)
Op = Op->user_back();
RedoInsts.insert(Op);
}
}
+// Canonicalize expressions of the following form:
+// x + (-Constant * y) -> x - (Constant * y)
+// x - (-Constant * y) -> x + (Constant * y)
+Instruction *Reassociate::canonicalizeNegConstExpr(Instruction *I) {
+ if (!I->hasOneUse() || I->getType()->isVectorTy())
+ return nullptr;
+
+ // Must be a mul, fmul, or fdiv instruction.
+ unsigned Opcode = I->getOpcode();
+ if (Opcode != Instruction::Mul && Opcode != Instruction::FMul &&
+ Opcode != Instruction::FDiv)
+ return nullptr;
+
+ // Must have at least one constant operand.
+ Constant *C0 = dyn_cast<Constant>(I->getOperand(0));
+ Constant *C1 = dyn_cast<Constant>(I->getOperand(1));
+ if (!C0 && !C1)
+ return nullptr;
+
+ // Must be a negative ConstantInt or ConstantFP.
+ Constant *C = C0 ? C0 : C1;
+ unsigned ConstIdx = C0 ? 0 : 1;
+ if (auto *CI = dyn_cast<ConstantInt>(C)) {
+ if (!CI->isNegative())
+ return nullptr;
+ } else if (auto *CF = dyn_cast<ConstantFP>(C)) {
+ if (!CF->isNegative())
+ return nullptr;
+ } else
+ return nullptr;
+
+ // User must be a binary operator with one or more uses.
+ Instruction *User = I->user_back();
+ if (!isa<BinaryOperator>(User) || !User->getNumUses())
+ return nullptr;
+
+ unsigned UserOpcode = User->getOpcode();
+ if (UserOpcode != Instruction::Add && UserOpcode != Instruction::FAdd &&
+ UserOpcode != Instruction::Sub && UserOpcode != Instruction::FSub)
+ return nullptr;
+
+ // Subtraction is not commutative. Explicitly, the following transform is
+ // not valid: (-Constant * y) - x -> x + (Constant * y)
+ if (!User->isCommutative() && User->getOperand(1) != I)
+ return nullptr;
+
+ // Change the sign of the constant.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(C))
+ I->setOperand(ConstIdx, ConstantInt::get(CI->getContext(), -CI->getValue()));
+ else {
+ ConstantFP *CF = cast<ConstantFP>(C);
+ APFloat Val = CF->getValueAPF();
+ Val.changeSign();
+ I->setOperand(ConstIdx, ConstantFP::get(CF->getContext(), Val));
+ }
+
+ // Canonicalize I to RHS to simplify the next bit of logic. E.g.,
+ // ((-Const*y) + x) -> (x + (-Const*y)).
+ if (User->getOperand(0) == I && User->isCommutative())
+ cast<BinaryOperator>(User)->swapOperands();
+
+ Value *Op0 = User->getOperand(0);
+ Value *Op1 = User->getOperand(1);
+ BinaryOperator *NI;
+ switch(UserOpcode) {
+ default:
+ llvm_unreachable("Unexpected Opcode!");
+ case Instruction::Add:
+ NI = BinaryOperator::CreateSub(Op0, Op1);
+ break;
+ case Instruction::Sub:
+ NI = BinaryOperator::CreateAdd(Op0, Op1);
+ break;
+ case Instruction::FAdd:
+ NI = BinaryOperator::CreateFSub(Op0, Op1);
+ NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
+ break;
+ case Instruction::FSub:
+ NI = BinaryOperator::CreateFAdd(Op0, Op1);
+ NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
+ break;
+ }
+
+ NI->insertBefore(User);
+ NI->setName(User->getName());
+ User->replaceAllUsesWith(NI);
+ NI->setDebugLoc(I->getDebugLoc());
+ RedoInsts.insert(I);
+ MadeChange = true;
+ return NI;
+}
+
/// OptimizeInst - Inspect and optimize the given instruction. Note that erasing
/// instructions is not allowed.
void Reassociate::OptimizeInst(Instruction *I) {
@@ -1810,8 +2064,7 @@ void Reassociate::OptimizeInst(Instruction *I) {
if (!isa<BinaryOperator>(I))
return;
- if (I->getOpcode() == Instruction::Shl &&
- isa<ConstantInt>(I->getOperand(1)))
+ if (I->getOpcode() == Instruction::Shl && isa<ConstantInt>(I->getOperand(1)))
// If an operand of this shift is a reassociable multiply, or if the shift
// is used by a reassociable multiply or add, turn into a multiply.
if (isReassociableOp(I->getOperand(0), Instruction::Mul) ||
@@ -1824,29 +2077,23 @@ void Reassociate::OptimizeInst(Instruction *I) {
I = NI;
}
- // Floating point binary operators are not associative, but we can still
- // commute (some) of them, to canonicalize the order of their operands.
- // This can potentially expose more CSE opportunities, and makes writing
- // other transformations simpler.
- if ((I->getType()->isFloatingPointTy() || I->getType()->isVectorTy())) {
- // FAdd and FMul can be commuted.
- if (I->getOpcode() != Instruction::FMul &&
- I->getOpcode() != Instruction::FAdd)
- return;
+ // Canonicalize negative constants out of expressions.
+ if (Instruction *Res = canonicalizeNegConstExpr(I))
+ I = Res;
- Value *LHS = I->getOperand(0);
- Value *RHS = I->getOperand(1);
- unsigned LHSRank = getRank(LHS);
- unsigned RHSRank = getRank(RHS);
+ // Commute binary operators, to canonicalize the order of their operands.
+ // This can potentially expose more CSE opportunities, and makes writing other
+ // transformations simpler.
+ if (I->isCommutative())
+ canonicalizeOperands(I);
- // Sort the operands by rank.
- if (RHSRank < LHSRank) {
- I->setOperand(0, RHS);
- I->setOperand(1, LHS);
- }
+ // Don't optimize vector instructions.
+ if (I->getType()->isVectorTy())
+ return;
+ // Don't optimize floating point instructions that don't have unsafe algebra.
+ if (I->getType()->isFloatingPointTy() && !I->hasUnsafeAlgebra())
return;
- }
// Do not reassociate boolean (i1) expressions. We want to preserve the
// original order of evaluation for short-circuited comparisons that
@@ -1877,6 +2124,24 @@ void Reassociate::OptimizeInst(Instruction *I) {
I = NI;
}
}
+ } else if (I->getOpcode() == Instruction::FSub) {
+ if (ShouldBreakUpSubtract(I)) {
+ Instruction *NI = BreakUpSubtract(I);
+ RedoInsts.insert(I);
+ MadeChange = true;
+ I = NI;
+ } else if (BinaryOperator::isFNeg(I)) {
+ // Otherwise, this is a negation. See if the operand is a multiply tree
+ // and if this is not an inner node of a multiply tree.
+ if (isReassociableOp(I->getOperand(1), Instruction::FMul) &&
+ (!I->hasOneUse() ||
+ !isReassociableOp(I->user_back(), Instruction::FMul))) {
+ Instruction *NI = LowerNegateToMultiply(I);
+ RedoInsts.insert(I);
+ MadeChange = true;
+ I = NI;
+ }
+ }
}
// If this instruction is an associative binary operator, process it.
@@ -1894,11 +2159,16 @@ void Reassociate::OptimizeInst(Instruction *I) {
if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add &&
cast<Instruction>(BO->user_back())->getOpcode() == Instruction::Sub)
return;
+ if (BO->hasOneUse() && BO->getOpcode() == Instruction::FAdd &&
+ cast<Instruction>(BO->user_back())->getOpcode() == Instruction::FSub)
+ return;
ReassociateExpression(BO);
}
void Reassociate::ReassociateExpression(BinaryOperator *I) {
+ assert(!I->getType()->isVectorTy() &&
+ "Reassociation of vector instructions is not supported.");
// First, walk the expression tree, linearizing the tree, collecting the
// operand information.
@@ -1943,12 +2213,21 @@ void Reassociate::ReassociateExpression(BinaryOperator *I) {
// this is a multiply tree used only by an add, and the immediate is a -1.
// In this case we reassociate to put the negation on the outside so that we
// can fold the negation into the add: (-X)*Y + Z -> Z-X*Y
- if (I->getOpcode() == Instruction::Mul && I->hasOneUse() &&
- cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add &&
- isa<ConstantInt>(Ops.back().Op) &&
- cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) {
- ValueEntry Tmp = Ops.pop_back_val();
- Ops.insert(Ops.begin(), Tmp);
+ if (I->hasOneUse()) {
+ if (I->getOpcode() == Instruction::Mul &&
+ cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add &&
+ isa<ConstantInt>(Ops.back().Op) &&
+ cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) {
+ ValueEntry Tmp = Ops.pop_back_val();
+ Ops.insert(Ops.begin(), Tmp);
+ } else if (I->getOpcode() == Instruction::FMul &&
+ cast<Instruction>(I->user_back())->getOpcode() ==
+ Instruction::FAdd &&
+ isa<ConstantFP>(Ops.back().Op) &&
+ cast<ConstantFP>(Ops.back().Op)->isExactlyValue(-1.0)) {
+ ValueEntry Tmp = Ops.pop_back_val();
+ Ops.insert(Ops.begin(), Tmp);
+ }
}
DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp
index b6023e2ce789..1b46727c17bb 100644
--- a/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -73,7 +73,7 @@ bool RegToMem::runOnFunction(Function &F) {
// Insert all new allocas into entry block.
BasicBlock *BBEntry = &F.getEntryBlock();
- assert(pred_begin(BBEntry) == pred_end(BBEntry) &&
+ assert(pred_empty(BBEntry) &&
"Entry block to function must not have predecessors!");
// Find first non-alloca instruction and create insertion point. This is
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 90c3520c8323..cfc9a8e89fa0 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -214,7 +214,8 @@ public:
///
/// This returns true if the block was not considered live before.
bool MarkBlockExecutable(BasicBlock *BB) {
- if (!BBExecutable.insert(BB)) return false;
+ if (!BBExecutable.insert(BB).second)
+ return false;
DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n');
BBWorkList.push_back(BB); // Add the block to the work list!
return true;
@@ -1010,7 +1011,7 @@ void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
}
Constant *Ptr = Operands[0];
- ArrayRef<Constant *> Indices(Operands.begin() + 1, Operands.end());
+ auto Indices = makeArrayRef(Operands.begin() + 1, Operands.end());
markConstant(&I, ConstantExpr::getGetElementPtr(Ptr, Indices));
}
@@ -1107,6 +1108,9 @@ CallOverdefined:
Operands.push_back(State.getConstant());
}
+ if (getValueState(I).isOverdefined())
+ return;
+
// If we can constant fold this, mark the result of the call as a
// constant.
if (Constant *C = ConstantFoldCall(F, Operands, TLI))
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index f902eb23cbcf..ed161fd4af3e 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -28,6 +28,7 @@
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/PtrUseVisitor.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -78,8 +79,8 @@ STATISTIC(NumVectorized, "Number of vectorized aggregates");
/// Hidden option to force the pass to not use DomTree and mem2reg, instead
/// forming SSA values through the SSAUpdater infrastructure.
-static cl::opt<bool>
-ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden);
+static cl::opt<bool> ForceSSAUpdater("force-ssa-updater", cl::init(false),
+ cl::Hidden);
/// Hidden option to enable randomly shuffling the slices to help uncover
/// instability in their order.
@@ -88,15 +89,15 @@ static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
/// Hidden option to experiment with completely strict handling of inbounds
/// GEPs.
-static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds",
- cl::init(false), cl::Hidden);
+static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
+ cl::Hidden);
namespace {
/// \brief A custom IRBuilder inserter which prefixes all names if they are
/// preserved.
template <bool preserveNames = true>
-class IRBuilderPrefixedInserter :
- public IRBuilderDefaultInserter<preserveNames> {
+class IRBuilderPrefixedInserter
+ : public IRBuilderDefaultInserter<preserveNames> {
std::string Prefix;
public:
@@ -112,19 +113,19 @@ protected:
// Specialization for not preserving the name is trivial.
template <>
-class IRBuilderPrefixedInserter<false> :
- public IRBuilderDefaultInserter<false> {
+class IRBuilderPrefixedInserter<false>
+ : public IRBuilderDefaultInserter<false> {
public:
void SetNamePrefix(const Twine &P) {}
};
/// \brief Provide a typedef for IRBuilder that drops names in release builds.
#ifndef NDEBUG
-typedef llvm::IRBuilder<true, ConstantFolder,
- IRBuilderPrefixedInserter<true> > IRBuilderTy;
+typedef llvm::IRBuilder<true, ConstantFolder, IRBuilderPrefixedInserter<true>>
+ IRBuilderTy;
#else
-typedef llvm::IRBuilder<false, ConstantFolder,
- IRBuilderPrefixedInserter<false> > IRBuilderTy;
+typedef llvm::IRBuilder<false, ConstantFolder, IRBuilderPrefixedInserter<false>>
+ IRBuilderTy;
#endif
}
@@ -170,10 +171,14 @@ public:
/// decreasing. Thus the spanning range comes first in a cluster with the
/// same start position.
bool operator<(const Slice &RHS) const {
- if (beginOffset() < RHS.beginOffset()) return true;
- if (beginOffset() > RHS.beginOffset()) return false;
- if (isSplittable() != RHS.isSplittable()) return !isSplittable();
- if (endOffset() > RHS.endOffset()) return true;
+ if (beginOffset() < RHS.beginOffset())
+ return true;
+ if (beginOffset() > RHS.beginOffset())
+ return false;
+ if (isSplittable() != RHS.isSplittable())
+ return !isSplittable();
+ if (endOffset() > RHS.endOffset())
+ return true;
return false;
}
@@ -197,9 +202,7 @@ public:
namespace llvm {
template <typename T> struct isPodLike;
-template <> struct isPodLike<Slice> {
- static const bool value = true;
-};
+template <> struct isPodLike<Slice> { static const bool value = true; };
}
namespace {
@@ -224,36 +227,318 @@ public:
/// \brief Support for iterating over the slices.
/// @{
typedef SmallVectorImpl<Slice>::iterator iterator;
+ typedef iterator_range<iterator> range;
iterator begin() { return Slices.begin(); }
iterator end() { return Slices.end(); }
typedef SmallVectorImpl<Slice>::const_iterator const_iterator;
+ typedef iterator_range<const_iterator> const_range;
const_iterator begin() const { return Slices.begin(); }
const_iterator end() const { return Slices.end(); }
/// @}
- /// \brief Allow iterating the dead users for this alloca.
+ /// \brief Erase a range of slices.
+ void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
+
+ /// \brief Insert new slices for this alloca.
///
- /// These are instructions which will never actually use the alloca as they
- /// are outside the allocated range. They are safe to replace with undef and
- /// delete.
- /// @{
- typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator;
- dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); }
- dead_user_iterator dead_user_end() const { return DeadUsers.end(); }
- /// @}
+ /// This moves the slices into the alloca's slices collection, and re-sorts
+ /// everything so that the usual ordering properties of the alloca's slices
+ /// hold.
+ void insert(ArrayRef<Slice> NewSlices) {
+ int OldSize = Slices.size();
+ std::move(NewSlices.begin(), NewSlices.end(), std::back_inserter(Slices));
+ auto SliceI = Slices.begin() + OldSize;
+ std::sort(SliceI, Slices.end());
+ std::inplace_merge(Slices.begin(), SliceI, Slices.end());
+ }
+
+ // Forward declare an iterator to befriend it.
+ class partition_iterator;
+
+ /// \brief A partition of the slices.
+ ///
+ /// An ephemeral representation for a range of slices which can be viewed as
+ /// a partition of the alloca. This range represents a span of the alloca's
+ /// memory which cannot be split, and provides access to all of the slices
+ /// overlapping some part of the partition.
+ ///
+ /// Objects of this type are produced by traversing the alloca's slices, but
+ /// are only ephemeral and not persistent.
+ class Partition {
+ private:
+ friend class AllocaSlices;
+ friend class AllocaSlices::partition_iterator;
+
+ /// \brief The begining and ending offsets of the alloca for this partition.
+ uint64_t BeginOffset, EndOffset;
+
+ /// \brief The start end end iterators of this partition.
+ iterator SI, SJ;
+
+ /// \brief A collection of split slice tails overlapping the partition.
+ SmallVector<Slice *, 4> SplitTails;
+
+ /// \brief Raw constructor builds an empty partition starting and ending at
+ /// the given iterator.
+ Partition(iterator SI) : SI(SI), SJ(SI) {}
+
+ public:
+ /// \brief The start offset of this partition.
+ ///
+ /// All of the contained slices start at or after this offset.
+ uint64_t beginOffset() const { return BeginOffset; }
- /// \brief Allow iterating the dead expressions referring to this alloca.
+ /// \brief The end offset of this partition.
+ ///
+ /// All of the contained slices end at or before this offset.
+ uint64_t endOffset() const { return EndOffset; }
+
+ /// \brief The size of the partition.
+ ///
+ /// Note that this can never be zero.
+ uint64_t size() const {
+ assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
+ return EndOffset - BeginOffset;
+ }
+
+ /// \brief Test whether this partition contains no slices, and merely spans
+ /// a region occupied by split slices.
+ bool empty() const { return SI == SJ; }
+
+ /// \name Iterate slices that start within the partition.
+ /// These may be splittable or unsplittable. They have a begin offset >= the
+ /// partition begin offset.
+ /// @{
+ // FIXME: We should probably define a "concat_iterator" helper and use that
+ // to stitch together pointee_iterators over the split tails and the
+ // contiguous iterators of the partition. That would give a much nicer
+ // interface here. We could then additionally expose filtered iterators for
+ // split, unsplit, and unsplittable splices based on the usage patterns.
+ iterator begin() const { return SI; }
+ iterator end() const { return SJ; }
+ /// @}
+
+ /// \brief Get the sequence of split slice tails.
+ ///
+ /// These tails are of slices which start before this partition but are
+ /// split and overlap into the partition. We accumulate these while forming
+ /// partitions.
+ ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
+ };
+
+ /// \brief An iterator over partitions of the alloca's slices.
+ ///
+ /// This iterator implements the core algorithm for partitioning the alloca's
+ /// slices. It is a forward iterator as we don't support backtracking for
+ /// efficiency reasons, and re-use a single storage area to maintain the
+ /// current set of split slices.
+ ///
+ /// It is templated on the slice iterator type to use so that it can operate
+ /// with either const or non-const slice iterators.
+ class partition_iterator
+ : public iterator_facade_base<partition_iterator,
+ std::forward_iterator_tag, Partition> {
+ friend class AllocaSlices;
+
+ /// \brief Most of the state for walking the partitions is held in a class
+ /// with a nice interface for examining them.
+ Partition P;
+
+ /// \brief We need to keep the end of the slices to know when to stop.
+ AllocaSlices::iterator SE;
+
+ /// \brief We also need to keep track of the maximum split end offset seen.
+ /// FIXME: Do we really?
+ uint64_t MaxSplitSliceEndOffset;
+
+ /// \brief Sets the partition to be empty at given iterator, and sets the
+ /// end iterator.
+ partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
+ : P(SI), SE(SE), MaxSplitSliceEndOffset(0) {
+ // If not already at the end, advance our state to form the initial
+ // partition.
+ if (SI != SE)
+ advance();
+ }
+
+ /// \brief Advance the iterator to the next partition.
+ ///
+ /// Requires that the iterator not be at the end of the slices.
+ void advance() {
+ assert((P.SI != SE || !P.SplitTails.empty()) &&
+ "Cannot advance past the end of the slices!");
+
+ // Clear out any split uses which have ended.
+ if (!P.SplitTails.empty()) {
+ if (P.EndOffset >= MaxSplitSliceEndOffset) {
+ // If we've finished all splits, this is easy.
+ P.SplitTails.clear();
+ MaxSplitSliceEndOffset = 0;
+ } else {
+ // Remove the uses which have ended in the prior partition. This
+ // cannot change the max split slice end because we just checked that
+ // the prior partition ended prior to that max.
+ P.SplitTails.erase(
+ std::remove_if(
+ P.SplitTails.begin(), P.SplitTails.end(),
+ [&](Slice *S) { return S->endOffset() <= P.EndOffset; }),
+ P.SplitTails.end());
+ assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(),
+ [&](Slice *S) {
+ return S->endOffset() == MaxSplitSliceEndOffset;
+ }) &&
+ "Could not find the current max split slice offset!");
+ assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(),
+ [&](Slice *S) {
+ return S->endOffset() <= MaxSplitSliceEndOffset;
+ }) &&
+ "Max split slice end offset is not actually the max!");
+ }
+ }
+
+ // If P.SI is already at the end, then we've cleared the split tail and
+ // now have an end iterator.
+ if (P.SI == SE) {
+ assert(P.SplitTails.empty() && "Failed to clear the split slices!");
+ return;
+ }
+
+ // If we had a non-empty partition previously, set up the state for
+ // subsequent partitions.
+ if (P.SI != P.SJ) {
+ // Accumulate all the splittable slices which started in the old
+ // partition into the split list.
+ for (Slice &S : P)
+ if (S.isSplittable() && S.endOffset() > P.EndOffset) {
+ P.SplitTails.push_back(&S);
+ MaxSplitSliceEndOffset =
+ std::max(S.endOffset(), MaxSplitSliceEndOffset);
+ }
+
+ // Start from the end of the previous partition.
+ P.SI = P.SJ;
+
+ // If P.SI is now at the end, we at most have a tail of split slices.
+ if (P.SI == SE) {
+ P.BeginOffset = P.EndOffset;
+ P.EndOffset = MaxSplitSliceEndOffset;
+ return;
+ }
+
+ // If the we have split slices and the next slice is after a gap and is
+ // not splittable immediately form an empty partition for the split
+ // slices up until the next slice begins.
+ if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
+ !P.SI->isSplittable()) {
+ P.BeginOffset = P.EndOffset;
+ P.EndOffset = P.SI->beginOffset();
+ return;
+ }
+ }
+
+ // OK, we need to consume new slices. Set the end offset based on the
+ // current slice, and step SJ past it. The beginning offset of the
+ // parttion is the beginning offset of the next slice unless we have
+ // pre-existing split slices that are continuing, in which case we begin
+ // at the prior end offset.
+ P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
+ P.EndOffset = P.SI->endOffset();
+ ++P.SJ;
+
+ // There are two strategies to form a partition based on whether the
+ // partition starts with an unsplittable slice or a splittable slice.
+ if (!P.SI->isSplittable()) {
+ // When we're forming an unsplittable region, it must always start at
+ // the first slice and will extend through its end.
+ assert(P.BeginOffset == P.SI->beginOffset());
+
+ // Form a partition including all of the overlapping slices with this
+ // unsplittable slice.
+ while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+ if (!P.SJ->isSplittable())
+ P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+ ++P.SJ;
+ }
+
+ // We have a partition across a set of overlapping unsplittable
+ // partitions.
+ return;
+ }
+
+ // If we're starting with a splittable slice, then we need to form
+ // a synthetic partition spanning it and any other overlapping splittable
+ // splices.
+ assert(P.SI->isSplittable() && "Forming a splittable partition!");
+
+ // Collect all of the overlapping splittable slices.
+ while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
+ P.SJ->isSplittable()) {
+ P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+ ++P.SJ;
+ }
+
+ // Back upiP.EndOffset if we ended the span early when encountering an
+ // unsplittable slice. This synthesizes the early end offset of
+ // a partition spanning only splittable slices.
+ if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+ assert(!P.SJ->isSplittable());
+ P.EndOffset = P.SJ->beginOffset();
+ }
+ }
+
+ public:
+ bool operator==(const partition_iterator &RHS) const {
+ assert(SE == RHS.SE &&
+ "End iterators don't match between compared partition iterators!");
+
+ // The observed positions of partitions is marked by the P.SI iterator and
+ // the emptyness of the split slices. The latter is only relevant when
+ // P.SI == SE, as the end iterator will additionally have an empty split
+ // slices list, but the prior may have the same P.SI and a tail of split
+ // slices.
+ if (P.SI == RHS.P.SI &&
+ P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
+ assert(P.SJ == RHS.P.SJ &&
+ "Same set of slices formed two different sized partitions!");
+ assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
+ "Same slice position with differently sized non-empty split "
+ "slice tails!");
+ return true;
+ }
+ return false;
+ }
+
+ partition_iterator &operator++() {
+ advance();
+ return *this;
+ }
+
+ Partition &operator*() { return P; }
+ };
+
+ /// \brief A forward range over the partitions of the alloca's slices.
+ ///
+ /// This accesses an iterator range over the partitions of the alloca's
+ /// slices. It computes these partitions on the fly based on the overlapping
+ /// offsets of the slices and the ability to split them. It will visit "empty"
+ /// partitions to cover regions of the alloca only accessed via split
+ /// slices.
+ iterator_range<partition_iterator> partitions() {
+ return make_range(partition_iterator(begin(), end()),
+ partition_iterator(end(), end()));
+ }
+
+ /// \brief Access the dead users for this alloca.
+ ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
+
+ /// \brief Access the dead operands referring to this alloca.
///
/// These are operands which have cannot actually be used to refer to the
/// alloca as they are outside its range and the user doesn't correct for
/// that. These mostly consist of PHI node inputs and the like which we just
/// need to replace with undef.
- /// @{
- typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator;
- dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); }
- dead_op_iterator dead_op_end() const { return DeadOperands.end(); }
- /// @}
+ ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
@@ -317,13 +602,22 @@ static Value *foldSelectInst(SelectInst &SI) {
// being selected between, fold the select. Yes this does (rarely) happen
// early on.
if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
- return SI.getOperand(1+CI->isZero());
+ return SI.getOperand(1 + CI->isZero());
if (SI.getOperand(1) == SI.getOperand(2))
return SI.getOperand(1);
return nullptr;
}
+/// \brief A helper that folds a PHI node or a select.
+static Value *foldPHINodeOrSelectInst(Instruction &I) {
+ if (PHINode *PN = dyn_cast<PHINode>(&I)) {
+ // If PN merges together the same value, return that value.
+ return PN->hasConstantValue();
+ }
+ return foldSelectInst(cast<SelectInst>(I));
+}
+
/// \brief Builder for the alloca slices.
///
/// This class builds a set of alloca slices by recursively visiting the uses
@@ -334,7 +628,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
typedef PtrUseVisitor<SliceBuilder> Base;
const uint64_t AllocSize;
- AllocaSlices &S;
+ AllocaSlices &AS;
SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
@@ -343,14 +637,14 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
public:
- SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &S)
+ SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
: PtrUseVisitor<SliceBuilder>(DL),
- AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), S(S) {}
+ AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), AS(AS) {}
private:
void markAsDead(Instruction &I) {
- if (VisitedDeadInsts.insert(&I))
- S.DeadUsers.push_back(&I);
+ if (VisitedDeadInsts.insert(&I).second)
+ AS.DeadUsers.push_back(&I);
}
void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
@@ -361,7 +655,7 @@ private:
DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
<< " which has zero size or starts outside of the "
<< AllocSize << " byte alloca:\n"
- << " alloca: " << S.AI << "\n"
+ << " alloca: " << AS.AI << "\n"
<< " use: " << I << "\n");
return markAsDead(I);
}
@@ -379,12 +673,12 @@ private:
if (Size > AllocSize - BeginOffset) {
DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
<< " to remain within the " << AllocSize << " byte alloca:\n"
- << " alloca: " << S.AI << "\n"
+ << " alloca: " << AS.AI << "\n"
<< " use: " << I << "\n");
EndOffset = AllocSize;
}
- S.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
+ AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
}
void visitBitCastInst(BitCastInst &BC) {
@@ -421,7 +715,8 @@ private:
GEPOffset +=
APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
} else {
- // For array or vector indices, scale the index by the size of the type.
+ // For array or vector indices, scale the index by the size of the
+ // type.
APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
GEPOffset += Index * APInt(Offset.getBitWidth(),
DL.getTypeAllocSize(GTI.getIndexedType()));
@@ -440,16 +735,10 @@ private:
void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
uint64_t Size, bool IsVolatile) {
- // We allow splitting of loads and stores where the type is an integer type
- // and cover the entire alloca. This prevents us from splitting over
- // eagerly.
- // FIXME: In the great blue eventually, we should eagerly split all integer
- // loads and stores, and then have a separate step that merges adjacent
- // alloca partitions into a single partition suitable for integer widening.
- // Or we should skip the merge step and rely on GVN and other passes to
- // merge adjacent loads and stores that survive mem2reg.
- bool IsSplittable =
- Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize;
+ // We allow splitting of non-volatile loads and stores where the type is an
+ // integer type. These may be used to implement 'memcpy' or other "transfer
+ // of bits" patterns.
+ bool IsSplittable = Ty->isIntegerTy() && !IsVolatile;
insertUse(I, Offset, Size, IsSplittable);
}
@@ -485,7 +774,7 @@ private:
DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
<< " which extends past the end of the " << AllocSize
<< " byte alloca:\n"
- << " alloca: " << S.AI << "\n"
+ << " alloca: " << AS.AI << "\n"
<< " use: " << SI << "\n");
return markAsDead(SI);
}
@@ -495,7 +784,6 @@ private:
handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
}
-
void visitMemSetInst(MemSetInst &II) {
assert(II.getRawDest() == *U && "Pointer use is not the destination?");
ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
@@ -507,9 +795,8 @@ private:
if (!IsOffsetKnown)
return PI.setAborted(&II);
- insertUse(II, Offset,
- Length ? Length->getLimitedValue()
- : AllocSize - Offset.getLimitedValue(),
+ insertUse(II, Offset, Length ? Length->getLimitedValue()
+ : AllocSize - Offset.getLimitedValue(),
(bool)Length);
}
@@ -533,15 +820,15 @@ private:
// FIXME: Yet another place we really should bypass this when
// instrumenting for ASan.
if (Offset.uge(AllocSize)) {
- SmallDenseMap<Instruction *, unsigned>::iterator MTPI = MemTransferSliceMap.find(&II);
+ SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
+ MemTransferSliceMap.find(&II);
if (MTPI != MemTransferSliceMap.end())
- S.Slices[MTPI->second].kill();
+ AS.Slices[MTPI->second].kill();
return markAsDead(II);
}
uint64_t RawOffset = Offset.getLimitedValue();
- uint64_t Size = Length ? Length->getLimitedValue()
- : AllocSize - RawOffset;
+ uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
// Check for the special case where the same exact value is used for both
// source and dest.
@@ -558,10 +845,10 @@ private:
bool Inserted;
SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
std::tie(MTPI, Inserted) =
- MemTransferSliceMap.insert(std::make_pair(&II, S.Slices.size()));
+ MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
unsigned PrevIdx = MTPI->second;
if (!Inserted) {
- Slice &PrevP = S.Slices[PrevIdx];
+ Slice &PrevP = AS.Slices[PrevIdx];
// Check if the begin offsets match and this is a non-volatile transfer.
// In that case, we can completely elide the transfer.
@@ -579,7 +866,7 @@ private:
insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
// Check that we ended up with a valid index in the map.
- assert(S.Slices[PrevIdx].getUse()->getUser() == &II &&
+ assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
"Map index doesn't point back to a slice with this user.");
}
@@ -639,64 +926,47 @@ private:
}
for (User *U : I->users())
- if (Visited.insert(cast<Instruction>(U)))
+ if (Visited.insert(cast<Instruction>(U)).second)
Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
} while (!Uses.empty());
return nullptr;
}
- void visitPHINode(PHINode &PN) {
- if (PN.use_empty())
- return markAsDead(PN);
- if (!IsOffsetKnown)
- return PI.setAborted(&PN);
-
- // See if we already have computed info on this node.
- uint64_t &PHISize = PHIOrSelectSizes[&PN];
- if (!PHISize) {
- // This is a new PHI node, check for an unsafe use of the PHI node.
- if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHISize))
- return PI.setAborted(UnsafeI);
- }
-
- // For PHI and select operands outside the alloca, we can't nuke the entire
- // phi or select -- the other side might still be relevant, so we special
- // case them here and use a separate structure to track the operands
- // themselves which should be replaced with undef.
- // FIXME: This should instead be escaped in the event we're instrumenting
- // for address sanitization.
- if (Offset.uge(AllocSize)) {
- S.DeadOperands.push_back(U);
- return;
- }
-
- insertUse(PN, Offset, PHISize);
- }
+ void visitPHINodeOrSelectInst(Instruction &I) {
+ assert(isa<PHINode>(I) || isa<SelectInst>(I));
+ if (I.use_empty())
+ return markAsDead(I);
- void visitSelectInst(SelectInst &SI) {
- if (SI.use_empty())
- return markAsDead(SI);
- if (Value *Result = foldSelectInst(SI)) {
+ // TODO: We could use SimplifyInstruction here to fold PHINodes and
+ // SelectInsts. However, doing so requires to change the current
+ // dead-operand-tracking mechanism. For instance, suppose neither loading
+ // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
+ // trap either. However, if we simply replace %U with undef using the
+ // current dead-operand-tracking mechanism, "load (select undef, undef,
+ // %other)" may trap because the select may return the first operand
+ // "undef".
+ if (Value *Result = foldPHINodeOrSelectInst(I)) {
if (Result == *U)
// If the result of the constant fold will be the pointer, recurse
- // through the select as if we had RAUW'ed it.
- enqueueUsers(SI);
+ // through the PHI/select as if we had RAUW'ed it.
+ enqueueUsers(I);
else
- // Otherwise the operand to the select is dead, and we can replace it
- // with undef.
- S.DeadOperands.push_back(U);
+ // Otherwise the operand to the PHI/select is dead, and we can replace
+ // it with undef.
+ AS.DeadOperands.push_back(U);
return;
}
+
if (!IsOffsetKnown)
- return PI.setAborted(&SI);
+ return PI.setAborted(&I);
// See if we already have computed info on this node.
- uint64_t &SelectSize = PHIOrSelectSizes[&SI];
- if (!SelectSize) {
- // This is a new Select, check for an unsafe use of it.
- if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectSize))
+ uint64_t &Size = PHIOrSelectSizes[&I];
+ if (!Size) {
+ // This is a new PHI/Select, check for an unsafe use of it.
+ if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
return PI.setAborted(UnsafeI);
}
@@ -707,17 +977,19 @@ private:
// FIXME: This should instead be escaped in the event we're instrumenting
// for address sanitization.
if (Offset.uge(AllocSize)) {
- S.DeadOperands.push_back(U);
+ AS.DeadOperands.push_back(U);
return;
}
- insertUse(SI, Offset, SelectSize);
+ insertUse(I, Offset, Size);
}
+ void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
+
+ void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
+
/// \brief Disable SROA entirely if there are unhandled users of the alloca.
- void visitInstruction(Instruction &I) {
- PI.setAborted(&I);
- }
+ void visitInstruction(Instruction &I) { PI.setAborted(&I); }
};
AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
@@ -738,7 +1010,9 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
}
Slices.erase(std::remove_if(Slices.begin(), Slices.end(),
- std::mem_fun_ref(&Slice::isDead)),
+ [](const Slice &S) {
+ return S.isDead();
+ }),
Slices.end());
#if __cplusplus >= 201103L && !defined(NDEBUG)
@@ -758,6 +1032,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
void AllocaSlices::print(raw_ostream &OS, const_iterator I,
StringRef Indent) const {
printSlice(OS, I, Indent);
+ OS << "\n";
printUse(OS, I, Indent);
}
@@ -765,7 +1040,7 @@ void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
StringRef Indent) const {
OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
<< " slice #" << (I - begin())
- << (I->isSplittable() ? " (splittable)" : "") << "\n";
+ << (I->isSplittable() ? " (splittable)" : "");
}
void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
@@ -813,15 +1088,17 @@ public:
AllocaInst &AI, DIBuilder &DIB)
: LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
- void run(const SmallVectorImpl<Instruction*> &Insts) {
+ void run(const SmallVectorImpl<Instruction *> &Insts) {
// Retain the debug information attached to the alloca for use when
// rewriting loads and stores.
- if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) {
- for (User *U : DebugNode->users())
- if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
- DDIs.push_back(DDI);
- else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
- DVIs.push_back(DVI);
+ if (auto *L = LocalAsMetadata::getIfExists(&AI)) {
+ if (auto *DebugNode = MetadataAsValue::getIfExists(AI.getContext(), L)) {
+ for (User *U : DebugNode->users())
+ if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
+ DDIs.push_back(DDI);
+ else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
+ DVIs.push_back(DVI);
+ }
}
LoadAndStorePromoter::run(Insts);
@@ -834,8 +1111,9 @@ public:
DVIs.pop_back_val()->eraseFromParent();
}
- bool isInstInList(Instruction *I,
- const SmallVectorImpl<Instruction*> &Insts) const override {
+ bool
+ isInstInList(Instruction *I,
+ const SmallVectorImpl<Instruction *> &Insts) const override {
Value *Ptr;
if (LoadInst *LI = dyn_cast<LoadInst>(I))
Ptr = LI->getOperand(0);
@@ -857,23 +1135,18 @@ public:
else
return false;
- } while (Visited.insert(Ptr));
+ } while (Visited.insert(Ptr).second);
return false;
}
void updateDebugInfo(Instruction *Inst) const override {
- for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(),
- E = DDIs.end(); I != E; ++I) {
- DbgDeclareInst *DDI = *I;
+ for (DbgDeclareInst *DDI : DDIs)
if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
- }
- for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(),
- E = DVIs.end(); I != E; ++I) {
- DbgValueInst *DVI = *I;
+ for (DbgValueInst *DVI : DVIs) {
Value *Arg = nullptr;
if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
// If an argument is zero extended then use argument directly. The ZExt
@@ -890,15 +1163,14 @@ public:
continue;
}
Instruction *DbgVal =
- DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()),
- Inst);
+ DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()),
+ DIExpression(DVI->getExpression()), Inst);
DbgVal->setDebugLoc(DVI->getDebugLoc());
}
}
};
} // end anon namespace
-
namespace {
/// \brief An optimization pass providing Scalar Replacement of Aggregates.
///
@@ -924,6 +1196,7 @@ class SROA : public FunctionPass {
LLVMContext *C;
const DataLayout *DL;
DominatorTree *DT;
+ AssumptionCache *AC;
/// \brief Worklist of alloca instructions to simplify.
///
@@ -932,12 +1205,12 @@ class SROA : public FunctionPass {
/// directly promoted. Finally, each time we rewrite a use of an alloca other
/// the one being actively rewritten, we add it back onto the list if not
/// already present to ensure it is re-visited.
- SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist;
+ SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> Worklist;
/// \brief A collection of instructions to delete.
/// We try to batch deletions to simplify code and make things a bit more
/// efficient.
- SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts;
+ SetVector<Instruction *, SmallVector<Instruction *, 8>> DeadInsts;
/// \brief Post-promotion worklist.
///
@@ -947,7 +1220,7 @@ class SROA : public FunctionPass {
///
/// Note that we have to be very careful to clear allocas out of this list in
/// the event they are deleted.
- SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist;
+ SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> PostPromotionWorklist;
/// \brief A collection of alloca instructions we can directly promote.
std::vector<AllocaInst *> PromotableAllocas;
@@ -957,7 +1230,7 @@ class SROA : public FunctionPass {
/// All of these PHIs have been checked for the safety of speculation and by
/// being speculated will allow promoting allocas currently in the promotable
/// queue.
- SetVector<PHINode *, SmallVector<PHINode *, 2> > SpeculatablePHIs;
+ SetVector<PHINode *, SmallVector<PHINode *, 2>> SpeculatablePHIs;
/// \brief A worklist of select instructions to speculate prior to promoting
/// allocas.
@@ -965,12 +1238,12 @@ class SROA : public FunctionPass {
/// All of these select instructions have been checked for the safety of
/// speculation and by being speculated will allow promoting allocas
/// currently in the promotable queue.
- SetVector<SelectInst *, SmallVector<SelectInst *, 2> > SpeculatableSelects;
+ SetVector<SelectInst *, SmallVector<SelectInst *, 2>> SpeculatableSelects;
public:
SROA(bool RequiresDomTree = true)
- : FunctionPass(ID), RequiresDomTree(RequiresDomTree),
- C(nullptr), DL(nullptr), DT(nullptr) {
+ : FunctionPass(ID), RequiresDomTree(RequiresDomTree), C(nullptr),
+ DL(nullptr), DT(nullptr) {
initializeSROAPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
@@ -983,14 +1256,13 @@ private:
friend class PHIOrSelectSpeculator;
friend class AllocaSliceRewriter;
- bool rewritePartition(AllocaInst &AI, AllocaSlices &S,
- AllocaSlices::iterator B, AllocaSlices::iterator E,
- int64_t BeginOffset, int64_t EndOffset,
- ArrayRef<AllocaSlices::iterator> SplitUses);
- bool splitAlloca(AllocaInst &AI, AllocaSlices &S);
+ bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
+ bool rewritePartition(AllocaInst &AI, AllocaSlices &AS,
+ AllocaSlices::Partition &P);
+ bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
bool runOnAlloca(AllocaInst &AI);
void clobberUse(Use &U);
- void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas);
+ void deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
bool promoteAllocas(Function &F);
};
}
@@ -1001,11 +1273,12 @@ FunctionPass *llvm::createSROAPass(bool RequiresDomTree) {
return new SROA(RequiresDomTree);
}
-INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates",
- false, false)
+INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates",
- false, false)
+INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false,
+ false)
/// Walk the range of a partitioning looking for a common type to cover this
/// sequence of slices.
@@ -1076,8 +1349,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
///
/// FIXME: This should be hoisted into a generic utility, likely in
/// Transforms/Util/Local.h
-static bool isSafePHIToSpeculate(PHINode &PN,
- const DataLayout *DL = nullptr) {
+static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) {
// For now, we can only do this promotion if the load is in the same block
// as the PHI, and if there are no stores between the phi and load.
// TODO: Allow recursive phi users.
@@ -1148,10 +1420,12 @@ static void speculatePHINodeLoads(PHINode &PN) {
PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
PN.getName() + ".sroa.speculated");
- // Get the TBAA tag and alignment to use from one of the loads. It doesn't
+ // Get the AA tags and alignment to use from one of the loads. It doesn't
// matter which one we get and if any differ.
LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
- MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
+
+ AAMDNodes AATags;
+ SomeLoad->getAAMetadata(AATags);
unsigned Align = SomeLoad->getAlignment();
// Rewrite all loads of the PN to use the new PHI.
@@ -1172,8 +1446,8 @@ static void speculatePHINodeLoads(PHINode &PN) {
InVal, (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
++NumLoadsSpeculated;
Load->setAlignment(Align);
- if (TBAATag)
- Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+ if (AATags)
+ Load->setAAMetadata(AATags);
NewPN->addIncoming(Load, Pred);
}
@@ -1238,12 +1512,15 @@ static void speculateSelectInstLoads(SelectInst &SI) {
IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false");
NumLoadsSpeculated += 2;
- // Transfer alignment and TBAA info if present.
+ // Transfer alignment and AA info if present.
TL->setAlignment(LI->getAlignment());
FL->setAlignment(LI->getAlignment());
- if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) {
- TL->setMetadata(LLVMContext::MD_tbaa, Tag);
- FL->setMetadata(LLVMContext::MD_tbaa, Tag);
+
+ AAMDNodes Tags;
+ LI->getAAMetadata(Tags);
+ if (Tags) {
+ TL->setAAMetadata(Tags);
+ FL->setAAMetadata(Tags);
}
Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
@@ -1332,7 +1609,8 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
SmallVectorImpl<Value *> &Indices,
Twine NamePrefix) {
if (Offset == 0)
- return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices, NamePrefix);
+ return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices,
+ NamePrefix);
// We can't recurse through pointer types.
if (Ty->isPointerTy())
@@ -1440,8 +1718,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
/// a single GEP as possible, thus making each GEP more independent of the
/// surrounding code.
static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
- APInt Offset, Type *PointerTy,
- Twine NamePrefix) {
+ APInt Offset, Type *PointerTy, Twine NamePrefix) {
// Even though we don't look through PHI nodes, we could be called on an
// instruction in an unreachable block, which may be on a cycle.
SmallPtrSet<Value *, 4> Visited;
@@ -1450,8 +1727,9 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
// We may end up computing an offset pointer that has the wrong type. If we
// never are able to compute one directly that has the correct type, we'll
- // fall back to it, so keep it around here.
+ // fall back to it, so keep it and the base it was computed from around here.
Value *OffsetPtr = nullptr;
+ Value *OffsetBasePtr;
// Remember any i8 pointer we come across to re-use if we need to do a raw
// byte offset.
@@ -1468,7 +1746,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
break;
Offset += GEPOffset;
Ptr = GEP->getPointerOperand();
- if (!Visited.insert(Ptr))
+ if (!Visited.insert(Ptr).second)
break;
}
@@ -1476,16 +1754,19 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
Indices.clear();
if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
Indices, NamePrefix)) {
- if (P->getType() == PointerTy) {
- // Zap any offset pointer that we ended up computing in previous rounds.
- if (OffsetPtr && OffsetPtr->use_empty())
- if (Instruction *I = dyn_cast<Instruction>(OffsetPtr))
- I->eraseFromParent();
+ // If we have a new natural pointer at the offset, clear out any old
+ // offset pointer we computed. Unless it is the base pointer or
+ // a non-instruction, we built a GEP we don't need. Zap it.
+ if (OffsetPtr && OffsetPtr != OffsetBasePtr)
+ if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) {
+ assert(I->use_empty() && "Built a GEP with uses some how!");
+ I->eraseFromParent();
+ }
+ OffsetPtr = P;
+ OffsetBasePtr = Ptr;
+ // If we also found a pointer of the right type, we're done.
+ if (P->getType() == PointerTy)
return P;
- }
- if (!OffsetPtr) {
- OffsetPtr = P;
- }
}
// Stash this pointer if we've found an i8*.
@@ -1505,7 +1786,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
break;
}
assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
- } while (Visited.insert(Ptr));
+ } while (Visited.insert(Ptr).second);
if (!OffsetPtr) {
if (!Int8Ptr) {
@@ -1515,9 +1796,10 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
Int8PtrOffset = Offset;
}
- OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr :
- IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
- NamePrefix + "sroa_raw_idx");
+ OffsetPtr = Int8PtrOffset == 0
+ ? Int8Ptr
+ : IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
+ NamePrefix + "sroa_raw_idx");
}
Ptr = OffsetPtr;
@@ -1528,6 +1810,27 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
return Ptr;
}
+/// \brief Compute the adjusted alignment for a load or store from an offset.
+static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
+ const DataLayout &DL) {
+ unsigned Alignment;
+ Type *Ty;
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ Alignment = LI->getAlignment();
+ Ty = LI->getType();
+ } else if (auto *SI = dyn_cast<StoreInst>(I)) {
+ Alignment = SI->getAlignment();
+ Ty = SI->getValueOperand()->getType();
+ } else {
+ llvm_unreachable("Only loads and stores are allowed!");
+ }
+
+ if (!Alignment)
+ Alignment = DL.getABITypeAlignment(Ty);
+
+ return MinAlign(Alignment, Offset);
+}
+
/// \brief Test whether we can convert a value from the old to the new type.
///
/// This predicate should be used to guard calls to convertValue in order to
@@ -1621,39 +1924,43 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
///
/// This function is called to test each entry in a partioning which is slated
/// for a single slice.
-static bool isVectorPromotionViableForSlice(
- const DataLayout &DL, AllocaSlices &S, uint64_t SliceBeginOffset,
- uint64_t SliceEndOffset, VectorType *Ty, uint64_t ElementSize,
- AllocaSlices::const_iterator I) {
+static bool isVectorPromotionViableForSlice(AllocaSlices::Partition &P,
+ const Slice &S, VectorType *Ty,
+ uint64_t ElementSize,
+ const DataLayout &DL) {
// First validate the slice offsets.
uint64_t BeginOffset =
- std::max(I->beginOffset(), SliceBeginOffset) - SliceBeginOffset;
+ std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
uint64_t BeginIndex = BeginOffset / ElementSize;
if (BeginIndex * ElementSize != BeginOffset ||
BeginIndex >= Ty->getNumElements())
return false;
uint64_t EndOffset =
- std::min(I->endOffset(), SliceEndOffset) - SliceBeginOffset;
+ std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
uint64_t EndIndex = EndOffset / ElementSize;
if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements())
return false;
assert(EndIndex > BeginIndex && "Empty vector!");
uint64_t NumElements = EndIndex - BeginIndex;
- Type *SliceTy =
- (NumElements == 1) ? Ty->getElementType()
- : VectorType::get(Ty->getElementType(), NumElements);
+ Type *SliceTy = (NumElements == 1)
+ ? Ty->getElementType()
+ : VectorType::get(Ty->getElementType(), NumElements);
Type *SplitIntTy =
Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
- Use *U = I->getUse();
+ Use *U = S.getUse();
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
if (MI->isVolatile())
return false;
- if (!I->isSplittable())
+ if (!S.isSplittable())
return false; // Skip any unsplittable intrinsics.
+ } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
+ if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+ II->getIntrinsicID() != Intrinsic::lifetime_end)
+ return false;
} else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
// Disable vector promotion when there are loads or stores of an FCA.
return false;
@@ -1661,8 +1968,7 @@ static bool isVectorPromotionViableForSlice(
if (LI->isVolatile())
return false;
Type *LTy = LI->getType();
- if (SliceBeginOffset > I->beginOffset() ||
- SliceEndOffset < I->endOffset()) {
+ if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
assert(LTy->isIntegerTy());
LTy = SplitIntTy;
}
@@ -1672,8 +1978,7 @@ static bool isVectorPromotionViableForSlice(
if (SI->isVolatile())
return false;
Type *STy = SI->getValueOperand()->getType();
- if (SliceBeginOffset > I->beginOffset() ||
- SliceEndOffset < I->endOffset()) {
+ if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
assert(STy->isIntegerTy());
STy = SplitIntTy;
}
@@ -1695,65 +2000,137 @@ static bool isVectorPromotionViableForSlice(
/// SSA value. We only can ensure this for a limited set of operations, and we
/// don't want to do the rewrites unless we are confident that the result will
/// be promotable, so we have an early test here.
-static bool
-isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy, AllocaSlices &S,
- uint64_t SliceBeginOffset, uint64_t SliceEndOffset,
- AllocaSlices::const_iterator I,
- AllocaSlices::const_iterator E,
- ArrayRef<AllocaSlices::iterator> SplitUses) {
- VectorType *Ty = dyn_cast<VectorType>(AllocaTy);
- if (!Ty)
- return false;
+static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P,
+ const DataLayout &DL) {
+ // Collect the candidate types for vector-based promotion. Also track whether
+ // we have different element types.
+ SmallVector<VectorType *, 4> CandidateTys;
+ Type *CommonEltTy = nullptr;
+ bool HaveCommonEltTy = true;
+ auto CheckCandidateType = [&](Type *Ty) {
+ if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+ CandidateTys.push_back(VTy);
+ if (!CommonEltTy)
+ CommonEltTy = VTy->getElementType();
+ else if (CommonEltTy != VTy->getElementType())
+ HaveCommonEltTy = false;
+ }
+ };
+ // Consider any loads or stores that are the exact size of the slice.
+ for (const Slice &S : P)
+ if (S.beginOffset() == P.beginOffset() &&
+ S.endOffset() == P.endOffset()) {
+ if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
+ CheckCandidateType(LI->getType());
+ else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
+ CheckCandidateType(SI->getValueOperand()->getType());
+ }
- uint64_t ElementSize = DL.getTypeSizeInBits(Ty->getScalarType());
+ // If we didn't find a vector type, nothing to do here.
+ if (CandidateTys.empty())
+ return nullptr;
- // While the definition of LLVM vectors is bitpacked, we don't support sizes
- // that aren't byte sized.
- if (ElementSize % 8)
- return false;
- assert((DL.getTypeSizeInBits(Ty) % 8) == 0 &&
- "vector size not a multiple of element size?");
- ElementSize /= 8;
+ // Remove non-integer vector types if we had multiple common element types.
+ // FIXME: It'd be nice to replace them with integer vector types, but we can't
+ // do that until all the backends are known to produce good code for all
+ // integer vector types.
+ if (!HaveCommonEltTy) {
+ CandidateTys.erase(std::remove_if(CandidateTys.begin(), CandidateTys.end(),
+ [](VectorType *VTy) {
+ return !VTy->getElementType()->isIntegerTy();
+ }),
+ CandidateTys.end());
+
+ // If there were no integer vector types, give up.
+ if (CandidateTys.empty())
+ return nullptr;
- for (; I != E; ++I)
- if (!isVectorPromotionViableForSlice(DL, S, SliceBeginOffset,
- SliceEndOffset, Ty, ElementSize, I))
- return false;
+ // Rank the remaining candidate vector types. This is easy because we know
+ // they're all integer vectors. We sort by ascending number of elements.
+ auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
+ assert(DL.getTypeSizeInBits(RHSTy) == DL.getTypeSizeInBits(LHSTy) &&
+ "Cannot have vector types of different sizes!");
+ assert(RHSTy->getElementType()->isIntegerTy() &&
+ "All non-integer types eliminated!");
+ assert(LHSTy->getElementType()->isIntegerTy() &&
+ "All non-integer types eliminated!");
+ return RHSTy->getNumElements() < LHSTy->getNumElements();
+ };
+ std::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes);
+ CandidateTys.erase(
+ std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
+ CandidateTys.end());
+ } else {
+// The only way to have the same element type in every vector type is to
+// have the same vector type. Check that and remove all but one.
+#ifndef NDEBUG
+ for (VectorType *VTy : CandidateTys) {
+ assert(VTy->getElementType() == CommonEltTy &&
+ "Unaccounted for element type!");
+ assert(VTy == CandidateTys[0] &&
+ "Different vector types with the same element type!");
+ }
+#endif
+ CandidateTys.resize(1);
+ }
- for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(),
- SUE = SplitUses.end();
- SUI != SUE; ++SUI)
- if (!isVectorPromotionViableForSlice(DL, S, SliceBeginOffset,
- SliceEndOffset, Ty, ElementSize, *SUI))
+ // Try each vector type, and return the one which works.
+ auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
+ uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType());
+
+ // While the definition of LLVM vectors is bitpacked, we don't support sizes
+ // that aren't byte sized.
+ if (ElementSize % 8)
return false;
+ assert((DL.getTypeSizeInBits(VTy) % 8) == 0 &&
+ "vector size not a multiple of element size?");
+ ElementSize /= 8;
- return true;
+ for (const Slice &S : P)
+ if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
+ return false;
+
+ for (const Slice *S : P.splitSliceTails())
+ if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
+ return false;
+
+ return true;
+ };
+ for (VectorType *VTy : CandidateTys)
+ if (CheckVectorTypeForPromotion(VTy))
+ return VTy;
+
+ return nullptr;
}
/// \brief Test whether a slice of an alloca is valid for integer widening.
///
/// This implements the necessary checking for the \c isIntegerWideningViable
/// test below on a single slice of the alloca.
-static bool isIntegerWideningViableForSlice(const DataLayout &DL,
- Type *AllocaTy,
+static bool isIntegerWideningViableForSlice(const Slice &S,
uint64_t AllocBeginOffset,
- uint64_t Size, AllocaSlices &S,
- AllocaSlices::const_iterator I,
+ Type *AllocaTy,
+ const DataLayout &DL,
bool &WholeAllocaOp) {
- uint64_t RelBegin = I->beginOffset() - AllocBeginOffset;
- uint64_t RelEnd = I->endOffset() - AllocBeginOffset;
+ uint64_t Size = DL.getTypeStoreSize(AllocaTy);
+
+ uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
+ uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
// We can't reasonably handle cases where the load or store extends past
// the end of the aloca's type and into its padding.
if (RelEnd > Size)
return false;
- Use *U = I->getUse();
+ Use *U = S.getUse();
if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
if (LI->isVolatile())
return false;
- if (RelBegin == 0 && RelEnd == Size)
+ // Note that we don't count vector loads or stores as whole-alloca
+ // operations which enable integer widening because we would prefer to use
+ // vector widening instead.
+ if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
WholeAllocaOp = true;
if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
@@ -1768,7 +2145,10 @@ static bool isIntegerWideningViableForSlice(const DataLayout &DL,
Type *ValueTy = SI->getValueOperand()->getType();
if (SI->isVolatile())
return false;
- if (RelBegin == 0 && RelEnd == Size)
+ // Note that we don't count vector loads or stores as whole-alloca
+ // operations which enable integer widening because we would prefer to use
+ // vector widening instead.
+ if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
WholeAllocaOp = true;
if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
@@ -1782,7 +2162,7 @@ static bool isIntegerWideningViableForSlice(const DataLayout &DL,
} else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
return false;
- if (!I->isSplittable())
+ if (!S.isSplittable())
return false; // Skip any unsplittable intrinsics.
} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
@@ -1801,12 +2181,8 @@ static bool isIntegerWideningViableForSlice(const DataLayout &DL,
/// This is a quick test to check whether we can rewrite the integer loads and
/// stores to a particular alloca into wider loads and stores and be able to
/// promote the resulting alloca.
-static bool
-isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy,
- uint64_t AllocBeginOffset, AllocaSlices &S,
- AllocaSlices::const_iterator I,
- AllocaSlices::const_iterator E,
- ArrayRef<AllocaSlices::iterator> SplitUses) {
+static bool isIntegerWideningViable(AllocaSlices::Partition &P, Type *AllocaTy,
+ const DataLayout &DL) {
uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy);
// Don't create integer types larger than the maximum bitwidth.
if (SizeInBits > IntegerType::MAX_INT_BITS)
@@ -1824,25 +2200,24 @@ isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy,
!canConvertValue(DL, IntTy, AllocaTy))
return false;
- uint64_t Size = DL.getTypeStoreSize(AllocaTy);
-
// While examining uses, we ensure that the alloca has a covering load or
// store. We don't want to widen the integer operations only to fail to
// promote due to some other unsplittable entry (which we may make splittable
// later). However, if there are only splittable uses, go ahead and assume
// that we cover the alloca.
- bool WholeAllocaOp = (I != E) ? false : DL.isLegalInteger(SizeInBits);
-
- for (; I != E; ++I)
- if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size,
- S, I, WholeAllocaOp))
+ // FIXME: We shouldn't consider split slices that happen to start in the
+ // partition here...
+ bool WholeAllocaOp =
+ P.begin() != P.end() ? false : DL.isLegalInteger(SizeInBits);
+
+ for (const Slice &S : P)
+ if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
+ WholeAllocaOp))
return false;
- for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(),
- SUE = SplitUses.end();
- SUI != SUE; ++SUI)
- if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size,
- S, *SUI, WholeAllocaOp))
+ for (const Slice *S : P.splitSliceTails())
+ if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
+ WholeAllocaOp))
return false;
return WholeAllocaOp;
@@ -1855,9 +2230,9 @@ static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
IntegerType *IntTy = cast<IntegerType>(V->getType());
assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
"Element extends past full value");
- uint64_t ShAmt = 8*Offset;
+ uint64_t ShAmt = 8 * Offset;
if (DL.isBigEndian())
- ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+ ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
if (ShAmt) {
V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
DEBUG(dbgs() << " shifted: " << *V << "\n");
@@ -1884,9 +2259,9 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
}
assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
"Element store outside of alloca store");
- uint64_t ShAmt = 8*Offset;
+ uint64_t ShAmt = 8 * Offset;
if (DL.isBigEndian())
- ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+ ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
if (ShAmt) {
V = IRB.CreateShl(V, ShAmt, Name + ".shift");
DEBUG(dbgs() << " shifted: " << *V << "\n");
@@ -1902,9 +2277,8 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
return V;
}
-static Value *extractVector(IRBuilderTy &IRB, Value *V,
- unsigned BeginIndex, unsigned EndIndex,
- const Twine &Name) {
+static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
+ unsigned EndIndex, const Twine &Name) {
VectorType *VecTy = cast<VectorType>(V->getType());
unsigned NumElements = EndIndex - BeginIndex;
assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
@@ -1919,13 +2293,12 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V,
return V;
}
- SmallVector<Constant*, 8> Mask;
+ SmallVector<Constant *, 8> Mask;
Mask.reserve(NumElements);
for (unsigned i = BeginIndex; i != EndIndex; ++i)
Mask.push_back(IRB.getInt32(i));
V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
- ConstantVector::get(Mask),
- Name + ".extract");
+ ConstantVector::get(Mask), Name + ".extract");
DEBUG(dbgs() << " shuffle: " << *V << "\n");
return V;
}
@@ -1940,7 +2313,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
// Single element to insert.
V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
Name + ".insert");
- DEBUG(dbgs() << " insert: " << *V << "\n");
+ DEBUG(dbgs() << " insert: " << *V << "\n");
return V;
}
@@ -1956,7 +2329,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
// use a shuffle vector to widen it with undef elements, and then
// a second shuffle vector to select between the loaded vector and the
// incoming vector.
- SmallVector<Constant*, 8> Mask;
+ SmallVector<Constant *, 8> Mask;
Mask.reserve(VecTy->getNumElements());
for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
if (i >= BeginIndex && i < EndIndex)
@@ -1964,8 +2337,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
else
Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
- ConstantVector::get(Mask),
- Name + ".expand");
+ ConstantVector::get(Mask), Name + ".expand");
DEBUG(dbgs() << " shuffle: " << *V << "\n");
Mask.clear();
@@ -1991,12 +2363,18 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base;
const DataLayout &DL;
- AllocaSlices &S;
+ AllocaSlices &AS;
SROA &Pass;
AllocaInst &OldAI, &NewAI;
const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
Type *NewAllocaTy;
+ // This is a convenience and flag variable that will be null unless the new
+ // alloca's integer operations should be widened to this integer type due to
+ // passing isIntegerWideningViable above. If it is non-null, the desired
+ // integer type will be stored here for easy access during rewriting.
+ IntegerType *IntTy;
+
// If we are rewriting an alloca partition which can be written as pure
// vector operations, we stash extra information here. When VecTy is
// non-null, we have some strict guarantees about the rewritten alloca:
@@ -2010,12 +2388,6 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
Type *ElementTy;
uint64_t ElementSize;
- // This is a convenience and flag variable that will be null unless the new
- // alloca's integer operations should be widened to this integer type due to
- // passing isIntegerWideningViable above. If it is non-null, the desired
- // integer type will be stored here for easy access during rewriting.
- IntegerType *IntTy;
-
// The original offset of the slice currently being rewritten relative to
// the original alloca.
uint64_t BeginOffset, EndOffset;
@@ -2038,25 +2410,25 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
IRBuilderTy IRB;
public:
- AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &S, SROA &Pass,
+ AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
AllocaInst &OldAI, AllocaInst &NewAI,
uint64_t NewAllocaBeginOffset,
- uint64_t NewAllocaEndOffset, bool IsVectorPromotable,
- bool IsIntegerPromotable,
+ uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
+ VectorType *PromotableVecTy,
SmallPtrSetImpl<PHINode *> &PHIUsers,
SmallPtrSetImpl<SelectInst *> &SelectUsers)
- : DL(DL), S(S), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
+ : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
NewAllocaBeginOffset(NewAllocaBeginOffset),
NewAllocaEndOffset(NewAllocaEndOffset),
NewAllocaTy(NewAI.getAllocatedType()),
- VecTy(IsVectorPromotable ? cast<VectorType>(NewAllocaTy) : nullptr),
- ElementTy(VecTy ? VecTy->getElementType() : nullptr),
- ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
IntTy(IsIntegerPromotable
? Type::getIntNTy(
NewAI.getContext(),
DL.getTypeSizeInBits(NewAI.getAllocatedType()))
: nullptr),
+ VecTy(PromotableVecTy),
+ ElementTy(VecTy ? VecTy->getElementType() : nullptr),
+ ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
IRB(NewAI.getContext(), ConstantFolder()) {
@@ -2065,8 +2437,7 @@ public:
"Only multiple-of-8 sized vector elements are viable");
++NumVectorized;
}
- assert((!IsVectorPromotable && !IsIntegerPromotable) ||
- IsVectorPromotable != IsIntegerPromotable);
+ assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
}
bool visit(AllocaSlices::const_iterator I) {
@@ -2076,6 +2447,9 @@ public:
IsSplittable = I->isSplittable();
IsSplit =
BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
+ DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
+ DEBUG(AS.printSlice(dbgs(), I, ""));
+ DEBUG(dbgs() << "\n");
// Compute the intersecting offset range.
assert(BeginOffset < NewAllocaEndOffset);
@@ -2146,7 +2520,8 @@ private:
);
}
- /// \brief Compute suitable alignment to access this slice of the *new* alloca.
+ /// \brief Compute suitable alignment to access this slice of the *new*
+ /// alloca.
///
/// You can optionally pass a type to this routine and if that type's ABI
/// alignment is itself suitable, this will return zero.
@@ -2154,7 +2529,8 @@ private:
unsigned NewAIAlign = NewAI.getAlignment();
if (!NewAIAlign)
NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType());
- unsigned Align = MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset);
+ unsigned Align =
+ MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset);
return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align;
}
@@ -2178,16 +2554,14 @@ private:
unsigned EndIndex = getIndex(NewEndOffset);
assert(EndIndex > BeginIndex && "Empty vector!");
- Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "load");
+ Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
}
Value *rewriteIntegerLoad(LoadInst &LI) {
assert(IntTy && "We cannot insert an integer to the alloca");
assert(!LI.isVolatile());
- Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "load");
+ Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
V = convertValue(DL, IRB, V, IntTy);
assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
@@ -2212,8 +2586,8 @@ private:
V = rewriteIntegerLoad(LI);
} else if (NewBeginOffset == NewAllocaBeginOffset &&
canConvertValue(DL, NewAllocaTy, LI.getType())) {
- V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- LI.isVolatile(), LI.getName());
+ V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(),
+ LI.getName());
} else {
Type *LTy = TargetTy->getPointerTo();
V = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
@@ -2230,7 +2604,7 @@ private:
assert(SliceSize < DL.getTypeStoreSize(LI.getType()) &&
"Split load isn't smaller than original load");
assert(LI.getType()->getIntegerBitWidth() ==
- DL.getTypeStoreSizeInBits(LI.getType()) &&
+ DL.getTypeStoreSizeInBits(LI.getType()) &&
"Non-byte-multiple bit width");
// Move the insertion point just past the load so that we can refer to it.
IRB.SetInsertPoint(std::next(BasicBlock::iterator(&LI)));
@@ -2238,9 +2612,9 @@ private:
// basis for the new value. This allows us to replace the uses of LI with
// the computed value, and then replace the placeholder with LI, leaving
// LI only used for this computation.
- Value *Placeholder
- = new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
- V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset,
+ Value *Placeholder =
+ new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
+ V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
"insert");
LI.replaceAllUsesWith(V);
Placeholder->replaceAllUsesWith(&LI);
@@ -2262,15 +2636,14 @@ private:
assert(EndIndex > BeginIndex && "Empty vector!");
unsigned NumElements = EndIndex - BeginIndex;
assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
- Type *SliceTy =
- (NumElements == 1) ? ElementTy
- : VectorType::get(ElementTy, NumElements);
+ Type *SliceTy = (NumElements == 1)
+ ? ElementTy
+ : VectorType::get(ElementTy, NumElements);
if (V->getType() != SliceTy)
V = convertValue(DL, IRB, V, SliceTy);
// Mix in the existing elements.
- Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "load");
+ Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
V = insertVector(IRB, Old, V, BeginIndex, "vec");
}
StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
@@ -2285,13 +2658,12 @@ private:
assert(IntTy && "We cannot extract an integer from the alloca");
assert(!SI.isVolatile());
if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
- Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "oldload");
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
Old = convertValue(DL, IRB, Old, IntTy);
assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
- V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset,
- "insert");
+ V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
}
V = convertValue(DL, IRB, V, NewAllocaTy);
StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
@@ -2319,10 +2691,10 @@ private:
assert(V->getType()->isIntegerTy() &&
"Only integer type loads and stores are split");
assert(V->getType()->getIntegerBitWidth() ==
- DL.getTypeStoreSizeInBits(V->getType()) &&
+ DL.getTypeStoreSizeInBits(V->getType()) &&
"Non-byte-multiple bit width");
IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
- V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset,
+ V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
"extract");
}
@@ -2367,14 +2739,14 @@ private:
if (Size == 1)
return V;
- Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8);
- V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, "zext"),
- ConstantExpr::getUDiv(
- Constant::getAllOnesValue(SplatIntTy),
- ConstantExpr::getZExt(
- Constant::getAllOnesValue(V->getType()),
- SplatIntTy)),
- "isplat");
+ Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
+ V = IRB.CreateMul(
+ IRB.CreateZExt(V, SplatIntTy, "zext"),
+ ConstantExpr::getUDiv(
+ Constant::getAllOnesValue(SplatIntTy),
+ ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()),
+ SplatIntTy)),
+ "isplat");
return V;
}
@@ -2411,11 +2783,11 @@ private:
// If this doesn't map cleanly onto the alloca type, and that type isn't
// a single value type, just emit a memset.
if (!VecTy && !IntTy &&
- (BeginOffset > NewAllocaBeginOffset ||
- EndOffset < NewAllocaEndOffset ||
+ (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
+ SliceSize != DL.getTypeStoreSize(AllocaTy) ||
!AllocaTy->isSingleValueType() ||
!DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) ||
- DL.getTypeSizeInBits(ScalarTy)%8 != 0)) {
+ DL.getTypeSizeInBits(ScalarTy) % 8 != 0)) {
Type *SizeTy = II.getLength()->getType();
Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
CallInst *New = IRB.CreateMemSet(
@@ -2449,8 +2821,8 @@ private:
if (NumElements > 1)
Splat = getVectorSplat(Splat, NumElements);
- Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "oldload");
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
} else if (IntTy) {
// If this is a memset on an alloca where we can widen stores, insert the
@@ -2462,8 +2834,8 @@ private:
if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
EndOffset != NewAllocaBeginOffset)) {
- Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "oldload");
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
Old = convertValue(DL, IRB, Old, IntTy);
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
V = insertInteger(DL, IRB, Old, V, Offset, "insert");
@@ -2535,10 +2907,11 @@ private:
// If this doesn't map cleanly onto the alloca type, and that type isn't
// a single value type, just emit a memcpy.
- bool EmitMemCpy
- = !VecTy && !IntTy && (BeginOffset > NewAllocaBeginOffset ||
- EndOffset < NewAllocaEndOffset ||
- !NewAI.getAllocatedType()->isSingleValueType());
+ bool EmitMemCpy =
+ !VecTy && !IntTy &&
+ (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
+ SliceSize != DL.getTypeStoreSize(NewAI.getAllocatedType()) ||
+ !NewAI.getAllocatedType()->isSingleValueType());
// If we're just going to emit a memcpy, the alloca hasn't changed, and the
// size hasn't been shrunk based on analysis of the viable range, this is
@@ -2559,8 +2932,8 @@ private:
// Strip all inbounds GEPs and pointer casts to try to dig out any root
// alloca that should be re-examined after rewriting this instruction.
Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
- if (AllocaInst *AI
- = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
+ if (AllocaInst *AI =
+ dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
assert(AI != &OldAI && AI != &NewAI &&
"Splittable transfers cannot reach the same alloca on both ends.");
Pass.Worklist.insert(AI);
@@ -2599,8 +2972,8 @@ private:
unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
unsigned NumElements = EndIndex - BeginIndex;
- IntegerType *SubIntTy
- = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : nullptr;
+ IntegerType *SubIntTy =
+ IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
// Reset the other pointer type to match the register type we're going to
// use, but using the address space of the original other pointer.
@@ -2629,27 +3002,25 @@ private:
Value *Src;
if (VecTy && !IsWholeAlloca && !IsDest) {
- Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "load");
+ Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
} else if (IntTy && !IsWholeAlloca && !IsDest) {
- Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "load");
+ Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
Src = convertValue(DL, IRB, Src, IntTy);
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
} else {
- Src = IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(),
- "copyload");
+ Src =
+ IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload");
}
if (VecTy && !IsWholeAlloca && IsDest) {
- Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "oldload");
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
} else if (IntTy && !IsWholeAlloca && IsDest) {
- Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "oldload");
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
Old = convertValue(DL, IRB, Old, IntTy);
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
@@ -2672,8 +3043,8 @@ private:
// Record this instruction for deletion.
Pass.DeadInsts.insert(&II);
- ConstantInt *Size
- = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
+ ConstantInt *Size =
+ ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
NewEndOffset - NewBeginOffset);
Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
Value *New;
@@ -2740,7 +3111,6 @@ private:
SelectUsers.insert(&SI);
return true;
}
-
};
}
@@ -2787,7 +3157,7 @@ private:
/// This uses a set to de-duplicate users.
void enqueueUsers(Instruction &I) {
for (Use &U : I.uses())
- if (Visited.insert(U.getUser()))
+ if (Visited.insert(U.getUser()).second)
Queue.push_back(&U);
}
@@ -2795,8 +3165,7 @@ private:
bool visitInstruction(Instruction &I) { return false; }
/// \brief Generic recursive split emission class.
- template <typename Derived>
- class OpSplitter {
+ template <typename Derived> class OpSplitter {
protected:
/// The builder used to form new instructions.
IRBuilderTy IRB;
@@ -2813,7 +3182,7 @@ private:
/// Initialize the splitter with an insertion point, Ptr and start with a
/// single zero GEP index.
OpSplitter(Instruction *InsertionPoint, Value *Ptr)
- : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
+ : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
public:
/// \brief Generic recursive split emission routine.
@@ -2869,7 +3238,7 @@ private:
struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr)
- : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
+ : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
/// Emit a leaf load of a single value. This is called at the leaves of the
/// recursive emission to actually load values.
@@ -2900,7 +3269,7 @@ private:
struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr)
- : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
+ : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
/// Emit a leaf store of a single value. This is called at the leaves of the
/// recursive emission to actually produce stores.
@@ -2908,8 +3277,8 @@ private:
assert(Ty->isSingleValueType());
// Extract the single value and store it using the indices.
Value *Store = IRB.CreateStore(
- IRB.CreateExtractValue(Agg, Indices, Name + ".extract"),
- IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"));
+ IRB.CreateExtractValue(Agg, Indices, Name + ".extract"),
+ IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"));
(void)Store;
DEBUG(dbgs() << " to: " << *Store << "\n");
}
@@ -2995,8 +3364,8 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
/// when the size or offset cause either end of type-based partition to be off.
/// Also, this is a best-effort routine. It is reasonable to give up and not
/// return a type if necessary.
-static Type *getTypePartition(const DataLayout &DL, Type *Ty,
- uint64_t Offset, uint64_t Size) {
+static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
+ uint64_t Size) {
if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size)
return stripAggregateTypeWrapping(DL, Ty);
if (Offset > DL.getTypeAllocSize(Ty) ||
@@ -3088,8 +3457,8 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty,
}
// Try to build up a sub-structure.
- StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE),
- STy->isPacked());
+ StructType *SubTy =
+ StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked());
const StructLayout *SubSL = DL.getStructLayout(SubTy);
if (Size != SubSL->getSizeInBytes())
return nullptr; // The sub-struct doesn't have quite the size needed.
@@ -3097,6 +3466,494 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty,
return SubTy;
}
+/// \brief Pre-split loads and stores to simplify rewriting.
+///
+/// We want to break up the splittable load+store pairs as much as
+/// possible. This is important to do as a preprocessing step, as once we
+/// start rewriting the accesses to partitions of the alloca we lose the
+/// necessary information to correctly split apart paired loads and stores
+/// which both point into this alloca. The case to consider is something like
+/// the following:
+///
+/// %a = alloca [12 x i8]
+/// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
+/// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
+/// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
+/// %iptr1 = bitcast i8* %gep1 to i64*
+/// %iptr2 = bitcast i8* %gep2 to i64*
+/// %fptr1 = bitcast i8* %gep1 to float*
+/// %fptr2 = bitcast i8* %gep2 to float*
+/// %fptr3 = bitcast i8* %gep3 to float*
+/// store float 0.0, float* %fptr1
+/// store float 1.0, float* %fptr2
+/// %v = load i64* %iptr1
+/// store i64 %v, i64* %iptr2
+/// %f1 = load float* %fptr2
+/// %f2 = load float* %fptr3
+///
+/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
+/// promote everything so we recover the 2 SSA values that should have been
+/// there all along.
+///
+/// \returns true if any changes are made.
+bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
+ DEBUG(dbgs() << "Pre-splitting loads and stores\n");
+
+ // Track the loads and stores which are candidates for pre-splitting here, in
+ // the order they first appear during the partition scan. These give stable
+ // iteration order and a basis for tracking which loads and stores we
+ // actually split.
+ SmallVector<LoadInst *, 4> Loads;
+ SmallVector<StoreInst *, 4> Stores;
+
+ // We need to accumulate the splits required of each load or store where we
+ // can find them via a direct lookup. This is important to cross-check loads
+ // and stores against each other. We also track the slice so that we can kill
+ // all the slices that end up split.
+ struct SplitOffsets {
+ Slice *S;
+ std::vector<uint64_t> Splits;
+ };
+ SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
+
+ // Track loads out of this alloca which cannot, for any reason, be pre-split.
+ // This is important as we also cannot pre-split stores of those loads!
+ // FIXME: This is all pretty gross. It means that we can be more aggressive
+ // in pre-splitting when the load feeding the store happens to come from
+ // a separate alloca. Put another way, the effectiveness of SROA would be
+ // decreased by a frontend which just concatenated all of its local allocas
+ // into one big flat alloca. But defeating such patterns is exactly the job
+ // SROA is tasked with! Sadly, to not have this discrepancy we would have
+ // change store pre-splitting to actually force pre-splitting of the load
+ // that feeds it *and all stores*. That makes pre-splitting much harder, but
+ // maybe it would make it more principled?
+ SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
+
+ DEBUG(dbgs() << " Searching for candidate loads and stores\n");
+ for (auto &P : AS.partitions()) {
+ for (Slice &S : P) {
+ Instruction *I = cast<Instruction>(S.getUse()->getUser());
+ if (!S.isSplittable() ||S.endOffset() <= P.endOffset()) {
+ // If this was a load we have to track that it can't participate in any
+ // pre-splitting!
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ UnsplittableLoads.insert(LI);
+ continue;
+ }
+ assert(P.endOffset() > S.beginOffset() &&
+ "Empty or backwards partition!");
+
+ // Determine if this is a pre-splittable slice.
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ assert(!LI->isVolatile() && "Cannot split volatile loads!");
+
+ // The load must be used exclusively to store into other pointers for
+ // us to be able to arbitrarily pre-split it. The stores must also be
+ // simple to avoid changing semantics.
+ auto IsLoadSimplyStored = [](LoadInst *LI) {
+ for (User *LU : LI->users()) {
+ auto *SI = dyn_cast<StoreInst>(LU);
+ if (!SI || !SI->isSimple())
+ return false;
+ }
+ return true;
+ };
+ if (!IsLoadSimplyStored(LI)) {
+ UnsplittableLoads.insert(LI);
+ continue;
+ }
+
+ Loads.push_back(LI);
+ } else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) {
+ if (!SI ||
+ S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
+ continue;
+ auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
+ if (!StoredLoad || !StoredLoad->isSimple())
+ continue;
+ assert(!SI->isVolatile() && "Cannot split volatile stores!");
+
+ Stores.push_back(SI);
+ } else {
+ // Other uses cannot be pre-split.
+ continue;
+ }
+
+ // Record the initial split.
+ DEBUG(dbgs() << " Candidate: " << *I << "\n");
+ auto &Offsets = SplitOffsetsMap[I];
+ assert(Offsets.Splits.empty() &&
+ "Should not have splits the first time we see an instruction!");
+ Offsets.S = &S;
+ Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
+ }
+
+ // Now scan the already split slices, and add a split for any of them which
+ // we're going to pre-split.
+ for (Slice *S : P.splitSliceTails()) {
+ auto SplitOffsetsMapI =
+ SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
+ if (SplitOffsetsMapI == SplitOffsetsMap.end())
+ continue;
+ auto &Offsets = SplitOffsetsMapI->second;
+
+ assert(Offsets.S == S && "Found a mismatched slice!");
+ assert(!Offsets.Splits.empty() &&
+ "Cannot have an empty set of splits on the second partition!");
+ assert(Offsets.Splits.back() ==
+ P.beginOffset() - Offsets.S->beginOffset() &&
+ "Previous split does not end where this one begins!");
+
+ // Record each split. The last partition's end isn't needed as the size
+ // of the slice dictates that.
+ if (S->endOffset() > P.endOffset())
+ Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
+ }
+ }
+
+ // We may have split loads where some of their stores are split stores. For
+ // such loads and stores, we can only pre-split them if their splits exactly
+ // match relative to their starting offset. We have to verify this prior to
+ // any rewriting.
+ Stores.erase(
+ std::remove_if(Stores.begin(), Stores.end(),
+ [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
+ // Lookup the load we are storing in our map of split
+ // offsets.
+ auto *LI = cast<LoadInst>(SI->getValueOperand());
+ // If it was completely unsplittable, then we're done,
+ // and this store can't be pre-split.
+ if (UnsplittableLoads.count(LI))
+ return true;
+
+ auto LoadOffsetsI = SplitOffsetsMap.find(LI);
+ if (LoadOffsetsI == SplitOffsetsMap.end())
+ return false; // Unrelated loads are definitely safe.
+ auto &LoadOffsets = LoadOffsetsI->second;
+
+ // Now lookup the store's offsets.
+ auto &StoreOffsets = SplitOffsetsMap[SI];
+
+ // If the relative offsets of each split in the load and
+ // store match exactly, then we can split them and we
+ // don't need to remove them here.
+ if (LoadOffsets.Splits == StoreOffsets.Splits)
+ return false;
+
+ DEBUG(dbgs()
+ << " Mismatched splits for load and store:\n"
+ << " " << *LI << "\n"
+ << " " << *SI << "\n");
+
+ // We've found a store and load that we need to split
+ // with mismatched relative splits. Just give up on them
+ // and remove both instructions from our list of
+ // candidates.
+ UnsplittableLoads.insert(LI);
+ return true;
+ }),
+ Stores.end());
+ // Now we have to go *back* through all te stores, because a later store may
+ // have caused an earlier store's load to become unsplittable and if it is
+ // unsplittable for the later store, then we can't rely on it being split in
+ // the earlier store either.
+ Stores.erase(std::remove_if(Stores.begin(), Stores.end(),
+ [&UnsplittableLoads](StoreInst *SI) {
+ auto *LI =
+ cast<LoadInst>(SI->getValueOperand());
+ return UnsplittableLoads.count(LI);
+ }),
+ Stores.end());
+ // Once we've established all the loads that can't be split for some reason,
+ // filter any that made it into our list out.
+ Loads.erase(std::remove_if(Loads.begin(), Loads.end(),
+ [&UnsplittableLoads](LoadInst *LI) {
+ return UnsplittableLoads.count(LI);
+ }),
+ Loads.end());
+
+
+ // If no loads or stores are left, there is no pre-splitting to be done for
+ // this alloca.
+ if (Loads.empty() && Stores.empty())
+ return false;
+
+ // From here on, we can't fail and will be building new accesses, so rig up
+ // an IR builder.
+ IRBuilderTy IRB(&AI);
+
+ // Collect the new slices which we will merge into the alloca slices.
+ SmallVector<Slice, 4> NewSlices;
+
+ // Track any allocas we end up splitting loads and stores for so we iterate
+ // on them.
+ SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
+
+ // At this point, we have collected all of the loads and stores we can
+ // pre-split, and the specific splits needed for them. We actually do the
+ // splitting in a specific order in order to handle when one of the loads in
+ // the value operand to one of the stores.
+ //
+ // First, we rewrite all of the split loads, and just accumulate each split
+ // load in a parallel structure. We also build the slices for them and append
+ // them to the alloca slices.
+ SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
+ std::vector<LoadInst *> SplitLoads;
+ for (LoadInst *LI : Loads) {
+ SplitLoads.clear();
+
+ IntegerType *Ty = cast<IntegerType>(LI->getType());
+ uint64_t LoadSize = Ty->getBitWidth() / 8;
+ assert(LoadSize > 0 && "Cannot have a zero-sized integer load!");
+
+ auto &Offsets = SplitOffsetsMap[LI];
+ assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+ "Slice size should always match load size exactly!");
+ uint64_t BaseOffset = Offsets.S->beginOffset();
+ assert(BaseOffset + LoadSize > BaseOffset &&
+ "Cannot represent alloca access size using 64-bit integers!");
+
+ Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
+ IRB.SetInsertPoint(BasicBlock::iterator(LI));
+
+ DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
+
+ uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+ int Idx = 0, Size = Offsets.Splits.size();
+ for (;;) {
+ auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+ auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+ LoadInst *PLoad = IRB.CreateAlignedLoad(
+ getAdjustedPtr(IRB, *DL, BasePtr,
+ APInt(DL->getPointerSizeInBits(), PartOffset),
+ PartPtrTy, BasePtr->getName() + "."),
+ getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false,
+ LI->getName());
+
+ // Append this load onto the list of split loads so we can find it later
+ // to rewrite the stores.
+ SplitLoads.push_back(PLoad);
+
+ // Now build a new slice for the alloca.
+ NewSlices.push_back(
+ Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+ &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
+ /*IsSplittable*/ false));
+ DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
+ << ", " << NewSlices.back().endOffset() << "): " << *PLoad
+ << "\n");
+
+ // See if we've handled all the splits.
+ if (Idx >= Size)
+ break;
+
+ // Setup the next partition.
+ PartOffset = Offsets.Splits[Idx];
+ ++Idx;
+ PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset;
+ }
+
+ // Now that we have the split loads, do the slow walk over all uses of the
+ // load and rewrite them as split stores, or save the split loads to use
+ // below if the store is going to be split there anyways.
+ bool DeferredStores = false;
+ for (User *LU : LI->users()) {
+ StoreInst *SI = cast<StoreInst>(LU);
+ if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
+ DeferredStores = true;
+ DEBUG(dbgs() << " Deferred splitting of store: " << *SI << "\n");
+ continue;
+ }
+
+ Value *StoreBasePtr = SI->getPointerOperand();
+ IRB.SetInsertPoint(BasicBlock::iterator(SI));
+
+ DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
+
+ for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
+ LoadInst *PLoad = SplitLoads[Idx];
+ uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
+ auto *PartPtrTy =
+ PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
+
+ StoreInst *PStore = IRB.CreateAlignedStore(
+ PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr,
+ APInt(DL->getPointerSizeInBits(), PartOffset),
+ PartPtrTy, StoreBasePtr->getName() + "."),
+ getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false);
+ (void)PStore;
+ DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
+ }
+
+ // We want to immediately iterate on any allocas impacted by splitting
+ // this store, and we have to track any promotable alloca (indicated by
+ // a direct store) as needing to be resplit because it is no longer
+ // promotable.
+ if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
+ ResplitPromotableAllocas.insert(OtherAI);
+ Worklist.insert(OtherAI);
+ } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+ StoreBasePtr->stripInBoundsOffsets())) {
+ Worklist.insert(OtherAI);
+ }
+
+ // Mark the original store as dead.
+ DeadInsts.insert(SI);
+ }
+
+ // Save the split loads if there are deferred stores among the users.
+ if (DeferredStores)
+ SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
+
+ // Mark the original load as dead and kill the original slice.
+ DeadInsts.insert(LI);
+ Offsets.S->kill();
+ }
+
+ // Second, we rewrite all of the split stores. At this point, we know that
+ // all loads from this alloca have been split already. For stores of such
+ // loads, we can simply look up the pre-existing split loads. For stores of
+ // other loads, we split those loads first and then write split stores of
+ // them.
+ for (StoreInst *SI : Stores) {
+ auto *LI = cast<LoadInst>(SI->getValueOperand());
+ IntegerType *Ty = cast<IntegerType>(LI->getType());
+ uint64_t StoreSize = Ty->getBitWidth() / 8;
+ assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
+
+ auto &Offsets = SplitOffsetsMap[SI];
+ assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+ "Slice size should always match load size exactly!");
+ uint64_t BaseOffset = Offsets.S->beginOffset();
+ assert(BaseOffset + StoreSize > BaseOffset &&
+ "Cannot represent alloca access size using 64-bit integers!");
+
+ Value *LoadBasePtr = LI->getPointerOperand();
+ Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
+
+ DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
+
+ // Check whether we have an already split load.
+ auto SplitLoadsMapI = SplitLoadsMap.find(LI);
+ std::vector<LoadInst *> *SplitLoads = nullptr;
+ if (SplitLoadsMapI != SplitLoadsMap.end()) {
+ SplitLoads = &SplitLoadsMapI->second;
+ assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
+ "Too few split loads for the number of splits in the store!");
+ } else {
+ DEBUG(dbgs() << " of load: " << *LI << "\n");
+ }
+
+ uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+ int Idx = 0, Size = Offsets.Splits.size();
+ for (;;) {
+ auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+ auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
+
+ // Either lookup a split load or create one.
+ LoadInst *PLoad;
+ if (SplitLoads) {
+ PLoad = (*SplitLoads)[Idx];
+ } else {
+ IRB.SetInsertPoint(BasicBlock::iterator(LI));
+ PLoad = IRB.CreateAlignedLoad(
+ getAdjustedPtr(IRB, *DL, LoadBasePtr,
+ APInt(DL->getPointerSizeInBits(), PartOffset),
+ PartPtrTy, LoadBasePtr->getName() + "."),
+ getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false,
+ LI->getName());
+ }
+
+ // And store this partition.
+ IRB.SetInsertPoint(BasicBlock::iterator(SI));
+ StoreInst *PStore = IRB.CreateAlignedStore(
+ PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr,
+ APInt(DL->getPointerSizeInBits(), PartOffset),
+ PartPtrTy, StoreBasePtr->getName() + "."),
+ getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false);
+
+ // Now build a new slice for the alloca.
+ NewSlices.push_back(
+ Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+ &PStore->getOperandUse(PStore->getPointerOperandIndex()),
+ /*IsSplittable*/ false));
+ DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
+ << ", " << NewSlices.back().endOffset() << "): " << *PStore
+ << "\n");
+ if (!SplitLoads) {
+ DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
+ }
+
+ // See if we've finished all the splits.
+ if (Idx >= Size)
+ break;
+
+ // Setup the next partition.
+ PartOffset = Offsets.Splits[Idx];
+ ++Idx;
+ PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
+ }
+
+ // We want to immediately iterate on any allocas impacted by splitting
+ // this load, which is only relevant if it isn't a load of this alloca and
+ // thus we didn't already split the loads above. We also have to keep track
+ // of any promotable allocas we split loads on as they can no longer be
+ // promoted.
+ if (!SplitLoads) {
+ if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
+ assert(OtherAI != &AI && "We can't re-split our own alloca!");
+ ResplitPromotableAllocas.insert(OtherAI);
+ Worklist.insert(OtherAI);
+ } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+ LoadBasePtr->stripInBoundsOffsets())) {
+ assert(OtherAI != &AI && "We can't re-split our own alloca!");
+ Worklist.insert(OtherAI);
+ }
+ }
+
+ // Mark the original store as dead now that we've split it up and kill its
+ // slice. Note that we leave the original load in place unless this store
+ // was its ownly use. It may in turn be split up if it is an alloca load
+ // for some other alloca, but it may be a normal load. This may introduce
+ // redundant loads, but where those can be merged the rest of the optimizer
+ // should handle the merging, and this uncovers SSA splits which is more
+ // important. In practice, the original loads will almost always be fully
+ // split and removed eventually, and the splits will be merged by any
+ // trivial CSE, including instcombine.
+ if (LI->hasOneUse()) {
+ assert(*LI->user_begin() == SI && "Single use isn't this store!");
+ DeadInsts.insert(LI);
+ }
+ DeadInsts.insert(SI);
+ Offsets.S->kill();
+ }
+
+ // Remove the killed slices that have ben pre-split.
+ AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) {
+ return S.isDead();
+ }), AS.end());
+
+ // Insert our new slices. This will sort and merge them into the sorted
+ // sequence.
+ AS.insert(NewSlices);
+
+ DEBUG(dbgs() << " Pre-split slices:\n");
+#ifndef NDEBUG
+ for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
+ DEBUG(AS.print(dbgs(), I, " "));
+#endif
+
+ // Finally, don't try to promote any allocas that new require re-splitting.
+ // They have already been added to the worklist above.
+ PromotableAllocas.erase(
+ std::remove_if(
+ PromotableAllocas.begin(), PromotableAllocas.end(),
+ [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }),
+ PromotableAllocas.end());
+
+ return true;
+}
+
/// \brief Rewrite an alloca partition's users.
///
/// This routine drives both of the rewriting goals of the SROA pass. It tries
@@ -3107,38 +3964,33 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty,
/// appropriate new offsets. It also evaluates how successful the rewrite was
/// at enabling promotion and if it was successful queues the alloca to be
/// promoted.
-bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S,
- AllocaSlices::iterator B, AllocaSlices::iterator E,
- int64_t BeginOffset, int64_t EndOffset,
- ArrayRef<AllocaSlices::iterator> SplitUses) {
- assert(BeginOffset < EndOffset);
- uint64_t SliceSize = EndOffset - BeginOffset;
-
+bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
+ AllocaSlices::Partition &P) {
// Try to compute a friendly type for this partition of the alloca. This
// won't always succeed, in which case we fall back to a legal integer type
// or an i8 array of an appropriate size.
Type *SliceTy = nullptr;
- if (Type *CommonUseTy = findCommonType(B, E, EndOffset))
- if (DL->getTypeAllocSize(CommonUseTy) >= SliceSize)
+ if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()))
+ if (DL->getTypeAllocSize(CommonUseTy) >= P.size())
SliceTy = CommonUseTy;
if (!SliceTy)
if (Type *TypePartitionTy = getTypePartition(*DL, AI.getAllocatedType(),
- BeginOffset, SliceSize))
+ P.beginOffset(), P.size()))
SliceTy = TypePartitionTy;
if ((!SliceTy || (SliceTy->isArrayTy() &&
SliceTy->getArrayElementType()->isIntegerTy())) &&
- DL->isLegalInteger(SliceSize * 8))
- SliceTy = Type::getIntNTy(*C, SliceSize * 8);
+ DL->isLegalInteger(P.size() * 8))
+ SliceTy = Type::getIntNTy(*C, P.size() * 8);
if (!SliceTy)
- SliceTy = ArrayType::get(Type::getInt8Ty(*C), SliceSize);
- assert(DL->getTypeAllocSize(SliceTy) >= SliceSize);
+ SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
+ assert(DL->getTypeAllocSize(SliceTy) >= P.size());
- bool IsVectorPromotable = isVectorPromotionViable(
- *DL, SliceTy, S, BeginOffset, EndOffset, B, E, SplitUses);
+ bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, *DL);
- bool IsIntegerPromotable =
- !IsVectorPromotable &&
- isIntegerWideningViable(*DL, SliceTy, BeginOffset, S, B, E, SplitUses);
+ VectorType *VecTy =
+ IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, *DL);
+ if (VecTy)
+ SliceTy = VecTy;
// Check for the case where we're going to rewrite to a new alloca of the
// exact same type as the original, and with the same access offsets. In that
@@ -3146,7 +3998,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S,
// perform phi and select speculation.
AllocaInst *NewAI;
if (SliceTy == AI.getAllocatedType()) {
- assert(BeginOffset == 0 &&
+ assert(P.beginOffset() == 0 &&
"Non-zero begin offset but same alloca type");
NewAI = &AI;
// FIXME: We should be able to bail at this point with "nothing changed".
@@ -3159,19 +4011,20 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S,
// type.
Alignment = DL->getABITypeAlignment(AI.getAllocatedType());
}
- Alignment = MinAlign(Alignment, BeginOffset);
+ Alignment = MinAlign(Alignment, P.beginOffset());
// If we will get at least this much alignment from the type alone, leave
// the alloca's alignment unconstrained.
if (Alignment <= DL->getABITypeAlignment(SliceTy))
Alignment = 0;
- NewAI = new AllocaInst(SliceTy, nullptr, Alignment,
- AI.getName() + ".sroa." + Twine(B - S.begin()), &AI);
+ NewAI = new AllocaInst(
+ SliceTy, nullptr, Alignment,
+ AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
++NumNewAllocas;
}
DEBUG(dbgs() << "Rewriting alloca partition "
- << "[" << BeginOffset << "," << EndOffset << ") to: " << *NewAI
- << "\n");
+ << "[" << P.beginOffset() << "," << P.endOffset()
+ << ") to: " << *NewAI << "\n");
// Track the high watermark on the worklist as it is only relevant for
// promoted allocas. We will reset it to this point if the alloca is not in
@@ -3181,22 +4034,16 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S,
SmallPtrSet<PHINode *, 8> PHIUsers;
SmallPtrSet<SelectInst *, 8> SelectUsers;
- AllocaSliceRewriter Rewriter(*DL, S, *this, AI, *NewAI, BeginOffset,
- EndOffset, IsVectorPromotable,
- IsIntegerPromotable, PHIUsers, SelectUsers);
+ AllocaSliceRewriter Rewriter(*DL, AS, *this, AI, *NewAI, P.beginOffset(),
+ P.endOffset(), IsIntegerPromotable, VecTy,
+ PHIUsers, SelectUsers);
bool Promotable = true;
- for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(),
- SUE = SplitUses.end();
- SUI != SUE; ++SUI) {
- DEBUG(dbgs() << " rewriting split ");
- DEBUG(S.printSlice(dbgs(), *SUI, ""));
- Promotable &= Rewriter.visit(*SUI);
+ for (Slice *S : P.splitSliceTails()) {
+ Promotable &= Rewriter.visit(S);
++NumUses;
}
- for (AllocaSlices::iterator I = B; I != E; ++I) {
- DEBUG(dbgs() << " rewriting ");
- DEBUG(S.printSlice(dbgs(), I, ""));
- Promotable &= Rewriter.visit(I);
+ for (Slice &S : P) {
+ Promotable &= Rewriter.visit(&S);
++NumUses;
}
@@ -3233,14 +4080,10 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S,
// If we have either PHIs or Selects to speculate, add them to those
// worklists and re-queue the new alloca so that we promote in on the
// next iteration.
- for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(),
- E = PHIUsers.end();
- I != E; ++I)
- SpeculatablePHIs.insert(*I);
- for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(),
- E = SelectUsers.end();
- I != E; ++I)
- SpeculatableSelects.insert(*I);
+ for (PHINode *PHIUser : PHIUsers)
+ SpeculatablePHIs.insert(PHIUser);
+ for (SelectInst *SelectUser : SelectUsers)
+ SpeculatableSelects.insert(SelectUser);
Worklist.insert(NewAI);
}
} else {
@@ -3258,136 +4101,46 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S,
return true;
}
-static void
-removeFinishedSplitUses(SmallVectorImpl<AllocaSlices::iterator> &SplitUses,
- uint64_t &MaxSplitUseEndOffset, uint64_t Offset) {
- if (Offset >= MaxSplitUseEndOffset) {
- SplitUses.clear();
- MaxSplitUseEndOffset = 0;
- return;
- }
-
- size_t SplitUsesOldSize = SplitUses.size();
- SplitUses.erase(std::remove_if(SplitUses.begin(), SplitUses.end(),
- [Offset](const AllocaSlices::iterator &I) {
- return I->endOffset() <= Offset;
- }),
- SplitUses.end());
- if (SplitUsesOldSize == SplitUses.size())
- return;
-
- // Recompute the max. While this is linear, so is remove_if.
- MaxSplitUseEndOffset = 0;
- for (SmallVectorImpl<AllocaSlices::iterator>::iterator
- SUI = SplitUses.begin(),
- SUE = SplitUses.end();
- SUI != SUE; ++SUI)
- MaxSplitUseEndOffset = std::max((*SUI)->endOffset(), MaxSplitUseEndOffset);
-}
-
/// \brief Walks the slices of an alloca and form partitions based on them,
/// rewriting each of their uses.
-bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &S) {
- if (S.begin() == S.end())
+bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
+ if (AS.begin() == AS.end())
return false;
unsigned NumPartitions = 0;
bool Changed = false;
- SmallVector<AllocaSlices::iterator, 4> SplitUses;
- uint64_t MaxSplitUseEndOffset = 0;
-
- uint64_t BeginOffset = S.begin()->beginOffset();
-
- for (AllocaSlices::iterator SI = S.begin(), SJ = std::next(SI), SE = S.end();
- SI != SE; SI = SJ) {
- uint64_t MaxEndOffset = SI->endOffset();
-
- if (!SI->isSplittable()) {
- // When we're forming an unsplittable region, it must always start at the
- // first slice and will extend through its end.
- assert(BeginOffset == SI->beginOffset());
-
- // Form a partition including all of the overlapping slices with this
- // unsplittable slice.
- while (SJ != SE && SJ->beginOffset() < MaxEndOffset) {
- if (!SJ->isSplittable())
- MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset());
- ++SJ;
- }
- } else {
- assert(SI->isSplittable()); // Established above.
- // Collect all of the overlapping splittable slices.
- while (SJ != SE && SJ->beginOffset() < MaxEndOffset &&
- SJ->isSplittable()) {
- MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset());
- ++SJ;
- }
+ // First try to pre-split loads and stores.
+ Changed |= presplitLoadsAndStores(AI, AS);
- // Back up MaxEndOffset and SJ if we ended the span early when
- // encountering an unsplittable slice.
- if (SJ != SE && SJ->beginOffset() < MaxEndOffset) {
- assert(!SJ->isSplittable());
- MaxEndOffset = SJ->beginOffset();
- }
- }
-
- // Check if we have managed to move the end offset forward yet. If so,
- // we'll have to rewrite uses and erase old split uses.
- if (BeginOffset < MaxEndOffset) {
- // Rewrite a sequence of overlapping slices.
- Changed |=
- rewritePartition(AI, S, SI, SJ, BeginOffset, MaxEndOffset, SplitUses);
- ++NumPartitions;
-
- removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, MaxEndOffset);
- }
-
- // Accumulate all the splittable slices from the [SI,SJ) region which
- // overlap going forward.
- for (AllocaSlices::iterator SK = SI; SK != SJ; ++SK)
- if (SK->isSplittable() && SK->endOffset() > MaxEndOffset) {
- SplitUses.push_back(SK);
- MaxSplitUseEndOffset = std::max(SK->endOffset(), MaxSplitUseEndOffset);
- }
-
- // If we're already at the end and we have no split uses, we're done.
- if (SJ == SE && SplitUses.empty())
- break;
-
- // If we have no split uses or no gap in offsets, we're ready to move to
- // the next slice.
- if (SplitUses.empty() || (SJ != SE && MaxEndOffset == SJ->beginOffset())) {
- BeginOffset = SJ->beginOffset();
+ // Now that we have identified any pre-splitting opportunities, mark any
+ // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail
+ // to split these during pre-splitting, we want to force them to be
+ // rewritten into a partition.
+ bool IsSorted = true;
+ for (Slice &S : AS) {
+ if (!S.isSplittable())
continue;
- }
-
- // Even if we have split slices, if the next slice is splittable and the
- // split slices reach it, we can simply set up the beginning offset of the
- // next iteration to bridge between them.
- if (SJ != SE && SJ->isSplittable() &&
- MaxSplitUseEndOffset > SJ->beginOffset()) {
- BeginOffset = MaxEndOffset;
+ // FIXME: We currently leave whole-alloca splittable loads and stores. This
+ // used to be the only splittable loads and stores and we need to be
+ // confident that the above handling of splittable loads and stores is
+ // completely sufficient before we forcibly disable the remaining handling.
+ if (S.beginOffset() == 0 &&
+ S.endOffset() >= DL->getTypeAllocSize(AI.getAllocatedType()))
continue;
+ if (isa<LoadInst>(S.getUse()->getUser()) ||
+ isa<StoreInst>(S.getUse()->getUser())) {
+ S.makeUnsplittable();
+ IsSorted = false;
}
+ }
+ if (!IsSorted)
+ std::sort(AS.begin(), AS.end());
- // Otherwise, we have a tail of split slices. Rewrite them with an empty
- // range of slices.
- uint64_t PostSplitEndOffset =
- SJ == SE ? MaxSplitUseEndOffset : SJ->beginOffset();
-
- Changed |= rewritePartition(AI, S, SJ, SJ, MaxEndOffset, PostSplitEndOffset,
- SplitUses);
+ // Rewrite each partition.
+ for (auto &P : AS.partitions()) {
+ Changed |= rewritePartition(AI, AS, P);
++NumPartitions;
-
- if (SJ == SE)
- break; // Skip the rest, we don't need to do any cleanup.
-
- removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset,
- PostSplitEndOffset);
-
- // Now just reset the begin offset for the next iteration.
- BeginOffset = SJ->beginOffset();
}
NumAllocaPartitions += NumPartitions;
@@ -3440,38 +4193,34 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
Changed |= AggRewriter.rewrite(AI);
// Build the slices using a recursive instruction-visiting builder.
- AllocaSlices S(*DL, AI);
- DEBUG(S.print(dbgs()));
- if (S.isEscaped())
+ AllocaSlices AS(*DL, AI);
+ DEBUG(AS.print(dbgs()));
+ if (AS.isEscaped())
return Changed;
// Delete all the dead users of this alloca before splitting and rewriting it.
- for (AllocaSlices::dead_user_iterator DI = S.dead_user_begin(),
- DE = S.dead_user_end();
- DI != DE; ++DI) {
+ for (Instruction *DeadUser : AS.getDeadUsers()) {
// Free up everything used by this instruction.
- for (Use &DeadOp : (*DI)->operands())
+ for (Use &DeadOp : DeadUser->operands())
clobberUse(DeadOp);
// Now replace the uses of this instruction.
- (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType()));
+ DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType()));
// And mark it for deletion.
- DeadInsts.insert(*DI);
+ DeadInsts.insert(DeadUser);
Changed = true;
}
- for (AllocaSlices::dead_op_iterator DO = S.dead_op_begin(),
- DE = S.dead_op_end();
- DO != DE; ++DO) {
- clobberUse(**DO);
+ for (Use *DeadOp : AS.getDeadOperands()) {
+ clobberUse(*DeadOp);
Changed = true;
}
// No slices to split. Leave the dead alloca for a later pass to clean up.
- if (S.begin() == S.end())
+ if (AS.begin() == AS.end())
return Changed;
- Changed |= splitAlloca(AI, S);
+ Changed |= splitAlloca(AI, AS);
DEBUG(dbgs() << " Speculating PHIs\n");
while (!SpeculatablePHIs.empty())
@@ -3493,7 +4242,8 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
///
/// We also record the alloca instructions deleted here so that they aren't
/// subsequently handed to mem2reg to promote.
-void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
+void SROA::deleteDeadInstructions(
+ SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
while (!DeadInsts.empty()) {
Instruction *I = DeadInsts.pop_back_val();
DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
@@ -3518,9 +4268,9 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
static void enqueueUsersInWorklist(Instruction &I,
SmallVectorImpl<Instruction *> &Worklist,
- SmallPtrSet<Instruction *, 8> &Visited) {
+ SmallPtrSetImpl<Instruction *> &Visited) {
for (User *U : I.users())
- if (Visited.insert(cast<Instruction>(U)))
+ if (Visited.insert(cast<Instruction>(U)).second)
Worklist.push_back(cast<Instruction>(U));
}
@@ -3540,14 +4290,14 @@ bool SROA::promoteAllocas(Function &F) {
if (DT && !ForceSSAUpdater) {
DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
- PromoteMemToReg(PromotableAllocas, *DT);
+ PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC);
PromotableAllocas.clear();
return true;
}
DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n");
SSAUpdater SSA;
- DIBuilder DIB(*F.getParent());
+ DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
SmallVector<Instruction *, 64> Insts;
// We need a worklist to walk the uses of each alloca.
@@ -3622,6 +4372,7 @@ bool SROA::runOnFunction(Function &F) {
DominatorTreeWrapperPass *DTWP =
getAnalysisIfAvailable<DominatorTreeWrapperPass>();
DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
BasicBlock &EntryBB = F.getEntryBlock();
for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
@@ -3642,9 +4393,7 @@ bool SROA::runOnFunction(Function &F) {
// Remove the deleted allocas from various lists so that we don't try to
// continue processing them.
if (!DeletedAllocas.empty()) {
- auto IsInSet = [&](AllocaInst *AI) {
- return DeletedAllocas.count(AI);
- };
+ auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
Worklist.remove_if(IsInSet);
PostPromotionWorklist.remove_if(IsInSet);
PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(),
@@ -3665,6 +4414,7 @@ bool SROA::runOnFunction(Function &F) {
}
void SROA::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<AssumptionCacheTracker>();
if (RequiresDomTree)
AU.addRequired<DominatorTreeWrapperPass>();
AU.setPreservesCFG();
diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp
index 73c97ffeef4f..179bbf78366d 100644
--- a/lib/Transforms/Scalar/SampleProfile.cpp
+++ b/lib/Transforms/Scalar/SampleProfile.cpp
@@ -26,7 +26,6 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/PostDominators.h"
@@ -42,15 +41,14 @@
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
+#include "llvm/ProfileData/SampleProfReader.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Support/LineIterator.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Regex.h"
#include "llvm/Support/raw_ostream.h"
#include <cctype>
using namespace llvm;
+using namespace sampleprof;
#define DEBUG_TYPE "sample-profile"
@@ -65,76 +63,48 @@ static cl::opt<unsigned> SampleProfileMaxPropagateIterations(
"sample block/edge weights through the CFG."));
namespace {
-/// \brief Represents the relative location of an instruction.
-///
-/// Instruction locations are specified by the line offset from the
-/// beginning of the function (marked by the line where the function
-/// header is) and the discriminator value within that line.
-///
-/// The discriminator value is useful to distinguish instructions
-/// that are on the same line but belong to different basic blocks
-/// (e.g., the two post-increment instructions in "if (p) x++; else y++;").
-struct InstructionLocation {
- InstructionLocation(int L, unsigned D) : LineOffset(L), Discriminator(D) {}
- int LineOffset;
- unsigned Discriminator;
-};
-}
-
-namespace llvm {
-template <> struct DenseMapInfo<InstructionLocation> {
- typedef DenseMapInfo<int> OffsetInfo;
- typedef DenseMapInfo<unsigned> DiscriminatorInfo;
- static inline InstructionLocation getEmptyKey() {
- return InstructionLocation(OffsetInfo::getEmptyKey(),
- DiscriminatorInfo::getEmptyKey());
- }
- static inline InstructionLocation getTombstoneKey() {
- return InstructionLocation(OffsetInfo::getTombstoneKey(),
- DiscriminatorInfo::getTombstoneKey());
- }
- static inline unsigned getHashValue(InstructionLocation Val) {
- return DenseMapInfo<std::pair<int, unsigned>>::getHashValue(
- std::pair<int, unsigned>(Val.LineOffset, Val.Discriminator));
- }
- static inline bool isEqual(InstructionLocation LHS, InstructionLocation RHS) {
- return LHS.LineOffset == RHS.LineOffset &&
- LHS.Discriminator == RHS.Discriminator;
- }
-};
-}
-
-namespace {
-typedef DenseMap<InstructionLocation, unsigned> BodySampleMap;
typedef DenseMap<BasicBlock *, unsigned> BlockWeightMap;
typedef DenseMap<BasicBlock *, BasicBlock *> EquivalenceClassMap;
typedef std::pair<BasicBlock *, BasicBlock *> Edge;
typedef DenseMap<Edge, unsigned> EdgeWeightMap;
typedef DenseMap<BasicBlock *, SmallVector<BasicBlock *, 8>> BlockEdgeMap;
-/// \brief Representation of the runtime profile for a function.
+/// \brief Sample profile pass.
///
-/// This data structure contains the runtime profile for a given
-/// function. It contains the total number of samples collected
-/// in the function and a map of samples collected in every statement.
-class SampleFunctionProfile {
+/// This pass reads profile data from the file specified by
+/// -sample-profile-file and annotates every affected function with the
+/// profile information found in that file.
+class SampleProfileLoader : public FunctionPass {
public:
- SampleFunctionProfile()
- : TotalSamples(0), TotalHeadSamples(0), HeaderLineno(0), DT(nullptr),
- PDT(nullptr), LI(nullptr), Ctx(nullptr) {}
+ // Class identification, replacement for typeinfo
+ static char ID;
+
+ SampleProfileLoader(StringRef Name = SampleProfileFile)
+ : FunctionPass(ID), DT(nullptr), PDT(nullptr), LI(nullptr), Ctx(nullptr),
+ Reader(), Samples(nullptr), Filename(Name), ProfileIsValid(false) {
+ initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool doInitialization(Module &M) override;
+
+ void dump() { Reader->dump(); }
+
+ const char *getPassName() const override { return "Sample profile pass"; }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<LoopInfo>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<PostDominatorTree>();
+ }
+protected:
unsigned getFunctionLoc(Function &F);
- bool emitAnnotations(Function &F, DominatorTree *DomTree,
- PostDominatorTree *PostDomTree, LoopInfo *Loops);
+ bool emitAnnotations(Function &F);
unsigned getInstWeight(Instruction &I);
- unsigned getBlockWeight(BasicBlock *B);
- void addTotalSamples(unsigned Num) { TotalSamples += Num; }
- void addHeadSamples(unsigned Num) { TotalHeadSamples += Num; }
- void addBodySamples(int LineOffset, unsigned Discriminator, unsigned Num) {
- assert(LineOffset >= 0);
- BodySamples[InstructionLocation(LineOffset, Discriminator)] += Num;
- }
- void print(raw_ostream &OS);
+ unsigned getBlockWeight(BasicBlock *BB);
void printEdgeWeight(raw_ostream &OS, Edge E);
void printBlockWeight(raw_ostream &OS, BasicBlock *BB);
void printBlockEquivalence(raw_ostream &OS, BasicBlock *BB);
@@ -147,32 +117,11 @@ public:
unsigned visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
void buildEdges(Function &F);
bool propagateThroughEdges(Function &F);
- bool empty() { return BodySamples.empty(); }
-protected:
- /// \brief Total number of samples collected inside this function.
- ///
- /// Samples are cumulative, they include all the samples collected
- /// inside this function and all its inlined callees.
- unsigned TotalSamples;
-
- /// \brief Total number of samples collected at the head of the function.
- /// FIXME: Use head samples to estimate a cold/hot attribute for the function.
- unsigned TotalHeadSamples;
-
- /// \brief Line number for the function header. Used to compute relative
- /// line numbers from the absolute line LOCs found in instruction locations.
- /// The relative line numbers are needed to address the samples from the
- /// profile file.
+ /// \brief Line number for the function header. Used to compute absolute
+ /// line numbers from the relative line numbers found in the profile.
unsigned HeaderLineno;
- /// \brief Map line offsets to collected samples.
- ///
- /// Each entry in this map contains the number of samples
- /// collected at the corresponding line offset. All line locations
- /// are an offset from the start of the function.
- BodySampleMap BodySamples;
-
/// \brief Map basic blocks to their computed weights.
///
/// The weight of a basic block is defined to be the maximum
@@ -212,105 +161,12 @@ protected:
/// \brief LLVM context holding the debug data we need.
LLVMContext *Ctx;
-};
-
-/// \brief Sample-based profile reader.
-///
-/// Each profile contains sample counts for all the functions
-/// executed. Inside each function, statements are annotated with the
-/// collected samples on all the instructions associated with that
-/// statement.
-///
-/// For this to produce meaningful data, the program needs to be
-/// compiled with some debug information (at minimum, line numbers:
-/// -gline-tables-only). Otherwise, it will be impossible to match IR
-/// instructions to the line numbers collected by the profiler.
-///
-/// From the profile file, we are interested in collecting the
-/// following information:
-///
-/// * A list of functions included in the profile (mangled names).
-///
-/// * For each function F:
-/// 1. The total number of samples collected in F.
-///
-/// 2. The samples collected at each line in F. To provide some
-/// protection against source code shuffling, line numbers should
-/// be relative to the start of the function.
-class SampleModuleProfile {
-public:
- SampleModuleProfile(const Module &M, StringRef F)
- : Profiles(0), Filename(F), M(M) {}
-
- void dump();
- bool loadText();
- void loadNative() { llvm_unreachable("not implemented"); }
- void printFunctionProfile(raw_ostream &OS, StringRef FName);
- void dumpFunctionProfile(StringRef FName);
- SampleFunctionProfile &getProfile(const Function &F) {
- return Profiles[F.getName()];
- }
- /// \brief Report a parse error message.
- void reportParseError(int64_t LineNumber, Twine Msg) const {
- DiagnosticInfoSampleProfile Diag(Filename.data(), LineNumber, Msg);
- M.getContext().diagnose(Diag);
- }
-
-protected:
- /// \brief Map every function to its associated profile.
- ///
- /// The profile of every function executed at runtime is collected
- /// in the structure SampleFunctionProfile. This maps function objects
- /// to their corresponding profiles.
- StringMap<SampleFunctionProfile> Profiles;
-
- /// \brief Path name to the file holding the profile data.
- ///
- /// The format of this file is defined by each profiler
- /// independently. If possible, the profiler should have a text
- /// version of the profile format to be used in constructing test
- /// cases and debugging.
- StringRef Filename;
-
- /// \brief Module being compiled. Used mainly to access the current
- /// LLVM context for diagnostics.
- const Module &M;
-};
-
-/// \brief Sample profile pass.
-///
-/// This pass reads profile data from the file specified by
-/// -sample-profile-file and annotates every affected function with the
-/// profile information found in that file.
-class SampleProfileLoader : public FunctionPass {
-public:
- // Class identification, replacement for typeinfo
- static char ID;
-
- SampleProfileLoader(StringRef Name = SampleProfileFile)
- : FunctionPass(ID), Profiler(), Filename(Name), ProfileIsValid(false) {
- initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry());
- }
-
- bool doInitialization(Module &M) override;
-
- void dump() { Profiler->dump(); }
-
- const char *getPassName() const override { return "Sample profile pass"; }
-
- bool runOnFunction(Function &F) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<LoopInfo>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<PostDominatorTree>();
- }
-
-protected:
/// \brief Profile reader object.
- std::unique_ptr<SampleModuleProfile> Profiler;
+ std::unique_ptr<SampleProfileReader> Reader;
+
+ /// \brief Samples collected for the body of this function.
+ FunctionSamples *Samples;
/// \brief Name of the profile file to load.
StringRef Filename;
@@ -320,26 +176,11 @@ protected:
};
}
-/// \brief Print this function profile on stream \p OS.
-///
-/// \param OS Stream to emit the output to.
-void SampleFunctionProfile::print(raw_ostream &OS) {
- OS << TotalSamples << ", " << TotalHeadSamples << ", " << BodySamples.size()
- << " sampled lines\n";
- for (BodySampleMap::const_iterator SI = BodySamples.begin(),
- SE = BodySamples.end();
- SI != SE; ++SI)
- OS << "\tline offset: " << SI->first.LineOffset
- << ", discriminator: " << SI->first.Discriminator
- << ", number of samples: " << SI->second << "\n";
- OS << "\n";
-}
-
/// \brief Print the weight of edge \p E on stream \p OS.
///
/// \param OS Stream to emit the output to.
/// \param E Edge to print.
-void SampleFunctionProfile::printEdgeWeight(raw_ostream &OS, Edge E) {
+void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) {
OS << "weight[" << E.first->getName() << "->" << E.second->getName()
<< "]: " << EdgeWeights[E] << "\n";
}
@@ -348,8 +189,8 @@ void SampleFunctionProfile::printEdgeWeight(raw_ostream &OS, Edge E) {
///
/// \param OS Stream to emit the output to.
/// \param BB Block to print.
-void SampleFunctionProfile::printBlockEquivalence(raw_ostream &OS,
- BasicBlock *BB) {
+void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS,
+ BasicBlock *BB) {
BasicBlock *Equiv = EquivalenceClass[BB];
OS << "equivalence[" << BB->getName()
<< "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n";
@@ -359,174 +200,10 @@ void SampleFunctionProfile::printBlockEquivalence(raw_ostream &OS,
///
/// \param OS Stream to emit the output to.
/// \param BB Block to print.
-void SampleFunctionProfile::printBlockWeight(raw_ostream &OS, BasicBlock *BB) {
+void SampleProfileLoader::printBlockWeight(raw_ostream &OS, BasicBlock *BB) {
OS << "weight[" << BB->getName() << "]: " << BlockWeights[BB] << "\n";
}
-/// \brief Print the function profile for \p FName on stream \p OS.
-///
-/// \param OS Stream to emit the output to.
-/// \param FName Name of the function to print.
-void SampleModuleProfile::printFunctionProfile(raw_ostream &OS,
- StringRef FName) {
- OS << "Function: " << FName << ":\n";
- Profiles[FName].print(OS);
-}
-
-/// \brief Dump the function profile for \p FName.
-///
-/// \param FName Name of the function to print.
-void SampleModuleProfile::dumpFunctionProfile(StringRef FName) {
- printFunctionProfile(dbgs(), FName);
-}
-
-/// \brief Dump all the function profiles found.
-void SampleModuleProfile::dump() {
- for (StringMap<SampleFunctionProfile>::const_iterator I = Profiles.begin(),
- E = Profiles.end();
- I != E; ++I)
- dumpFunctionProfile(I->getKey());
-}
-
-/// \brief Load samples from a text file.
-///
-/// The file contains a list of samples for every function executed at
-/// runtime. Each function profile has the following format:
-///
-/// function1:total_samples:total_head_samples
-/// offset1[.discriminator]: number_of_samples [fn1:num fn2:num ... ]
-/// offset2[.discriminator]: number_of_samples [fn3:num fn4:num ... ]
-/// ...
-/// offsetN[.discriminator]: number_of_samples [fn5:num fn6:num ... ]
-///
-/// Function names must be mangled in order for the profile loader to
-/// match them in the current translation unit. The two numbers in the
-/// function header specify how many total samples were accumulated in
-/// the function (first number), and the total number of samples accumulated
-/// at the prologue of the function (second number). This head sample
-/// count provides an indicator of how frequent is the function invoked.
-///
-/// Each sampled line may contain several items. Some are optional
-/// (marked below):
-///
-/// a- Source line offset. This number represents the line number
-/// in the function where the sample was collected. The line number
-/// is always relative to the line where symbol of the function
-/// is defined. So, if the function has its header at line 280,
-/// the offset 13 is at line 293 in the file.
-///
-/// b- [OPTIONAL] Discriminator. This is used if the sampled program
-/// was compiled with DWARF discriminator support
-/// (http://wiki.dwarfstd.org/index.php?title=Path_Discriminators)
-///
-/// c- Number of samples. This is the number of samples collected by
-/// the profiler at this source location.
-///
-/// d- [OPTIONAL] Potential call targets and samples. If present, this
-/// line contains a call instruction. This models both direct and
-/// indirect calls. Each called target is listed together with the
-/// number of samples. For example,
-///
-/// 130: 7 foo:3 bar:2 baz:7
-///
-/// The above means that at relative line offset 130 there is a
-/// call instruction that calls one of foo(), bar() and baz(). With
-/// baz() being the relatively more frequent call target.
-///
-/// FIXME: This is currently unhandled, but it has a lot of
-/// potential for aiding the inliner.
-///
-///
-/// Since this is a flat profile, a function that shows up more than
-/// once gets all its samples aggregated across all its instances.
-///
-/// FIXME: flat profiles are too imprecise to provide good optimization
-/// opportunities. Convert them to context-sensitive profile.
-///
-/// This textual representation is useful to generate unit tests and
-/// for debugging purposes, but it should not be used to generate
-/// profiles for large programs, as the representation is extremely
-/// inefficient.
-///
-/// \returns true if the file was loaded successfully, false otherwise.
-bool SampleModuleProfile::loadText() {
- ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
- MemoryBuffer::getFile(Filename);
- if (std::error_code EC = BufferOrErr.getError()) {
- std::string Msg(EC.message());
- M.getContext().diagnose(DiagnosticInfoSampleProfile(Filename.data(), Msg));
- return false;
- }
- std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get());
- line_iterator LineIt(*Buffer, '#');
-
- // Read the profile of each function. Since each function may be
- // mentioned more than once, and we are collecting flat profiles,
- // accumulate samples as we parse them.
- Regex HeadRE("^([^0-9].*):([0-9]+):([0-9]+)$");
- Regex LineSample("^([0-9]+)\\.?([0-9]+)?: ([0-9]+)(.*)$");
- while (!LineIt.is_at_eof()) {
- // Read the header of each function.
- //
- // Note that for function identifiers we are actually expecting
- // mangled names, but we may not always get them. This happens when
- // the compiler decides not to emit the function (e.g., it was inlined
- // and removed). In this case, the binary will not have the linkage
- // name for the function, so the profiler will emit the function's
- // unmangled name, which may contain characters like ':' and '>' in its
- // name (member functions, templates, etc).
- //
- // The only requirement we place on the identifier, then, is that it
- // should not begin with a number.
- SmallVector<StringRef, 3> Matches;
- if (!HeadRE.match(*LineIt, &Matches)) {
- reportParseError(LineIt.line_number(),
- "Expected 'mangled_name:NUM:NUM', found " + *LineIt);
- return false;
- }
- assert(Matches.size() == 4);
- StringRef FName = Matches[1];
- unsigned NumSamples, NumHeadSamples;
- Matches[2].getAsInteger(10, NumSamples);
- Matches[3].getAsInteger(10, NumHeadSamples);
- Profiles[FName] = SampleFunctionProfile();
- SampleFunctionProfile &FProfile = Profiles[FName];
- FProfile.addTotalSamples(NumSamples);
- FProfile.addHeadSamples(NumHeadSamples);
- ++LineIt;
-
- // Now read the body. The body of the function ends when we reach
- // EOF or when we see the start of the next function.
- while (!LineIt.is_at_eof() && isdigit((*LineIt)[0])) {
- if (!LineSample.match(*LineIt, &Matches)) {
- reportParseError(
- LineIt.line_number(),
- "Expected 'NUM[.NUM]: NUM[ mangled_name:NUM]*', found " + *LineIt);
- return false;
- }
- assert(Matches.size() == 5);
- unsigned LineOffset, NumSamples, Discriminator = 0;
- Matches[1].getAsInteger(10, LineOffset);
- if (Matches[2] != "")
- Matches[2].getAsInteger(10, Discriminator);
- Matches[3].getAsInteger(10, NumSamples);
-
- // FIXME: Handle called targets (in Matches[4]).
-
- // When dealing with instruction weights, we use the value
- // zero to indicate the absence of a sample. If we read an
- // actual zero from the profile file, return it as 1 to
- // avoid the confusion later on.
- if (NumSamples == 0)
- NumSamples = 1;
- FProfile.addBodySamples(LineOffset, Discriminator, NumSamples);
- ++LineIt;
- }
- }
-
- return true;
-}
-
/// \brief Get the weight for an instruction.
///
/// The "weight" of an instruction \p Inst is the number of samples
@@ -538,7 +215,7 @@ bool SampleModuleProfile::loadText() {
/// \param Inst Instruction to query.
///
/// \returns The profiled weight of I.
-unsigned SampleFunctionProfile::getInstWeight(Instruction &Inst) {
+unsigned SampleProfileLoader::getInstWeight(Instruction &Inst) {
DebugLoc DLoc = Inst.getDebugLoc();
unsigned Lineno = DLoc.getLine();
if (Lineno < HeaderLineno)
@@ -547,8 +224,7 @@ unsigned SampleFunctionProfile::getInstWeight(Instruction &Inst) {
DILocation DIL(DLoc.getAsMDNode(*Ctx));
int LOffset = Lineno - HeaderLineno;
unsigned Discriminator = DIL.getDiscriminator();
- unsigned Weight =
- BodySamples.lookup(InstructionLocation(LOffset, Discriminator));
+ unsigned Weight = Samples->samplesAt(LOffset, Discriminator);
DEBUG(dbgs() << " " << Lineno << "." << Discriminator << ":" << Inst
<< " (line offset: " << LOffset << "." << Discriminator
<< " - weight: " << Weight << ")\n");
@@ -557,24 +233,24 @@ unsigned SampleFunctionProfile::getInstWeight(Instruction &Inst) {
/// \brief Compute the weight of a basic block.
///
-/// The weight of basic block \p B is the maximum weight of all the
-/// instructions in B. The weight of \p B is computed and cached in
+/// The weight of basic block \p BB is the maximum weight of all the
+/// instructions in BB. The weight of \p BB is computed and cached in
/// the BlockWeights map.
///
-/// \param B The basic block to query.
+/// \param BB The basic block to query.
///
-/// \returns The computed weight of B.
-unsigned SampleFunctionProfile::getBlockWeight(BasicBlock *B) {
- // If we've computed B's weight before, return it.
+/// \returns The computed weight of BB.
+unsigned SampleProfileLoader::getBlockWeight(BasicBlock *BB) {
+ // If we've computed BB's weight before, return it.
std::pair<BlockWeightMap::iterator, bool> Entry =
- BlockWeights.insert(std::make_pair(B, 0));
+ BlockWeights.insert(std::make_pair(BB, 0));
if (!Entry.second)
return Entry.first->second;
- // Otherwise, compute and cache B's weight.
+ // Otherwise, compute and cache BB's weight.
unsigned Weight = 0;
- for (BasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) {
- unsigned InstWeight = getInstWeight(*I);
+ for (auto &I : BB->getInstList()) {
+ unsigned InstWeight = getInstWeight(I);
if (InstWeight > Weight)
Weight = InstWeight;
}
@@ -588,13 +264,13 @@ unsigned SampleFunctionProfile::getBlockWeight(BasicBlock *B) {
/// the weights of every basic block in the CFG.
///
/// \param F The function to query.
-bool SampleFunctionProfile::computeBlockWeights(Function &F) {
+bool SampleProfileLoader::computeBlockWeights(Function &F) {
bool Changed = false;
DEBUG(dbgs() << "Block weights\n");
- for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) {
- unsigned Weight = getBlockWeight(B);
+ for (auto &BB : F) {
+ unsigned Weight = getBlockWeight(&BB);
Changed |= (Weight > 0);
- DEBUG(printBlockWeight(dbgs(), B));
+ DEBUG(printBlockWeight(dbgs(), &BB));
}
return Changed;
@@ -623,16 +299,13 @@ bool SampleFunctionProfile::computeBlockWeights(Function &F) {
/// \param DomTree Opposite dominator tree. If \p Descendants is filled
/// with blocks from \p BB1's dominator tree, then
/// this is the post-dominator tree, and vice versa.
-void SampleFunctionProfile::findEquivalencesFor(
+void SampleProfileLoader::findEquivalencesFor(
BasicBlock *BB1, SmallVector<BasicBlock *, 8> Descendants,
DominatorTreeBase<BasicBlock> *DomTree) {
- for (SmallVectorImpl<BasicBlock *>::iterator I = Descendants.begin(),
- E = Descendants.end();
- I != E; ++I) {
- BasicBlock *BB2 = *I;
+ for (auto *BB2 : Descendants) {
bool IsDomParent = DomTree->dominates(BB2, BB1);
bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2);
- if (BB1 != BB2 && VisitedBlocks.insert(BB2) && IsDomParent &&
+ if (BB1 != BB2 && VisitedBlocks.insert(BB2).second && IsDomParent &&
IsInSameLoop) {
EquivalenceClass[BB2] = BB1;
@@ -660,12 +333,12 @@ void SampleFunctionProfile::findEquivalencesFor(
/// dominates B2, B2 post-dominates B1 and both are in the same loop.
///
/// \param F The function to query.
-void SampleFunctionProfile::findEquivalenceClasses(Function &F) {
+void SampleProfileLoader::findEquivalenceClasses(Function &F) {
SmallVector<BasicBlock *, 8> DominatedBBs;
DEBUG(dbgs() << "\nBlock equivalence classes\n");
// Find equivalence sets based on dominance and post-dominance information.
- for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) {
- BasicBlock *BB1 = B;
+ for (auto &BB : F) {
+ BasicBlock *BB1 = &BB;
// Compute BB1's equivalence class once.
if (EquivalenceClass.count(BB1)) {
@@ -712,8 +385,8 @@ void SampleFunctionProfile::findEquivalenceClasses(Function &F) {
// each equivalence class has the largest weight, assign that weight
// to all the blocks in that equivalence class.
DEBUG(dbgs() << "\nAssign the same weight to all blocks in the same class\n");
- for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) {
- BasicBlock *BB = B;
+ for (auto &BI : F) {
+ BasicBlock *BB = &BI;
BasicBlock *EquivBB = EquivalenceClass[BB];
if (BB != EquivBB)
BlockWeights[BB] = BlockWeights[EquivBB];
@@ -731,8 +404,8 @@ void SampleFunctionProfile::findEquivalenceClasses(Function &F) {
/// \param UnknownEdge Set if E has not been visited before.
///
/// \returns E's weight, if known. Otherwise, return 0.
-unsigned SampleFunctionProfile::visitEdge(Edge E, unsigned *NumUnknownEdges,
- Edge *UnknownEdge) {
+unsigned SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,
+ Edge *UnknownEdge) {
if (!VisitedEdges.count(E)) {
(*NumUnknownEdges)++;
*UnknownEdge = E;
@@ -753,11 +426,11 @@ unsigned SampleFunctionProfile::visitEdge(Edge E, unsigned *NumUnknownEdges,
/// \param F Function to process.
///
/// \returns True if new weights were assigned to edges or blocks.
-bool SampleFunctionProfile::propagateThroughEdges(Function &F) {
+bool SampleProfileLoader::propagateThroughEdges(Function &F) {
bool Changed = false;
DEBUG(dbgs() << "\nPropagation through edges\n");
- for (Function::iterator BI = F.begin(), EI = F.end(); BI != EI; ++BI) {
- BasicBlock *BB = BI;
+ for (auto &BI : F) {
+ BasicBlock *BB = &BI;
// Visit all the predecessor and successor edges to determine
// which ones have a weight assigned already. Note that it doesn't
@@ -771,16 +444,16 @@ bool SampleFunctionProfile::propagateThroughEdges(Function &F) {
if (i == 0) {
// First, visit all predecessor edges.
- for (size_t I = 0; I < Predecessors[BB].size(); I++) {
- Edge E = std::make_pair(Predecessors[BB][I], BB);
+ for (auto *Pred : Predecessors[BB]) {
+ Edge E = std::make_pair(Pred, BB);
TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
if (E.first == E.second)
SelfReferentialEdge = E;
}
} else {
// On the second round, visit all successor edges.
- for (size_t I = 0; I < Successors[BB].size(); I++) {
- Edge E = std::make_pair(BB, Successors[BB][I]);
+ for (auto *Succ : Successors[BB]) {
+ Edge E = std::make_pair(BB, Succ);
TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
}
}
@@ -821,7 +494,7 @@ bool SampleFunctionProfile::propagateThroughEdges(Function &F) {
<< " known. Set weight for block: ";
printBlockWeight(dbgs(), BB););
}
- if (VisitedBlocks.insert(BB))
+ if (VisitedBlocks.insert(BB).second)
Changed = true;
} else if (NumUnknownEdges == 1 && VisitedBlocks.count(BB)) {
// If there is a single unknown edge and the block has been
@@ -857,9 +530,9 @@ bool SampleFunctionProfile::propagateThroughEdges(Function &F) {
///
/// We are interested in unique edges. If a block B1 has multiple
/// edges to another block B2, we only add a single B1->B2 edge.
-void SampleFunctionProfile::buildEdges(Function &F) {
- for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
- BasicBlock *B1 = I;
+void SampleProfileLoader::buildEdges(Function &F) {
+ for (auto &BI : F) {
+ BasicBlock *B1 = &BI;
// Add predecessors for B1.
SmallPtrSet<BasicBlock *, 16> Visited;
@@ -867,7 +540,7 @@ void SampleFunctionProfile::buildEdges(Function &F) {
llvm_unreachable("Found a stale predecessors list in a basic block.");
for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); PI != PE; ++PI) {
BasicBlock *B2 = *PI;
- if (Visited.insert(B2))
+ if (Visited.insert(B2).second)
Predecessors[B1].push_back(B2);
}
@@ -877,7 +550,7 @@ void SampleFunctionProfile::buildEdges(Function &F) {
llvm_unreachable("Found a stale successors list in a basic block.");
for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); SI != SE; ++SI) {
BasicBlock *B2 = *SI;
- if (Visited.insert(B2))
+ if (Visited.insert(B2).second)
Successors[B1].push_back(B2);
}
}
@@ -885,22 +558,22 @@ void SampleFunctionProfile::buildEdges(Function &F) {
/// \brief Propagate weights into edges
///
-/// The following rules are applied to every block B in the CFG:
+/// The following rules are applied to every block BB in the CFG:
///
-/// - If B has a single predecessor/successor, then the weight
+/// - If BB has a single predecessor/successor, then the weight
/// of that edge is the weight of the block.
///
/// - If all incoming or outgoing edges are known except one, and the
/// weight of the block is already known, the weight of the unknown
/// edge will be the weight of the block minus the sum of all the known
-/// edges. If the sum of all the known edges is larger than B's weight,
+/// edges. If the sum of all the known edges is larger than BB's weight,
/// we set the unknown edge weight to zero.
///
/// - If there is a self-referential edge, and the weight of the block is
/// known, the weight for that edge is set to the weight of the block
/// minus the weight of the other incoming edges to that block (if
/// known).
-void SampleFunctionProfile::propagateWeights(Function &F) {
+void SampleProfileLoader::propagateWeights(Function &F) {
bool Changed = true;
unsigned i = 0;
@@ -920,9 +593,9 @@ void SampleFunctionProfile::propagateWeights(Function &F) {
// edge weights computed during propagation.
DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
MDBuilder MDB(F.getContext());
- for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
- BasicBlock *B = I;
- TerminatorInst *TI = B->getTerminator();
+ for (auto &BI : F) {
+ BasicBlock *BB = &BI;
+ TerminatorInst *TI = BB->getTerminator();
if (TI->getNumSuccessors() == 1)
continue;
if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
@@ -934,7 +607,7 @@ void SampleFunctionProfile::propagateWeights(Function &F) {
bool AllWeightsZero = true;
for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
BasicBlock *Succ = TI->getSuccessor(I);
- Edge E = std::make_pair(B, Succ);
+ Edge E = std::make_pair(BB, Succ);
unsigned Weight = EdgeWeights[E];
DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
Weights.push_back(Weight);
@@ -965,22 +638,17 @@ void SampleFunctionProfile::propagateWeights(Function &F) {
///
/// \returns the line number where \p F is defined. If it returns 0,
/// it means that there is no debug information available for \p F.
-unsigned SampleFunctionProfile::getFunctionLoc(Function &F) {
- NamedMDNode *CUNodes = F.getParent()->getNamedMetadata("llvm.dbg.cu");
- if (CUNodes) {
- for (unsigned I = 0, E1 = CUNodes->getNumOperands(); I != E1; ++I) {
- DICompileUnit CU(CUNodes->getOperand(I));
- DIArray Subprograms = CU.getSubprograms();
- for (unsigned J = 0, E2 = Subprograms.getNumElements(); J != E2; ++J) {
- DISubprogram Subprogram(Subprograms.getElement(J));
- if (Subprogram.describes(&F))
- return Subprogram.getLineNumber();
- }
- }
- }
+unsigned SampleProfileLoader::getFunctionLoc(Function &F) {
+ DISubprogram S = getDISubprogram(&F);
+ if (S.isSubprogram())
+ return S.getLineNumber();
+ // If could not find the start of \p F, emit a diagnostic to inform the user
+ // about the missed opportunity.
F.getContext().diagnose(DiagnosticInfoSampleProfile(
- "No debug information found in function " + F.getName()));
+ "No debug information found in function " + F.getName() +
+ ": Function profile not used",
+ DS_Warning));
return 0;
}
@@ -1002,15 +670,15 @@ unsigned SampleFunctionProfile::getFunctionLoc(Function &F) {
///
/// 3- Propagation of block weights into edges. This uses a simple
/// propagation heuristic. The following rules are applied to every
-/// block B in the CFG:
+/// block BB in the CFG:
///
-/// - If B has a single predecessor/successor, then the weight
+/// - If BB has a single predecessor/successor, then the weight
/// of that edge is the weight of the block.
///
/// - If all the edges are known except one, and the weight of the
/// block is already known, the weight of the unknown edge will
/// be the weight of the block minus the sum of all the known
-/// edges. If the sum of all the known edges is larger than B's weight,
+/// edges. If the sum of all the known edges is larger than BB's weight,
/// we set the unknown edge weight to zero.
///
/// - If there is a self-referential edge, and the weight of the block is
@@ -1028,14 +696,12 @@ unsigned SampleFunctionProfile::getFunctionLoc(Function &F) {
/// work here.
///
/// Once all the branch weights are computed, we emit the MD_prof
-/// metadata on B using the computed values for each of its branches.
+/// metadata on BB using the computed values for each of its branches.
///
/// \param F The function to query.
///
/// \returns true if \p F was modified. Returns false, otherwise.
-bool SampleFunctionProfile::emitAnnotations(Function &F, DominatorTree *DomTree,
- PostDominatorTree *PostDomTree,
- LoopInfo *Loops) {
+bool SampleProfileLoader::emitAnnotations(Function &F) {
bool Changed = false;
// Initialize invariants used during computation and propagation.
@@ -1045,10 +711,6 @@ bool SampleFunctionProfile::emitAnnotations(Function &F, DominatorTree *DomTree,
DEBUG(dbgs() << "Line number for the first instruction in " << F.getName()
<< ": " << HeaderLineno << "\n");
- DT = DomTree;
- PDT = PostDomTree;
- LI = Loops;
- Ctx = &F.getParent()->getContext();
// Compute basic block weights.
Changed |= computeBlockWeights(F);
@@ -1075,8 +737,14 @@ INITIALIZE_PASS_END(SampleProfileLoader, "sample-profile",
"Sample Profile loader", false, false)
bool SampleProfileLoader::doInitialization(Module &M) {
- Profiler.reset(new SampleModuleProfile(M, Filename));
- ProfileIsValid = Profiler->loadText();
+ auto ReaderOrErr = SampleProfileReader::create(Filename, M.getContext());
+ if (std::error_code EC = ReaderOrErr.getError()) {
+ std::string Msg = "Could not open profile: " + EC.message();
+ M.getContext().diagnose(DiagnosticInfoSampleProfile(Filename.data(), Msg));
+ return false;
+ }
+ Reader = std::move(ReaderOrErr.get());
+ ProfileIsValid = (Reader->read() == sampleprof_error::success);
return true;
}
@@ -1091,11 +759,13 @@ FunctionPass *llvm::createSampleProfileLoaderPass(StringRef Name) {
bool SampleProfileLoader::runOnFunction(Function &F) {
if (!ProfileIsValid)
return false;
- DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- PostDominatorTree *PDT = &getAnalysis<PostDominatorTree>();
- LoopInfo *LI = &getAnalysis<LoopInfo>();
- SampleFunctionProfile &FunctionProfile = Profiler->getProfile(F);
- if (!FunctionProfile.empty())
- return FunctionProfile.emitAnnotations(F, DT, PDT, LI);
+
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ PDT = &getAnalysis<PostDominatorTree>();
+ LI = &getAnalysis<LoopInfo>();
+ Ctx = &F.getParent()->getContext();
+ Samples = Reader->getSamplesFor(F);
+ if (!Samples->empty())
+ return emitAnnotations(F);
return false;
}
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index de724d419a48..a16e9e29a1f1 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -28,6 +28,7 @@ using namespace llvm;
/// ScalarOpts library.
void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeADCEPass(Registry);
+ initializeAlignmentFromAssumptionsPass(Registry);
initializeSampleProfileLoaderPass(Registry);
initializeConstantHoistingPass(Registry);
initializeConstantPropagationPass(Registry);
@@ -38,6 +39,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeDSEPass(Registry);
initializeGVNPass(Registry);
initializeEarlyCSEPass(Registry);
+ initializeFlattenCFGPassPass(Registry);
initializeIndVarSimplifyPass(Registry);
initializeJumpThreadingPass(Registry);
initializeLICMPass(Registry);
@@ -77,6 +79,10 @@ void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createAggressiveDCEPass());
}
+void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createAlignmentFromAssumptionsPass());
+}
+
void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createCFGSimplificationPass());
}
@@ -145,6 +151,10 @@ void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createPartiallyInlineLibCallsPass());
}
+void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLowerSwitchPass());
+}
+
void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createPromoteMemoryToRegisterPass());
}
@@ -203,6 +213,10 @@ void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createTypeBasedAliasAnalysisPass());
}
+void LLVMAddScopedNoAliasAAPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createScopedNoAliasAAPass());
+}
+
void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createBasicAliasAnalysisPass());
}
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index e2a24a7fd4a7..5c49a5504b47 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -23,6 +23,7 @@
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/CallSite.h"
@@ -197,6 +198,7 @@ namespace {
// getAnalysisUsage - This pass does not require any passes, but we know it
// will not alter the CFG, so say so.
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.setPreservesCFG();
}
@@ -214,6 +216,7 @@ namespace {
// getAnalysisUsage - This pass does not require any passes, but we know it
// will not alter the CFG, so say so.
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
AU.setPreservesCFG();
}
};
@@ -225,12 +228,14 @@ char SROA_SSAUp::ID = 0;
INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl",
"Scalar Replacement of Aggregates (DT)", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_END(SROA_DT, "scalarrepl",
"Scalar Replacement of Aggregates (DT)", false, false)
INITIALIZE_PASS_BEGIN(SROA_SSAUp, "scalarrepl-ssa",
"Scalar Replacement of Aggregates (SSAUp)", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa",
"Scalar Replacement of Aggregates (SSAUp)", false, false)
@@ -1063,12 +1068,14 @@ public:
void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) {
// Remember which alloca we're promoting (for isInstInList).
this->AI = AI;
- if (MDNode *DebugNode = MDNode::getIfExists(AI->getContext(), AI)) {
- for (User *U : DebugNode->users())
- if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
- DDIs.push_back(DDI);
- else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
- DVIs.push_back(DVI);
+ if (auto *L = LocalAsMetadata::getIfExists(AI)) {
+ if (auto *DebugNode = MetadataAsValue::getIfExists(AI->getContext(), L)) {
+ for (User *U : DebugNode->users())
+ if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
+ DDIs.push_back(DDI);
+ else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
+ DVIs.push_back(DVI);
+ }
}
LoadAndStorePromoter::run(Insts);
@@ -1119,9 +1126,9 @@ public:
} else {
continue;
}
- Instruction *DbgVal =
- DIB->insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()),
- Inst);
+ Instruction *DbgVal = DIB->insertDbgValueIntrinsic(
+ Arg, 0, DIVariable(DVI->getVariable()),
+ DIExpression(DVI->getExpression()), Inst);
DbgVal->setDebugLoc(DVI->getDebugLoc());
}
}
@@ -1333,12 +1340,15 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) {
LoadInst *FalseLoad =
Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".f");
- // Transfer alignment and TBAA info if present.
+ // Transfer alignment and AA info if present.
TrueLoad->setAlignment(LI->getAlignment());
FalseLoad->setAlignment(LI->getAlignment());
- if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) {
- TrueLoad->setMetadata(LLVMContext::MD_tbaa, Tag);
- FalseLoad->setMetadata(LLVMContext::MD_tbaa, Tag);
+
+ AAMDNodes Tags;
+ LI->getAAMetadata(Tags);
+ if (Tags) {
+ TrueLoad->setAAMetadata(Tags);
+ FalseLoad->setAAMetadata(Tags);
}
Value *V = Builder.CreateSelect(SI->getCondition(), TrueLoad, FalseLoad);
@@ -1364,10 +1374,12 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) {
PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(),
PN->getName()+".ld", PN);
- // Get the TBAA tag and alignment to use from one of the loads. It doesn't
+ // Get the AA tags and alignment to use from one of the loads. It doesn't
// matter which one we get and if any differ, it doesn't matter.
LoadInst *SomeLoad = cast<LoadInst>(PN->user_back());
- MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
+
+ AAMDNodes AATags;
+ SomeLoad->getAAMetadata(AATags);
unsigned Align = SomeLoad->getAlignment();
// Rewrite all loads of the PN to use the new PHI.
@@ -1389,7 +1401,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) {
PN->getName() + "." + Pred->getName(),
Pred->getTerminator());
Load->setAlignment(Align);
- if (TBAATag) Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+ if (AATags) Load->setAAMetadata(AATags);
}
NewPN->addIncoming(Load, Pred);
@@ -1407,9 +1419,11 @@ bool SROA::performPromotion(Function &F) {
DominatorTree *DT = nullptr;
if (HasDomTree)
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ AssumptionCache &AC =
+ getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function
- DIBuilder DIB(*F.getParent());
+ DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
bool Changed = false;
SmallVector<Instruction*, 64> Insts;
while (1) {
@@ -1425,7 +1439,7 @@ bool SROA::performPromotion(Function &F) {
if (Allocas.empty()) break;
if (HasDomTree)
- PromoteMemToReg(Allocas, *DT);
+ PromoteMemToReg(Allocas, *DT, nullptr, &AC);
else {
SSAUpdater SSA;
for (unsigned i = 0, e = Allocas.size(); i != e; ++i) {
@@ -1658,7 +1672,7 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
AllocaInfo &Info) {
// If we've already checked this PHI, don't do it again.
if (PHINode *PN = dyn_cast<PHINode>(I))
- if (!Info.CheckedPHIs.insert(PN))
+ if (!Info.CheckedPHIs.insert(PN).second)
return;
for (User *U : I->users()) {
diff --git a/lib/Transforms/Scalar/Scalarizer.cpp b/lib/Transforms/Scalar/Scalarizer.cpp
index 7a73f113b1d9..6036c099be0e 100644
--- a/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/lib/Transforms/Scalar/Scalarizer.cpp
@@ -150,6 +150,16 @@ public:
bool visitLoadInst(LoadInst &);
bool visitStoreInst(StoreInst &);
+ static void registerOptions() {
+ // This is disabled by default because having separate loads and stores
+ // makes it more likely that the -combiner-alias-analysis limits will be
+ // reached.
+ OptionRegistry::registerOption<bool, Scalarizer,
+ &Scalarizer::ScalarizeLoadStore>(
+ "scalarize-load-store",
+ "Allow the scalarizer pass to scalarize loads and store", false);
+ }
+
private:
Scatterer scatter(Instruction *, Value *);
void gather(Instruction *, const ValueVector &);
@@ -164,19 +174,14 @@ private:
GatherList Gathered;
unsigned ParallelLoopAccessMDKind;
const DataLayout *DL;
+ bool ScalarizeLoadStore;
};
char Scalarizer::ID = 0;
} // end anonymous namespace
-// This is disabled by default because having separate loads and stores makes
-// it more likely that the -combiner-alias-analysis limits will be reached.
-static cl::opt<bool> ScalarizeLoadStore
- ("scalarize-load-store", cl::Hidden, cl::init(false),
- cl::desc("Allow the scalarizer pass to scalarize loads and store"));
-
-INITIALIZE_PASS(Scalarizer, "scalarizer", "Scalarize vector operations",
- false, false)
+INITIALIZE_PASS_WITH_OPTIONS(Scalarizer, "scalarizer",
+ "Scalarize vector operations", false, false)
Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
ValueVector *cachePtr)
@@ -236,7 +241,9 @@ Value *Scatterer::operator[](unsigned I) {
bool Scalarizer::doInitialization(Module &M) {
ParallelLoopAccessMDKind =
- M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
+ M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
+ ScalarizeLoadStore =
+ M.getContext().getOption<bool, Scalarizer, &Scalarizer::ScalarizeLoadStore>();
return false;
}
@@ -312,6 +319,8 @@ bool Scalarizer::canTransferMetadata(unsigned Tag) {
|| Tag == LLVMContext::MD_fpmath
|| Tag == LLVMContext::MD_tbaa_struct
|| Tag == LLVMContext::MD_invariant_load
+ || Tag == LLVMContext::MD_alias_scope
+ || Tag == LLVMContext::MD_noalias
|| Tag == ParallelLoopAccessMDKind);
}
@@ -322,8 +331,10 @@ void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) {
Op->getAllMetadataOtherThanDebugLoc(MDs);
for (unsigned I = 0, E = CV.size(); I != E; ++I) {
if (Instruction *New = dyn_cast<Instruction>(CV[I])) {
- for (SmallVectorImpl<std::pair<unsigned, MDNode *> >::iterator
- MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI)
+ for (SmallVectorImpl<std::pair<unsigned, MDNode *>>::iterator
+ MI = MDs.begin(),
+ ME = MDs.end();
+ MI != ME; ++MI)
if (canTransferMetadata(MI->first))
New->setMetadata(MI->first, MI->second);
New->setDebugLoc(Op->getDebugLoc());
diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 6557ce4575dd..6157746af48c 100644
--- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -79,6 +79,81 @@
// ld.global.f32 %f3, [%rl6+128]; // much better
// ld.global.f32 %f4, [%rl6+132]; // much better
//
+// Another improvement enabled by the LowerGEP flag is to lower a GEP with
+// multiple indices to either multiple GEPs with a single index or arithmetic
+// operations (depending on whether the target uses alias analysis in codegen).
+// Such transformation can have following benefits:
+// (1) It can always extract constants in the indices of structure type.
+// (2) After such Lowering, there are more optimization opportunities such as
+// CSE, LICM and CGP.
+//
+// E.g. The following GEPs have multiple indices:
+// BB1:
+// %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3
+// load %p
+// ...
+// BB2:
+// %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2
+// load %p2
+// ...
+//
+// We can not do CSE for to the common part related to index "i64 %i". Lowering
+// GEPs can achieve such goals.
+// If the target does not use alias analysis in codegen, this pass will
+// lower a GEP with multiple indices into arithmetic operations:
+// BB1:
+// %1 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity
+// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
+// %3 = add i64 %1, %2 ; CSE opportunity
+// %4 = mul i64 %j1, length_of_struct
+// %5 = add i64 %3, %4
+// %6 = add i64 %3, struct_field_3 ; Constant offset
+// %p = inttoptr i64 %6 to i32*
+// load %p
+// ...
+// BB2:
+// %7 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity
+// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
+// %9 = add i64 %7, %8 ; CSE opportunity
+// %10 = mul i64 %j2, length_of_struct
+// %11 = add i64 %9, %10
+// %12 = add i64 %11, struct_field_2 ; Constant offset
+// %p = inttoptr i64 %12 to i32*
+// load %p2
+// ...
+//
+// If the target uses alias analysis in codegen, this pass will lower a GEP
+// with multiple indices into multiple GEPs with a single index:
+// BB1:
+// %1 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity
+// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
+// %3 = getelementptr i8* %1, i64 %2 ; CSE opportunity
+// %4 = mul i64 %j1, length_of_struct
+// %5 = getelementptr i8* %3, i64 %4
+// %6 = getelementptr i8* %5, struct_field_3 ; Constant offset
+// %p = bitcast i8* %6 to i32*
+// load %p
+// ...
+// BB2:
+// %7 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity
+// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
+// %9 = getelementptr i8* %7, i64 %8 ; CSE opportunity
+// %10 = mul i64 %j2, length_of_struct
+// %11 = getelementptr i8* %9, i64 %10
+// %12 = getelementptr i8* %11, struct_field_2 ; Constant offset
+// %p2 = bitcast i8* %12 to i32*
+// load %p2
+// ...
+//
+// Lowering GEPs can also benefit other passes such as LICM and CGP.
+// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple
+// indices if one of the index is variant. If we lower such GEP into invariant
+// parts and variant parts, LICM can hoist/sink those invariant parts.
+// CGP (CodeGen Prepare) tries to sink address calculations that match the
+// target's addressing modes. A GEP with multiple indices may not match and will
+// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of
+// them. So we end up with a better addressing mode.
+//
//===----------------------------------------------------------------------===//
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -92,6 +167,9 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/IR/IRBuilder.h"
using namespace llvm;
@@ -117,18 +195,17 @@ namespace {
/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15).
class ConstantOffsetExtractor {
public:
- /// Extracts a constant offset from the given GEP index. It outputs the
- /// numeric value of the extracted constant offset (0 if failed), and a
+ /// Extracts a constant offset from the given GEP index. It returns the
/// new index representing the remainder (equal to the original index minus
- /// the constant offset).
+ /// the constant offset), or nullptr if we cannot extract a constant offset.
/// \p Idx The given GEP index
- /// \p NewIdx The new index to replace (output)
/// \p DL The datalayout of the module
/// \p GEP The given GEP
- static int64_t Extract(Value *Idx, Value *&NewIdx, const DataLayout *DL,
- GetElementPtrInst *GEP);
- /// Looks for a constant offset without extracting it. The meaning of the
- /// arguments and the return value are the same as Extract.
+ static Value *Extract(Value *Idx, const DataLayout *DL,
+ GetElementPtrInst *GEP);
+ /// Looks for a constant offset from the given GEP index without extracting
+ /// it. It returns the numeric value of the extracted constant offset (0 if
+ /// failed). The meaning of the arguments are the same as Extract.
static int64_t Find(Value *Idx, const DataLayout *DL, GetElementPtrInst *GEP);
private:
@@ -228,7 +305,9 @@ class ConstantOffsetExtractor {
class SeparateConstOffsetFromGEP : public FunctionPass {
public:
static char ID;
- SeparateConstOffsetFromGEP() : FunctionPass(ID) {
+ SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr,
+ bool LowerGEP = false)
+ : FunctionPass(ID), TM(TM), LowerGEP(LowerGEP) {
initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
}
@@ -251,10 +330,29 @@ class SeparateConstOffsetFromGEP : public FunctionPass {
/// Tries to split the given GEP into a variadic base and a constant offset,
/// and returns true if the splitting succeeds.
bool splitGEP(GetElementPtrInst *GEP);
- /// Finds the constant offset within each index, and accumulates them. This
- /// function only inspects the GEP without changing it. The output
- /// NeedsExtraction indicates whether we can extract a non-zero constant
- /// offset from any index.
+ /// Lower a GEP with multiple indices into multiple GEPs with a single index.
+ /// Function splitGEP already split the original GEP into a variadic part and
+ /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
+ /// variadic part into a set of GEPs with a single index and applies
+ /// AccumulativeByteOffset to it.
+ /// \p Variadic The variadic part of the original GEP.
+ /// \p AccumulativeByteOffset The constant offset.
+ void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
+ int64_t AccumulativeByteOffset);
+ /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
+ /// Function splitGEP already split the original GEP into a variadic part and
+ /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
+ /// variadic part into a set of arithmetic operations and applies
+ /// AccumulativeByteOffset to it.
+ /// \p Variadic The variadic part of the original GEP.
+ /// \p AccumulativeByteOffset The constant offset.
+ void lowerToArithmetics(GetElementPtrInst *Variadic,
+ int64_t AccumulativeByteOffset);
+ /// Finds the constant offset within each index and accumulates them. If
+ /// LowerGEP is true, it finds in indices of both sequential and structure
+ /// types, otherwise it only finds in sequential indices. The output
+ /// NeedsExtraction indicates whether we successfully find a non-zero constant
+ /// offset.
int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
/// Canonicalize array indices to pointer-size integers. This helps to
/// simplify the logic of splitting a GEP. For example, if a + b is a
@@ -274,6 +372,10 @@ class SeparateConstOffsetFromGEP : public FunctionPass {
bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
const DataLayout *DL;
+ const TargetMachine *TM;
+ /// Whether to lower a GEP with multiple indices into arithmetic operations or
+ /// multiple GEPs with a single index.
+ bool LowerGEP;
};
} // anonymous namespace
@@ -289,8 +391,10 @@ INITIALIZE_PASS_END(
"Split GEPs to a variadic base and a constant offset for better CSE", false,
false)
-FunctionPass *llvm::createSeparateConstOffsetFromGEPPass() {
- return new SeparateConstOffsetFromGEP();
+FunctionPass *
+llvm::createSeparateConstOffsetFromGEPPass(const TargetMachine *TM,
+ bool LowerGEP) {
+ return new SeparateConstOffsetFromGEP(TM, LowerGEP);
}
bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
@@ -519,8 +623,13 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
//
// Replacing the "or" with "add" is fine, because
// a | (b + 5) = a + (b + 5) = (a + b) + 5
- return BinaryOperator::CreateAdd(BO->getOperand(0), BO->getOperand(1),
- BO->getName(), IP);
+ if (OpNo == 0) {
+ return BinaryOperator::CreateAdd(NextInChain, TheOther, BO->getName(),
+ IP);
+ } else {
+ return BinaryOperator::CreateAdd(TheOther, NextInChain, BO->getName(),
+ IP);
+ }
}
// We can reuse BO in this case, because the new expression shares the same
@@ -537,19 +646,17 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
return BO;
}
-int64_t ConstantOffsetExtractor::Extract(Value *Idx, Value *&NewIdx,
- const DataLayout *DL,
- GetElementPtrInst *GEP) {
+Value *ConstantOffsetExtractor::Extract(Value *Idx, const DataLayout *DL,
+ GetElementPtrInst *GEP) {
ConstantOffsetExtractor Extractor(DL, GEP);
// Find a non-zero constant offset first.
APInt ConstantOffset =
Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
GEP->isInBounds());
- if (ConstantOffset != 0) {
- // Separates the constant offset from the GEP index.
- NewIdx = Extractor.rebuildWithoutConstOffset();
- }
- return ConstantOffset.getSExtValue();
+ if (ConstantOffset == 0)
+ return nullptr;
+ // Separates the constant offset from the GEP index.
+ return Extractor.rebuildWithoutConstOffset();
}
int64_t ConstantOffsetExtractor::Find(Value *Idx, const DataLayout *DL,
@@ -615,11 +722,116 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
AccumulativeByteOffset +=
ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
}
+ } else if (LowerGEP) {
+ StructType *StTy = cast<StructType>(*GTI);
+ uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue();
+ // Skip field 0 as the offset is always 0.
+ if (Field != 0) {
+ NeedsExtraction = true;
+ AccumulativeByteOffset +=
+ DL->getStructLayout(StTy)->getElementOffset(Field);
+ }
}
}
return AccumulativeByteOffset;
}
+void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
+ GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) {
+ IRBuilder<> Builder(Variadic);
+ Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
+
+ Type *I8PtrTy =
+ Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
+ Value *ResultPtr = Variadic->getOperand(0);
+ if (ResultPtr->getType() != I8PtrTy)
+ ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+
+ gep_type_iterator GTI = gep_type_begin(*Variadic);
+ // Create an ugly GEP for each sequential index. We don't create GEPs for
+ // structure indices, as they are accumulated in the constant offset index.
+ for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
+ if (isa<SequentialType>(*GTI)) {
+ Value *Idx = Variadic->getOperand(I);
+ // Skip zero indices.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
+ if (CI->isZero())
+ continue;
+
+ APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
+ DL->getTypeAllocSize(GTI.getIndexedType()));
+ // Scale the index by element size.
+ if (ElementSize != 1) {
+ if (ElementSize.isPowerOf2()) {
+ Idx = Builder.CreateShl(
+ Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
+ } else {
+ Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
+ }
+ }
+ // Create an ugly GEP with a single index for each index.
+ ResultPtr = Builder.CreateGEP(ResultPtr, Idx, "uglygep");
+ }
+ }
+
+ // Create a GEP with the constant offset index.
+ if (AccumulativeByteOffset != 0) {
+ Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
+ ResultPtr = Builder.CreateGEP(ResultPtr, Offset, "uglygep");
+ }
+ if (ResultPtr->getType() != Variadic->getType())
+ ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());
+
+ Variadic->replaceAllUsesWith(ResultPtr);
+ Variadic->eraseFromParent();
+}
+
+void
+SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
+ int64_t AccumulativeByteOffset) {
+ IRBuilder<> Builder(Variadic);
+ Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
+
+ Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy);
+ gep_type_iterator GTI = gep_type_begin(*Variadic);
+ // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We
+ // don't create arithmetics for structure indices, as they are accumulated
+ // in the constant offset index.
+ for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
+ if (isa<SequentialType>(*GTI)) {
+ Value *Idx = Variadic->getOperand(I);
+ // Skip zero indices.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
+ if (CI->isZero())
+ continue;
+
+ APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
+ DL->getTypeAllocSize(GTI.getIndexedType()));
+ // Scale the index by element size.
+ if (ElementSize != 1) {
+ if (ElementSize.isPowerOf2()) {
+ Idx = Builder.CreateShl(
+ Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
+ } else {
+ Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
+ }
+ }
+ // Create an ADD for each index.
+ ResultPtr = Builder.CreateAdd(ResultPtr, Idx);
+ }
+ }
+
+ // Create an ADD for the constant offset index.
+ if (AccumulativeByteOffset != 0) {
+ ResultPtr = Builder.CreateAdd(
+ ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset));
+ }
+
+ ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType());
+ Variadic->replaceAllUsesWith(ResultPtr);
+ Variadic->eraseFromParent();
+}
+
bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
// Skip vector GEPs.
if (GEP->getType()->isVectorTy())
@@ -637,32 +849,42 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
if (!NeedsExtraction)
return Changed;
- // Before really splitting the GEP, check whether the backend supports the
- // addressing mode we are about to produce. If no, this splitting probably
- // won't be beneficial.
- TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
- if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(),
- /*BaseGV=*/nullptr, AccumulativeByteOffset,
- /*HasBaseReg=*/true, /*Scale=*/0)) {
- return Changed;
+ // If LowerGEP is disabled, before really splitting the GEP, check whether the
+ // backend supports the addressing mode we are about to produce. If no, this
+ // splitting probably won't be beneficial.
+ // If LowerGEP is enabled, even the extracted constant offset can not match
+ // the addressing mode, we can still do optimizations to other lowered parts
+ // of variable indices. Therefore, we don't check for addressing modes in that
+ // case.
+ if (!LowerGEP) {
+ TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
+ if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(),
+ /*BaseGV=*/nullptr, AccumulativeByteOffset,
+ /*HasBaseReg=*/true, /*Scale=*/0)) {
+ return Changed;
+ }
}
- // Remove the constant offset in each GEP index. The resultant GEP computes
- // the variadic base.
+ // Remove the constant offset in each sequential index. The resultant GEP
+ // computes the variadic base.
+ // Notice that we don't remove struct field indices here. If LowerGEP is
+ // disabled, a structure index is not accumulated and we still use the old
+ // one. If LowerGEP is enabled, a structure index is accumulated in the
+ // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later
+ // handle the constant offset and won't need a new structure index.
gep_type_iterator GTI = gep_type_begin(*GEP);
for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
if (isa<SequentialType>(*GTI)) {
- Value *NewIdx = nullptr;
- // Tries to extract a constant offset from this GEP index.
- int64_t ConstantOffset =
- ConstantOffsetExtractor::Extract(GEP->getOperand(I), NewIdx, DL, GEP);
- if (ConstantOffset != 0) {
- assert(NewIdx != nullptr &&
- "ConstantOffset != 0 implies NewIdx is set");
+ // Splits this GEP index into a variadic part and a constant offset, and
+ // uses the variadic part as the new index.
+ Value *NewIdx =
+ ConstantOffsetExtractor::Extract(GEP->getOperand(I), DL, GEP);
+ if (NewIdx != nullptr) {
GEP->setOperand(I, NewIdx);
}
}
}
+
// Clear the inbounds attribute because the new index may be off-bound.
// e.g.,
//
@@ -684,6 +906,21 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
// possible. GEPs with inbounds are more friendly to alias analysis.
GEP->setIsInBounds(false);
+ // Lowers a GEP to either GEPs with a single index or arithmetic operations.
+ if (LowerGEP) {
+ // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
+ // arithmetic operations if the target uses alias analysis in codegen.
+ if (TM && TM->getSubtarget<TargetSubtargetInfo>().useAA())
+ lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
+ else
+ lowerToArithmetics(GEP, AccumulativeByteOffset);
+ return true;
+ }
+
+ // No need to create another GEP if the accumulative byte offset is 0.
+ if (AccumulativeByteOffset == 0)
+ return true;
+
// Offsets the base with the accumulative byte offset.
//
// %gep ; the base
@@ -715,16 +952,16 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
Instruction *NewGEP = GEP->clone();
NewGEP->insertBefore(GEP);
- uint64_t ElementTypeSizeOfGEP =
- DL->getTypeAllocSize(GEP->getType()->getElementType());
+ // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned =
+ // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is
+ // used with unsigned integers later.
+ int64_t ElementTypeSizeOfGEP = static_cast<int64_t>(
+ DL->getTypeAllocSize(GEP->getType()->getElementType()));
Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
// Very likely. As long as %gep is natually aligned, the byte offset we
// extracted should be a multiple of sizeof(*%gep).
- // Per ANSI C standard, signed / unsigned = unsigned. Therefore, we
- // cast ElementTypeSizeOfGEP to signed.
- int64_t Index =
- AccumulativeByteOffset / static_cast<int64_t>(ElementTypeSizeOfGEP);
+ int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
NewGEP = GetElementPtrInst::Create(
NewGEP, ConstantInt::get(IntPtrTy, Index, true), GEP->getName(), GEP);
} else {
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 5d5606ba47b0..2e317f9d0999 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -25,6 +25,7 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/CFG.h"
@@ -34,22 +35,30 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
#define DEBUG_TYPE "simplifycfg"
+static cl::opt<unsigned>
+UserBonusInstThreshold("bonus-inst-threshold", cl::Hidden, cl::init(1),
+ cl::desc("Control the number of bonus instructions (default = 1)"));
+
STATISTIC(NumSimpl, "Number of blocks simplified");
namespace {
struct CFGSimplifyPass : public FunctionPass {
static char ID; // Pass identification, replacement for typeid
- CFGSimplifyPass() : FunctionPass(ID) {
+ unsigned BonusInstThreshold;
+ CFGSimplifyPass(int T = -1) : FunctionPass(ID) {
+ BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T);
initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetTransformInfo>();
}
};
@@ -59,12 +68,13 @@ char CFGSimplifyPass::ID = 0;
INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
false)
INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
false)
// Public interface to the CFGSimplification pass
-FunctionPass *llvm::createCFGSimplificationPass() {
- return new CFGSimplifyPass();
+FunctionPass *llvm::createCFGSimplificationPass(int Threshold) {
+ return new CFGSimplifyPass(Threshold);
}
/// mergeEmptyReturnBlocks - If we have more than one empty (other than phi
@@ -146,7 +156,8 @@ static bool mergeEmptyReturnBlocks(Function &F) {
/// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function,
/// iterating until no more changes are made.
static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
- const DataLayout *DL) {
+ const DataLayout *DL, AssumptionCache *AC,
+ unsigned BonusInstThreshold) {
bool Changed = false;
bool LocalChange = true;
while (LocalChange) {
@@ -155,7 +166,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
// Loop over all of the basic blocks and remove them if they are unneeded...
//
for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
- if (SimplifyCFG(BBIt++, TTI, DL)) {
+ if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, DL, AC)) {
LocalChange = true;
++NumSimpl;
}
@@ -172,12 +183,14 @@ bool CFGSimplifyPass::runOnFunction(Function &F) {
if (skipOptnoneFunction(F))
return false;
+ AssumptionCache *AC =
+ &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
bool EverChanged = removeUnreachableBlocks(F);
EverChanged |= mergeEmptyReturnBlocks(F);
- EverChanged |= iterativelySimplifyCFG(F, TTI, DL);
+ EverChanged |= iterativelySimplifyCFG(F, TTI, DL, AC, BonusInstThreshold);
// If neither pass changed anything, we're done.
if (!EverChanged) return false;
@@ -191,7 +204,7 @@ bool CFGSimplifyPass::runOnFunction(Function &F) {
return true;
do {
- EverChanged = iterativelySimplifyCFG(F, TTI, DL);
+ EverChanged = iterativelySimplifyCFG(F, TTI, DL, AC, BonusInstThreshold);
EverChanged |= removeUnreachableBlocks(F);
} while (EverChanged);
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index 7348c45c5d37..903b675fdd56 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -56,7 +56,7 @@ namespace {
}
private:
bool ProcessBlock(BasicBlock &BB);
- bool SinkInstruction(Instruction *I, SmallPtrSet<Instruction *, 8> &Stores);
+ bool SinkInstruction(Instruction *I, SmallPtrSetImpl<Instruction*> &Stores);
bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB) const;
bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo) const;
};
@@ -157,7 +157,7 @@ bool Sinking::ProcessBlock(BasicBlock &BB) {
}
static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
- SmallPtrSet<Instruction *, 8> &Stores) {
+ SmallPtrSetImpl<Instruction *> &Stores) {
if (Inst->mayWriteToMemory()) {
Stores.insert(Inst);
@@ -166,9 +166,8 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
AliasAnalysis::Location Loc = AA->getLocation(L);
- for (SmallPtrSet<Instruction *, 8>::iterator I = Stores.begin(),
- E = Stores.end(); I != E; ++I)
- if (AA->getModRefInfo(*I, Loc) & AliasAnalysis::Mod)
+ for (Instruction *S : Stores)
+ if (AA->getModRefInfo(S, Loc) & AliasAnalysis::Mod)
return false;
}
@@ -220,7 +219,7 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst,
/// SinkInstruction - Determine whether it is safe to sink the specified machine
/// instruction out of its current block into a successor.
bool Sinking::SinkInstruction(Instruction *Inst,
- SmallPtrSet<Instruction *, 8> &Stores) {
+ SmallPtrSetImpl<Instruction *> &Stores) {
// Don't sink static alloca instructions. CodeGen assumes allocas outside the
// entry block are dynamically sized stack objects.
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index b9673ed655e0..7fe87f9319b6 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -10,6 +10,7 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SCCIterator.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/RegionInfo.h"
#include "llvm/Analysis/RegionIterator.h"
#include "llvm/Analysis/RegionPass.h"
@@ -166,6 +167,7 @@ class StructurizeCFG : public RegionPass {
Region *ParentRegion;
DominatorTree *DT;
+ LoopInfo *LI;
RNVector Order;
BBSet Visited;
@@ -247,6 +249,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequiredID(LowerSwitchID);
AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfo>();
AU.addPreserved<DominatorTreeWrapperPass>();
RegionPass::getAnalysisUsage(AU);
}
@@ -301,8 +304,9 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
BasicBlock *Succ = Term->getSuccessor(i);
- if (Visited.count(Succ))
+ if (Visited.count(Succ) && LI->isLoopHeader(Succ) ) {
Loops[Succ] = BB;
+ }
}
}
}
@@ -862,6 +866,7 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
ParentRegion = R;
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LI = &getAnalysis<LoopInfo>();
orderNodes();
collectInfos();
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index b7580255150c..f3c3e3054b60 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -63,6 +63,7 @@
#include "llvm/IR/CFG.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
@@ -86,6 +87,7 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced");
namespace {
struct TailCallElim : public FunctionPass {
const TargetTransformInfo *TTI;
+ const DataLayout *DL;
static char ID; // Pass identification, replacement for typeid
TailCallElim() : FunctionPass(ID) {
@@ -157,6 +159,8 @@ bool TailCallElim::runOnFunction(Function &F) {
if (skipOptnoneFunction(F))
return false;
+ DL = F.getParent()->getDataLayout();
+
bool AllCallsAreTailCalls = false;
bool Modified = markTails(F, AllCallsAreTailCalls);
if (AllCallsAreTailCalls)
@@ -175,7 +179,7 @@ struct AllocaDerivedValueTracker {
auto AddUsesToWorklist = [&](Value *V) {
for (auto &U : V->uses()) {
- if (!Visited.insert(&U))
+ if (!Visited.insert(&U).second)
continue;
Worklist.push_back(&U);
}
@@ -400,18 +404,28 @@ bool TailCallElim::runTRE(Function &F) {
// alloca' is changed from being a static alloca to being a dynamic alloca.
// Until this is resolved, disable this transformation if that would ever
// happen. This bug is PR962.
+ SmallVector<BasicBlock*, 8> BBToErase;
for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
ArgumentPHIs, !CanTRETailMarkedCall);
- if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
+ if (!Change && BB->getFirstNonPHIOrDbg() == Ret) {
Change = FoldReturnAndProcessPred(BB, Ret, OldEntry,
TailCallsAreMarkedTail, ArgumentPHIs,
!CanTRETailMarkedCall);
+ // FoldReturnAndProcessPred may have emptied some BB. Remember to
+ // erase them.
+ if (Change && BB->empty())
+ BBToErase.push_back(BB);
+
+ }
MadeChange |= Change;
}
}
+ for (auto BB: BBToErase)
+ BB->eraseFromParent();
+
// If we eliminated any tail recursions, it's possible that we inserted some
// silly PHI nodes which just merge an initial value (the incoming operand)
// with themselves. Check to see if we did and clean up our mess if so. This
@@ -450,7 +464,7 @@ bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) {
// being loaded from.
if (CI->mayWriteToMemory() ||
!isSafeToLoadUnconditionally(L->getPointerOperand(), L,
- L->getAlignment()))
+ L->getAlignment(), DL))
return false;
}
}
@@ -819,8 +833,20 @@ bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB,
if (CallInst *CI = FindTRECandidate(BI, CannotTailCallElimCallsMarkedTail)){
DEBUG(dbgs() << "FOLDING: " << *BB
<< "INTO UNCOND BRANCH PRED: " << *Pred);
- EliminateRecursiveTailCall(CI, FoldReturnIntoUncondBranch(Ret, BB, Pred),
- OldEntry, TailCallsAreMarkedTail, ArgumentPHIs,
+ ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred);
+
+ // Cleanup: if all predecessors of BB have been eliminated by
+ // FoldReturnIntoUncondBranch, we would like to delete it, but we
+ // can not just nuke it as it is being used as an iterator by our caller.
+ // Just empty it, and the caller will erase it when it is safe to do so.
+ // It is important to empty it, because the ret instruction in there is
+ // still using a value which EliminateRecursiveTailCall will attempt
+ // to remove.
+ if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
+ BB->getInstList().clear();
+
+ EliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
+ ArgumentPHIs,
CannotTailCallElimCallsMarkedTail);
++NumRetDuped;
Change = true;