summaryrefslogtreecommitdiff
path: root/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms')
-rw-r--r--lib/Transforms/Coroutines/CoroFrame.cpp100
-rw-r--r--lib/Transforms/IPO/FunctionImport.cpp15
-rw-r--r--lib/Transforms/IPO/Inliner.cpp4
-rw-r--r--lib/Transforms/IPO/PartialInlining.cpp426
-rw-r--r--lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp6
-rw-r--r--lib/Transforms/InstCombine/InstCombineAddSub.cpp199
-rw-r--r--lib/Transforms/InstCombine/InstCombineAndOrXor.cpp109
-rw-r--r--lib/Transforms/InstCombine/InstCombineCalls.cpp8
-rw-r--r--lib/Transforms/InstCombine/InstCombineCasts.cpp25
-rw-r--r--lib/Transforms/InstCombine/InstCombineCompares.cpp39
-rw-r--r--lib/Transforms/InstCombine/InstCombineInternal.h30
-rw-r--r--lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp6
-rw-r--r--lib/Transforms/InstCombine/InstCombineMulDivRem.cpp20
-rw-r--r--lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp2
-rw-r--r--lib/Transforms/InstCombine/InstructionCombining.cpp26
-rw-r--r--lib/Transforms/Instrumentation/AddressSanitizer.cpp34
-rw-r--r--lib/Transforms/Instrumentation/DataFlowSanitizer.cpp8
-rw-r--r--lib/Transforms/Instrumentation/EfficiencySanitizer.cpp49
-rw-r--r--lib/Transforms/Instrumentation/MemorySanitizer.cpp7
-rw-r--r--lib/Transforms/Scalar/CorrelatedValuePropagation.cpp10
-rw-r--r--lib/Transforms/Scalar/LoopIdiomRecognize.cpp295
-rw-r--r--lib/Transforms/Scalar/NewGVN.cpp210
-rw-r--r--lib/Transforms/Scalar/SimpleLoopUnswitch.cpp161
-rw-r--r--lib/Transforms/Scalar/SpeculativeExecution.cpp43
-rw-r--r--lib/Transforms/Utils/BypassSlowDivision.cpp4
-rw-r--r--lib/Transforms/Utils/CloneFunction.cpp32
-rw-r--r--lib/Transforms/Utils/CloneModule.cpp2
-rw-r--r--lib/Transforms/Utils/EscapeEnumerator.cpp3
-rw-r--r--lib/Transforms/Utils/InlineFunction.cpp61
-rw-r--r--lib/Transforms/Utils/InstructionNamer.cpp13
-rw-r--r--lib/Transforms/Utils/Local.cpp106
-rw-r--r--lib/Transforms/Utils/LoopUtils.cpp201
-rw-r--r--lib/Transforms/Utils/ModuleUtils.cpp12
-rw-r--r--lib/Transforms/Utils/SimplifyLibCalls.cpp6
-rw-r--r--lib/Transforms/Utils/VNCoercion.cpp9
-rw-r--r--lib/Transforms/Utils/ValueMapper.cpp9
-rw-r--r--lib/Transforms/Vectorize/LoadStoreVectorizer.cpp2
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp241
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp112
39 files changed, 1845 insertions, 800 deletions
diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp
index 19e6789dfa74..4480220f2cd4 100644
--- a/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -177,7 +177,7 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
// consume. Note, that crossing coro.save also requires a spill, as any code
// between coro.save and coro.suspend may resume the coroutine and all of the
// state needs to be saved by that time.
- auto markSuspendBlock = [&](IntrinsicInst* BarrierInst) {
+ auto markSuspendBlock = [&](IntrinsicInst *BarrierInst) {
BasicBlock *SuspendBlock = BarrierInst->getParent();
auto &B = getBlockData(SuspendBlock);
B.Suspend = true;
@@ -495,6 +495,78 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
return FramePtr;
}
+// Sets the unwind edge of an instruction to a particular successor.
+static void setUnwindEdgeTo(TerminatorInst *TI, BasicBlock *Succ) {
+ if (auto *II = dyn_cast<InvokeInst>(TI))
+ II->setUnwindDest(Succ);
+ else if (auto *CS = dyn_cast<CatchSwitchInst>(TI))
+ CS->setUnwindDest(Succ);
+ else if (auto *CR = dyn_cast<CleanupReturnInst>(TI))
+ CR->setUnwindDest(Succ);
+ else
+ llvm_unreachable("unexpected terminator instruction");
+}
+
+// Replaces all uses of OldPred with the NewPred block in all PHINodes in a
+// block.
+static void updatePhiNodes(BasicBlock *DestBB, BasicBlock *OldPred,
+ BasicBlock *NewPred,
+ PHINode *LandingPadReplacement) {
+ unsigned BBIdx = 0;
+ for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PN = cast<PHINode>(I);
+
+ // We manually update the LandingPadReplacement PHINode and it is the last
+ // PHI Node. So, if we find it, we are done.
+ if (LandingPadReplacement == PN)
+ break;
+
+ // Reuse the previous value of BBIdx if it lines up. In cases where we
+ // have multiple phi nodes with *lots* of predecessors, this is a speed
+ // win because we don't have to scan the PHI looking for TIBB. This
+ // happens because the BB list of PHI nodes are usually in the same
+ // order.
+ if (PN->getIncomingBlock(BBIdx) != OldPred)
+ BBIdx = PN->getBasicBlockIndex(OldPred);
+
+ assert(BBIdx != (unsigned)-1 && "Invalid PHI Index!");
+ PN->setIncomingBlock(BBIdx, NewPred);
+ }
+}
+
+// Uses SplitEdge unless the successor block is an EHPad, in which case do EH
+// specific handling.
+static BasicBlock *ehAwareSplitEdge(BasicBlock *BB, BasicBlock *Succ,
+ LandingPadInst *OriginalPad,
+ PHINode *LandingPadReplacement) {
+ auto *PadInst = Succ->getFirstNonPHI();
+ if (!LandingPadReplacement && !PadInst->isEHPad())
+ return SplitEdge(BB, Succ);
+
+ auto *NewBB = BasicBlock::Create(BB->getContext(), "", BB->getParent(), Succ);
+ setUnwindEdgeTo(BB->getTerminator(), NewBB);
+ updatePhiNodes(Succ, BB, NewBB, LandingPadReplacement);
+
+ if (LandingPadReplacement) {
+ auto *NewLP = OriginalPad->clone();
+ auto *Terminator = BranchInst::Create(Succ, NewBB);
+ NewLP->insertBefore(Terminator);
+ LandingPadReplacement->addIncoming(NewLP, NewBB);
+ return NewBB;
+ }
+ Value *ParentPad = nullptr;
+ if (auto *FuncletPad = dyn_cast<FuncletPadInst>(PadInst))
+ ParentPad = FuncletPad->getParentPad();
+ else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(PadInst))
+ ParentPad = CatchSwitch->getParentPad();
+ else
+ llvm_unreachable("handling for other EHPads not implemented yet");
+
+ auto *NewCleanupPad = CleanupPadInst::Create(ParentPad, {}, "", NewBB);
+ CleanupReturnInst::Create(NewCleanupPad, Succ, NewBB);
+ return NewBB;
+}
+
static void rewritePHIs(BasicBlock &BB) {
// For every incoming edge we will create a block holding all
// incoming values in a single PHI nodes.
@@ -502,7 +574,7 @@ static void rewritePHIs(BasicBlock &BB) {
// loop:
// %n.val = phi i32[%n, %entry], [%inc, %loop]
//
- // It will create:
+ // It will create:
//
// loop.from.entry:
// %n.loop.pre = phi i32 [%n, %entry]
@@ -517,9 +589,22 @@ static void rewritePHIs(BasicBlock &BB) {
// TODO: Simplify PHINodes in the basic block to remove duplicate
// predecessors.
+ LandingPadInst *LandingPad = nullptr;
+ PHINode *ReplPHI = nullptr;
+ if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) {
+ // ehAwareSplitEdge will clone the LandingPad in all the edge blocks.
+ // We replace the original landing pad with a PHINode that will collect the
+ // results from all of them.
+ ReplPHI = PHINode::Create(LandingPad->getType(), 1, "", LandingPad);
+ ReplPHI->takeName(LandingPad);
+ LandingPad->replaceAllUsesWith(ReplPHI);
+ // We will erase the original landing pad at the end of this function after
+ // ehAwareSplitEdge cloned it in the transition blocks.
+ }
+
SmallVector<BasicBlock *, 8> Preds(pred_begin(&BB), pred_end(&BB));
for (BasicBlock *Pred : Preds) {
- auto *IncomingBB = SplitEdge(Pred, &BB);
+ auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI);
IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName());
auto *PN = cast<PHINode>(&BB.front());
do {
@@ -531,7 +616,14 @@ static void rewritePHIs(BasicBlock &BB) {
InputV->addIncoming(V, Pred);
PN->setIncomingValue(Index, InputV);
PN = dyn_cast<PHINode>(PN->getNextNode());
- } while (PN);
+ } while (PN != ReplPHI); // ReplPHI is either null or the PHI that replaced
+ // the landing pad.
+ }
+
+ if (LandingPad) {
+ // Calls to ehAwareSplitEdge function cloned the original lading pad.
+ // No longer need it.
+ LandingPad->eraseFromParent();
}
}
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 7ed07d63c627..231487923fad 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -610,8 +610,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
return true;
// Lookup the linkage recorded in the summaries during global analysis.
- const auto &GS = DefinedGlobals.find(GV.getGUID());
- GlobalValue::LinkageTypes Linkage;
+ auto GS = DefinedGlobals.find(GV.getGUID());
if (GS == DefinedGlobals.end()) {
// Must have been promoted (possibly conservatively). Find original
// name so that we can access the correct summary and see if it can
@@ -623,7 +622,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
std::string OrigId = GlobalValue::getGlobalIdentifier(
OrigName, GlobalValue::InternalLinkage,
TheModule.getSourceFileName());
- const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId));
+ GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId));
if (GS == DefinedGlobals.end()) {
// Also check the original non-promoted non-globalized name. In some
// cases a preempted weak value is linked in as a local copy because
@@ -631,15 +630,11 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
// In that case, since it was originally not a local value, it was
// recorded in the index using the original name.
// FIXME: This may not be needed once PR27866 is fixed.
- const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName));
+ GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName));
assert(GS != DefinedGlobals.end());
- Linkage = GS->second->linkage();
- } else {
- Linkage = GS->second->linkage();
}
- } else
- Linkage = GS->second->linkage();
- return !GlobalValue::isLocalLinkage(Linkage);
+ }
+ return !GlobalValue::isLocalLinkage(GS->second->linkage());
};
// FIXME: See if we can just internalize directly here via linkage changes
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 6c83c99ae3be..673d3af0ab52 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -502,7 +502,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
std::swap(CallSites[i--], CallSites[--FirstCallInSCC]);
InlinedArrayAllocasTy InlinedArrayAllocas;
- InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache);
+ InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache, PSI);
// Now that we have all of the call sites, loop over them and inline them if
// it looks profitable to do so.
@@ -872,7 +872,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
// Setup the data structure used to plumb customization into the
// `InlineFunction` routine.
InlineFunctionInfo IFI(
- /*cg=*/nullptr, &GetAssumptionCache,
+ /*cg=*/nullptr, &GetAssumptionCache, PSI,
&FAM.getResult<BlockFrequencyAnalysis>(*(CS.getCaller())),
&FAM.getResult<BlockFrequencyAnalysis>(Callee));
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 2db47b3b5622..8dff2fb3be8a 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -16,6 +16,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
@@ -42,6 +43,11 @@ STATISTIC(NumPartialInlined,
static cl::opt<bool>
DisablePartialInlining("disable-partial-inlining", cl::init(false),
cl::Hidden, cl::desc("Disable partial ininling"));
+// This is an option used by testing:
+static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
+ cl::init(false), cl::ZeroOrMore,
+ cl::ReallyHidden,
+ cl::desc("Skip Cost Analysis"));
static cl::opt<unsigned> MaxNumInlineBlocks(
"max-num-inline-blocks", cl::init(5), cl::Hidden,
@@ -53,6 +59,15 @@ static cl::opt<int> MaxNumPartialInlining(
"max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
cl::desc("Max number of partial inlining. The default is unlimited"));
+// Used only when PGO or user annotated branch data is absent. It is
+// the least value that is used to weigh the outline region. If BFI
+// produces larger value, the BFI value will be used.
+static cl::opt<int>
+ OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
+ cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Relative frequency of outline region to "
+ "the entry block"));
+
namespace {
struct FunctionOutliningInfo {
@@ -84,8 +99,6 @@ struct PartialInlinerImpl {
bool run(Module &M);
Function *unswitchFunction(Function *F);
- std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
-
private:
int NumPartialInlining = 0;
std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
@@ -93,11 +106,84 @@ private:
Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
ProfileSummaryInfo *PSI;
- bool shouldPartialInline(CallSite CS, OptimizationRemarkEmitter &ORE);
+ // Return the frequency of the OutlininingBB relative to F's entry point.
+ // The result is no larger than 1 and is represented using BP.
+ // (Note that the outlined region's 'head' block can only have incoming
+ // edges from the guarding entry blocks).
+ BranchProbability getOutliningCallBBRelativeFreq(Function *F,
+ FunctionOutliningInfo *OI,
+ Function *DuplicateFunction,
+ BlockFrequencyInfo *BFI,
+ BasicBlock *OutliningCallBB);
+
+ // Return true if the callee of CS should be partially inlined with
+ // profit.
+ bool shouldPartialInline(CallSite CS, Function *F, FunctionOutliningInfo *OI,
+ BlockFrequencyInfo *CalleeBFI,
+ BasicBlock *OutliningCallBB,
+ int OutliningCallOverhead,
+ OptimizationRemarkEmitter &ORE);
+
+ // Try to inline DuplicateFunction (cloned from F with call to
+ // the OutlinedFunction into its callers. Return true
+ // if there is any successful inlining.
+ bool tryPartialInline(Function *DuplicateFunction,
+ Function *F, /*orignal function */
+ FunctionOutliningInfo *OI, Function *OutlinedFunction,
+ BlockFrequencyInfo *CalleeBFI);
+
+ // Compute the mapping from use site of DuplicationFunction to the enclosing
+ // BB's profile count.
+ void computeCallsiteToProfCountMap(Function *DuplicateFunction,
+ DenseMap<User *, uint64_t> &SiteCountMap);
+
bool IsLimitReached() {
return (MaxNumPartialInlining != -1 &&
NumPartialInlining >= MaxNumPartialInlining);
}
+
+ CallSite getCallSite(User *U) {
+ CallSite CS;
+ if (CallInst *CI = dyn_cast<CallInst>(U))
+ CS = CallSite(CI);
+ else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
+ CS = CallSite(II);
+ else
+ llvm_unreachable("All uses must be calls");
+ return CS;
+ }
+
+ CallSite getOneCallSiteTo(Function *F) {
+ User *User = *F->user_begin();
+ return getCallSite(User);
+ }
+
+ std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
+ CallSite CS = getOneCallSiteTo(F);
+ DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+ BasicBlock *Block = CS.getParent();
+ return std::make_tuple(DLoc, Block);
+ }
+
+ // Returns the costs associated with function outlining:
+ // - The first value is the non-weighted runtime cost for making the call
+ // to the outlined function 'OutlinedFunction', including the addtional
+ // setup cost in the outlined function itself;
+ // - The second value is the estimated size of the new call sequence in
+ // basic block 'OutliningCallBB';
+ // - The third value is the estimated size of the original code from
+ // function 'F' that is extracted into the outlined function.
+ std::tuple<int, int, int>
+ computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo,
+ Function *OutlinedFunction,
+ BasicBlock *OutliningCallBB);
+ // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
+ // approximate both the size and runtime cost (Note that in the current
+ // inline cost analysis, there is no clear distinction there either).
+ int computeBBInlineCost(BasicBlock *BB);
+
+ std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
+
};
struct PartialInlinerLegacyPass : public ModulePass {
@@ -157,7 +243,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
return isa<ReturnInst>(TI);
};
- auto GetReturnBlock = [=](BasicBlock *Succ1, BasicBlock *Succ2) {
+ auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
if (IsReturnBlock(Succ1))
return std::make_tuple(Succ1, Succ2);
if (IsReturnBlock(Succ2))
@@ -167,7 +253,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
};
// Detect a triangular shape:
- auto GetCommonSucc = [=](BasicBlock *Succ1, BasicBlock *Succ2) {
+ auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
if (IsSuccessor(Succ1, Succ2))
return std::make_tuple(Succ1, Succ2);
if (IsSuccessor(Succ2, Succ1))
@@ -223,7 +309,8 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
// Do sanity check of the entries: threre should not
// be any successors (not in the entry set) other than
// {ReturnBlock, NonReturnBlock}
- assert(OutliningInfo->Entries[0] == &F->front());
+ assert(OutliningInfo->Entries[0] == &F->front() &&
+ "Function Entry must be the first in Entries vector");
DenseSet<BasicBlock *> Entries;
for (BasicBlock *E : OutliningInfo->Entries)
Entries.insert(E);
@@ -289,10 +376,54 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
return OutliningInfo;
}
-bool PartialInlinerImpl::shouldPartialInline(CallSite CS,
- OptimizationRemarkEmitter &ORE) {
- // TODO : more sharing with shouldInline in Inliner.cpp
+// Check if there is PGO data or user annoated branch data:
+static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
+ if (F->getEntryCount())
+ return true;
+ // Now check if any of the entry block has MD_prof data:
+ for (auto *E : OI->Entries) {
+ BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
+ if (!BR || BR->isUnconditional())
+ continue;
+ uint64_t T, F;
+ if (BR->extractProfMetadata(T, F))
+ return true;
+ }
+ return false;
+}
+
+BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
+ Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction,
+ BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB) {
+
+ auto EntryFreq =
+ BFI->getBlockFreq(&DuplicateFunction->getEntryBlock());
+ auto OutliningCallFreq = BFI->getBlockFreq(OutliningCallBB);
+
+ auto OutlineRegionRelFreq =
+ BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(),
+ EntryFreq.getFrequency());
+
+ if (hasProfileData(F, OI))
+ return OutlineRegionRelFreq;
+
+ // When profile data is not available, we need to be very
+ // conservative in estimating the overall savings. We need to make sure
+ // the outline region relative frequency is not below the threshold
+ // specified by the option.
+ OutlineRegionRelFreq = std::max(OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
+
+ return OutlineRegionRelFreq;
+}
+
+bool PartialInlinerImpl::shouldPartialInline(
+ CallSite CS, Function *F /* Original Callee */, FunctionOutliningInfo *OI,
+ BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB,
+ int NonWeightedOutliningRcost, OptimizationRemarkEmitter &ORE) {
using namespace ore;
+ if (SkipCostAnalysis)
+ return true;
+
Instruction *Call = CS.getInstruction();
Function *Callee = CS.getCalledFunction();
Function *Caller = CS.getCaller();
@@ -302,36 +433,170 @@ bool PartialInlinerImpl::shouldPartialInline(CallSite CS,
if (IC.isAlways()) {
ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
- << NV("Callee", Callee)
+ << NV("Callee", F)
<< " should always be fully inlined, not partially");
return false;
}
if (IC.isNever()) {
ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
- << NV("Callee", Callee) << " not partially inlined into "
+ << NV("Callee", F) << " not partially inlined into "
<< NV("Caller", Caller)
<< " because it should never be inlined (cost=never)");
return false;
}
if (!IC) {
- ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
- << NV("Callee", Callee) << " not partially inlined into "
+ ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
+ << NV("Callee", F) << " not partially inlined into "
<< NV("Caller", Caller) << " because too costly to inline (cost="
<< NV("Cost", IC.getCost()) << ", threshold="
<< NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
return false;
}
+ const DataLayout &DL = Caller->getParent()->getDataLayout();
+ // The savings of eliminating the call:
+ int NonWeightedSavings = getCallsiteCost(CS, DL);
+ BlockFrequency NormWeightedSavings(NonWeightedSavings);
+
+ auto RelativeFreq =
+ getOutliningCallBBRelativeFreq(F, OI, Callee, CalleeBFI, OutliningCallBB);
+ auto NormWeightedRcost =
+ BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq;
+
+ // Weighted saving is smaller than weighted cost, return false
+ if (NormWeightedSavings < NormWeightedRcost) {
+ ORE.emit(
+ OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call)
+ << NV("Callee", F) << " not partially inlined into "
+ << NV("Caller", Caller) << " runtime overhead (overhead="
+ << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency())
+ << ", savings="
+ << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")"
+ << " of making the outlined call is too high");
+
+ return false;
+ }
ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
- << NV("Callee", Callee) << " can be partially inlined into "
+ << NV("Callee", F) << " can be partially inlined into "
<< NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
<< " (threshold="
<< NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
return true;
}
+// TODO: Ideally we should share Inliner's InlineCost Analysis code.
+// For now use a simplified version. The returned 'InlineCost' will be used
+// to esimate the size cost as well as runtime cost of the BB.
+int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
+ int InlineCost = 0;
+ const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+ if (isa<DbgInfoIntrinsic>(I))
+ continue;
+
+ if (CallInst *CI = dyn_cast<CallInst>(I)) {
+ InlineCost += getCallsiteCost(CallSite(CI), DL);
+ continue;
+ }
+
+ if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
+ InlineCost += getCallsiteCost(CallSite(II), DL);
+ continue;
+ }
+
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+ InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
+ continue;
+ }
+ InlineCost += InlineConstants::InstrCost;
+ }
+ return InlineCost;
+}
+
+std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts(
+ Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction,
+ BasicBlock *OutliningCallBB) {
+ // First compute the cost of the outlined region 'OI' in the original
+ // function 'F':
+ int OutlinedRegionCost = 0;
+ for (BasicBlock &BB : *F) {
+ if (&BB != OI->ReturnBlock &&
+ // Assuming Entry set is small -- do a linear search here:
+ std::find(OI->Entries.begin(), OI->Entries.end(), &BB) ==
+ OI->Entries.end()) {
+ OutlinedRegionCost += computeBBInlineCost(&BB);
+ }
+ }
+
+ // Now compute the cost of the call sequence to the outlined function
+ // 'OutlinedFunction' in BB 'OutliningCallBB':
+ int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB);
+
+ // Now compute the cost of the extracted/outlined function itself:
+ int OutlinedFunctionCost = 0;
+ for (BasicBlock &BB : *OutlinedFunction) {
+ OutlinedFunctionCost += computeBBInlineCost(&BB);
+ }
+
+ assert(OutlinedFunctionCost >= OutlinedRegionCost &&
+ "Outlined function cost should be no less than the outlined region");
+ int OutliningRuntimeOverhead =
+ OutliningFuncCallCost + (OutlinedFunctionCost - OutlinedRegionCost);
+
+ return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead,
+ OutlinedRegionCost);
+}
+
+// Create the callsite to profile count map which is
+// used to update the original function's entry count,
+// after the function is partially inlined into the callsite.
+void PartialInlinerImpl::computeCallsiteToProfCountMap(
+ Function *DuplicateFunction,
+ DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
+ std::vector<User *> Users(DuplicateFunction->user_begin(),
+ DuplicateFunction->user_end());
+ Function *CurrentCaller = nullptr;
+ BlockFrequencyInfo *CurrentCallerBFI = nullptr;
+
+ auto ComputeCurrBFI = [&,this](Function *Caller) {
+ // For the old pass manager:
+ if (!GetBFI) {
+ if (CurrentCallerBFI)
+ delete CurrentCallerBFI;
+ DominatorTree DT(*Caller);
+ LoopInfo LI(DT);
+ BranchProbabilityInfo BPI(*Caller, LI);
+ CurrentCallerBFI = new BlockFrequencyInfo(*Caller, BPI, LI);
+ } else {
+ // New pass manager:
+ CurrentCallerBFI = &(*GetBFI)(*Caller);
+ }
+ };
+
+ for (User *User : Users) {
+ CallSite CS = getCallSite(User);
+ Function *Caller = CS.getCaller();
+ if (CurrentCaller != Caller) {
+ CurrentCaller = Caller;
+ ComputeCurrBFI(Caller);
+ } else {
+ assert(CurrentCallerBFI && "CallerBFI is not set");
+ }
+ BasicBlock *CallBB = CS.getInstruction()->getParent();
+ auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
+ if (Count)
+ CallSiteToProfCountMap[User] = *Count;
+ else
+ CallSiteToProfCountMap[User] = 0;
+ }
+ if (!GetBFI) {
+ if (CurrentCallerBFI)
+ delete CurrentCallerBFI;
+ }
+}
+
Function *PartialInlinerImpl::unswitchFunction(Function *F) {
if (F->hasAddressTaken())
@@ -347,21 +612,21 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
if (PSI->isFunctionEntryCold(F))
return nullptr;
- std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
- computeOutliningInfo(F);
+ if (F->user_begin() == F->user_end())
+ return nullptr;
+
+ std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
- if (!OutliningInfo)
+ if (!OI)
return nullptr;
// Clone the function, so that we can hack away on it.
ValueToValueMapTy VMap;
Function *DuplicateFunction = CloneFunction(F, VMap);
- BasicBlock *NewReturnBlock =
- cast<BasicBlock>(VMap[OutliningInfo->ReturnBlock]);
- BasicBlock *NewNonReturnBlock =
- cast<BasicBlock>(VMap[OutliningInfo->NonReturnBlock]);
+ BasicBlock *NewReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
+ BasicBlock *NewNonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
DenseSet<BasicBlock *> NewEntries;
- for (BasicBlock *BB : OutliningInfo->Entries) {
+ for (BasicBlock *BB : OI->Entries) {
NewEntries.insert(cast<BasicBlock>(VMap[BB]));
}
@@ -390,7 +655,7 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
BasicBlock *PreReturn = NewReturnBlock;
// only split block when necessary:
PHINode *FirstPhi = getFirstPHI(PreReturn);
- unsigned NumPredsFromEntries = OutliningInfo->ReturnBlockPreds.size();
+ unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size();
if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) {
NewReturnBlock = NewReturnBlock->splitBasicBlock(
@@ -408,14 +673,14 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
Ins = NewReturnBlock->getFirstNonPHI();
RetPhi->addIncoming(&*I, PreReturn);
- for (BasicBlock *E : OutliningInfo->ReturnBlockPreds) {
+ for (BasicBlock *E : OI->ReturnBlockPreds) {
BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE);
OldPhi->removeIncomingValue(NewE);
}
++I;
}
- for (auto E : OutliningInfo->ReturnBlockPreds) {
+ for (auto E : OI->ReturnBlockPreds) {
BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock);
}
@@ -423,7 +688,7 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
// Returns true if the block is to be partial inlined into the caller
// (i.e. not to be extracted to the out of line function)
- auto ToBeInlined = [=](BasicBlock *BB) {
+ auto ToBeInlined = [&](BasicBlock *BB) {
return BB == NewReturnBlock || NewEntries.count(BB);
};
// Gather up the blocks that we're going to extract.
@@ -443,50 +708,113 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI);
// Extract the body of the if.
- Function *ExtractedFunction =
+ Function *OutlinedFunction =
CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI)
.extractCodeRegion();
- // Inline the top-level if test into all callers.
+ bool AnyInline =
+ tryPartialInline(DuplicateFunction, F, OI.get(), OutlinedFunction, &BFI);
+
+ // Ditch the duplicate, since we're done with it, and rewrite all remaining
+ // users (function pointers, etc.) back to the original function.
+ DuplicateFunction->replaceAllUsesWith(F);
+ DuplicateFunction->eraseFromParent();
+
+ if (AnyInline)
+ return OutlinedFunction;
+
+ // Remove the function that is speculatively created:
+ if (OutlinedFunction)
+ OutlinedFunction->eraseFromParent();
+
+ return nullptr;
+}
+
+bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction,
+ Function *F,
+ FunctionOutliningInfo *OI,
+ Function *OutlinedFunction,
+ BlockFrequencyInfo *CalleeBFI) {
+ if (OutlinedFunction == nullptr)
+ return false;
+
+ int NonWeightedRcost;
+ int SizeCost;
+ int OutlinedRegionSizeCost;
+
+ auto OutliningCallBB =
+ getOneCallSiteTo(OutlinedFunction).getInstruction()->getParent();
+
+ std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) =
+ computeOutliningCosts(F, OI, OutlinedFunction, OutliningCallBB);
+
+ // The call sequence to the outlined function is larger than the original
+ // outlined region size, it does not increase the chances of inlining
+ // 'F' with outlining (The inliner usies the size increase to model the
+ // the cost of inlining a callee).
+ if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) {
+ OptimizationRemarkEmitter ORE(F);
+ DebugLoc DLoc;
+ BasicBlock *Block;
+ std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction);
+ ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
+ DLoc, Block)
+ << ore::NV("Function", F)
+ << " not partially inlined into callers (Original Size = "
+ << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost)
+ << ", Size of call sequence to outlined function = "
+ << ore::NV("NewSize", SizeCost) << ")");
+ return false;
+ }
+
+ assert(F->user_begin() == F->user_end() &&
+ "F's users should all be replaced!");
std::vector<User *> Users(DuplicateFunction->user_begin(),
DuplicateFunction->user_end());
+ DenseMap<User *, uint64_t> CallSiteToProfCountMap;
+ if (F->getEntryCount())
+ computeCallsiteToProfCountMap(DuplicateFunction, CallSiteToProfCountMap);
+
+ auto CalleeEntryCount = F->getEntryCount();
+ uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
+ bool AnyInline = false;
for (User *User : Users) {
- CallSite CS;
- if (CallInst *CI = dyn_cast<CallInst>(User))
- CS = CallSite(CI);
- else if (InvokeInst *II = dyn_cast<InvokeInst>(User))
- CS = CallSite(II);
- else
- llvm_unreachable("All uses must be calls");
+ CallSite CS = getCallSite(User);
if (IsLimitReached())
continue;
OptimizationRemarkEmitter ORE(CS.getCaller());
- if (!shouldPartialInline(CS, ORE))
+
+ if (!shouldPartialInline(CS, F, OI, CalleeBFI, OutliningCallBB,
+ NonWeightedRcost, ORE))
continue;
- DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
- BasicBlock *Block = CS.getParent();
- ORE.emit(OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", DLoc, Block)
- << ore::NV("Callee", F) << " partially inlined into "
- << ore::NV("Caller", CS.getCaller()));
+ ORE.emit(
+ OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction())
+ << ore::NV("Callee", F) << " partially inlined into "
+ << ore::NV("Caller", CS.getCaller()));
- InlineFunctionInfo IFI(nullptr, GetAssumptionCache);
+ InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
InlineFunction(CS, IFI);
+
+ // Now update the entry count:
+ if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
+ uint64_t CallSiteCount = CallSiteToProfCountMap[User];
+ CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
+ }
+
+ AnyInline = true;
NumPartialInlining++;
- // update stats
+ // Update the stats
NumPartialInlined++;
}
- // Ditch the duplicate, since we're done with it, and rewrite all remaining
- // users (function pointers, etc.) back to the original function.
- DuplicateFunction->replaceAllUsesWith(F);
- DuplicateFunction->eraseFromParent();
-
+ if (AnyInline && CalleeEntryCount)
+ F->setEntryCount(CalleeEntryCountV);
- return ExtractedFunction;
+ return AnyInline;
}
bool PartialInlinerImpl::run(Module &M) {
diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index d3a3c24ce7b4..659cb9df00a2 100644
--- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -16,6 +16,7 @@
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/TypeMetadataUtils.h"
#include "llvm/Bitcode/BitcodeWriter.h"
#include "llvm/IR/Constants.h"
@@ -178,7 +179,7 @@ void filterModule(
else
GO = new GlobalVariable(
*M, GA->getValueType(), false, GlobalValue::ExternalLinkage,
- (Constant *)nullptr, "", (GlobalVariable *)nullptr,
+ nullptr, "", nullptr,
GA->getThreadLocalMode(), GA->getType()->getAddressSpace());
GO->takeName(GA);
GA->replaceAllUsesWith(GO);
@@ -320,7 +321,8 @@ void splitAndWriteThinLTOBitcode(
// FIXME: Try to re-use BSI and PFI from the original module here.
- ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, nullptr);
+ ProfileSummaryInfo PSI(M);
+ ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI);
SmallVector<char, 0> Buffer;
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 153a186d5ed4..0ca62b7ae40c 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -847,92 +847,6 @@ Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) {
return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
}
-/// \brief Return true if we can prove that adding the two values of the
-/// knownbits will not overflow.
-/// Otherwise return false.
-static bool checkRippleForAdd(const KnownBits &LHSKnown,
- const KnownBits &RHSKnown) {
- // Addition of two 2's complement numbers having opposite signs will never
- // overflow.
- if ((LHSKnown.isNegative() && RHSKnown.isNonNegative()) ||
- (LHSKnown.isNonNegative() && RHSKnown.isNegative()))
- return true;
-
- // If either of the values is known to be non-negative, adding them can only
- // overflow if the second is also non-negative, so we can assume that.
- // Two non-negative numbers will only overflow if there is a carry to the
- // sign bit, so we can check if even when the values are as big as possible
- // there is no overflow to the sign bit.
- if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()) {
- APInt MaxLHS = ~LHSKnown.Zero;
- MaxLHS.clearSignBit();
- APInt MaxRHS = ~RHSKnown.Zero;
- MaxRHS.clearSignBit();
- APInt Result = std::move(MaxLHS) + std::move(MaxRHS);
- return Result.isSignBitClear();
- }
-
- // If either of the values is known to be negative, adding them can only
- // overflow if the second is also negative, so we can assume that.
- // Two negative number will only overflow if there is no carry to the sign
- // bit, so we can check if even when the values are as small as possible
- // there is overflow to the sign bit.
- if (LHSKnown.isNegative() || RHSKnown.isNegative()) {
- APInt MinLHS = LHSKnown.One;
- MinLHS.clearSignBit();
- APInt MinRHS = RHSKnown.One;
- MinRHS.clearSignBit();
- APInt Result = std::move(MinLHS) + std::move(MinRHS);
- return Result.isSignBitSet();
- }
-
- // If we reached here it means that we know nothing about the sign bits.
- // In this case we can't know if there will be an overflow, since by
- // changing the sign bits any two values can be made to overflow.
- return false;
-}
-
-/// Return true if we can prove that:
-/// (sext (add LHS, RHS)) === (add (sext LHS), (sext RHS))
-/// This basically requires proving that the add in the original type would not
-/// overflow to change the sign bit or have a carry out.
-bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS,
- Instruction &CxtI) {
- // There are different heuristics we can use for this. Here are some simple
- // ones.
-
- // If LHS and RHS each have at least two sign bits, the addition will look
- // like
- //
- // XX..... +
- // YY.....
- //
- // If the carry into the most significant position is 0, X and Y can't both
- // be 1 and therefore the carry out of the addition is also 0.
- //
- // If the carry into the most significant position is 1, X and Y can't both
- // be 0 and therefore the carry out of the addition is also 1.
- //
- // Since the carry into the most significant position is always equal to
- // the carry out of the addition, there is no signed overflow.
- if (ComputeNumSignBits(LHS, 0, &CxtI) > 1 &&
- ComputeNumSignBits(RHS, 0, &CxtI) > 1)
- return true;
-
- unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
- KnownBits LHSKnown(BitWidth);
- computeKnownBits(LHS, LHSKnown, 0, &CxtI);
-
- KnownBits RHSKnown(BitWidth);
- computeKnownBits(RHS, RHSKnown, 0, &CxtI);
-
- // Check if carry bit of addition will not cause overflow.
- if (checkRippleForAdd(LHSKnown, RHSKnown))
- return true;
-
- return false;
-}
-
/// \brief Return true if we can prove that:
/// (sub LHS, RHS) === (sub nsw LHS, RHS)
/// This basically requires proving that the add in the original type would not
@@ -968,13 +882,9 @@ bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS,
bool InstCombiner::WillNotOverflowUnsignedSub(Value *LHS, Value *RHS,
Instruction &CxtI) {
// If the LHS is negative and the RHS is non-negative, no unsigned wrap.
- bool LHSKnownNonNegative, LHSKnownNegative;
- bool RHSKnownNonNegative, RHSKnownNegative;
- ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, /*Depth=*/0,
- &CxtI);
- ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, /*Depth=*/0,
- &CxtI);
- if (LHSKnownNegative && RHSKnownNonNegative)
+ KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
+ KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
+ if (LHSKnown.isNegative() && RHSKnown.isNonNegative())
return true;
return false;
@@ -1041,6 +951,57 @@ static Value *checkForNegativeOperand(BinaryOperator &I,
return nullptr;
}
+static Instruction *foldAddWithConstant(BinaryOperator &Add,
+ InstCombiner::BuilderTy &Builder) {
+ Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1);
+ const APInt *C;
+ if (!match(Op1, m_APInt(C)))
+ return nullptr;
+
+ if (C->isSignMask()) {
+ // If wrapping is not allowed, then the addition must set the sign bit:
+ // X + (signmask) --> X | signmask
+ if (Add.hasNoSignedWrap() || Add.hasNoUnsignedWrap())
+ return BinaryOperator::CreateOr(Op0, Op1);
+
+ // If wrapping is allowed, then the addition flips the sign bit of LHS:
+ // X + (signmask) --> X ^ signmask
+ return BinaryOperator::CreateXor(Op0, Op1);
+ }
+
+ Value *X;
+ const APInt *C2;
+ Type *Ty = Add.getType();
+
+ // Is this add the last step in a convoluted sext?
+ // add(zext(xor i16 X, -32768), -32768) --> sext X
+ if (match(Op0, m_ZExt(m_Xor(m_Value(X), m_APInt(C2)))) &&
+ C2->isMinSignedValue() && C2->sext(Ty->getScalarSizeInBits()) == *C)
+ return CastInst::Create(Instruction::SExt, X, Ty);
+
+ // (add (zext (add nuw X, C2)), C) --> (zext (add nuw X, C2 + C))
+ // FIXME: This should check hasOneUse to not increase the instruction count?
+ if (C->isNegative() &&
+ match(Op0, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C2)))) &&
+ C->sge(-C2->sext(C->getBitWidth()))) {
+ Constant *NewC =
+ ConstantInt::get(X->getType(), *C2 + C->trunc(C2->getBitWidth()));
+ return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty);
+ }
+
+ // Shifts and add used to flip and mask off the low bit:
+ // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1
+ const APInt *C3;
+ if (*C == 1 && match(Op0, m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C2)),
+ m_APInt(C3)))) &&
+ C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) {
+ Value *NotX = Builder.CreateNot(X);
+ return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1));
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
bool Changed = SimplifyAssociativeOrCommutative(I);
Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
@@ -1056,41 +1017,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
if (Value *V = SimplifyUsingDistributiveLaws(I))
return replaceInstUsesWith(I, V);
- const APInt *RHSC;
- if (match(RHS, m_APInt(RHSC))) {
- if (RHSC->isSignMask()) {
- // If wrapping is not allowed, then the addition must set the sign bit:
- // X + (signmask) --> X | signmask
- if (I.hasNoSignedWrap() || I.hasNoUnsignedWrap())
- return BinaryOperator::CreateOr(LHS, RHS);
-
- // If wrapping is allowed, then the addition flips the sign bit of LHS:
- // X + (signmask) --> X ^ signmask
- return BinaryOperator::CreateXor(LHS, RHS);
- }
-
- // Is this add the last step in a convoluted sext?
- Value *X;
- const APInt *C;
- if (match(LHS, m_ZExt(m_Xor(m_Value(X), m_APInt(C)))) &&
- C->isMinSignedValue() &&
- C->sext(LHS->getType()->getScalarSizeInBits()) == *RHSC) {
- // add(zext(xor i16 X, -32768), -32768) --> sext X
- return CastInst::Create(Instruction::SExt, X, LHS->getType());
- }
-
- if (RHSC->isNegative() &&
- match(LHS, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C)))) &&
- RHSC->sge(-C->sext(RHSC->getBitWidth()))) {
- // (add (zext (add nuw X, C)), Val) -> (zext (add nuw X, C+Val))
- Constant *NewC =
- ConstantInt::get(X->getType(), *C + RHSC->trunc(C->getBitWidth()));
- return new ZExtInst(Builder->CreateNUWAdd(X, NewC), I.getType());
- }
- }
+ if (Instruction *X = foldAddWithConstant(I, *Builder))
+ return X;
- // FIXME: Use the match above instead of dyn_cast to allow these transforms
- // for splat vectors.
+ // FIXME: This should be moved into the above helper function to allow these
+ // transforms for splat vectors.
if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
// zext(bool) + C -> bool ? C + 1 : C
if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS))
@@ -1285,8 +1216,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
Constant *CI =
ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
if (ConstantExpr::getZExt(CI, I.getType()) == RHSC &&
- computeOverflowForUnsignedAdd(LHSConv->getOperand(0), CI, &I) ==
- OverflowResult::NeverOverflows) {
+ willNotOverflowUnsignedAdd(LHSConv->getOperand(0), CI, I)) {
// Insert the new, smaller add.
Value *NewAdd =
Builder->CreateNUWAdd(LHSConv->getOperand(0), CI, "addconv");
@@ -1303,9 +1233,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
if (LHSConv->getOperand(0)->getType() ==
RHSConv->getOperand(0)->getType() &&
(LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
- computeOverflowForUnsignedAdd(LHSConv->getOperand(0),
- RHSConv->getOperand(0),
- &I) == OverflowResult::NeverOverflows) {
+ willNotOverflowUnsignedAdd(LHSConv->getOperand(0),
+ RHSConv->getOperand(0), I)) {
// Insert the new integer add.
Value *NewAdd = Builder->CreateNUWAdd(
LHSConv->getOperand(0), RHSConv->getOperand(0), "addconv");
@@ -1347,15 +1276,13 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
}
// TODO(jingyue): Consider WillNotOverflowSignedAdd and
- // WillNotOverflowUnsignedAdd to reduce the number of invocations of
+ // willNotOverflowUnsignedAdd to reduce the number of invocations of
// computeKnownBits.
if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS, I)) {
Changed = true;
I.setHasNoSignedWrap(true);
}
- if (!I.hasNoUnsignedWrap() &&
- computeOverflowForUnsignedAdd(LHS, RHS, &I) ==
- OverflowResult::NeverOverflows) {
+ if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedAdd(LHS, RHS, I)) {
Changed = true;
I.setHasNoUnsignedWrap(true);
}
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index b114801cc1c0..82dc88f1b3ad 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -23,21 +23,6 @@ using namespace PatternMatch;
#define DEBUG_TYPE "instcombine"
-static inline Value *dyn_castNotVal(Value *V) {
- // If this is not(not(x)) don't return that this is a not: we want the two
- // not's to be folded first.
- if (BinaryOperator::isNot(V)) {
- Value *Operand = BinaryOperator::getNotArgument(V);
- if (!IsFreeToInvert(Operand, Operand->hasOneUse()))
- return Operand;
- }
-
- // Constants can be considered to be not'ed values...
- if (ConstantInt *C = dyn_cast<ConstantInt>(V))
- return ConstantInt::get(C->getType(), ~C->getValue());
- return nullptr;
-}
-
/// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into
/// a four bit mask.
static unsigned getFCmpCode(FCmpInst::Predicate CC) {
@@ -713,9 +698,8 @@ Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
}
// This simplification is only valid if the upper range is not negative.
- bool IsNegative, IsNotNegative;
- ComputeSignBit(RangeEnd, IsNotNegative, IsNegative, /*Depth=*/0, Cmp1);
- if (!IsNotNegative)
+ KnownBits Known = computeKnownBits(RangeEnd, /*Depth=*/0, Cmp1);
+ if (!Known.isNonNegative())
return nullptr;
if (Inverted)
@@ -1013,26 +997,22 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
/// (~A & ~B) == (~(A | B))
/// (~A | ~B) == (~(A & B))
static Instruction *matchDeMorgansLaws(BinaryOperator &I,
- InstCombiner::BuilderTy *Builder) {
+ InstCombiner::BuilderTy &Builder) {
auto Opcode = I.getOpcode();
assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&
"Trying to match De Morgan's Laws with something other than and/or");
+
// Flip the logic operation.
- if (Opcode == Instruction::And)
- Opcode = Instruction::Or;
- else
- Opcode = Instruction::And;
+ Opcode = (Opcode == Instruction::And) ? Instruction::Or : Instruction::And;
- Value *Op0 = I.getOperand(0);
- Value *Op1 = I.getOperand(1);
- // TODO: Use pattern matchers instead of dyn_cast.
- if (Value *Op0NotVal = dyn_castNotVal(Op0))
- if (Value *Op1NotVal = dyn_castNotVal(Op1))
- if (Op0->hasOneUse() && Op1->hasOneUse()) {
- Value *LogicOp = Builder->CreateBinOp(Opcode, Op0NotVal, Op1NotVal,
- I.getName() + ".demorgan");
- return BinaryOperator::CreateNot(LogicOp);
- }
+ Value *A, *B;
+ if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) &&
+ match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) &&
+ !IsFreeToInvert(A, A->hasOneUse()) &&
+ !IsFreeToInvert(B, B->hasOneUse())) {
+ Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan");
+ return BinaryOperator::CreateNot(AndOr);
+ }
return nullptr;
}
@@ -1376,7 +1356,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
return FoldedLogic;
- if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+ if (Instruction *DeMorgan = matchDeMorgansLaws(I, *Builder))
return DeMorgan;
{
@@ -2005,18 +1985,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
if (Value *V = SimplifyBSwap(I))
return replaceInstUsesWith(I, V);
- if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
- ConstantInt *C1 = nullptr; Value *X = nullptr;
- // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
- if (match(Op0, m_Xor(m_Value(X), m_ConstantInt(C1))) &&
- Op0->hasOneUse()) {
- Value *Or = Builder->CreateOr(X, RHS);
- Or->takeName(Op0);
- return BinaryOperator::CreateXor(Or,
- Builder->getInt(C1->getValue() & ~RHS->getValue()));
- }
- }
-
if (isa<Constant>(Op1))
if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
return FoldedLogic;
@@ -2167,7 +2135,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))
return BinaryOperator::CreateOr(Op1, Builder->CreateAnd(A, C));
- if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+ if (Instruction *DeMorgan = matchDeMorgansLaws(I, *Builder))
return DeMorgan;
// Canonicalize xor to the RHS.
@@ -2399,27 +2367,44 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
}
// Is this a 'not' (~) fed by a binary operator?
- BinaryOperator *NotOp;
- if (match(&I, m_Not(m_BinOp(NotOp)))) {
- if (NotOp->getOpcode() == Instruction::And ||
- NotOp->getOpcode() == Instruction::Or) {
+ BinaryOperator *NotVal;
+ if (match(&I, m_Not(m_BinOp(NotVal)))) {
+ if (NotVal->getOpcode() == Instruction::And ||
+ NotVal->getOpcode() == Instruction::Or) {
// Apply DeMorgan's Law when inverts are free:
// ~(X & Y) --> (~X | ~Y)
// ~(X | Y) --> (~X & ~Y)
- if (IsFreeToInvert(NotOp->getOperand(0),
- NotOp->getOperand(0)->hasOneUse()) &&
- IsFreeToInvert(NotOp->getOperand(1),
- NotOp->getOperand(1)->hasOneUse())) {
- Value *NotX = Builder->CreateNot(NotOp->getOperand(0), "notlhs");
- Value *NotY = Builder->CreateNot(NotOp->getOperand(1), "notrhs");
- if (NotOp->getOpcode() == Instruction::And)
+ if (IsFreeToInvert(NotVal->getOperand(0),
+ NotVal->getOperand(0)->hasOneUse()) &&
+ IsFreeToInvert(NotVal->getOperand(1),
+ NotVal->getOperand(1)->hasOneUse())) {
+ Value *NotX = Builder->CreateNot(NotVal->getOperand(0), "notlhs");
+ Value *NotY = Builder->CreateNot(NotVal->getOperand(1), "notrhs");
+ if (NotVal->getOpcode() == Instruction::And)
return BinaryOperator::CreateOr(NotX, NotY);
return BinaryOperator::CreateAnd(NotX, NotY);
}
- } else if (NotOp->getOpcode() == Instruction::AShr) {
- // ~(~X >>s Y) --> (X >>s Y)
- if (Value *Op0NotVal = dyn_castNotVal(NotOp->getOperand(0)))
- return BinaryOperator::CreateAShr(Op0NotVal, NotOp->getOperand(1));
+ }
+
+ // ~(~X >>s Y) --> (X >>s Y)
+ if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y))))
+ return BinaryOperator::CreateAShr(X, Y);
+
+ // If we are inverting a right-shifted constant, we may be able to eliminate
+ // the 'not' by inverting the constant and using the opposite shift type.
+ // Canonicalization rules ensure that only a negative constant uses 'ashr',
+ // but we must check that in case that transform has not fired yet.
+ const APInt *C;
+ if (match(NotVal, m_AShr(m_APInt(C), m_Value(Y))) && C->isNegative()) {
+ // ~(C >>s Y) --> ~C >>u Y (when inverting the replicated sign bits)
+ Constant *NotC = ConstantInt::get(I.getType(), ~(*C));
+ return BinaryOperator::CreateLShr(NotC, Y);
+ }
+
+ if (match(NotVal, m_LShr(m_APInt(C), m_Value(Y))) && C->isNonNegative()) {
+ // ~(C >>u Y) --> ~C >>s Y (when inverting the replicated sign bits)
+ Constant *NotC = ConstantInt::get(I.getType(), ~(*C));
+ return BinaryOperator::CreateAShr(NotC, Y);
}
}
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 6989d67f0060..face7abcc95f 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1384,10 +1384,10 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
// Create a mask for bits above (ctlz) or below (cttz) the first known one.
bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz;
- unsigned PossibleZeros = IsTZ ? Known.One.countTrailingZeros()
- : Known.One.countLeadingZeros();
- unsigned DefiniteZeros = IsTZ ? Known.Zero.countTrailingOnes()
- : Known.Zero.countLeadingOnes();
+ unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros()
+ : Known.countMaxLeadingZeros();
+ unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros()
+ : Known.countMinLeadingZeros();
// If all bits above (ctlz) or below (cttz) the first known one are known
// zero, this value is constant.
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 312d9baae43a..001a4bcf16f3 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -559,6 +559,9 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
}
+ // FIXME: Maybe combine the next two transforms to handle the no cast case
+ // more efficiently. Support vector types. Cleanup code by using m_OneUse.
+
// Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
Value *A = nullptr; ConstantInt *Cst = nullptr;
if (Src->hasOneUse() &&
@@ -588,15 +591,20 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
// the sign bit of the original value; performing ashr instead of lshr
// generates bits of the same value as the sign bit.
if (Src->hasOneUse() &&
- match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst))) &&
- cast<Instruction>(Src)->getOperand(0)->hasOneUse()) {
+ match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst)))) {
+ Value *SExt = cast<Instruction>(Src)->getOperand(0);
+ const unsigned SExtSize = SExt->getType()->getPrimitiveSizeInBits();
const unsigned ASize = A->getType()->getPrimitiveSizeInBits();
+ unsigned ShiftAmt = Cst->getZExtValue();
// This optimization can be only performed when zero bits generated by
// the original lshr aren't pulled into the value after truncation, so we
- // can only shift by values smaller than the size of destination type (in
- // bits).
- if (Cst->getValue().ult(ASize)) {
- Value *Shift = Builder->CreateAShr(A, Cst->getZExtValue());
+ // can only shift by values no larger than the number of extension bits.
+ // FIXME: Instead of bailing when the shift is too large, use and to clear
+ // the extra bits.
+ if (SExt->hasOneUse() && ShiftAmt <= SExtSize - ASize) {
+ // If shifting by the size of the original value in bits or more, it is
+ // being filled with the sign bit, so shift by ASize-1 to avoid ub.
+ Value *Shift = Builder->CreateAShr(A, std::min(ShiftAmt, ASize-1));
Shift->takeName(Src);
return CastInst::CreateIntegerCast(Shift, CI.getType(), true);
}
@@ -1180,9 +1188,8 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
// If we know that the value being extended is positive, we can use a zext
// instead.
- bool KnownZero, KnownOne;
- ComputeSignBit(Src, KnownZero, KnownOne, 0, &CI);
- if (KnownZero) {
+ KnownBits Known = computeKnownBits(Src, 0, &CI);
+ if (Known.isNonNegative()) {
Value *ZExt = Builder->CreateZExt(Src, DestTy);
return replaceInstUsesWith(CI, ZExt);
}
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 34ce235b3fe2..60ed4057cedd 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -2785,6 +2785,9 @@ Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {
}
/// Try to fold icmp (binop), X or icmp X, (binop).
+/// TODO: A large part of this logic is duplicated in InstSimplify's
+/// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
+/// duplication.
Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -2794,7 +2797,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
if (!BO0 && !BO1)
return nullptr;
- CmpInst::Predicate Pred = I.getPredicate();
+ const CmpInst::Predicate Pred = I.getPredicate();
bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
if (BO0 && isa<OverflowingBinaryOperator>(BO0))
NoOp0WrapProblem =
@@ -3029,21 +3032,20 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
case Instruction::Sub:
case Instruction::Xor:
if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b
- return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
- BO1->getOperand(0));
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
// icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) {
if (CI->getValue().isSignMask()) {
- ICmpInst::Predicate Pred =
+ ICmpInst::Predicate NewPred =
I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
- return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+ return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
}
if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) {
- ICmpInst::Predicate Pred =
+ ICmpInst::Predicate NewPred =
I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
- Pred = I.getSwappedPredicate(Pred);
- return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+ NewPred = I.getSwappedPredicate(NewPred);
+ return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
}
}
break;
@@ -3062,21 +3064,27 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
AP.getBitWidth() - AP.countTrailingZeros()));
Value *And1 = Builder->CreateAnd(BO0->getOperand(0), Mask);
Value *And2 = Builder->CreateAnd(BO1->getOperand(0), Mask);
- return new ICmpInst(I.getPredicate(), And1, And2);
+ return new ICmpInst(Pred, And1, And2);
}
}
break;
+
case Instruction::UDiv:
case Instruction::LShr:
- if (I.isSigned())
+ if (I.isSigned() || !BO0->isExact() || !BO1->isExact())
break;
- LLVM_FALLTHROUGH;
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
case Instruction::SDiv:
+ if (!I.isEquality() || !BO0->isExact() || !BO1->isExact())
+ break;
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
case Instruction::AShr:
if (!BO0->isExact() || !BO1->isExact())
break;
- return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
- BO1->getOperand(0));
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
case Instruction::Shl: {
bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap();
bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap();
@@ -3084,8 +3092,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
break;
if (!NSW && I.isSigned())
break;
- return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
- BO1->getOperand(0));
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
}
}
}
@@ -3096,7 +3103,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
auto BitwiseAnd =
m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value()));
- if (match(BO0, BitwiseAnd) && I.getPredicate() == ICmpInst::ICMP_ULT) {
+ if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) {
auto *Zero = Constant::getNullValue(BO0->getType());
return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero);
}
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 3be6419a129a..1424f61fe701 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -30,6 +30,7 @@
#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -388,10 +389,21 @@ private:
bool DoTransform = true);
Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
- bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI);
+ bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI) {
+ return computeOverflowForSignedAdd(LHS, RHS, &CxtI) ==
+ OverflowResult::NeverOverflows;
+ };
+ bool willNotOverflowUnsignedAdd(Value *LHS, Value *RHS, Instruction &CxtI) {
+ return computeOverflowForUnsignedAdd(LHS, RHS, &CxtI) ==
+ OverflowResult::NeverOverflows;
+ };
bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction &CxtI);
bool WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction &CxtI);
bool WillNotOverflowSignedMul(Value *LHS, Value *RHS, Instruction &CxtI);
+ bool willNotOverflowUnsignedMul(Value *LHS, Value *RHS, Instruction &CxtI) {
+ return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) ==
+ OverflowResult::NeverOverflows;
+ };
Value *EmitGEPOffset(User *GEP);
Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
@@ -492,7 +504,11 @@ public:
void computeKnownBits(Value *V, KnownBits &Known,
unsigned Depth, Instruction *CxtI) const {
- return llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
+ llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
+ }
+ KnownBits computeKnownBits(Value *V, unsigned Depth,
+ Instruction *CxtI) const {
+ return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT);
}
bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth = 0,
@@ -503,11 +519,6 @@ public:
Instruction *CxtI = nullptr) const {
return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT);
}
- void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
- unsigned Depth = 0, Instruction *CxtI = nullptr) const {
- return llvm::ComputeSignBit(V, KnownZero, KnownOne, DL, Depth, &AC, CxtI,
- &DT);
- }
OverflowResult computeOverflowForUnsignedMul(Value *LHS, Value *RHS,
const Instruction *CxtI) {
return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
@@ -516,6 +527,11 @@ public:
const Instruction *CxtI) {
return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
}
+ OverflowResult computeOverflowForSignedAdd(const Value *LHS,
+ const Value *RHS,
+ const Instruction *CxtI) const {
+ return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
+ }
/// Maximum size of array considered when transforming.
uint64_t MaxArraySizeForCombine;
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 675553017838..a4d84ae81aa0 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -885,10 +885,8 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,
// first non-zero index.
auto IsAllNonNegative = [&]() {
for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) {
- bool KnownNonNegative, KnownNegative;
- IC.ComputeSignBit(GEPI->getOperand(i), KnownNonNegative,
- KnownNegative, 0, MemI);
- if (KnownNonNegative)
+ KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI);
+ if (Known.isNonNegative())
continue;
return false;
}
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index face9d9237ae..2a35259f2103 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -162,11 +162,9 @@ bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS,
// product is exactly the minimum negative number.
// E.g. mul i16 with 17 sign bits: 0xff00 * 0xff80 = 0x8000
// For simplicity we just check if at least one side is not negative.
- bool LHSNonNegative, LHSNegative;
- bool RHSNonNegative, RHSNegative;
- ComputeSignBit(LHS, LHSNonNegative, LHSNegative, /*Depth=*/0, &CxtI);
- ComputeSignBit(RHS, RHSNonNegative, RHSNegative, /*Depth=*/0, &CxtI);
- if (LHSNonNegative || RHSNonNegative)
+ KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
+ KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
+ if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative())
return true;
}
return false;
@@ -422,8 +420,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
Constant *CI =
ConstantExpr::getTrunc(Op1C, Op0Conv->getOperand(0)->getType());
if (ConstantExpr::getZExt(CI, I.getType()) == Op1C &&
- computeOverflowForUnsignedMul(Op0Conv->getOperand(0), CI, &I) ==
- OverflowResult::NeverOverflows) {
+ willNotOverflowUnsignedMul(Op0Conv->getOperand(0), CI, I)) {
// Insert the new, smaller mul.
Value *NewMul =
Builder->CreateNUWMul(Op0Conv->getOperand(0), CI, "mulconv");
@@ -440,9 +437,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
if (Op0Conv->getOperand(0)->getType() ==
Op1Conv->getOperand(0)->getType() &&
(Op0Conv->hasOneUse() || Op1Conv->hasOneUse()) &&
- computeOverflowForUnsignedMul(Op0Conv->getOperand(0),
- Op1Conv->getOperand(0),
- &I) == OverflowResult::NeverOverflows) {
+ willNotOverflowUnsignedMul(Op0Conv->getOperand(0),
+ Op1Conv->getOperand(0), I)) {
// Insert the new integer mul.
Value *NewMul = Builder->CreateNUWMul(
Op0Conv->getOperand(0), Op1Conv->getOperand(0), "mulconv");
@@ -456,9 +452,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
I.setHasNoSignedWrap(true);
}
- if (!I.hasNoUnsignedWrap() &&
- computeOverflowForUnsignedMul(Op0, Op1, &I) ==
- OverflowResult::NeverOverflows) {
+ if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedMul(Op0, Op1, I)) {
Changed = true;
I.setHasNoUnsignedWrap(true);
}
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 05b01774cd5e..4028a92771a4 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -611,7 +611,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1))
return I;
- unsigned Leaders = Known2.Zero.countLeadingOnes();
+ unsigned Leaders = Known2.countMinLeadingZeros();
Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
break;
}
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 1792cb585f87..65b1148cb03b 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2212,9 +2212,9 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
// Canonicalize fcmp_one -> fcmp_oeq
FCmpInst::Predicate FPred; Value *Y;
- if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)),
- TrueDest, FalseDest)) &&
- BI.getCondition()->hasOneUse())
+ if (match(&BI, m_Br(m_OneUse(m_FCmp(FPred, m_Value(X), m_Value(Y))),
+ TrueDest, FalseDest))) {
+ // TODO: Why are we only transforming these 3 predicates?
if (FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE ||
FPred == FCmpInst::FCMP_OGE) {
FCmpInst *Cond = cast<FCmpInst>(BI.getCondition());
@@ -2225,12 +2225,12 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
Worklist.Add(Cond);
return &BI;
}
+ }
// Canonicalize icmp_ne -> icmp_eq
ICmpInst::Predicate IPred;
- if (match(&BI, m_Br(m_ICmp(IPred, m_Value(X), m_Value(Y)),
- TrueDest, FalseDest)) &&
- BI.getCondition()->hasOneUse())
+ if (match(&BI, m_Br(m_OneUse(m_ICmp(IPred, m_Value(X), m_Value(Y))),
+ TrueDest, FalseDest))) {
if (IPred == ICmpInst::ICMP_NE || IPred == ICmpInst::ICMP_ULE ||
IPred == ICmpInst::ICMP_SLE || IPred == ICmpInst::ICMP_UGE ||
IPred == ICmpInst::ICMP_SGE) {
@@ -2241,6 +2241,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
Worklist.Add(Cond);
return &BI;
}
+ }
return nullptr;
}
@@ -2264,8 +2265,8 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
unsigned BitWidth = cast<IntegerType>(Cond->getType())->getBitWidth();
KnownBits Known(BitWidth);
computeKnownBits(Cond, Known, 0, &SI);
- unsigned LeadingKnownZeros = Known.Zero.countLeadingOnes();
- unsigned LeadingKnownOnes = Known.One.countLeadingOnes();
+ unsigned LeadingKnownZeros = Known.countMinLeadingZeros();
+ unsigned LeadingKnownOnes = Known.countMinLeadingOnes();
// Compute the number of leading bits we can ignore.
// TODO: A better way to determine this would use ComputeNumSignBits().
@@ -3141,7 +3142,7 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
// Lower dbg.declare intrinsics otherwise their value may be clobbered
// by instcombiner.
- bool DbgDeclaresChanged = LowerDbgDeclare(F);
+ bool MadeIRChange = LowerDbgDeclare(F);
// Iterate while there is work to do.
int Iteration = 0;
@@ -3150,18 +3151,17 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
<< F.getName() << "\n");
- bool Changed = prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
+ MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
InstCombiner IC(Worklist, &Builder, F.optForMinSize(), ExpensiveCombines,
AA, AC, TLI, DT, DL, LI);
IC.MaxArraySizeForCombine = MaxArraySize;
- Changed |= IC.run();
- if (!Changed)
+ if (!IC.run())
break;
}
- return DbgDeclaresChanged || Iteration > 1;
+ return MadeIRChange || Iteration > 1;
}
PreservedAnalyses InstCombinePass::run(Function &F,
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index b034ccc46933..7eea44d6aca0 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -613,7 +613,15 @@ public:
bool UseGlobalsGC = true)
: ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan),
Recover(Recover || ClRecover),
- UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC) {}
+ UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
+ // Not a typo: ClWithComdat is almost completely pointless without
+ // ClUseGlobalsGC (because then it only works on modules without
+ // globals, which are rare); it is a prerequisite for ClUseGlobalsGC;
+ // and both suffer from gold PR19002 for which UseGlobalsGC constructor
+ // argument is designed as workaround. Therefore, disable both
+ // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to
+ // do globals-gc.
+ UseCtorComdat(UseGlobalsGC && ClWithComdat) {}
bool runOnModule(Module &M) override;
static char ID; // Pass identification, replacement for typeid
StringRef getPassName() const override { return "AddressSanitizerModule"; }
@@ -656,6 +664,7 @@ private:
bool CompileKernel;
bool Recover;
bool UseGlobalsGC;
+ bool UseCtorComdat;
Type *IntptrTy;
LLVMContext *C;
Triple TargetTriple;
@@ -1677,7 +1686,7 @@ AddressSanitizerModule::CreateMetadataGlobal(Module &M, Constant *Initializer,
: GlobalVariable::PrivateLinkage;
GlobalVariable *Metadata = new GlobalVariable(
M, Initializer->getType(), false, Linkage, Initializer,
- Twine("__asan_global_") + GlobalValue::getRealLinkageName(OriginalName));
+ Twine("__asan_global_") + GlobalValue::dropLLVMManglingEscape(OriginalName));
Metadata->setSection(getGlobalMetadataSection());
return Metadata;
}
@@ -1782,7 +1791,7 @@ void AddressSanitizerModule::InstrumentGlobalsMachO(
// On recent Mach-O platforms, use a structure which binds the liveness of
// the global variable to the metadata struct. Keep the list of "Liveness" GV
// created to be added to llvm.compiler.used
- StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy, nullptr);
+ StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy);
SmallVector<GlobalValue *, 16> LivenessGlobals(ExtendedGlobals.size());
for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
@@ -1793,9 +1802,9 @@ void AddressSanitizerModule::InstrumentGlobalsMachO(
// On recent Mach-O platforms, we emit the global metadata in a way that
// allows the linker to properly strip dead globals.
- auto LivenessBinder = ConstantStruct::get(
- LivenessTy, Initializer->getAggregateElement(0u),
- ConstantExpr::getPointerCast(Metadata, IntptrTy), nullptr);
+ auto LivenessBinder =
+ ConstantStruct::get(LivenessTy, Initializer->getAggregateElement(0u),
+ ConstantExpr::getPointerCast(Metadata, IntptrTy));
GlobalVariable *Liveness = new GlobalVariable(
M, LivenessTy, false, GlobalVariable::InternalLinkage, LivenessBinder,
Twine("__asan_binder_") + G->getName());
@@ -1893,7 +1902,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
// We initialize an array of such structures and pass it to a run-time call.
StructType *GlobalStructTy =
StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
- IntptrTy, IntptrTy, IntptrTy, nullptr);
+ IntptrTy, IntptrTy, IntptrTy);
SmallVector<GlobalVariable *, 16> NewGlobals(n);
SmallVector<Constant *, 16> Initializers(n);
@@ -1929,10 +1938,9 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);
Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
- StructType *NewTy = StructType::get(Ty, RightRedZoneTy, nullptr);
- Constant *NewInitializer =
- ConstantStruct::get(NewTy, G->getInitializer(),
- Constant::getNullValue(RightRedZoneTy), nullptr);
+ StructType *NewTy = StructType::get(Ty, RightRedZoneTy);
+ Constant *NewInitializer = ConstantStruct::get(
+ NewTy, G->getInitializer(), Constant::getNullValue(RightRedZoneTy));
// Create a new global variable with enough space for a redzone.
GlobalValue::LinkageTypes Linkage = G->getLinkage();
@@ -2013,7 +2021,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
ConstantExpr::getPointerCast(Name, IntptrTy),
ConstantExpr::getPointerCast(ModuleName, IntptrTy),
ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc,
- ConstantExpr::getPointerCast(ODRIndicator, IntptrTy), nullptr);
+ ConstantExpr::getPointerCast(ODRIndicator, IntptrTy));
if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;
@@ -2073,7 +2081,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
// Put the constructor and destructor in comdat if both
// (1) global instrumentation is not TU-specific
// (2) target is ELF.
- if (ClWithComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
+ if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName));
appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority,
AsanCtorFunction);
diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 8786781933ea..e2e3cbdbc295 100644
--- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -388,7 +388,7 @@ FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
ArgTypes.push_back(ShadowPtrTy);
Type *RetType = T->getReturnType();
if (!RetType->isVoidTy())
- RetType = StructType::get(RetType, ShadowTy, (Type *)nullptr);
+ RetType = StructType::get(RetType, ShadowTy);
return FunctionType::get(RetType, ArgTypes, T->isVarArg());
}
@@ -476,16 +476,14 @@ bool DataFlowSanitizer::doInitialization(Module &M) {
GetArgTLS = ConstantExpr::getIntToPtr(
ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)),
PointerType::getUnqual(
- FunctionType::get(PointerType::getUnqual(ArgTLSTy),
- (Type *)nullptr)));
+ FunctionType::get(PointerType::getUnqual(ArgTLSTy), false)));
}
if (GetRetvalTLSPtr) {
RetvalTLS = nullptr;
GetRetvalTLS = ConstantExpr::getIntToPtr(
ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)),
PointerType::getUnqual(
- FunctionType::get(PointerType::getUnqual(ShadowTy),
- (Type *)nullptr)));
+ FunctionType::get(PointerType::getUnqual(ShadowTy), false)));
}
ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000);
diff --git a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
index 7dea1dee756a..e89384c559fe 100644
--- a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
@@ -398,8 +398,8 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
// u64 *ArrayCounter;
// };
auto *StructInfoTy =
- StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy,
- Int8PtrPtrTy, Int64PtrTy, Int64PtrTy, nullptr);
+ StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy,
+ Int8PtrPtrTy, Int64PtrTy, Int64PtrTy);
auto *StructInfoPtrTy = StructInfoTy->getPointerTo();
// This structure should be kept consistent with the CacheFragInfo struct
// in the runtime library.
@@ -408,8 +408,7 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
// u32 NumStructs;
// StructInfo *Structs;
// };
- auto *CacheFragInfoTy =
- StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy, nullptr);
+ auto *CacheFragInfoTy = StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy);
std::vector<StructType *> Vec = M.getIdentifiedStructTypes();
unsigned NumStructs = 0;
@@ -457,24 +456,23 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
ArrayCounterIdx[0] = ConstantInt::get(Int32Ty, 0);
ArrayCounterIdx[1] = ConstantInt::get(Int32Ty,
getArrayCounterIdx(StructTy));
- Initializers.push_back(
- ConstantStruct::get(
- StructInfoTy,
- ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy),
- ConstantInt::get(Int32Ty,
- DL.getStructLayout(StructTy)->getSizeInBytes()),
- ConstantInt::get(Int32Ty, StructTy->getNumElements()),
- Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy) :
- ConstantExpr::getPointerCast(Offset, Int32PtrTy),
- Size == nullptr ? ConstantPointerNull::get(Int32PtrTy) :
- ConstantExpr::getPointerCast(Size, Int32PtrTy),
- TypeName == nullptr ? ConstantPointerNull::get(Int8PtrPtrTy) :
- ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy),
- ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
- FieldCounterIdx),
- ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
- ArrayCounterIdx),
- nullptr));
+ Initializers.push_back(ConstantStruct::get(
+ StructInfoTy,
+ ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy),
+ ConstantInt::get(Int32Ty,
+ DL.getStructLayout(StructTy)->getSizeInBytes()),
+ ConstantInt::get(Int32Ty, StructTy->getNumElements()),
+ Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy)
+ : ConstantExpr::getPointerCast(Offset, Int32PtrTy),
+ Size == nullptr ? ConstantPointerNull::get(Int32PtrTy)
+ : ConstantExpr::getPointerCast(Size, Int32PtrTy),
+ TypeName == nullptr
+ ? ConstantPointerNull::get(Int8PtrPtrTy)
+ : ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy),
+ ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
+ FieldCounterIdx),
+ ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
+ ArrayCounterIdx)));
}
// Structs.
Constant *StructInfo;
@@ -491,11 +489,8 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
auto *CacheFragInfoGV = new GlobalVariable(
M, CacheFragInfoTy, true, GlobalVariable::InternalLinkage,
- ConstantStruct::get(CacheFragInfoTy,
- UnitName,
- ConstantInt::get(Int32Ty, NumStructs),
- StructInfo,
- nullptr));
+ ConstantStruct::get(CacheFragInfoTy, UnitName,
+ ConstantInt::get(Int32Ty, NumStructs), StructInfo));
return CacheFragInfoGV;
}
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 15333a5317dd..ff753c20a94a 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -1576,13 +1576,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
bool Signed = false) {
Type *srcTy = V->getType();
+ size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
+ size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
+ if (srcSizeInBits > 1 && dstSizeInBits == 1)
+ return IRB.CreateICmpNE(V, getCleanShadow(V));
+
if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
return IRB.CreateIntCast(V, dstTy, Signed);
if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
dstTy->getVectorNumElements() == srcTy->getVectorNumElements())
return IRB.CreateIntCast(V, dstTy, Signed);
- size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
- size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
Value *V2 =
IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed);
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 3f1a77b49a44..ee493a8ec7e1 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -442,9 +442,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
bool Changed = false;
if (!NUW) {
- ConstantRange NUWRange =
- LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
- OBO::NoUnsignedWrap);
+ ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+ BinaryOperator::Add, LRange, OBO::NoUnsignedWrap);
if (!NUWRange.isEmptySet()) {
bool NewNUW = NUWRange.contains(LazyRRange());
AddOp->setHasNoUnsignedWrap(NewNUW);
@@ -452,9 +451,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
}
}
if (!NSW) {
- ConstantRange NSWRange =
- LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
- OBO::NoSignedWrap);
+ ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+ BinaryOperator::Add, LRange, OBO::NoSignedWrap);
if (!NSWRange.isEmptySet()) {
bool NewNSW = NSWRange.contains(LazyRRange());
AddOp->setHasNoSignedWrap(NewNSW);
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 48d5ae88cda9..6693a26e8890 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -144,6 +144,10 @@ private:
bool recognizePopcount();
void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
PHINode *CntPhi, Value *Var);
+ bool recognizeAndInsertCTLZ();
+ void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
+ PHINode *CntPhi, Value *Var, const DebugLoc DL,
+ bool ZeroCheck, bool IsCntPhiUsedOutsideLoop);
/// @}
};
@@ -994,7 +998,7 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
}
bool LoopIdiomRecognize::runOnNoncountableLoop() {
- return recognizePopcount();
+ return recognizePopcount() || recognizeAndInsertCTLZ();
}
/// Check if the given conditional branch is based on the comparison between
@@ -1159,6 +1163,167 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
return true;
}
+/// Return true if the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
+/// or nullptr if there is no such.
+/// 2) \p CntPhi is set to the corresponding phi node
+/// or nullptr if there is no such.
+/// 3) \p Var is set to the value whose CTLZ could be used.
+/// 4) \p DefX is set to the instruction calculating Loop exit condition.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+/// if (x0 == 0)
+/// goto loop-exit // the precondition of the loop
+/// cnt0 = init-val;
+/// do {
+/// x = phi (x0, x.next); //PhiX
+/// cnt = phi(cnt0, cnt.next);
+///
+/// cnt.next = cnt + 1;
+/// ...
+/// x.next = x >> 1; // DefX
+/// ...
+/// } while(x.next != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
+ Instruction *&CntInst, PHINode *&CntPhi,
+ Instruction *&DefX) {
+ BasicBlock *LoopEntry;
+ Value *VarX = nullptr;
+
+ DefX = nullptr;
+ PhiX = nullptr;
+ CntInst = nullptr;
+ CntPhi = nullptr;
+ LoopEntry = *(CurLoop->block_begin());
+
+ // step 1: Check if the loop-back branch is in desirable form.
+ if (Value *T = matchCondition(
+ dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+ DefX = dyn_cast<Instruction>(T);
+ else
+ return false;
+
+ // step 2: detect instructions corresponding to "x.next = x >> 1"
+ if (!DefX || DefX->getOpcode() != Instruction::AShr)
+ return false;
+ if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)))
+ if (!Shft || !Shft->isOne())
+ return false;
+ VarX = DefX->getOperand(0);
+
+ // step 3: Check the recurrence of variable X
+ PhiX = dyn_cast<PHINode>(VarX);
+ if (!PhiX || (PhiX->getOperand(0) != DefX && PhiX->getOperand(1) != DefX))
+ return false;
+
+ // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
+ // TODO: We can skip the step. If loop trip count is known (CTLZ),
+ // then all uses of "cnt.next" could be optimized to the trip count
+ // plus "cnt0". Currently it is not optimized.
+ // This step could be used to detect POPCNT instruction:
+ // cnt.next = cnt + (x.next & 1)
+ for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+ IterE = LoopEntry->end();
+ Iter != IterE; Iter++) {
+ Instruction *Inst = &*Iter;
+ if (Inst->getOpcode() != Instruction::Add)
+ continue;
+
+ ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+ if (!Inc || !Inc->isOne())
+ continue;
+
+ PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
+ if (!Phi || Phi->getParent() != LoopEntry)
+ continue;
+
+ CntInst = Inst;
+ CntPhi = Phi;
+ break;
+ }
+ if (!CntInst)
+ return false;
+
+ return true;
+}
+
+/// Recognize CTLZ idiom in a non-countable loop and convert the loop
+/// to countable (with CTLZ trip count).
+/// If CTLZ inserted as a new trip count returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
+ // Give up if the loop has multiple blocks or multiple backedges.
+ if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+ return false;
+
+ Instruction *CntInst, *DefX;
+ PHINode *CntPhi, *PhiX;
+ if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX))
+ return false;
+
+ bool IsCntPhiUsedOutsideLoop = false;
+ for (User *U : CntPhi->users())
+ if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+ IsCntPhiUsedOutsideLoop = true;
+ break;
+ }
+ bool IsCntInstUsedOutsideLoop = false;
+ for (User *U : CntInst->users())
+ if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+ IsCntInstUsedOutsideLoop = true;
+ break;
+ }
+ // If both CntInst and CntPhi are used outside the loop the profitability
+ // is questionable.
+ if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
+ return false;
+
+ // For some CPUs result of CTLZ(X) intrinsic is undefined
+ // when X is 0. If we can not guarantee X != 0, we need to check this
+ // when expand.
+ bool ZeroCheck = false;
+ // It is safe to assume Preheader exist as it was checked in
+ // parent function RunOnLoop.
+ BasicBlock *PH = CurLoop->getLoopPreheader();
+ Value *InitX = PhiX->getIncomingValueForBlock(PH);
+ // If we check X != 0 before entering the loop we don't need a zero
+ // check in CTLZ intrinsic.
+ if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
+ if (BranchInst *PreCondBr =
+ dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
+ if (matchCondition(PreCondBr, PH) == InitX)
+ ZeroCheck = true;
+ }
+
+ // Check if CTLZ intrinsic is profitable. Assume it is always profitable
+ // if we delete the loop (the loop has only 6 instructions):
+ // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
+ // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
+ // %shr = ashr %n.addr.0, 1
+ // %tobool = icmp eq %shr, 0
+ // %inc = add nsw %i.0, 1
+ // br i1 %tobool
+
+ IRBuilder<> Builder(PH->getTerminator());
+ SmallVector<const Value *, 2> Ops =
+ {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()};
+ ArrayRef<const Value *> Args(Ops);
+ if (CurLoop->getHeader()->size() != 6 &&
+ TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
+ TargetTransformInfo::TCC_Basic)
+ return false;
+
+ const DebugLoc DL = DefX->getDebugLoc();
+ transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck,
+ IsCntPhiUsedOutsideLoop);
+ return true;
+}
+
/// Recognizes a population count idiom in a non-countable loop.
///
/// If detected, transforms the relevant code to issue the popcount intrinsic
@@ -1222,6 +1387,134 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
return CI;
}
+static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+ const DebugLoc &DL, bool ZeroCheck) {
+ Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
+ Type *Tys[] = {Val->getType()};
+
+ Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+ Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys);
+ CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+ CI->setDebugLoc(DL);
+
+ return CI;
+}
+
+/// Transform the following loop:
+/// loop:
+/// CntPhi = PHI [Cnt0, CntInst]
+/// PhiX = PHI [InitX, DefX]
+/// CntInst = CntPhi + 1
+/// DefX = PhiX >> 1
+// LOOP_BODY
+/// Br: loop if (DefX != 0)
+/// Use(CntPhi) or Use(CntInst)
+///
+/// Into:
+/// If CntPhi used outside the loop:
+/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
+/// Count = CountPrev + 1
+/// else
+/// Count = BitWidth(InitX) - CTLZ(InitX)
+/// loop:
+/// CntPhi = PHI [Cnt0, CntInst]
+/// PhiX = PHI [InitX, DefX]
+/// PhiCount = PHI [Count, Dec]
+/// CntInst = CntPhi + 1
+/// DefX = PhiX >> 1
+/// Dec = PhiCount - 1
+/// LOOP_BODY
+/// Br: loop if (Dec != 0)
+/// Use(CountPrev + Cnt0) // Use(CntPhi)
+/// or
+/// Use(Count + Cnt0) // Use(CntInst)
+///
+/// If LOOP_BODY is empty the loop will be deleted.
+/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
+void LoopIdiomRecognize::transformLoopToCountable(
+ BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
+ const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
+ BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator());
+
+ // Step 1: Insert the CTLZ instruction at the end of the preheader block
+ // Count = BitWidth - CTLZ(InitX);
+ // If there are uses of CntPhi create:
+ // CountPrev = BitWidth - CTLZ(InitX >> 1);
+ IRBuilder<> Builder(PreheaderBr);
+ Builder.SetCurrentDebugLocation(DL);
+ Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;
+
+ if (IsCntPhiUsedOutsideLoop)
+ InitXNext = Builder.CreateAShr(InitX,
+ ConstantInt::get(InitX->getType(), 1));
+ else
+ InitXNext = InitX;
+ CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
+ Count = Builder.CreateSub(
+ ConstantInt::get(CTLZ->getType(),
+ CTLZ->getType()->getIntegerBitWidth()),
+ CTLZ);
+ if (IsCntPhiUsedOutsideLoop) {
+ CountPrev = Count;
+ Count = Builder.CreateAdd(
+ CountPrev,
+ ConstantInt::get(CountPrev->getType(), 1));
+ }
+ if (IsCntPhiUsedOutsideLoop)
+ NewCount = Builder.CreateZExtOrTrunc(CountPrev,
+ cast<IntegerType>(CntInst->getType()));
+ else
+ NewCount = Builder.CreateZExtOrTrunc(Count,
+ cast<IntegerType>(CntInst->getType()));
+
+ // If the CTLZ counter's initial value is not zero, insert Add Inst.
+ Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
+ ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+ if (!InitConst || !InitConst->isZero())
+ NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+
+ // Step 2: Insert new IV and loop condition:
+ // loop:
+ // ...
+ // PhiCount = PHI [Count, Dec]
+ // ...
+ // Dec = PhiCount - 1
+ // ...
+ // Br: loop if (Dec != 0)
+ BasicBlock *Body = *(CurLoop->block_begin());
+ auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+ ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+ Type *Ty = Count->getType();
+
+ PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+ Builder.SetInsertPoint(LbCond);
+ Instruction *TcDec = cast<Instruction>(
+ Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+ "tcdec", false, true));
+
+ TcPhi->addIncoming(Count, Preheader);
+ TcPhi->addIncoming(TcDec, Body);
+
+ CmpInst::Predicate Pred =
+ (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
+ LbCond->setPredicate(Pred);
+ LbCond->setOperand(0, TcDec);
+ LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+
+ // Step 3: All the references to the original counter outside
+ // the loop are replaced with the NewCount -- the value returned from
+ // __builtin_ctlz(x).
+ if (IsCntPhiUsedOutsideLoop)
+ CntPhi->replaceUsesOutsideBlock(NewCount, Body);
+ else
+ CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+ // step 4: Forget the "non-computable" trip-count SCEV associated with the
+ // loop. The loop would otherwise not be deleted even if it becomes empty.
+ SE->forgetLoop(CurLoop);
+}
+
void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
Instruction *CntInst,
PHINode *CntPhi, Value *Var) {
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 3c9850b156ac..5e0a705782ea 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -283,7 +283,6 @@ public:
// Forward propagation info
const Expression *getDefiningExpr() const { return DefiningExpr; }
- void setDefiningExpr(const Expression *E) { DefiningExpr = E; }
// Value member set
bool empty() const { return Members.empty(); }
@@ -317,6 +316,9 @@ public:
--StoreCount;
}
+ // True if this class has no memory members.
+ bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); }
+
// Return true if two congruence classes are equivalent to each other. This
// means
// that every field but the ID number and the dead field are equivalent.
@@ -401,9 +403,12 @@ class NewGVN {
MemorySSAWalker *MSSAWalker;
const DataLayout &DL;
std::unique_ptr<PredicateInfo> PredInfo;
- BumpPtrAllocator ExpressionAllocator;
- ArrayRecycler<Value *> ArgRecycler;
- TarjanSCC SCCFinder;
+
+ // These are the only two things the create* functions should have
+ // side-effects on due to allocating memory.
+ mutable BumpPtrAllocator ExpressionAllocator;
+ mutable ArrayRecycler<Value *> ArgRecycler;
+ mutable TarjanSCC SCCFinder;
const SimplifyQuery SQ;
// Number of function arguments, used by ranking
@@ -430,11 +435,12 @@ class NewGVN {
// In order to correctly ensure propagation, we must keep track of what
// comparisons we used, so that when the values of the comparisons change, we
// propagate the information to the places we used the comparison.
- DenseMap<const Value *, SmallPtrSet<Instruction *, 2>> PredicateToUsers;
- // Mapping from MemoryAccess we used to the MemoryAccess we used it with. Has
+ mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>>
+ PredicateToUsers;
// the same reasoning as PredicateToUsers. When we skip MemoryAccesses for
// stores, we no longer can rely solely on the def-use chains of MemorySSA.
- DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>> MemoryToUsers;
+ mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>>
+ MemoryToUsers;
// A table storing which memorydefs/phis represent a memory state provably
// equivalent to another memory state.
@@ -457,7 +463,7 @@ class NewGVN {
DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState;
enum PhiCycleState { PCS_Unknown, PCS_CycleFree, PCS_Cycle };
- DenseMap<const PHINode *, PhiCycleState> PhiCycleState;
+ mutable DenseMap<const PHINode *, PhiCycleState> PhiCycleState;
// Expression to class mapping.
using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
ExpressionClassMap ExpressionToClass;
@@ -511,21 +517,24 @@ public:
private:
// Expression handling.
- const Expression *createExpression(Instruction *);
- const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *);
- PHIExpression *createPHIExpression(Instruction *, bool &HasBackedge,
- bool &AllConstant);
- const VariableExpression *createVariableExpression(Value *);
- const ConstantExpression *createConstantExpression(Constant *);
- const Expression *createVariableOrConstant(Value *V);
- const UnknownExpression *createUnknownExpression(Instruction *);
+ const Expression *createExpression(Instruction *) const;
+ const Expression *createBinaryExpression(unsigned, Type *, Value *,
+ Value *) const;
+ PHIExpression *createPHIExpression(Instruction *, bool &HasBackEdge,
+ bool &AllConstant) const;
+ const VariableExpression *createVariableExpression(Value *) const;
+ const ConstantExpression *createConstantExpression(Constant *) const;
+ const Expression *createVariableOrConstant(Value *V) const;
+ const UnknownExpression *createUnknownExpression(Instruction *) const;
const StoreExpression *createStoreExpression(StoreInst *,
- const MemoryAccess *);
+ const MemoryAccess *) const;
LoadExpression *createLoadExpression(Type *, Value *, LoadInst *,
- const MemoryAccess *);
- const CallExpression *createCallExpression(CallInst *, const MemoryAccess *);
- const AggregateValueExpression *createAggregateValueExpression(Instruction *);
- bool setBasicExpressionInfo(Instruction *, BasicExpression *);
+ const MemoryAccess *) const;
+ const CallExpression *createCallExpression(CallInst *,
+ const MemoryAccess *) const;
+ const AggregateValueExpression *
+ createAggregateValueExpression(Instruction *) const;
+ bool setBasicExpressionInfo(Instruction *, BasicExpression *) const;
// Congruence class handling.
CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) {
@@ -560,17 +569,18 @@ private:
// Symbolic evaluation.
const Expression *checkSimplificationResults(Expression *, Instruction *,
- Value *);
- const Expression *performSymbolicEvaluation(Value *);
+ Value *) const;
+ const Expression *performSymbolicEvaluation(Value *) const;
const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
- Instruction *, MemoryAccess *);
- const Expression *performSymbolicLoadEvaluation(Instruction *);
- const Expression *performSymbolicStoreEvaluation(Instruction *);
- const Expression *performSymbolicCallEvaluation(Instruction *);
- const Expression *performSymbolicPHIEvaluation(Instruction *);
- const Expression *performSymbolicAggrValueEvaluation(Instruction *);
- const Expression *performSymbolicCmpEvaluation(Instruction *);
- const Expression *performSymbolicPredicateInfoEvaluation(Instruction *);
+ Instruction *,
+ MemoryAccess *) const;
+ const Expression *performSymbolicLoadEvaluation(Instruction *) const;
+ const Expression *performSymbolicStoreEvaluation(Instruction *) const;
+ const Expression *performSymbolicCallEvaluation(Instruction *) const;
+ const Expression *performSymbolicPHIEvaluation(Instruction *) const;
+ const Expression *performSymbolicAggrValueEvaluation(Instruction *) const;
+ const Expression *performSymbolicCmpEvaluation(Instruction *) const;
+ const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const;
// Congruence finding.
bool someEquivalentDominates(const Instruction *, const Instruction *) const;
@@ -620,8 +630,8 @@ private:
void markPredicateUsersTouched(Instruction *);
void markValueLeaderChangeTouched(CongruenceClass *CC);
void markMemoryLeaderChangeTouched(CongruenceClass *CC);
- void addPredicateUsers(const PredicateBase *, Instruction *);
- void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U);
+ void addPredicateUsers(const PredicateBase *, Instruction *) const;
+ void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const;
// Main loop of value numbering
void iterateTouchedInstructions();
@@ -634,7 +644,7 @@ private:
void verifyIterationSettled(Function &F);
bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const;
BasicBlock *getBlockForValue(Value *V) const;
- void deleteExpression(const Expression *E);
+ void deleteExpression(const Expression *E) const;
unsigned InstrToDFSNum(const Value *V) const {
assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");
return InstrDFS.lookup(V);
@@ -654,7 +664,7 @@ private:
? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())
: InstrDFS.lookup(MA);
}
- bool isCycleFree(const PHINode *PN);
+ bool isCycleFree(const PHINode *PN) const;
template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
// Debug counter info. When verifying, we have to reset the value numbering
// debug counter to the same state it started in to get the same results.
@@ -702,7 +712,7 @@ BasicBlock *NewGVN::getBlockForValue(Value *V) const {
// Delete a definitely dead expression, so it can be reused by the expression
// allocator. Some of these are not in creation functions, so we have to accept
// const versions.
-void NewGVN::deleteExpression(const Expression *E) {
+void NewGVN::deleteExpression(const Expression *E) const {
assert(isa<BasicExpression>(E));
auto *BE = cast<BasicExpression>(E);
const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler);
@@ -710,7 +720,7 @@ void NewGVN::deleteExpression(const Expression *E) {
}
PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
- bool &AllConstant) {
+ bool &AllConstant) const {
BasicBlock *PHIBlock = I->getParent();
auto *PN = cast<PHINode>(I);
auto *E =
@@ -722,30 +732,46 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
unsigned PHIRPO = RPOOrdering.lookup(DT->getNode(PHIBlock));
+ // NewGVN assumes the operands of a PHI node are in a consistent order across
+ // PHIs. LLVM doesn't seem to always guarantee this. While we need to fix
+ // this in LLVM at some point we don't want GVN to find wrong congruences.
+ // Therefore, here we sort uses in predecessor order.
+ // We're sorting the values by pointer. In theory this might be cause of
+ // non-determinism, but here we don't rely on the ordering for anything
+ // significant, e.g. we don't create new instructions based on it so we're
+ // fine.
+ SmallVector<const Use *, 4> PHIOperands;
+ for (const Use &U : PN->operands())
+ PHIOperands.push_back(&U);
+ std::sort(PHIOperands.begin(), PHIOperands.end(),
+ [&](const Use *U1, const Use *U2) {
+ return PN->getIncomingBlock(*U1) < PN->getIncomingBlock(*U2);
+ });
+
// Filter out unreachable phi operands.
- auto Filtered = make_filter_range(PN->operands(), [&](const Use &U) {
- return ReachableEdges.count({PN->getIncomingBlock(U), PHIBlock});
+ auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) {
+ return ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock});
});
std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
- [&](const Use &U) -> Value * {
- auto *BB = PN->getIncomingBlock(U);
+ [&](const Use *U) -> Value * {
+ auto *BB = PN->getIncomingBlock(*U);
auto *DTN = DT->getNode(BB);
if (RPOOrdering.lookup(DTN) >= PHIRPO)
HasBackedge = true;
- AllConstant &= isa<UndefValue>(U) || isa<Constant>(U);
+ AllConstant &= isa<UndefValue>(*U) || isa<Constant>(*U);
// Don't try to transform self-defined phis.
- if (U == PN)
+ if (*U == PN)
return PN;
- return lookupOperandLeader(U);
+ return lookupOperandLeader(*U);
});
return E;
}
// Set basic expression info (Arguments, type, opcode) for Expression
// E from Instruction I in block B.
-bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) {
+bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) const {
bool AllConstant = true;
if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
E->setType(GEP->getSourceElementType());
@@ -766,7 +792,8 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) {
}
const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
- Value *Arg1, Value *Arg2) {
+ Value *Arg1,
+ Value *Arg2) const {
auto *E = new (ExpressionAllocator) BasicExpression(2);
E->setType(T);
@@ -795,7 +822,8 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
// TODO: Once finished, this should not take an Instruction, we only
// use it for printing.
const Expression *NewGVN::checkSimplificationResults(Expression *E,
- Instruction *I, Value *V) {
+ Instruction *I,
+ Value *V) const {
if (!V)
return nullptr;
if (auto *C = dyn_cast<Constant>(V)) {
@@ -827,7 +855,7 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
return nullptr;
}
-const Expression *NewGVN::createExpression(Instruction *I) {
+const Expression *NewGVN::createExpression(Instruction *I) const {
auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
bool AllConstant = setBasicExpressionInfo(I, E);
@@ -913,7 +941,7 @@ const Expression *NewGVN::createExpression(Instruction *I) {
}
const AggregateValueExpression *
-NewGVN::createAggregateValueExpression(Instruction *I) {
+NewGVN::createAggregateValueExpression(Instruction *I) const {
if (auto *II = dyn_cast<InsertValueInst>(I)) {
auto *E = new (ExpressionAllocator)
AggregateValueExpression(I->getNumOperands(), II->getNumIndices());
@@ -932,32 +960,32 @@ NewGVN::createAggregateValueExpression(Instruction *I) {
llvm_unreachable("Unhandled type of aggregate value operation");
}
-const VariableExpression *NewGVN::createVariableExpression(Value *V) {
+const VariableExpression *NewGVN::createVariableExpression(Value *V) const {
auto *E = new (ExpressionAllocator) VariableExpression(V);
E->setOpcode(V->getValueID());
return E;
}
-const Expression *NewGVN::createVariableOrConstant(Value *V) {
+const Expression *NewGVN::createVariableOrConstant(Value *V) const {
if (auto *C = dyn_cast<Constant>(V))
return createConstantExpression(C);
return createVariableExpression(V);
}
-const ConstantExpression *NewGVN::createConstantExpression(Constant *C) {
+const ConstantExpression *NewGVN::createConstantExpression(Constant *C) const {
auto *E = new (ExpressionAllocator) ConstantExpression(C);
E->setOpcode(C->getValueID());
return E;
}
-const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) {
+const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const {
auto *E = new (ExpressionAllocator) UnknownExpression(I);
E->setOpcode(I->getOpcode());
return E;
}
-const CallExpression *NewGVN::createCallExpression(CallInst *CI,
- const MemoryAccess *MA) {
+const CallExpression *
+NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const {
// FIXME: Add operand bundles for calls.
auto *E =
new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA);
@@ -1017,9 +1045,8 @@ Value *NewGVN::lookupOperandLeader(Value *V) const {
const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const {
auto *CC = getMemoryClass(MA);
assert(CC->getMemoryLeader() &&
- "Every MemoryAccess should be mapped to a "
- "congruence class with a represenative memory "
- "access");
+ "Every MemoryAccess should be mapped to a congruence class with a "
+ "representative memory access");
return CC->getMemoryLeader();
}
@@ -1032,7 +1059,7 @@ bool NewGVN::isMemoryAccessTop(const MemoryAccess *MA) const {
LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
LoadInst *LI,
- const MemoryAccess *MA) {
+ const MemoryAccess *MA) const {
auto *E =
new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA));
E->allocateOperands(ArgRecycler, ExpressionAllocator);
@@ -1050,8 +1077,8 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
return E;
}
-const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
- const MemoryAccess *MA) {
+const StoreExpression *
+NewGVN::createStoreExpression(StoreInst *SI, const MemoryAccess *MA) const {
auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand());
auto *E = new (ExpressionAllocator)
StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA);
@@ -1068,7 +1095,7 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
return E;
}
-const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
// Unlike loads, we never try to eliminate stores, so we do not check if they
// are simple and avoid value numbering them.
auto *SI = cast<StoreInst>(I);
@@ -1126,7 +1153,7 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) {
const Expression *
NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
LoadInst *LI, Instruction *DepInst,
- MemoryAccess *DefiningAccess) {
+ MemoryAccess *DefiningAccess) const {
assert((!LI || LI->isSimple()) && "Not a simple load");
if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {
// Can't forward from non-atomic to atomic without violating memory model.
@@ -1201,7 +1228,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
return nullptr;
}
-const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
auto *LI = cast<LoadInst>(I);
// We can eliminate in favor of non-simple loads, but we won't be able to
@@ -1239,7 +1266,7 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) {
}
const Expression *
-NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {
+NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
auto *PI = PredInfo->getPredicateInfoFor(I);
if (!PI)
return nullptr;
@@ -1284,7 +1311,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {
return nullptr;
if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) {
- DEBUG(dbgs() << "Copy is not of any condition operands!");
+ DEBUG(dbgs() << "Copy is not of any condition operands!\n");
return nullptr;
}
Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
@@ -1329,7 +1356,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {
}
// Evaluate read only and pure calls, and create an expression result.
-const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
auto *CI = cast<CallInst>(I);
if (auto *II = dyn_cast<IntrinsicInst>(I)) {
// Instrinsics with the returned attribute are copies of arguments.
@@ -1366,8 +1393,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
DEBUG(dbgs() << "Setting " << *From);
DEBUG(dbgs() << " equivalent to congruence class ");
DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader ");
- DEBUG(dbgs() << *NewClass->getMemoryLeader());
- DEBUG(dbgs() << "\n");
+ DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");
auto LookupResult = MemoryAccessToClass.find(From);
bool Changed = false;
@@ -1381,7 +1407,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
NewClass->memory_insert(MP);
// This may have killed the class if it had no non-memory members
if (OldClass->getMemoryLeader() == From) {
- if (OldClass->memory_empty()) {
+ if (OldClass->definesNoMemory()) {
OldClass->setMemoryLeader(nullptr);
} else {
OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
@@ -1406,7 +1432,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
// Determine if a phi is cycle-free. That means the values in the phi don't
// depend on any expressions that can change value as a result of the phi.
// For example, a non-cycle free phi would be v = phi(0, v+1).
-bool NewGVN::isCycleFree(const PHINode *PN) {
+bool NewGVN::isCycleFree(const PHINode *PN) const {
// In order to compute cycle-freeness, we do SCC finding on the phi, and see
// what kind of SCC it ends up in. If it is a singleton, it is cycle-free.
// If it is not in a singleton, it is only cycle free if the other members are
@@ -1436,7 +1462,7 @@ bool NewGVN::isCycleFree(const PHINode *PN) {
}
// Evaluate PHI nodes symbolically, and create an expression result.
-const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
// True if one of the incoming phi edges is a backedge.
bool HasBackedge = false;
// All constant tracks the state of whether all the *original* phi operands
@@ -1510,7 +1536,8 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) {
return E;
}
-const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) {
+const Expression *
+NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const {
if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
auto *II = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
if (II && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) {
@@ -1548,7 +1575,7 @@ const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) {
return createAggregateValueExpression(I);
}
-const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
auto *CI = dyn_cast<CmpInst>(I);
// See if our operands are equal to those of a previous predicate, and if so,
// if it implies true or false.
@@ -1663,7 +1690,7 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) {
}
// Substitute and symbolize the value before value numbering.
-const Expression *NewGVN::performSymbolicEvaluation(Value *V) {
+const Expression *NewGVN::performSymbolicEvaluation(Value *V) const {
const Expression *E = nullptr;
if (auto *C = dyn_cast<Constant>(V))
E = createConstantExpression(C);
@@ -1749,7 +1776,7 @@ void NewGVN::markUsersTouched(Value *V) {
}
}
-void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) {
+void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const {
DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
MemoryToUsers[To].insert(U);
}
@@ -1772,7 +1799,7 @@ void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) {
}
// Add I to the set of users of a given predicate.
-void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) {
+void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const {
if (auto *PBranch = dyn_cast<PredicateBranch>(PB))
PredicateToUsers[PBranch->Condition].insert(I);
else if (auto *PAssume = dyn_cast<PredicateBranch>(PB))
@@ -1825,8 +1852,7 @@ const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {
// TODO: If this ends up to slow, we can maintain a next memory leader like we
// do for regular leaders.
// Make sure there will be a leader to find
- assert((CC->getStoreCount() > 0 || !CC->memory_empty()) &&
- "Can't get next leader if there is none");
+ assert(!CC->definesNoMemory() && "Can't get next leader if there is none");
if (CC->getStoreCount() > 0) {
if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))
return MSSA->getMemoryAccess(NL);
@@ -1898,7 +1924,7 @@ void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
setMemoryClass(InstMA, NewClass);
// Now, fixup the old class if necessary
if (OldClass->getMemoryLeader() == InstMA) {
- if (OldClass->getStoreCount() != 0 || !OldClass->memory_empty()) {
+ if (!OldClass->definesNoMemory()) {
OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
DEBUG(dbgs() << "Memory class leader change for class "
<< OldClass->getID() << " to "
@@ -1956,10 +1982,9 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) {
// If it's a store expression we are using, it means we are not equivalent
// to something earlier.
- if (isa<StoreExpression>(E)) {
- assert(lookupOperandLeader(SI->getValueOperand()) !=
- NewClass->getLeader());
- NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand()));
+ if (auto *SE = dyn_cast<StoreExpression>(E)) {
+ assert(SE->getStoredValue() != NewClass->getLeader());
+ NewClass->setStoredValue(SE->getStoredValue());
markValueLeaderChangeTouched(NewClass);
// Shift the new class leader to be the store
DEBUG(dbgs() << "Changing leader of congruence class "
@@ -1985,7 +2010,7 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
// See if we destroyed the class or need to swap leaders.
if (OldClass->empty() && OldClass != TOPClass) {
if (OldClass->getDefiningExpr()) {
- DEBUG(dbgs() << "Erasing expression " << OldClass->getDefiningExpr()
+ DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()
<< " from table\n");
ExpressionToClass.erase(OldClass->getDefiningExpr());
}
@@ -2064,7 +2089,7 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
} else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
StoreInst *SI = SE->getStoreInst();
NewClass->setLeader(SI);
- NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand()));
+ NewClass->setStoredValue(SE->getStoredValue());
// The RepMemoryAccess field will be filled in properly by the
// moveValueToNewCongruenceClass call.
} else {
@@ -2523,6 +2548,19 @@ void NewGVN::verifyMemoryCongruency() const {
return false;
if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first))
return !isInstructionTriviallyDead(MemDef->getMemoryInst());
+
+ // We could have phi nodes which operands are all trivially dead,
+ // so we don't process them.
+ if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) {
+ for (auto &U : MemPHI->incoming_values()) {
+ if (Instruction *I = dyn_cast<Instruction>(U.get())) {
+ if (!isInstructionTriviallyDead(I))
+ return true;
+ }
+ }
+ return false;
+ }
+
return true;
};
diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index fb1b47c48276..4f608c97147d 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -55,7 +55,7 @@ static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,
/// Update the dominator tree after removing one exiting predecessor of a loop
/// exit block.
static void updateLoopExitIDom(BasicBlock *LoopExitBB, Loop &L,
- DominatorTree &DT) {
+ DominatorTree &DT) {
assert(pred_begin(LoopExitBB) != pred_end(LoopExitBB) &&
"Cannot have empty predecessors of the loop exit block if we split "
"off a block to unswitch!");
@@ -137,6 +137,98 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,
}
}
+/// Check that all the LCSSA PHI nodes in the loop exit block have trivial
+/// incoming values along this edge.
+static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
+ BasicBlock &ExitBB) {
+ for (Instruction &I : ExitBB) {
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ // No more PHIs to check.
+ return true;
+
+ // If the incoming value for this edge isn't loop invariant the unswitch
+ // won't be trivial.
+ if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB)))
+ return false;
+ }
+ llvm_unreachable("Basic blocks should never be empty!");
+}
+
+/// Rewrite the PHI nodes in an unswitched loop exit basic block.
+///
+/// Requires that the loop exit and unswitched basic block are the same, and
+/// that the exiting block was a unique predecessor of that block. Rewrites the
+/// PHI nodes in that block such that what were LCSSA PHI nodes become trivial
+/// PHI nodes from the old preheader that now contains the unswitched
+/// terminator.
+static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
+ BasicBlock &OldExitingBB,
+ BasicBlock &OldPH) {
+ for (Instruction &I : UnswitchedBB) {
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ // No more PHIs to check.
+ break;
+
+ // When the loop exit is directly unswitched we just need to update the
+ // incoming basic block. We loop to handle weird cases with repeated
+ // incoming blocks, but expect to typically only have one operand here.
+ for (auto i : llvm::seq<int>(0, PN->getNumOperands())) {
+ assert(PN->getIncomingBlock(i) == &OldExitingBB &&
+ "Found incoming block different from unique predecessor!");
+ PN->setIncomingBlock(i, &OldPH);
+ }
+ }
+}
+
+/// Rewrite the PHI nodes in the loop exit basic block and the split off
+/// unswitched block.
+///
+/// Because the exit block remains an exit from the loop, this rewrites the
+/// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI
+/// nodes into the unswitched basic block to select between the value in the
+/// old preheader and the loop exit.
+static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
+ BasicBlock &UnswitchedBB,
+ BasicBlock &OldExitingBB,
+ BasicBlock &OldPH) {
+ assert(&ExitBB != &UnswitchedBB &&
+ "Must have different loop exit and unswitched blocks!");
+ Instruction *InsertPt = &*UnswitchedBB.begin();
+ for (Instruction &I : ExitBB) {
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ // No more PHIs to check.
+ break;
+
+ auto *NewPN = PHINode::Create(PN->getType(), /*NumReservedValues*/ 2,
+ PN->getName() + ".split", InsertPt);
+
+ // Walk backwards over the old PHI node's inputs to minimize the cost of
+ // removing each one. We have to do this weird loop manually so that we
+ // create the same number of new incoming edges in the new PHI as we expect
+ // each case-based edge to be included in the unswitched switch in some
+ // cases.
+ // FIXME: This is really, really gross. It would be much cleaner if LLVM
+ // allowed us to create a single entry for a predecessor block without
+ // having separate entries for each "edge" even though these edges are
+ // required to produce identical results.
+ for (int i = PN->getNumIncomingValues() - 1; i >= 0; --i) {
+ if (PN->getIncomingBlock(i) != &OldExitingBB)
+ continue;
+
+ Value *Incoming = PN->removeIncomingValue(i);
+ NewPN->addIncoming(Incoming, &OldPH);
+ }
+
+ // Now replace the old PHI with the new one and wire the old one in as an
+ // input to the new one.
+ PN->replaceAllUsesWith(NewPN);
+ NewPN->addIncoming(PN, &ExitBB);
+ }
+}
+
/// Unswitch a trivial branch if the condition is loop invariant.
///
/// This routine should only be called when loop code leading to the branch has
@@ -187,10 +279,8 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
assert(L.contains(ContinueBB) &&
"Cannot have both successors exit and still be in the loop!");
- // If the loop exit block contains phi nodes, this isn't trivial.
- // FIXME: We should examine the PHI to determine whether or not we can handle
- // it trivially.
- if (isa<PHINode>(LoopExitBB->begin()))
+ auto *ParentBB = BI.getParent();
+ if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB))
return false;
DEBUG(dbgs() << " unswitching trivial branch when: " << CondVal
@@ -209,14 +299,13 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
BasicBlock *UnswitchedBB;
if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) {
(void)PredBB;
- assert(PredBB == BI.getParent() && "A branch's parent is't a predecessor!");
+ assert(PredBB == BI.getParent() &&
+ "A branch's parent isn't a predecessor!");
UnswitchedBB = LoopExitBB;
} else {
UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI);
}
- BasicBlock *ParentBB = BI.getParent();
-
// Now splice the branch to gate reaching the new preheader and re-point its
// successors.
OldPH->getInstList().splice(std::prev(OldPH->end()),
@@ -229,6 +318,13 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
// terminator.
BranchInst::Create(ContinueBB, ParentBB);
+ // Rewrite the relevant PHI nodes.
+ if (UnswitchedBB == LoopExitBB)
+ rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
+ else
+ rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
+ *ParentBB, *OldPH);
+
// Now we need to update the dominator tree.
updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
// But if we split something off of the loop exit block then we also removed
@@ -278,6 +374,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
if (!L.isLoopInvariant(LoopCond))
return false;
+ auto *ParentBB = SI.getParent();
+
// FIXME: We should compute this once at the start and update it!
SmallVector<BasicBlock *, 16> ExitBlocks;
L.getExitBlocks(ExitBlocks);
@@ -287,12 +385,13 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
SmallVector<int, 4> ExitCaseIndices;
for (auto Case : SI.cases()) {
auto *SuccBB = Case.getCaseSuccessor();
- if (ExitBlockSet.count(SuccBB) && !isa<PHINode>(SuccBB->begin()))
+ if (ExitBlockSet.count(SuccBB) &&
+ areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB))
ExitCaseIndices.push_back(Case.getCaseIndex());
}
BasicBlock *DefaultExitBB = nullptr;
if (ExitBlockSet.count(SI.getDefaultDest()) &&
- !isa<PHINode>(SI.getDefaultDest()->begin()) &&
+ areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) &&
!isa<UnreachableInst>(SI.getDefaultDest()->getTerminator()))
DefaultExitBB = SI.getDefaultDest();
else if (ExitCaseIndices.empty())
@@ -330,7 +429,6 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
if (CommonSuccBB) {
SI.setDefaultDest(CommonSuccBB);
} else {
- BasicBlock *ParentBB = SI.getParent();
BasicBlock *UnreachableBB = BasicBlock::Create(
ParentBB->getContext(),
Twine(ParentBB->getName()) + ".unreachable_default",
@@ -358,30 +456,44 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
// Now add the unswitched switch.
auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH);
- // Split any exit blocks with remaining in-loop predecessors. We walk in
- // reverse so that we split in the same order as the cases appeared. This is
- // purely for convenience of reading the resulting IR, but it doesn't cost
- // anything really.
+ // Rewrite the IR for the unswitched basic blocks. This requires two steps.
+ // First, we split any exit blocks with remaining in-loop predecessors. Then
+ // we update the PHIs in one of two ways depending on if there was a split.
+ // We walk in reverse so that we split in the same order as the cases
+ // appeared. This is purely for convenience of reading the resulting IR, but
+ // it doesn't cost anything really.
+ SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs;
SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap;
// Handle the default exit if necessary.
// FIXME: It'd be great if we could merge this with the loop below but LLVM's
// ranges aren't quite powerful enough yet.
- if (DefaultExitBB && !pred_empty(DefaultExitBB)) {
- auto *SplitBB =
- SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
- updateLoopExitIDom(DefaultExitBB, L, DT);
- DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
+ if (DefaultExitBB) {
+ if (pred_empty(DefaultExitBB)) {
+ UnswitchedExitBBs.insert(DefaultExitBB);
+ rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH);
+ } else {
+ auto *SplitBB =
+ SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
+ rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
+ *ParentBB, *OldPH);
+ updateLoopExitIDom(DefaultExitBB, L, DT);
+ DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
+ }
}
// Note that we must use a reference in the for loop so that we update the
// container.
for (auto &CasePair : reverse(ExitCases)) {
// Grab a reference to the exit block in the pair so that we can update it.
- BasicBlock *&ExitBB = CasePair.second;
+ BasicBlock *ExitBB = CasePair.second;
// If this case is the last edge into the exit block, we can simply reuse it
// as it will no longer be a loop exit. No mapping necessary.
- if (pred_empty(ExitBB))
+ if (pred_empty(ExitBB)) {
+ // Only rewrite once.
+ if (UnswitchedExitBBs.insert(ExitBB).second)
+ rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH);
continue;
+ }
// Otherwise we need to split the exit block so that we retain an exit
// block from the loop and a target for the unswitched condition.
@@ -389,9 +501,12 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
if (!SplitExitBB) {
// If this is the first time we see this, do the split and remember it.
SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
+ rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
+ *ParentBB, *OldPH);
updateLoopExitIDom(ExitBB, L, DT);
}
- ExitBB = SplitExitBB;
+ // Update the case pair to point to the split block.
+ CasePair.second = SplitExitBB;
}
// Now add the unswitched cases. We do this in reverse order as we built them
diff --git a/lib/Transforms/Scalar/SpeculativeExecution.cpp b/lib/Transforms/Scalar/SpeculativeExecution.cpp
index a0fc966cee2c..a7c308b59877 100644
--- a/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -208,6 +208,47 @@ bool SpeculativeExecutionPass::runOnBasicBlock(BasicBlock &B) {
return false;
}
+static unsigned ComputeSpeculationCost(const Instruction *I,
+ const TargetTransformInfo &TTI) {
+ switch (Operator::getOpcode(I)) {
+ case Instruction::GetElementPtr:
+ case Instruction::Add:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Select:
+ case Instruction::Shl:
+ case Instruction::Sub:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::Xor:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::Call:
+ case Instruction::BitCast:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::AddrSpaceCast:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPExt:
+ case Instruction::FPTrunc:
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return TTI.getUserCost(I);
+
+ default:
+ return UINT_MAX; // Disallow anything not whitelisted.
+ }
+}
+
bool SpeculativeExecutionPass::considerHoistingFromTo(
BasicBlock &FromBlock, BasicBlock &ToBlock) {
SmallSet<const Instruction *, 8> NotHoisted;
@@ -223,7 +264,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
unsigned TotalSpeculationCost = 0;
for (auto& I : FromBlock) {
- const unsigned Cost = TTI->getUserCost(&I);
+ const unsigned Cost = ComputeSpeculationCost(&I, *TTI);
if (Cost != UINT_MAX && isSafeToSpeculativelyExecute(&I) &&
AllPrecedingUsesFromBlockHoisted(&I)) {
TotalSpeculationCost += Cost;
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
index 7ffdad597a9b..83ec7f55d1af 100644
--- a/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -261,10 +261,10 @@ ValueRange FastDivInsertionTask::getValueRange(Value *V,
computeKnownBits(V, Known, DL);
- if (Known.Zero.countLeadingOnes() >= HiBits)
+ if (Known.countMinLeadingZeros() >= HiBits)
return VALRNG_KNOWN_SHORT;
- if (Known.One.countLeadingZeros() < HiBits)
+ if (Known.countMaxLeadingZeros() < HiBits)
return VALRNG_LIKELY_LONG;
// Long integer divisions are often used in hashtable implementations. It's
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index d5124ac89016..4aa26fd14fee 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -41,6 +41,7 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
ValueToValueMapTy &VMap,
const Twine &NameSuffix, Function *F,
ClonedCodeInfo *CodeInfo) {
+ DenseMap<const MDNode *, MDNode *> Cache;
BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
@@ -50,6 +51,9 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end();
II != IE; ++II) {
Instruction *NewInst = II->clone();
+ if (F && F->getSubprogram())
+ DebugLoc::reparentDebugInfo(*NewInst, BB->getParent()->getSubprogram(),
+ F->getSubprogram(), Cache);
if (II->hasName())
NewInst->setName(II->getName()+NameSuffix);
NewBB->getInstList().push_back(NewInst);
@@ -120,12 +124,28 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
OldFunc->getAllMetadata(MDs);
- for (auto MD : MDs)
- NewFunc->addMetadata(
- MD.first,
- *MapMetadata(MD.second, VMap,
- ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
- TypeMapper, Materializer));
+ for (auto MD : MDs) {
+ MDNode *NewMD;
+ bool MustCloneSP =
+ (MD.first == LLVMContext::MD_dbg && OldFunc->getParent() &&
+ OldFunc->getParent() == NewFunc->getParent());
+ if (MustCloneSP) {
+ auto *SP = cast<DISubprogram>(MD.second);
+ NewMD = DISubprogram::getDistinct(
+ NewFunc->getContext(), SP->getScope(), SP->getName(),
+ NewFunc->getName(), SP->getFile(), SP->getLine(), SP->getType(),
+ SP->isLocalToUnit(), SP->isDefinition(), SP->getScopeLine(),
+ SP->getContainingType(), SP->getVirtuality(), SP->getVirtualIndex(),
+ SP->getThisAdjustment(), SP->getFlags(), SP->isOptimized(),
+ SP->getUnit(), SP->getTemplateParams(), SP->getDeclaration(),
+ SP->getVariables(), SP->getThrownTypes());
+ } else
+ NewMD =
+ MapMetadata(MD.second, VMap,
+ ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+ TypeMapper, Materializer);
+ NewFunc->addMetadata(MD.first, *NewMD);
+ }
// Loop over all of the basic blocks in the function, cloning them as
// appropriate. Note that we save BE this way in order to handle cloning of
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index 4e9d67252d6c..5444b752de82 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -96,7 +96,7 @@ std::unique_ptr<Module> llvm::CloneModule(
else
GV = new GlobalVariable(
*New, I->getValueType(), false, GlobalValue::ExternalLinkage,
- (Constant *)nullptr, I->getName(), (GlobalVariable *)nullptr,
+ nullptr, I->getName(), nullptr,
I->getThreadLocalMode(), I->getType()->getAddressSpace());
VMap[&*I] = GV;
// We do not copy attributes (mainly because copying between different
diff --git a/lib/Transforms/Utils/EscapeEnumerator.cpp b/lib/Transforms/Utils/EscapeEnumerator.cpp
index 8c2386554da5..78d7474e5b95 100644
--- a/lib/Transforms/Utils/EscapeEnumerator.cpp
+++ b/lib/Transforms/Utils/EscapeEnumerator.cpp
@@ -67,8 +67,7 @@ IRBuilder<> *EscapeEnumerator::Next() {
// Create a cleanup block.
LLVMContext &C = F.getContext();
BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
- Type *ExnTy =
- StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C), nullptr);
+ Type *ExnTy = StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C));
if (!F.hasPersonalityFn()) {
Constant *PersFn = getDefaultPersonalityFn(F.getParent());
F.setPersonalityFn(PersFn);
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 6d56e08af99f..9cb4762b683c 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -1302,41 +1302,6 @@ static bool hasLifetimeMarkers(AllocaInst *AI) {
return false;
}
-/// Rebuild the entire inlined-at chain for this instruction so that the top of
-/// the chain now is inlined-at the new call site.
-static DebugLoc
-updateInlinedAtInfo(const DebugLoc &DL, DILocation *InlinedAtNode,
- LLVMContext &Ctx,
- DenseMap<const DILocation *, DILocation *> &IANodes) {
- SmallVector<DILocation *, 3> InlinedAtLocations;
- DILocation *Last = InlinedAtNode;
- DILocation *CurInlinedAt = DL;
-
- // Gather all the inlined-at nodes
- while (DILocation *IA = CurInlinedAt->getInlinedAt()) {
- // Skip any we've already built nodes for
- if (DILocation *Found = IANodes[IA]) {
- Last = Found;
- break;
- }
-
- InlinedAtLocations.push_back(IA);
- CurInlinedAt = IA;
- }
-
- // Starting from the top, rebuild the nodes to point to the new inlined-at
- // location (then rebuilding the rest of the chain behind it) and update the
- // map of already-constructed inlined-at nodes.
- for (const DILocation *MD : reverse(InlinedAtLocations)) {
- Last = IANodes[MD] = DILocation::getDistinct(
- Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last);
- }
-
- // And finally create the normal location for this instruction, referring to
- // the new inlined-at chain.
- return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), Last);
-}
-
/// Return the result of AI->isStaticAlloca() if AI were moved to the entry
/// block. Allocas used in inalloca calls and allocas of dynamic array size
/// cannot be static.
@@ -1364,14 +1329,16 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
// Cache the inlined-at nodes as they're built so they are reused, without
// this every instruction's inlined-at chain would become distinct from each
// other.
- DenseMap<const DILocation *, DILocation *> IANodes;
+ DenseMap<const MDNode *, MDNode *> IANodes;
for (; FI != Fn->end(); ++FI) {
for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
BI != BE; ++BI) {
if (DebugLoc DL = BI->getDebugLoc()) {
- BI->setDebugLoc(
- updateInlinedAtInfo(DL, InlinedAtNode, BI->getContext(), IANodes));
+ auto IA = DebugLoc::appendInlinedAt(DL, InlinedAtNode, BI->getContext(),
+ IANodes);
+ auto IDL = DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), IA);
+ BI->setDebugLoc(IDL);
continue;
}
@@ -1429,11 +1396,12 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock,
/// Update the branch metadata for cloned call instructions.
static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
const Optional<uint64_t> &CalleeEntryCount,
- const Instruction *TheCall) {
+ const Instruction *TheCall,
+ ProfileSummaryInfo *PSI) {
if (!CalleeEntryCount.hasValue() || CalleeEntryCount.getValue() < 1)
return;
Optional<uint64_t> CallSiteCount =
- ProfileSummaryInfo::getProfileCount(TheCall, nullptr);
+ PSI ? PSI->getProfileCount(TheCall, nullptr) : None;
uint64_t CallCount =
std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0,
CalleeEntryCount.getValue());
@@ -1456,16 +1424,16 @@ static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
/// The callsite's block count is subtracted from the callee's function entry
/// count.
static void updateCalleeCount(BlockFrequencyInfo *CallerBFI, BasicBlock *CallBB,
- Instruction *CallInst, Function *Callee) {
+ Instruction *CallInst, Function *Callee,
+ ProfileSummaryInfo *PSI) {
// If the callee has a original count of N, and the estimated count of
// callsite is M, the new callee count is set to N - M. M is estimated from
// the caller's entry count, its entry block frequency and the block frequency
// of the callsite.
Optional<uint64_t> CalleeCount = Callee->getEntryCount();
- if (!CalleeCount.hasValue())
+ if (!CalleeCount.hasValue() || !PSI)
return;
- Optional<uint64_t> CallCount =
- ProfileSummaryInfo::getProfileCount(CallInst, CallerBFI);
+ Optional<uint64_t> CallCount = PSI->getProfileCount(CallInst, CallerBFI);
if (!CallCount.hasValue())
return;
// Since CallSiteCount is an estimate, it could exceed the original callee
@@ -1668,9 +1636,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI,
CalledFunc->front());
- updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall);
+ updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall,
+ IFI.PSI);
// Update the profile count of callee.
- updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc);
+ updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc, IFI.PSI);
// Inject byval arguments initialization.
for (std::pair<Value*, Value*> &Init : ByValInit)
diff --git a/lib/Transforms/Utils/InstructionNamer.cpp b/lib/Transforms/Utils/InstructionNamer.cpp
index 8a1973d1db05..53b432fcafd4 100644
--- a/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/lib/Transforms/Utils/InstructionNamer.cpp
@@ -26,16 +26,15 @@ namespace {
InstNamer() : FunctionPass(ID) {
initializeInstNamerPass(*PassRegistry::getPassRegistry());
}
-
+
void getAnalysisUsage(AnalysisUsage &Info) const override {
Info.setPreservesAll();
}
bool runOnFunction(Function &F) override {
- for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end();
- AI != AE; ++AI)
- if (!AI->hasName() && !AI->getType()->isVoidTy())
- AI->setName("arg");
+ for (auto &Arg : F.args())
+ if (!Arg.hasName())
+ Arg.setName("arg");
for (BasicBlock &BB : F) {
if (!BB.hasName())
@@ -48,11 +47,11 @@ namespace {
return true;
}
};
-
+
char InstNamer::ID = 0;
}
-INITIALIZE_PASS(InstNamer, "instnamer",
+INITIALIZE_PASS(InstNamer, "instnamer",
"Assign names to anonymous instructions", false, false)
char &llvm::InstructionNamerID = InstNamer::ID;
//===----------------------------------------------------------------------===//
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index ce6b703f3528..1ca509472b5f 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -1041,7 +1041,7 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
KnownBits Known(BitWidth);
computeKnownBits(V, Known, DL, 0, AC, CxtI, DT);
- unsigned TrailZ = Known.Zero.countTrailingOnes();
+ unsigned TrailZ = Known.countMinTrailingZeros();
// Avoid trouble with ridiculously large TrailZ values, such as
// those computed from a null pointer.
@@ -1105,8 +1105,9 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
StoreInst *SI, DIBuilder &Builder) {
auto *DIVar = DDI->getVariable();
- auto *DIExpr = DDI->getExpression();
assert(DIVar && "Missing variable");
+ auto *DIExpr = DDI->getExpression();
+ Value *DV = SI->getOperand(0);
// If an argument is zero extended then use argument directly. The ZExt
// may be zapped by an optimization pass in future.
@@ -1116,34 +1117,28 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0));
if (ExtendedArg) {
- // We're now only describing a subset of the variable. The fragment we're
- // describing will always be smaller than the variable size, because
- // VariableSize == Size of Alloca described by DDI. Since SI stores
- // to the alloca described by DDI, if it's first operand is an extend,
- // we're guaranteed that before extension, the value was narrower than
- // the size of the alloca, hence the size of the described variable.
- SmallVector<uint64_t, 3> Ops;
- unsigned FragmentOffset = 0;
- // If this already is a bit fragment, we drop the bit fragment from the
- // expression and record the offset.
- auto Fragment = DIExpr->getFragmentInfo();
- if (Fragment) {
- Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()-3);
- FragmentOffset = Fragment->OffsetInBits;
- } else {
- Ops.append(DIExpr->elements_begin(), DIExpr->elements_end());
+ // If this DDI was already describing only a fragment of a variable, ensure
+ // that fragment is appropriately narrowed here.
+ // But if a fragment wasn't used, describe the value as the original
+ // argument (rather than the zext or sext) so that it remains described even
+ // if the sext/zext is optimized away. This widens the variable description,
+ // leaving it up to the consumer to know how the smaller value may be
+ // represented in a larger register.
+ if (auto Fragment = DIExpr->getFragmentInfo()) {
+ unsigned FragmentOffset = Fragment->OffsetInBits;
+ SmallVector<uint64_t, 3> Ops(DIExpr->elements_begin(),
+ DIExpr->elements_end() - 3);
+ Ops.push_back(dwarf::DW_OP_LLVM_fragment);
+ Ops.push_back(FragmentOffset);
+ const DataLayout &DL = DDI->getModule()->getDataLayout();
+ Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType()));
+ DIExpr = Builder.createExpression(Ops);
}
- Ops.push_back(dwarf::DW_OP_LLVM_fragment);
- Ops.push_back(FragmentOffset);
- const DataLayout &DL = DDI->getModule()->getDataLayout();
- Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType()));
- auto NewDIExpr = Builder.createExpression(Ops);
- if (!LdStHasDebugValue(DIVar, NewDIExpr, SI))
- Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, NewDIExpr,
- DDI->getDebugLoc(), SI);
- } else if (!LdStHasDebugValue(DIVar, DIExpr, SI))
- Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, DIExpr,
- DDI->getDebugLoc(), SI);
+ DV = ExtendedArg;
+ }
+ if (!LdStHasDebugValue(DIVar, DIExpr, SI))
+ Builder.insertDbgValueIntrinsic(DV, 0, DIVar, DIExpr, DDI->getDebugLoc(),
+ SI);
}
/// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
@@ -1781,44 +1776,43 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J) {
combineMetadata(K, J, KnownIDs);
}
-unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
- DominatorTree &DT,
- const BasicBlockEdge &Root) {
+template <typename RootType, typename DominatesFn>
+static unsigned replaceDominatedUsesWith(Value *From, Value *To,
+ const RootType &Root,
+ const DominatesFn &Dominates) {
assert(From->getType() == To->getType());
-
+
unsigned Count = 0;
for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
- UI != UE; ) {
+ UI != UE;) {
Use &U = *UI++;
- if (DT.dominates(Root, U)) {
- U.set(To);
- DEBUG(dbgs() << "Replace dominated use of '"
- << From->getName() << "' as "
- << *To << " in " << *U << "\n");
- ++Count;
- }
+ if (!Dominates(Root, U))
+ continue;
+ U.set(To);
+ DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
+ << *To << " in " << *U << "\n");
+ ++Count;
}
return Count;
}
unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
DominatorTree &DT,
- const BasicBlock *BB) {
- assert(From->getType() == To->getType());
+ const BasicBlockEdge &Root) {
+ auto Dominates = [&DT](const BasicBlockEdge &Root, const Use &U) {
+ return DT.dominates(Root, U);
+ };
+ return ::replaceDominatedUsesWith(From, To, Root, Dominates);
+}
- unsigned Count = 0;
- for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
- UI != UE;) {
- Use &U = *UI++;
- auto *I = cast<Instruction>(U.getUser());
- if (DT.properlyDominates(BB, I->getParent())) {
- U.set(To);
- DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
- << *To << " in " << *U << "\n");
- ++Count;
- }
- }
- return Count;
+unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
+ DominatorTree &DT,
+ const BasicBlock *BB) {
+ auto ProperlyDominates = [&DT](const BasicBlock *BB, const Use &U) {
+ auto *I = cast<Instruction>(U.getUser())->getParent();
+ return DT.properlyDominates(BB, I);
+ };
+ return ::replaceDominatedUsesWith(From, To, BB, ProperlyDominates);
}
bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index 175d013a011d..81f033e7d51a 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -18,6 +18,7 @@
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
@@ -1112,3 +1113,203 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
else
return (FalseVal + (TrueVal / 2)) / TrueVal;
}
+
+/// \brief Adds a 'fast' flag to floating point operations.
+static Value *addFastMathFlag(Value *V) {
+ if (isa<FPMathOperator>(V)) {
+ FastMathFlags Flags;
+ Flags.setUnsafeAlgebra();
+ cast<Instruction>(V)->setFastMathFlags(Flags);
+ }
+ return V;
+}
+
+// Helper to generate a log2 shuffle reduction.
+Value *
+llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
+ RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
+ ArrayRef<Value *> RedOps) {
+ unsigned VF = Src->getType()->getVectorNumElements();
+ // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+ // and vector ops, reducing the set of values being computed by half each
+ // round.
+ assert(isPowerOf2_32(VF) &&
+ "Reduction emission only supported for pow2 vectors!");
+ Value *TmpVec = Src;
+ SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
+ for (unsigned i = VF; i != 1; i >>= 1) {
+ // Move the upper half of the vector to the lower half.
+ for (unsigned j = 0; j != i / 2; ++j)
+ ShuffleMask[j] = Builder.getInt32(i / 2 + j);
+
+ // Fill the rest of the mask with undef.
+ std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
+ UndefValue::get(Builder.getInt32Ty()));
+
+ Value *Shuf = Builder.CreateShuffleVector(
+ TmpVec, UndefValue::get(TmpVec->getType()),
+ ConstantVector::get(ShuffleMask), "rdx.shuf");
+
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+ // Floating point operations had to be 'fast' to enable the reduction.
+ TmpVec = addFastMathFlag(Builder.CreateBinOp((Instruction::BinaryOps)Op,
+ TmpVec, Shuf, "bin.rdx"));
+ } else {
+ assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
+ "Invalid min/max");
+ TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, TmpVec,
+ Shuf);
+ }
+ if (!RedOps.empty())
+ propagateIRFlags(TmpVec, RedOps);
+ }
+ // The result is in the first element of the vector.
+ return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+}
+
+/// Create a simple vector reduction specified by an opcode and some
+/// flags (if generating min/max reductions).
+Value *llvm::createSimpleTargetReduction(
+ IRBuilder<> &Builder, const TargetTransformInfo *TTI, unsigned Opcode,
+ Value *Src, TargetTransformInfo::ReductionFlags Flags,
+ ArrayRef<Value *> RedOps) {
+ assert(isa<VectorType>(Src->getType()) && "Type must be a vector");
+
+ Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType());
+ std::function<Value*()> BuildFunc;
+ using RD = RecurrenceDescriptor;
+ RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid;
+ // TODO: Support creating ordered reductions.
+ FastMathFlags FMFUnsafe;
+ FMFUnsafe.setUnsafeAlgebra();
+
+ switch (Opcode) {
+ case Instruction::Add:
+ BuildFunc = [&]() { return Builder.CreateAddReduce(Src); };
+ break;
+ case Instruction::Mul:
+ BuildFunc = [&]() { return Builder.CreateMulReduce(Src); };
+ break;
+ case Instruction::And:
+ BuildFunc = [&]() { return Builder.CreateAndReduce(Src); };
+ break;
+ case Instruction::Or:
+ BuildFunc = [&]() { return Builder.CreateOrReduce(Src); };
+ break;
+ case Instruction::Xor:
+ BuildFunc = [&]() { return Builder.CreateXorReduce(Src); };
+ break;
+ case Instruction::FAdd:
+ BuildFunc = [&]() {
+ auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src);
+ cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe);
+ return Rdx;
+ };
+ break;
+ case Instruction::FMul:
+ BuildFunc = [&]() {
+ auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src);
+ cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe);
+ return Rdx;
+ };
+ break;
+ case Instruction::ICmp:
+ if (Flags.IsMaxOp) {
+ MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMax : RD::MRK_UIntMax;
+ BuildFunc = [&]() {
+ return Builder.CreateIntMaxReduce(Src, Flags.IsSigned);
+ };
+ } else {
+ MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMin : RD::MRK_UIntMin;
+ BuildFunc = [&]() {
+ return Builder.CreateIntMinReduce(Src, Flags.IsSigned);
+ };
+ }
+ break;
+ case Instruction::FCmp:
+ if (Flags.IsMaxOp) {
+ MinMaxKind = RD::MRK_FloatMax;
+ BuildFunc = [&]() { return Builder.CreateFPMaxReduce(Src, Flags.NoNaN); };
+ } else {
+ MinMaxKind = RD::MRK_FloatMin;
+ BuildFunc = [&]() { return Builder.CreateFPMinReduce(Src, Flags.NoNaN); };
+ }
+ break;
+ default:
+ llvm_unreachable("Unhandled opcode");
+ break;
+ }
+ if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags))
+ return BuildFunc();
+ return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps);
+}
+
+/// Create a vector reduction using a given recurrence descriptor.
+Value *llvm::createTargetReduction(IRBuilder<> &Builder,
+ const TargetTransformInfo *TTI,
+ RecurrenceDescriptor &Desc, Value *Src,
+ bool NoNaN) {
+ // TODO: Support in-order reductions based on the recurrence descriptor.
+ RecurrenceDescriptor::RecurrenceKind RecKind = Desc.getRecurrenceKind();
+ TargetTransformInfo::ReductionFlags Flags;
+ Flags.NoNaN = NoNaN;
+ auto getSimpleRdx = [&](unsigned Opc) {
+ return createSimpleTargetReduction(Builder, TTI, Opc, Src, Flags);
+ };
+ switch (RecKind) {
+ case RecurrenceDescriptor::RK_FloatAdd:
+ return getSimpleRdx(Instruction::FAdd);
+ case RecurrenceDescriptor::RK_FloatMult:
+ return getSimpleRdx(Instruction::FMul);
+ case RecurrenceDescriptor::RK_IntegerAdd:
+ return getSimpleRdx(Instruction::Add);
+ case RecurrenceDescriptor::RK_IntegerMult:
+ return getSimpleRdx(Instruction::Mul);
+ case RecurrenceDescriptor::RK_IntegerAnd:
+ return getSimpleRdx(Instruction::And);
+ case RecurrenceDescriptor::RK_IntegerOr:
+ return getSimpleRdx(Instruction::Or);
+ case RecurrenceDescriptor::RK_IntegerXor:
+ return getSimpleRdx(Instruction::Xor);
+ case RecurrenceDescriptor::RK_IntegerMinMax: {
+ switch (Desc.getMinMaxRecurrenceKind()) {
+ case RecurrenceDescriptor::MRK_SIntMax:
+ Flags.IsSigned = true;
+ Flags.IsMaxOp = true;
+ break;
+ case RecurrenceDescriptor::MRK_UIntMax:
+ Flags.IsMaxOp = true;
+ break;
+ case RecurrenceDescriptor::MRK_SIntMin:
+ Flags.IsSigned = true;
+ break;
+ case RecurrenceDescriptor::MRK_UIntMin:
+ break;
+ default:
+ llvm_unreachable("Unhandled MRK");
+ }
+ return getSimpleRdx(Instruction::ICmp);
+ }
+ case RecurrenceDescriptor::RK_FloatMinMax: {
+ Flags.IsMaxOp =
+ Desc.getMinMaxRecurrenceKind() == RecurrenceDescriptor::MRK_FloatMax;
+ return getSimpleRdx(Instruction::FCmp);
+ }
+ default:
+ llvm_unreachable("Unhandled RecKind");
+ }
+}
+
+void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
+ if (auto *VecOp = dyn_cast<Instruction>(I)) {
+ if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
+ // VecOVp is initialized to the 0th scalar, so start counting from index
+ // '1'.
+ VecOp->copyIRFlags(I0);
+ for (int i = 1, e = VL.size(); i < e; ++i) {
+ if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
+ VecOp->andIRFlags(Scalar);
+ }
+ }
+ }
+}
diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index 29d334f2968f..2ef3d6336ae2 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -35,7 +35,7 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,
// Upgrade a 2-field global array type to the new 3-field format if needed.
if (Data && OldEltTy->getNumElements() < 3)
EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
- IRB.getInt8PtrTy(), nullptr);
+ IRB.getInt8PtrTy());
else
EltTy = OldEltTy;
if (Constant *Init = GVCtor->getInitializer()) {
@@ -44,10 +44,10 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,
for (unsigned i = 0; i != n; ++i) {
auto Ctor = cast<Constant>(Init->getOperand(i));
if (EltTy != OldEltTy)
- Ctor = ConstantStruct::get(
- EltTy, Ctor->getAggregateElement((unsigned)0),
- Ctor->getAggregateElement(1),
- Constant::getNullValue(IRB.getInt8PtrTy()), nullptr);
+ Ctor =
+ ConstantStruct::get(EltTy, Ctor->getAggregateElement((unsigned)0),
+ Ctor->getAggregateElement(1),
+ Constant::getNullValue(IRB.getInt8PtrTy()));
CurrentCtors.push_back(Ctor);
}
}
@@ -55,7 +55,7 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,
} else {
// Use the new three-field struct if there isn't one already.
EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
- IRB.getInt8PtrTy(), nullptr);
+ IRB.getInt8PtrTy());
}
// Build a 2 or 3 field global_ctor entry. We don't take a comdat key.
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 9e71d746de34..1de579ed41b0 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1450,11 +1450,11 @@ static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
// x86_64 can't use {float, float} since that would be returned in both
// xmm0 and xmm1, which isn't what a real struct would do.
ResTy = T.getArch() == Triple::x86_64
- ? static_cast<Type *>(VectorType::get(ArgTy, 2))
- : static_cast<Type *>(StructType::get(ArgTy, ArgTy, nullptr));
+ ? static_cast<Type *>(VectorType::get(ArgTy, 2))
+ : static_cast<Type *>(StructType::get(ArgTy, ArgTy));
} else {
Name = "__sincospi_stret";
- ResTy = StructType::get(ArgTy, ArgTy, nullptr);
+ ResTy = StructType::get(ArgTy, ArgTy);
}
Module *M = OrigCallee->getParent();
diff --git a/lib/Transforms/Utils/VNCoercion.cpp b/lib/Transforms/Utils/VNCoercion.cpp
index 83bd29dbca65..60d9ede2c487 100644
--- a/lib/Transforms/Utils/VNCoercion.cpp
+++ b/lib/Transforms/Utils/VNCoercion.cpp
@@ -303,6 +303,15 @@ static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,
const DataLayout &DL) {
LLVMContext &Ctx = SrcVal->getType()->getContext();
+ // If two pointers are in the same address space, they have the same size,
+ // so we don't need to do any truncation, etc. This avoids introducing
+ // ptrtoint instructions for pointers that may be non-integral.
+ if (SrcVal->getType()->isPointerTy() && LoadTy->isPointerTy() &&
+ cast<PointerType>(SrcVal->getType())->getAddressSpace() ==
+ cast<PointerType>(LoadTy)->getAddressSpace()) {
+ return SrcVal;
+ }
+
uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
// Compute which bits of the stored value are being used by the load. Convert
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 84d89f103a2f..930972924c3c 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -949,11 +949,10 @@ void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
Constant *NewV;
if (IsOldCtorDtor) {
auto *S = cast<ConstantStruct>(V);
- auto *E1 = mapValue(S->getOperand(0));
- auto *E2 = mapValue(S->getOperand(1));
- Value *Null = Constant::getNullValue(VoidPtrTy);
- NewV =
- ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null, nullptr);
+ auto *E1 = cast<Constant>(mapValue(S->getOperand(0)));
+ auto *E2 = cast<Constant>(mapValue(S->getOperand(1)));
+ Constant *Null = Constant::getNullValue(VoidPtrTy);
+ NewV = ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null);
} else {
NewV = cast_or_null<Constant>(mapValue(V));
}
diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 97dcb40a1d72..9cf66382b581 100644
--- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -346,7 +346,7 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
if (!Safe) {
KnownBits Known(BitWidth);
computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
- if (Known.Zero.countTrailingZeros() < (BitWidth - 1))
+ if (Known.countMaxTrailingOnes() < (BitWidth - 1))
Safe = true;
}
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3fde0a453962..516ab7d03a88 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -391,13 +391,14 @@ public:
TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM),
AddedSafetyChecks(false) {}
- // Perform the actual loop widening (vectorization).
- void vectorize() {
- // Create a new empty loop. Unlink the old loop and connect the new one.
- createEmptyLoop();
- // Widen each instruction in the old loop to a new one in the new loop.
- vectorizeLoop();
- }
+ /// Create a new empty loop. Unlink the old loop and connect the new one.
+ void createVectorizedLoopSkeleton();
+
+ /// Vectorize a single instruction within the innermost loop.
+ void vectorizeInstruction(Instruction &I);
+
+ /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
+ void fixVectorizedLoop();
// Return true if any runtime check is added.
bool areSafetyChecksAdded() { return AddedSafetyChecks; }
@@ -425,9 +426,6 @@ protected:
EdgeMaskCacheTy;
typedef DenseMap<BasicBlock *, VectorParts> BlockMaskCacheTy;
- /// Create an empty loop, based on the loop ranges of the old loop.
- void createEmptyLoop();
-
/// Set up the values of the IVs correctly when exiting the vector loop.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
Value *CountRoundDown, Value *EndValue,
@@ -436,8 +434,6 @@ protected:
/// Create a new induction variable inside L.
PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
Value *Step, Instruction *DL);
- /// Copy and widen the instructions from the old loop.
- virtual void vectorizeLoop();
/// Handle all cross-iteration phis in the header.
void fixCrossIterationPHIs();
@@ -450,10 +446,10 @@ protected:
/// vectorizing this phi node.
void fixReduction(PHINode *Phi);
- /// \brief The Loop exit block may have single value PHI nodes where the
- /// incoming value is 'Undef'. While vectorizing we only handled real values
- /// that were defined inside the loop. Here we fix the 'undef case'.
- /// See PR14725.
+ /// \brief The Loop exit block may have single value PHI nodes with some
+ /// incoming value. While vectorizing we only handled real values
+ /// that were defined inside the loop and we should have one value for
+ /// each predecessor of its parent basic block. See PR14725.
void fixLCSSAPHIs();
/// Iteratively sink the scalarized operands of a predicated instruction into
@@ -464,11 +460,6 @@ protected:
/// respective conditions.
void predicateInstructions();
- /// Collect the instructions from the original loop that would be trivially
- /// dead in the vectorized loop if generated.
- void collectTriviallyDeadInstructions(
- SmallPtrSetImpl<Instruction *> &DeadInstructions);
-
/// Shrinks vector element sizes to the smallest bitwidth they can be legally
/// represented as.
void truncateToMinimalBitwidths();
@@ -481,10 +472,6 @@ protected:
/// and DST.
VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
- /// A helper function to vectorize a single instruction within the innermost
- /// loop.
- void vectorizeInstruction(Instruction &I);
-
/// Vectorize a single PHINode in a block. This method handles the induction
/// variable canonicalization. It supports both VF = 1 for unrolled loops and
/// arbitrary length vectors.
@@ -1700,6 +1687,9 @@ public:
/// access that can be widened.
bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
+ // Returns true if the NoNaN attribute is set on the function.
+ bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
+
private:
/// Check if a single basic block loop is vectorizable.
/// At this point we know that this is a loop with a constant trip count
@@ -2185,7 +2175,10 @@ public:
/// passed Legality checks.
class LoopVectorizationPlanner {
public:
- LoopVectorizationPlanner(LoopVectorizationCostModel &CM) : CM(CM) {}
+ LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI,
+ LoopVectorizationLegality *Legal,
+ LoopVectorizationCostModel &CM)
+ : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {}
~LoopVectorizationPlanner() {}
@@ -2193,7 +2186,25 @@ public:
LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
unsigned UserVF);
+ /// Generate the IR code for the vectorized loop.
+ void executePlan(InnerLoopVectorizer &ILV);
+
+protected:
+ /// Collect the instructions from the original loop that would be trivially
+ /// dead in the vectorized loop if generated.
+ void collectTriviallyDeadInstructions(
+ SmallPtrSetImpl<Instruction *> &DeadInstructions);
+
private:
+ /// The loop that we evaluate.
+ Loop *OrigLoop;
+
+ /// Loop Info analysis.
+ LoopInfo *LI;
+
+ /// The legality analysis.
+ LoopVectorizationLegality *Legal;
+
/// The profitablity analysis.
LoopVectorizationCostModel &CM;
};
@@ -3361,7 +3372,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
LVer->prepareNoAliasMetadata();
}
-void InnerLoopVectorizer::createEmptyLoop() {
+void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
/*
In this function we generate a new loop. The new loop will contain
the vectorized instructions while the old loop will continue to run the
@@ -3883,36 +3894,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
}
}
-void InnerLoopVectorizer::vectorizeLoop() {
- //===------------------------------------------------===//
- //
- // Notice: any optimization or new instruction that go
- // into the code below should be also be implemented in
- // the cost-model.
- //
- //===------------------------------------------------===//
-
- // Collect instructions from the original loop that will become trivially dead
- // in the vectorized loop. We don't need to vectorize these instructions. For
- // example, original induction update instructions can become dead because we
- // separately emit induction "steps" when generating code for the new loop.
- // Similarly, we create a new latch condition when setting up the structure
- // of the new loop, so the old one can become dead.
- SmallPtrSet<Instruction *, 4> DeadInstructions;
- collectTriviallyDeadInstructions(DeadInstructions);
-
- // Scan the loop in a topological order to ensure that defs are vectorized
- // before users.
- LoopBlocksDFS DFS(OrigLoop);
- DFS.perform(LI);
-
- // Vectorize all instructions in the original loop that will not become
- // trivially dead when vectorized.
- for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
- for (Instruction &I : *BB)
- if (!DeadInstructions.count(&I))
- vectorizeInstruction(I);
-
+void InnerLoopVectorizer::fixVectorizedLoop() {
// Insert truncates and extends for any truncated instructions as hints to
// InstCombine.
if (VF > 1)
@@ -4049,8 +4031,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// Set the insertion point after the previous value if it is an instruction.
// Note that the previous value may have been constant-folded so it is not
- // guaranteed to be an instruction in the vector loop.
- if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]))
+ // guaranteed to be an instruction in the vector loop. Also, if the previous
+ // value is a phi node, we should insert after all the phi nodes to avoid
+ // breaking basic block verification.
+ if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]) ||
+ isa<PHINode>(PreviousParts[UF - 1]))
Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
else
Builder.SetInsertPoint(
@@ -4258,39 +4243,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
}
if (VF > 1) {
- // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
- // and vector ops, reducing the set of values being computed by half each
- // round.
- assert(isPowerOf2_32(VF) &&
- "Reduction emission only supported for pow2 vectors!");
- Value *TmpVec = ReducedPartRdx;
- SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
- for (unsigned i = VF; i != 1; i >>= 1) {
- // Move the upper half of the vector to the lower half.
- for (unsigned j = 0; j != i / 2; ++j)
- ShuffleMask[j] = Builder.getInt32(i / 2 + j);
-
- // Fill the rest of the mask with undef.
- std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
- UndefValue::get(Builder.getInt32Ty()));
-
- Value *Shuf = Builder.CreateShuffleVector(
- TmpVec, UndefValue::get(TmpVec->getType()),
- ConstantVector::get(ShuffleMask), "rdx.shuf");
-
- if (Op != Instruction::ICmp && Op != Instruction::FCmp)
- // Floating point operations had to be 'fast' to enable the reduction.
- TmpVec = addFastMathFlag(Builder.CreateBinOp(
- (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
- else
- TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
- TmpVec, Shuf);
- }
-
- // The result is in the first element of the vector.
+ bool NoNaN = Legal->hasFunNoNaNAttr();
ReducedPartRdx =
- Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
-
+ createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
// If the reduction can be performed in a smaller type, we need to extend
// the reduction to the wider type before we branch to the original loop.
if (Phi->getType() != RdxDesc.getRecurrenceType())
@@ -4345,33 +4300,11 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
if (!LCSSAPhi)
break;
- if (LCSSAPhi->getNumIncomingValues() == 1)
- LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
- LoopMiddleBlock);
- }
-}
-
-void InnerLoopVectorizer::collectTriviallyDeadInstructions(
- SmallPtrSetImpl<Instruction *> &DeadInstructions) {
- BasicBlock *Latch = OrigLoop->getLoopLatch();
-
- // We create new control-flow for the vectorized loop, so the original
- // condition will be dead after vectorization if it's only used by the
- // branch.
- auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
- if (Cmp && Cmp->hasOneUse())
- DeadInstructions.insert(Cmp);
-
- // We create new "steps" for induction variable updates to which the original
- // induction variables map. An original update instruction will be dead if
- // all its users except the induction variable are dead.
- for (auto &Induction : *Legal->getInductionVars()) {
- PHINode *Ind = Induction.first;
- auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
- if (all_of(IndUpdate->users(), [&](User *U) -> bool {
- return U == Ind || DeadInstructions.count(cast<Instruction>(U));
- }))
- DeadInstructions.insert(IndUpdate);
+ if (LCSSAPhi->getNumIncomingValues() == 1) {
+ assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) &&
+ "Incoming value isn't loop invariant");
+ LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock);
+ }
}
}
@@ -7577,6 +7510,72 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
return CM.selectVectorizationFactor(MaxVF);
}
+void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) {
+ // Perform the actual loop transformation.
+
+ // 1. Create a new empty loop. Unlink the old loop and connect the new one.
+ ILV.createVectorizedLoopSkeleton();
+
+ //===------------------------------------------------===//
+ //
+ // Notice: any optimization or new instruction that go
+ // into the code below should also be implemented in
+ // the cost-model.
+ //
+ //===------------------------------------------------===//
+
+ // 2. Copy and widen instructions from the old loop into the new loop.
+
+ // Collect instructions from the original loop that will become trivially dead
+ // in the vectorized loop. We don't need to vectorize these instructions. For
+ // example, original induction update instructions can become dead because we
+ // separately emit induction "steps" when generating code for the new loop.
+ // Similarly, we create a new latch condition when setting up the structure
+ // of the new loop, so the old one can become dead.
+ SmallPtrSet<Instruction *, 4> DeadInstructions;
+ collectTriviallyDeadInstructions(DeadInstructions);
+
+ // Scan the loop in a topological order to ensure that defs are vectorized
+ // before users.
+ LoopBlocksDFS DFS(OrigLoop);
+ DFS.perform(LI);
+
+ // Vectorize all instructions in the original loop that will not become
+ // trivially dead when vectorized.
+ for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
+ for (Instruction &I : *BB)
+ if (!DeadInstructions.count(&I))
+ ILV.vectorizeInstruction(I);
+
+ // 3. Fix the vectorized code: take care of header phi's, live-outs,
+ // predication, updating analyses.
+ ILV.fixVectorizedLoop();
+}
+
+void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
+ SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+ BasicBlock *Latch = OrigLoop->getLoopLatch();
+
+ // We create new control-flow for the vectorized loop, so the original
+ // condition will be dead after vectorization if it's only used by the
+ // branch.
+ auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
+ if (Cmp && Cmp->hasOneUse())
+ DeadInstructions.insert(Cmp);
+
+ // We create new "steps" for induction variable updates to which the original
+ // induction variables map. An original update instruction will be dead if
+ // all its users except the induction variable are dead.
+ for (auto &Induction : *Legal->getInductionVars()) {
+ PHINode *Ind = Induction.first;
+ auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+ if (all_of(IndUpdate->users(), [&](User *U) -> bool {
+ return U == Ind || DeadInstructions.count(cast<Instruction>(U));
+ }))
+ DeadInstructions.insert(IndUpdate);
+ }
+}
+
void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
auto *SI = dyn_cast<StoreInst>(Instr);
bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent()));
@@ -7759,7 +7758,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
CM.collectValuesToIgnore();
// Use the planner for vectorization.
- LoopVectorizationPlanner LVP(CM);
+ LoopVectorizationPlanner LVP(L, LI, &LVL, CM);
// Get user vectorization factor.
unsigned UserVF = Hints.getWidth();
@@ -7853,7 +7852,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// interleave it.
InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
&CM);
- Unroller.vectorize();
+ LVP.executePlan(Unroller);
ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
L->getHeader())
@@ -7863,7 +7862,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// If we decided that it is *legal* to vectorize the loop, then do it.
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
&LVL, &CM);
- LB.vectorize();
+ LVP.executePlan(LB);
++LoopsVectorized;
// Add metadata to disable runtime unrolling a scalar loop when there are
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f112c555205c..64013d6d687d 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -40,7 +40,9 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <memory>
@@ -212,23 +214,6 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
return Opcode;
}
-/// Get the intersection (logical and) of all of the potential IR flags
-/// of each scalar operation (VL) that will be converted into a vector (I).
-/// Flag set: NSW, NUW, exact, and all of fast-math.
-static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
- if (auto *VecOp = dyn_cast<Instruction>(I)) {
- if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
- // VecOVp is initialized to the 0th scalar, so start counting from index
- // '1'.
- VecOp->copyIRFlags(I0);
- for (int i = 1, e = VL.size(); i < e; ++i) {
- if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
- VecOp->andIRFlags(Scalar);
- }
- }
- }
-}
-
/// \returns true if all of the values in \p VL have the same type or false
/// otherwise.
static bool allSameType(ArrayRef<Value *> VL) {
@@ -315,10 +300,10 @@ public:
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
- const DataLayout *DL)
+ const DataLayout *DL, OptimizationRemarkEmitter *ORE)
: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
- DL(DL), Builder(Se->getContext()) {
+ DL(DL), ORE(ORE), Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
// Use the vector register size specified by the target unless overridden
// by a command-line option.
@@ -331,7 +316,10 @@ public:
else
MaxVecRegSize = TTI->getRegisterBitWidth(true);
- MinVecRegSize = MinVectorRegSizeOption;
+ if (MinVectorRegSizeOption.getNumOccurrences())
+ MinVecRegSize = MinVectorRegSizeOption;
+ else
+ MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
}
/// \brief Vectorize the tree that starts with the elements in \p VL.
@@ -377,6 +365,8 @@ public:
MinBWs.clear();
}
+ unsigned getTreeSize() const { return VectorizableTree.size(); }
+
/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
@@ -415,6 +405,8 @@ public:
/// vectorizable. We do not vectorize such trees.
bool isTreeTinyAndNotFullyVectorizable();
+ OptimizationRemarkEmitter *getORE() { return ORE; }
+
private:
struct TreeEntry;
@@ -944,6 +936,8 @@ private:
AssumptionCache *AC;
DemandedBits *DB;
const DataLayout *DL;
+ OptimizationRemarkEmitter *ORE;
+
unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
unsigned MinVecRegSize; // Set by cl::opt (default: 128).
/// Instruction builder to construct the vectorized tree.
@@ -1835,11 +1829,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
CInt->getValue().isPowerOf2())
Op2VP = TargetTransformInfo::OP_PowerOf2;
- int ScalarCost = VecTy->getNumElements() *
- TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
- Op2VK, Op1VP, Op2VP);
+ SmallVector<const Value *, 4> Operands(VL0->operand_values());
+ int ScalarCost =
+ VecTy->getNumElements() *
+ TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
+ Op2VP, Operands);
int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
- Op1VP, Op2VP);
+ Op1VP, Op2VP, Operands);
return VecCost - ScalarCost;
}
case Instruction::GetElementPtr: {
@@ -3703,10 +3699,8 @@ void BoUpSLP::computeMinimumValueSizes() {
// Determine if the sign bit of all the roots is known to be zero. If not,
// IsKnownPositive is set to False.
IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
- bool KnownZero = false;
- bool KnownOne = false;
- ComputeSignBit(R, KnownZero, KnownOne, *DL);
- return KnownZero;
+ KnownBits Known = computeKnownBits(R, *DL);
+ return Known.isNonNegative();
});
// Determine the maximum number of bits required to store the scalar
@@ -3786,8 +3780,9 @@ struct SLPVectorizer : public FunctionPass {
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+ auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
- return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
+ return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -3799,6 +3794,7 @@ struct SLPVectorizer : public FunctionPass {
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<DemandedBitsWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
@@ -3817,8 +3813,9 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A
auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
auto *AC = &AM.getResult<AssumptionAnalysis>(F);
auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
+ auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
+ bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
if (!Changed)
return PreservedAnalyses::all();
@@ -3833,7 +3830,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
TargetTransformInfo *TTI_,
TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
LoopInfo *LI_, DominatorTree *DT_,
- AssumptionCache *AC_, DemandedBits *DB_) {
+ AssumptionCache *AC_, DemandedBits *DB_,
+ OptimizationRemarkEmitter *ORE_) {
SE = SE_;
TTI = TTI_;
TLI = TLI_;
@@ -3861,7 +3859,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// Use the bottom up slp vectorizer to construct chains that start with
// store instructions.
- BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
+ BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
// delete instructions.
@@ -3950,6 +3948,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+ using namespace ore;
+ R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
+ cast<StoreInst>(Chain[i]))
+ << "Stores SLP vectorized with cost " << NV("Cost", Cost)
+ << " and with tree size "
+ << NV("TreeSize", R.getTreeSize()));
+
R.vectorizeTree();
// Move to the next bundle.
@@ -4163,6 +4168,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+ R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
+ cast<Instruction>(Ops[0]))
+ << "SLP vectorized with cost " << ore::NV("Cost", Cost)
+ << " and with tree size "
+ << ore::NV("TreeSize", R.getTreeSize()));
+
Value *VectorizedRoot = R.vectorizeTree();
// Reconstruct the build vector by extracting the vectorized root. This
@@ -4506,6 +4517,12 @@ public:
DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
<< ". (HorRdx)\n");
+ auto *I0 = cast<Instruction>(VL[0]);
+ V.getORE()->emit(
+ OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0)
+ << "Vectorized horizontal reduction with cost "
+ << ore::NV("Cost", Cost) << " and with tree size "
+ << ore::NV("TreeSize", V.getTreeSize()));
// Vectorize a tree.
DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
@@ -4513,7 +4530,7 @@ public:
// Emit a reduction.
Value *ReducedSubTree =
- emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps);
+ emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI);
if (VectorizedTree) {
Builder.SetCurrentDebugLocation(Loc);
VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
@@ -4583,33 +4600,31 @@ private:
/// \brief Emit a horizontal reduction of the vectorized value.
Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
- unsigned ReduxWidth, ArrayRef<Value *> RedOps) {
+ unsigned ReduxWidth, ArrayRef<Value *> RedOps,
+ const TargetTransformInfo *TTI) {
assert(VectorizedValue && "Need to have a vectorized tree node");
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
+ if (!IsPairwiseReduction)
+ return createSimpleTargetReduction(
+ Builder, TTI, ReductionOpcode, VectorizedValue,
+ TargetTransformInfo::ReductionFlags(), RedOps);
+
Value *TmpVec = VectorizedValue;
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
- if (IsPairwiseReduction) {
- Value *LeftMask =
+ Value *LeftMask =
createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
- Value *RightMask =
+ Value *RightMask =
createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
- Value *LeftShuf = Builder.CreateShuffleVector(
+ Value *LeftShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
- Value *RightShuf = Builder.CreateShuffleVector(
+ Value *RightShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
"rdx.shuf.r");
- TmpVec = Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf,
- "bin.rdx");
- } else {
- Value *UpperHalf =
- createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
- Value *Shuf = Builder.CreateShuffleVector(
- TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
- TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx");
- }
+ TmpVec =
+ Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx");
propagateIRFlags(TmpVec, RedOps);
}
@@ -5162,6 +5177,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
namespace llvm {