aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm/lib/Transforms/Scalar
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Transforms/Scalar')
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/ADCE.cpp730
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp450
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/BDCE.cpp170
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp418
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp822
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp104
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp653
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/DCE.cpp163
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp1360
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/DivRemPairs.cpp206
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp1185
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp80
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp525
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/GVN.cpp2667
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp1207
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp922
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp696
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp22
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp2597
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp1856
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp1012
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp2545
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LICM.cpp1576
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp25
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp336
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp267
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp1025
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp1728
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp223
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp1427
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp679
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp92
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp750
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp1801
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp711
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp109
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp373
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp5518
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp1353
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp1613
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp598
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp172
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp383
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp137
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp1492
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/MergeICmps.cpp650
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp429
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp538
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp4252
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp180
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp691
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp2401
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp128
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp2830
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SCCP.cpp2068
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SROA.cpp4447
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Scalar.cpp282
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp802
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp1310
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp2171
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp296
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Sink.cpp306
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp811
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp319
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp737
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp953
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp853
67 files changed, 70232 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
new file mode 100644
index 000000000000..1e683db50206
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -0,0 +1,730 @@
+//===- ADCE.cpp - Code to perform dead code elimination -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Aggressive Dead Code Elimination pass. This pass
+// optimistically assumes that all instructions are dead until proven otherwise,
+// allowing it to eliminate dead computations that other DCE passes do not
+// catch, particularly involving loop computations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/ADCE.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <cassert>
+#include <cstddef>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "adce"
+
+STATISTIC(NumRemoved, "Number of instructions removed");
+STATISTIC(NumBranchesRemoved, "Number of branch instructions removed");
+
+// This is a temporary option until we change the interface to this pass based
+// on optimization level.
+static cl::opt<bool> RemoveControlFlowFlag("adce-remove-control-flow",
+ cl::init(true), cl::Hidden);
+
+// This option enables removing of may-be-infinite loops which have no other
+// effect.
+static cl::opt<bool> RemoveLoops("adce-remove-loops", cl::init(false),
+ cl::Hidden);
+
+namespace {
+
+/// Information about Instructions
+struct InstInfoType {
+ /// True if the associated instruction is live.
+ bool Live = false;
+
+ /// Quick access to information for block containing associated Instruction.
+ struct BlockInfoType *Block = nullptr;
+};
+
+/// Information about basic blocks relevant to dead code elimination.
+struct BlockInfoType {
+ /// True when this block contains a live instructions.
+ bool Live = false;
+
+ /// True when this block ends in an unconditional branch.
+ bool UnconditionalBranch = false;
+
+ /// True when this block is known to have live PHI nodes.
+ bool HasLivePhiNodes = false;
+
+ /// Control dependence sources need to be live for this block.
+ bool CFLive = false;
+
+ /// Quick access to the LiveInfo for the terminator,
+ /// holds the value &InstInfo[Terminator]
+ InstInfoType *TerminatorLiveInfo = nullptr;
+
+ /// Corresponding BasicBlock.
+ BasicBlock *BB = nullptr;
+
+ /// Cache of BB->getTerminator().
+ TerminatorInst *Terminator = nullptr;
+
+ /// Post-order numbering of reverse control flow graph.
+ unsigned PostOrder;
+
+ bool terminatorIsLive() const { return TerminatorLiveInfo->Live; }
+};
+
+class AggressiveDeadCodeElimination {
+ Function &F;
+
+ // ADCE does not use DominatorTree per se, but it updates it to preserve the
+ // analysis.
+ DominatorTree &DT;
+ PostDominatorTree &PDT;
+
+ /// Mapping of blocks to associated information, an element in BlockInfoVec.
+ /// Use MapVector to get deterministic iteration order.
+ MapVector<BasicBlock *, BlockInfoType> BlockInfo;
+ bool isLive(BasicBlock *BB) { return BlockInfo[BB].Live; }
+
+ /// Mapping of instructions to associated information.
+ DenseMap<Instruction *, InstInfoType> InstInfo;
+ bool isLive(Instruction *I) { return InstInfo[I].Live; }
+
+ /// Instructions known to be live where we need to mark
+ /// reaching definitions as live.
+ SmallVector<Instruction *, 128> Worklist;
+
+ /// Debug info scopes around a live instruction.
+ SmallPtrSet<const Metadata *, 32> AliveScopes;
+
+ /// Set of blocks with not known to have live terminators.
+ SmallPtrSet<BasicBlock *, 16> BlocksWithDeadTerminators;
+
+ /// The set of blocks which we have determined whose control
+ /// dependence sources must be live and which have not had
+ /// those dependences analyzed.
+ SmallPtrSet<BasicBlock *, 16> NewLiveBlocks;
+
+ /// Set up auxiliary data structures for Instructions and BasicBlocks and
+ /// initialize the Worklist to the set of must-be-live Instruscions.
+ void initialize();
+
+ /// Return true for operations which are always treated as live.
+ bool isAlwaysLive(Instruction &I);
+
+ /// Return true for instrumentation instructions for value profiling.
+ bool isInstrumentsConstant(Instruction &I);
+
+ /// Propagate liveness to reaching definitions.
+ void markLiveInstructions();
+
+ /// Mark an instruction as live.
+ void markLive(Instruction *I);
+
+ /// Mark a block as live.
+ void markLive(BlockInfoType &BB);
+ void markLive(BasicBlock *BB) { markLive(BlockInfo[BB]); }
+
+ /// Mark terminators of control predecessors of a PHI node live.
+ void markPhiLive(PHINode *PN);
+
+ /// Record the Debug Scopes which surround live debug information.
+ void collectLiveScopes(const DILocalScope &LS);
+ void collectLiveScopes(const DILocation &DL);
+
+ /// Analyze dead branches to find those whose branches are the sources
+ /// of control dependences impacting a live block. Those branches are
+ /// marked live.
+ void markLiveBranchesFromControlDependences();
+
+ /// Remove instructions not marked live, return if any any instruction
+ /// was removed.
+ bool removeDeadInstructions();
+
+ /// Identify connected sections of the control flow graph which have
+ /// dead terminators and rewrite the control flow graph to remove them.
+ void updateDeadRegions();
+
+ /// Set the BlockInfo::PostOrder field based on a post-order
+ /// numbering of the reverse control flow graph.
+ void computeReversePostOrder();
+
+ /// Make the terminator of this block an unconditional branch to \p Target.
+ void makeUnconditional(BasicBlock *BB, BasicBlock *Target);
+
+public:
+ AggressiveDeadCodeElimination(Function &F, DominatorTree &DT,
+ PostDominatorTree &PDT)
+ : F(F), DT(DT), PDT(PDT) {}
+
+ bool performDeadCodeElimination();
+};
+
+} // end anonymous namespace
+
+bool AggressiveDeadCodeElimination::performDeadCodeElimination() {
+ initialize();
+ markLiveInstructions();
+ return removeDeadInstructions();
+}
+
+static bool isUnconditionalBranch(TerminatorInst *Term) {
+ auto *BR = dyn_cast<BranchInst>(Term);
+ return BR && BR->isUnconditional();
+}
+
+void AggressiveDeadCodeElimination::initialize() {
+ auto NumBlocks = F.size();
+
+ // We will have an entry in the map for each block so we grow the
+ // structure to twice that size to keep the load factor low in the hash table.
+ BlockInfo.reserve(NumBlocks);
+ size_t NumInsts = 0;
+
+ // Iterate over blocks and initialize BlockInfoVec entries, count
+ // instructions to size the InstInfo hash table.
+ for (auto &BB : F) {
+ NumInsts += BB.size();
+ auto &Info = BlockInfo[&BB];
+ Info.BB = &BB;
+ Info.Terminator = BB.getTerminator();
+ Info.UnconditionalBranch = isUnconditionalBranch(Info.Terminator);
+ }
+
+ // Initialize instruction map and set pointers to block info.
+ InstInfo.reserve(NumInsts);
+ for (auto &BBInfo : BlockInfo)
+ for (Instruction &I : *BBInfo.second.BB)
+ InstInfo[&I].Block = &BBInfo.second;
+
+ // Since BlockInfoVec holds pointers into InstInfo and vice-versa, we may not
+ // add any more elements to either after this point.
+ for (auto &BBInfo : BlockInfo)
+ BBInfo.second.TerminatorLiveInfo = &InstInfo[BBInfo.second.Terminator];
+
+ // Collect the set of "root" instructions that are known live.
+ for (Instruction &I : instructions(F))
+ if (isAlwaysLive(I))
+ markLive(&I);
+
+ if (!RemoveControlFlowFlag)
+ return;
+
+ if (!RemoveLoops) {
+ // This stores state for the depth-first iterator. In addition
+ // to recording which nodes have been visited we also record whether
+ // a node is currently on the "stack" of active ancestors of the current
+ // node.
+ using StatusMap = DenseMap<BasicBlock *, bool>;
+
+ class DFState : public StatusMap {
+ public:
+ std::pair<StatusMap::iterator, bool> insert(BasicBlock *BB) {
+ return StatusMap::insert(std::make_pair(BB, true));
+ }
+
+ // Invoked after we have visited all children of a node.
+ void completed(BasicBlock *BB) { (*this)[BB] = false; }
+
+ // Return true if \p BB is currently on the active stack
+ // of ancestors.
+ bool onStack(BasicBlock *BB) {
+ auto Iter = find(BB);
+ return Iter != end() && Iter->second;
+ }
+ } State;
+
+ State.reserve(F.size());
+ // Iterate over blocks in depth-first pre-order and
+ // treat all edges to a block already seen as loop back edges
+ // and mark the branch live it if there is a back edge.
+ for (auto *BB: depth_first_ext(&F.getEntryBlock(), State)) {
+ TerminatorInst *Term = BB->getTerminator();
+ if (isLive(Term))
+ continue;
+
+ for (auto *Succ : successors(BB))
+ if (State.onStack(Succ)) {
+ // back edge....
+ markLive(Term);
+ break;
+ }
+ }
+ }
+
+ // Mark blocks live if there is no path from the block to a
+ // return of the function.
+ // We do this by seeing which of the postdomtree root children exit the
+ // program, and for all others, mark the subtree live.
+ for (auto &PDTChild : children<DomTreeNode *>(PDT.getRootNode())) {
+ auto *BB = PDTChild->getBlock();
+ auto &Info = BlockInfo[BB];
+ // Real function return
+ if (isa<ReturnInst>(Info.Terminator)) {
+ DEBUG(dbgs() << "post-dom root child is a return: " << BB->getName()
+ << '\n';);
+ continue;
+ }
+
+ // This child is something else, like an infinite loop.
+ for (auto DFNode : depth_first(PDTChild))
+ markLive(BlockInfo[DFNode->getBlock()].Terminator);
+ }
+
+ // Treat the entry block as always live
+ auto *BB = &F.getEntryBlock();
+ auto &EntryInfo = BlockInfo[BB];
+ EntryInfo.Live = true;
+ if (EntryInfo.UnconditionalBranch)
+ markLive(EntryInfo.Terminator);
+
+ // Build initial collection of blocks with dead terminators
+ for (auto &BBInfo : BlockInfo)
+ if (!BBInfo.second.terminatorIsLive())
+ BlocksWithDeadTerminators.insert(BBInfo.second.BB);
+}
+
+bool AggressiveDeadCodeElimination::isAlwaysLive(Instruction &I) {
+ // TODO -- use llvm::isInstructionTriviallyDead
+ if (I.isEHPad() || I.mayHaveSideEffects()) {
+ // Skip any value profile instrumentation calls if they are
+ // instrumenting constants.
+ if (isInstrumentsConstant(I))
+ return false;
+ return true;
+ }
+ if (!isa<TerminatorInst>(I))
+ return false;
+ if (RemoveControlFlowFlag && (isa<BranchInst>(I) || isa<SwitchInst>(I)))
+ return false;
+ return true;
+}
+
+// Check if this instruction is a runtime call for value profiling and
+// if it's instrumenting a constant.
+bool AggressiveDeadCodeElimination::isInstrumentsConstant(Instruction &I) {
+ // TODO -- move this test into llvm::isInstructionTriviallyDead
+ if (CallInst *CI = dyn_cast<CallInst>(&I))
+ if (Function *Callee = CI->getCalledFunction())
+ if (Callee->getName().equals(getInstrProfValueProfFuncName()))
+ if (isa<Constant>(CI->getArgOperand(0)))
+ return true;
+ return false;
+}
+
+void AggressiveDeadCodeElimination::markLiveInstructions() {
+ // Propagate liveness backwards to operands.
+ do {
+ // Worklist holds newly discovered live instructions
+ // where we need to mark the inputs as live.
+ while (!Worklist.empty()) {
+ Instruction *LiveInst = Worklist.pop_back_val();
+ DEBUG(dbgs() << "work live: "; LiveInst->dump(););
+
+ for (Use &OI : LiveInst->operands())
+ if (Instruction *Inst = dyn_cast<Instruction>(OI))
+ markLive(Inst);
+
+ if (auto *PN = dyn_cast<PHINode>(LiveInst))
+ markPhiLive(PN);
+ }
+
+ // After data flow liveness has been identified, examine which branch
+ // decisions are required to determine live instructions are executed.
+ markLiveBranchesFromControlDependences();
+
+ } while (!Worklist.empty());
+}
+
+void AggressiveDeadCodeElimination::markLive(Instruction *I) {
+ auto &Info = InstInfo[I];
+ if (Info.Live)
+ return;
+
+ DEBUG(dbgs() << "mark live: "; I->dump());
+ Info.Live = true;
+ Worklist.push_back(I);
+
+ // Collect the live debug info scopes attached to this instruction.
+ if (const DILocation *DL = I->getDebugLoc())
+ collectLiveScopes(*DL);
+
+ // Mark the containing block live
+ auto &BBInfo = *Info.Block;
+ if (BBInfo.Terminator == I) {
+ BlocksWithDeadTerminators.erase(BBInfo.BB);
+ // For live terminators, mark destination blocks
+ // live to preserve this control flow edges.
+ if (!BBInfo.UnconditionalBranch)
+ for (auto *BB : successors(I->getParent()))
+ markLive(BB);
+ }
+ markLive(BBInfo);
+}
+
+void AggressiveDeadCodeElimination::markLive(BlockInfoType &BBInfo) {
+ if (BBInfo.Live)
+ return;
+ DEBUG(dbgs() << "mark block live: " << BBInfo.BB->getName() << '\n');
+ BBInfo.Live = true;
+ if (!BBInfo.CFLive) {
+ BBInfo.CFLive = true;
+ NewLiveBlocks.insert(BBInfo.BB);
+ }
+
+ // Mark unconditional branches at the end of live
+ // blocks as live since there is no work to do for them later
+ if (BBInfo.UnconditionalBranch)
+ markLive(BBInfo.Terminator);
+}
+
+void AggressiveDeadCodeElimination::collectLiveScopes(const DILocalScope &LS) {
+ if (!AliveScopes.insert(&LS).second)
+ return;
+
+ if (isa<DISubprogram>(LS))
+ return;
+
+ // Tail-recurse through the scope chain.
+ collectLiveScopes(cast<DILocalScope>(*LS.getScope()));
+}
+
+void AggressiveDeadCodeElimination::collectLiveScopes(const DILocation &DL) {
+ // Even though DILocations are not scopes, shove them into AliveScopes so we
+ // don't revisit them.
+ if (!AliveScopes.insert(&DL).second)
+ return;
+
+ // Collect live scopes from the scope chain.
+ collectLiveScopes(*DL.getScope());
+
+ // Tail-recurse through the inlined-at chain.
+ if (const DILocation *IA = DL.getInlinedAt())
+ collectLiveScopes(*IA);
+}
+
+void AggressiveDeadCodeElimination::markPhiLive(PHINode *PN) {
+ auto &Info = BlockInfo[PN->getParent()];
+ // Only need to check this once per block.
+ if (Info.HasLivePhiNodes)
+ return;
+ Info.HasLivePhiNodes = true;
+
+ // If a predecessor block is not live, mark it as control-flow live
+ // which will trigger marking live branches upon which
+ // that block is control dependent.
+ for (auto *PredBB : predecessors(Info.BB)) {
+ auto &Info = BlockInfo[PredBB];
+ if (!Info.CFLive) {
+ Info.CFLive = true;
+ NewLiveBlocks.insert(PredBB);
+ }
+ }
+}
+
+void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() {
+ if (BlocksWithDeadTerminators.empty())
+ return;
+
+ DEBUG({
+ dbgs() << "new live blocks:\n";
+ for (auto *BB : NewLiveBlocks)
+ dbgs() << "\t" << BB->getName() << '\n';
+ dbgs() << "dead terminator blocks:\n";
+ for (auto *BB : BlocksWithDeadTerminators)
+ dbgs() << "\t" << BB->getName() << '\n';
+ });
+
+ // The dominance frontier of a live block X in the reverse
+ // control graph is the set of blocks upon which X is control
+ // dependent. The following sequence computes the set of blocks
+ // which currently have dead terminators that are control
+ // dependence sources of a block which is in NewLiveBlocks.
+
+ SmallVector<BasicBlock *, 32> IDFBlocks;
+ ReverseIDFCalculator IDFs(PDT);
+ IDFs.setDefiningBlocks(NewLiveBlocks);
+ IDFs.setLiveInBlocks(BlocksWithDeadTerminators);
+ IDFs.calculate(IDFBlocks);
+ NewLiveBlocks.clear();
+
+ // Dead terminators which control live blocks are now marked live.
+ for (auto *BB : IDFBlocks) {
+ DEBUG(dbgs() << "live control in: " << BB->getName() << '\n');
+ markLive(BB->getTerminator());
+ }
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Routines to update the CFG and SSA information before removing dead code.
+//
+//===----------------------------------------------------------------------===//
+bool AggressiveDeadCodeElimination::removeDeadInstructions() {
+ // Updates control and dataflow around dead blocks
+ updateDeadRegions();
+
+ DEBUG({
+ for (Instruction &I : instructions(F)) {
+ // Check if the instruction is alive.
+ if (isLive(&I))
+ continue;
+
+ if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+ // Check if the scope of this variable location is alive.
+ if (AliveScopes.count(DII->getDebugLoc()->getScope()))
+ continue;
+
+ // If intrinsic is pointing at a live SSA value, there may be an
+ // earlier optimization bug: if we know the location of the variable,
+ // why isn't the scope of the location alive?
+ if (Value *V = DII->getVariableLocation())
+ if (Instruction *II = dyn_cast<Instruction>(V))
+ if (isLive(II))
+ dbgs() << "Dropping debug info for " << *DII << "\n";
+ }
+ }
+ });
+
+ // The inverse of the live set is the dead set. These are those instructions
+ // that have no side effects and do not influence the control flow or return
+ // value of the function, and may therefore be deleted safely.
+ // NOTE: We reuse the Worklist vector here for memory efficiency.
+ for (Instruction &I : instructions(F)) {
+ // Check if the instruction is alive.
+ if (isLive(&I))
+ continue;
+
+ if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+ // Check if the scope of this variable location is alive.
+ if (AliveScopes.count(DII->getDebugLoc()->getScope()))
+ continue;
+
+ // Fallthrough and drop the intrinsic.
+ }
+
+ // Prepare to delete.
+ Worklist.push_back(&I);
+ I.dropAllReferences();
+ }
+
+ for (Instruction *&I : Worklist) {
+ ++NumRemoved;
+ I->eraseFromParent();
+ }
+
+ return !Worklist.empty();
+}
+
+// A dead region is the set of dead blocks with a common live post-dominator.
+void AggressiveDeadCodeElimination::updateDeadRegions() {
+ DEBUG({
+ dbgs() << "final dead terminator blocks: " << '\n';
+ for (auto *BB : BlocksWithDeadTerminators)
+ dbgs() << '\t' << BB->getName()
+ << (BlockInfo[BB].Live ? " LIVE\n" : "\n");
+ });
+
+ // Don't compute the post ordering unless we needed it.
+ bool HavePostOrder = false;
+
+ for (auto *BB : BlocksWithDeadTerminators) {
+ auto &Info = BlockInfo[BB];
+ if (Info.UnconditionalBranch) {
+ InstInfo[Info.Terminator].Live = true;
+ continue;
+ }
+
+ if (!HavePostOrder) {
+ computeReversePostOrder();
+ HavePostOrder = true;
+ }
+
+ // Add an unconditional branch to the successor closest to the
+ // end of the function which insures a path to the exit for each
+ // live edge.
+ BlockInfoType *PreferredSucc = nullptr;
+ for (auto *Succ : successors(BB)) {
+ auto *Info = &BlockInfo[Succ];
+ if (!PreferredSucc || PreferredSucc->PostOrder < Info->PostOrder)
+ PreferredSucc = Info;
+ }
+ assert((PreferredSucc && PreferredSucc->PostOrder > 0) &&
+ "Failed to find safe successor for dead branch");
+
+ // Collect removed successors to update the (Post)DominatorTrees.
+ SmallPtrSet<BasicBlock *, 4> RemovedSuccessors;
+ bool First = true;
+ for (auto *Succ : successors(BB)) {
+ if (!First || Succ != PreferredSucc->BB) {
+ Succ->removePredecessor(BB);
+ RemovedSuccessors.insert(Succ);
+ } else
+ First = false;
+ }
+ makeUnconditional(BB, PreferredSucc->BB);
+
+ // Inform the dominators about the deleted CFG edges.
+ SmallVector<DominatorTree::UpdateType, 4> DeletedEdges;
+ for (auto *Succ : RemovedSuccessors) {
+ // It might have happened that the same successor appeared multiple times
+ // and the CFG edge wasn't really removed.
+ if (Succ != PreferredSucc->BB) {
+ DEBUG(dbgs() << "ADCE: (Post)DomTree edge enqueued for deletion"
+ << BB->getName() << " -> " << Succ->getName() << "\n");
+ DeletedEdges.push_back({DominatorTree::Delete, BB, Succ});
+ }
+ }
+
+ DT.applyUpdates(DeletedEdges);
+ PDT.applyUpdates(DeletedEdges);
+
+ NumBranchesRemoved += 1;
+ }
+}
+
+// reverse top-sort order
+void AggressiveDeadCodeElimination::computeReversePostOrder() {
+ // This provides a post-order numbering of the reverse control flow graph
+ // Note that it is incomplete in the presence of infinite loops but we don't
+ // need numbers blocks which don't reach the end of the functions since
+ // all branches in those blocks are forced live.
+
+ // For each block without successors, extend the DFS from the block
+ // backward through the graph
+ SmallPtrSet<BasicBlock*, 16> Visited;
+ unsigned PostOrder = 0;
+ for (auto &BB : F) {
+ if (succ_begin(&BB) != succ_end(&BB))
+ continue;
+ for (BasicBlock *Block : inverse_post_order_ext(&BB,Visited))
+ BlockInfo[Block].PostOrder = PostOrder++;
+ }
+}
+
+void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB,
+ BasicBlock *Target) {
+ TerminatorInst *PredTerm = BB->getTerminator();
+ // Collect the live debug info scopes attached to this instruction.
+ if (const DILocation *DL = PredTerm->getDebugLoc())
+ collectLiveScopes(*DL);
+
+ // Just mark live an existing unconditional branch
+ if (isUnconditionalBranch(PredTerm)) {
+ PredTerm->setSuccessor(0, Target);
+ InstInfo[PredTerm].Live = true;
+ return;
+ }
+ DEBUG(dbgs() << "making unconditional " << BB->getName() << '\n');
+ NumBranchesRemoved += 1;
+ IRBuilder<> Builder(PredTerm);
+ auto *NewTerm = Builder.CreateBr(Target);
+ InstInfo[NewTerm].Live = true;
+ if (const DILocation *DL = PredTerm->getDebugLoc())
+ NewTerm->setDebugLoc(DL);
+
+ InstInfo.erase(PredTerm);
+ PredTerm->eraseFromParent();
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Pass Manager integration code
+//
+//===----------------------------------------------------------------------===//
+PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &FAM) {
+ auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+ auto &PDT = FAM.getResult<PostDominatorTreeAnalysis>(F);
+ if (!AggressiveDeadCodeElimination(F, DT, PDT).performDeadCodeElimination())
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<PostDominatorTreeAnalysis>();
+ return PA;
+}
+
+namespace {
+
+struct ADCELegacyPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+
+ ADCELegacyPass() : FunctionPass(ID) {
+ initializeADCELegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+ return AggressiveDeadCodeElimination(F, DT, PDT)
+ .performDeadCodeElimination();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // We require DominatorTree here only to update and thus preserve it.
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<PostDominatorTreeWrapperPass>();
+ if (!RemoveControlFlowFlag)
+ AU.setPreservesCFG();
+ else {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<PostDominatorTreeWrapperPass>();
+ }
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+char ADCELegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ADCELegacyPass, "adce",
+ "Aggressive Dead Code Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination",
+ false, false)
+
+FunctionPass *llvm::createAggressiveDCEPass() { return new ADCELegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
new file mode 100644
index 000000000000..99480f12da9e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -0,0 +1,450 @@
+//===----------------------- AlignmentFromAssumptions.cpp -----------------===//
+// Set Load/Store Alignments From Assumptions
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a ScalarEvolution-based transformation to set
+// the alignments of load, stores and memory intrinsics based on the truth
+// expressions of assume intrinsics. The primary motivation is to handle
+// complex alignment assumptions that apply to vector loads and stores that
+// appear after vectorization and unrolling.
+//
+//===----------------------------------------------------------------------===//
+
+#define AA_NAME "alignment-from-assumptions"
+#define DEBUG_TYPE AA_NAME
+#include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+STATISTIC(NumLoadAlignChanged,
+ "Number of loads changed by alignment assumptions");
+STATISTIC(NumStoreAlignChanged,
+ "Number of stores changed by alignment assumptions");
+STATISTIC(NumMemIntAlignChanged,
+ "Number of memory intrinsics changed by alignment assumptions");
+
+namespace {
+struct AlignmentFromAssumptions : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ AlignmentFromAssumptions() : FunctionPass(ID) {
+ initializeAlignmentFromAssumptionsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+
+ AU.setPreservesCFG();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ }
+
+ AlignmentFromAssumptionsPass Impl;
+};
+}
+
+char AlignmentFromAssumptions::ID = 0;
+static const char aip_name[] = "Alignment from assumptions";
+INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME,
+ aip_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME,
+ aip_name, false, false)
+
+FunctionPass *llvm::createAlignmentFromAssumptionsPass() {
+ return new AlignmentFromAssumptions();
+}
+
+// Given an expression for the (constant) alignment, AlignSCEV, and an
+// expression for the displacement between a pointer and the aligned address,
+// DiffSCEV, compute the alignment of the displaced pointer if it can be reduced
+// to a constant. Using SCEV to compute alignment handles the case where
+// DiffSCEV is a recurrence with constant start such that the aligned offset
+// is constant. e.g. {16,+,32} % 32 -> 16.
+static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,
+ const SCEV *AlignSCEV,
+ ScalarEvolution *SE) {
+ // DiffUnits = Diff % int64_t(Alignment)
+ const SCEV *DiffAlignDiv = SE->getUDivExpr(DiffSCEV, AlignSCEV);
+ const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV);
+ const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV);
+
+ DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is " <<
+ *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
+
+ if (const SCEVConstant *ConstDUSCEV =
+ dyn_cast<SCEVConstant>(DiffUnitsSCEV)) {
+ int64_t DiffUnits = ConstDUSCEV->getValue()->getSExtValue();
+
+ // If the displacement is an exact multiple of the alignment, then the
+ // displaced pointer has the same alignment as the aligned pointer, so
+ // return the alignment value.
+ if (!DiffUnits)
+ return (unsigned)
+ cast<SCEVConstant>(AlignSCEV)->getValue()->getSExtValue();
+
+ // If the displacement is not an exact multiple, but the remainder is a
+ // constant, then return this remainder (but only if it is a power of 2).
+ uint64_t DiffUnitsAbs = std::abs(DiffUnits);
+ if (isPowerOf2_64(DiffUnitsAbs))
+ return (unsigned) DiffUnitsAbs;
+ }
+
+ return 0;
+}
+
+// There is an address given by an offset OffSCEV from AASCEV which has an
+// alignment AlignSCEV. Use that information, if possible, to compute a new
+// alignment for Ptr.
+static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
+ const SCEV *OffSCEV, Value *Ptr,
+ ScalarEvolution *SE) {
+ const SCEV *PtrSCEV = SE->getSCEV(Ptr);
+ const SCEV *DiffSCEV = SE->getMinusSCEV(PtrSCEV, AASCEV);
+
+ // On 32-bit platforms, DiffSCEV might now have type i32 -- we've always
+ // sign-extended OffSCEV to i64, so make sure they agree again.
+ DiffSCEV = SE->getNoopOrSignExtend(DiffSCEV, OffSCEV->getType());
+
+ // What we really want to know is the overall offset to the aligned
+ // address. This address is displaced by the provided offset.
+ DiffSCEV = SE->getMinusSCEV(DiffSCEV, OffSCEV);
+
+ DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to " <<
+ *AlignSCEV << " and offset " << *OffSCEV <<
+ " using diff " << *DiffSCEV << "\n");
+
+ unsigned NewAlignment = getNewAlignmentDiff(DiffSCEV, AlignSCEV, SE);
+ DEBUG(dbgs() << "\tnew alignment: " << NewAlignment << "\n");
+
+ if (NewAlignment) {
+ return NewAlignment;
+ } else if (const SCEVAddRecExpr *DiffARSCEV =
+ dyn_cast<SCEVAddRecExpr>(DiffSCEV)) {
+ // The relative offset to the alignment assumption did not yield a constant,
+ // but we should try harder: if we assume that a is 32-byte aligned, then in
+ // for (i = 0; i < 1024; i += 4) r += a[i]; not all of the loads from a are
+ // 32-byte aligned, but instead alternate between 32 and 16-byte alignment.
+ // As a result, the new alignment will not be a constant, but can still
+ // be improved over the default (of 4) to 16.
+
+ const SCEV *DiffStartSCEV = DiffARSCEV->getStart();
+ const SCEV *DiffIncSCEV = DiffARSCEV->getStepRecurrence(*SE);
+
+ DEBUG(dbgs() << "\ttrying start/inc alignment using start " <<
+ *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n");
+
+ // Now compute the new alignment using the displacement to the value in the
+ // first iteration, and also the alignment using the per-iteration delta.
+ // If these are the same, then use that answer. Otherwise, use the smaller
+ // one, but only if it divides the larger one.
+ NewAlignment = getNewAlignmentDiff(DiffStartSCEV, AlignSCEV, SE);
+ unsigned NewIncAlignment = getNewAlignmentDiff(DiffIncSCEV, AlignSCEV, SE);
+
+ DEBUG(dbgs() << "\tnew start alignment: " << NewAlignment << "\n");
+ DEBUG(dbgs() << "\tnew inc alignment: " << NewIncAlignment << "\n");
+
+ if (!NewAlignment || !NewIncAlignment) {
+ return 0;
+ } else if (NewAlignment > NewIncAlignment) {
+ if (NewAlignment % NewIncAlignment == 0) {
+ DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+ NewIncAlignment << "\n");
+ return NewIncAlignment;
+ }
+ } else if (NewIncAlignment > NewAlignment) {
+ if (NewIncAlignment % NewAlignment == 0) {
+ DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+ NewAlignment << "\n");
+ return NewAlignment;
+ }
+ } else if (NewIncAlignment == NewAlignment) {
+ DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+ NewAlignment << "\n");
+ return NewAlignment;
+ }
+ }
+
+ return 0;
+}
+
+bool AlignmentFromAssumptionsPass::extractAlignmentInfo(CallInst *I,
+ Value *&AAPtr,
+ const SCEV *&AlignSCEV,
+ const SCEV *&OffSCEV) {
+ // An alignment assume must be a statement about the least-significant
+ // bits of the pointer being zero, possibly with some offset.
+ ICmpInst *ICI = dyn_cast<ICmpInst>(I->getArgOperand(0));
+ if (!ICI)
+ return false;
+
+ // This must be an expression of the form: x & m == 0.
+ if (ICI->getPredicate() != ICmpInst::ICMP_EQ)
+ return false;
+
+ // Swap things around so that the RHS is 0.
+ Value *CmpLHS = ICI->getOperand(0);
+ Value *CmpRHS = ICI->getOperand(1);
+ const SCEV *CmpLHSSCEV = SE->getSCEV(CmpLHS);
+ const SCEV *CmpRHSSCEV = SE->getSCEV(CmpRHS);
+ if (CmpLHSSCEV->isZero())
+ std::swap(CmpLHS, CmpRHS);
+ else if (!CmpRHSSCEV->isZero())
+ return false;
+
+ BinaryOperator *CmpBO = dyn_cast<BinaryOperator>(CmpLHS);
+ if (!CmpBO || CmpBO->getOpcode() != Instruction::And)
+ return false;
+
+ // Swap things around so that the right operand of the and is a constant
+ // (the mask); we cannot deal with variable masks.
+ Value *AndLHS = CmpBO->getOperand(0);
+ Value *AndRHS = CmpBO->getOperand(1);
+ const SCEV *AndLHSSCEV = SE->getSCEV(AndLHS);
+ const SCEV *AndRHSSCEV = SE->getSCEV(AndRHS);
+ if (isa<SCEVConstant>(AndLHSSCEV)) {
+ std::swap(AndLHS, AndRHS);
+ std::swap(AndLHSSCEV, AndRHSSCEV);
+ }
+
+ const SCEVConstant *MaskSCEV = dyn_cast<SCEVConstant>(AndRHSSCEV);
+ if (!MaskSCEV)
+ return false;
+
+ // The mask must have some trailing ones (otherwise the condition is
+ // trivial and tells us nothing about the alignment of the left operand).
+ unsigned TrailingOnes = MaskSCEV->getAPInt().countTrailingOnes();
+ if (!TrailingOnes)
+ return false;
+
+ // Cap the alignment at the maximum with which LLVM can deal (and make sure
+ // we don't overflow the shift).
+ uint64_t Alignment;
+ TrailingOnes = std::min(TrailingOnes,
+ unsigned(sizeof(unsigned) * CHAR_BIT - 1));
+ Alignment = std::min(1u << TrailingOnes, +Value::MaximumAlignment);
+
+ Type *Int64Ty = Type::getInt64Ty(I->getParent()->getParent()->getContext());
+ AlignSCEV = SE->getConstant(Int64Ty, Alignment);
+
+ // The LHS might be a ptrtoint instruction, or it might be the pointer
+ // with an offset.
+ AAPtr = nullptr;
+ OffSCEV = nullptr;
+ if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(AndLHS)) {
+ AAPtr = PToI->getPointerOperand();
+ OffSCEV = SE->getZero(Int64Ty);
+ } else if (const SCEVAddExpr* AndLHSAddSCEV =
+ dyn_cast<SCEVAddExpr>(AndLHSSCEV)) {
+ // Try to find the ptrtoint; subtract it and the rest is the offset.
+ for (SCEVAddExpr::op_iterator J = AndLHSAddSCEV->op_begin(),
+ JE = AndLHSAddSCEV->op_end(); J != JE; ++J)
+ if (const SCEVUnknown *OpUnk = dyn_cast<SCEVUnknown>(*J))
+ if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(OpUnk->getValue())) {
+ AAPtr = PToI->getPointerOperand();
+ OffSCEV = SE->getMinusSCEV(AndLHSAddSCEV, *J);
+ break;
+ }
+ }
+
+ if (!AAPtr)
+ return false;
+
+ // Sign extend the offset to 64 bits (so that it is like all of the other
+ // expressions).
+ unsigned OffSCEVBits = OffSCEV->getType()->getPrimitiveSizeInBits();
+ if (OffSCEVBits < 64)
+ OffSCEV = SE->getSignExtendExpr(OffSCEV, Int64Ty);
+ else if (OffSCEVBits > 64)
+ return false;
+
+ AAPtr = AAPtr->stripPointerCasts();
+ return true;
+}
+
+bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
+ Value *AAPtr;
+ const SCEV *AlignSCEV, *OffSCEV;
+ if (!extractAlignmentInfo(ACall, AAPtr, AlignSCEV, OffSCEV))
+ return false;
+
+ // Skip ConstantPointerNull and UndefValue. Assumptions on these shouldn't
+ // affect other users.
+ if (isa<ConstantData>(AAPtr))
+ return false;
+
+ const SCEV *AASCEV = SE->getSCEV(AAPtr);
+
+ // Apply the assumption to all other users of the specified pointer.
+ SmallPtrSet<Instruction *, 32> Visited;
+ SmallVector<Instruction*, 16> WorkList;
+ for (User *J : AAPtr->users()) {
+ if (J == ACall)
+ continue;
+
+ if (Instruction *K = dyn_cast<Instruction>(J))
+ if (isValidAssumeForContext(ACall, K, DT))
+ WorkList.push_back(K);
+ }
+
+ while (!WorkList.empty()) {
+ Instruction *J = WorkList.pop_back_val();
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(J)) {
+ unsigned NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+ LI->getPointerOperand(), SE);
+
+ if (NewAlignment > LI->getAlignment()) {
+ LI->setAlignment(NewAlignment);
+ ++NumLoadAlignChanged;
+ }
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) {
+ unsigned NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+ SI->getPointerOperand(), SE);
+
+ if (NewAlignment > SI->getAlignment()) {
+ SI->setAlignment(NewAlignment);
+ ++NumStoreAlignChanged;
+ }
+ } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) {
+ unsigned NewDestAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+ MI->getDest(), SE);
+
+ // For memory transfers, we need a common alignment for both the
+ // source and destination. If we have a new alignment for this
+ // instruction, but only for one operand, save it. If we reach the
+ // other operand through another assumption later, then we may
+ // change the alignment at that point.
+ if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+ unsigned NewSrcAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+ MTI->getSource(), SE);
+
+ DenseMap<MemTransferInst *, unsigned>::iterator DI =
+ NewDestAlignments.find(MTI);
+ unsigned AltDestAlignment = (DI == NewDestAlignments.end()) ?
+ 0 : DI->second;
+
+ DenseMap<MemTransferInst *, unsigned>::iterator SI =
+ NewSrcAlignments.find(MTI);
+ unsigned AltSrcAlignment = (SI == NewSrcAlignments.end()) ?
+ 0 : SI->second;
+
+ DEBUG(dbgs() << "\tmem trans: " << NewDestAlignment << " " <<
+ AltDestAlignment << " " << NewSrcAlignment <<
+ " " << AltSrcAlignment << "\n");
+
+ // Of these four alignments, pick the largest possible...
+ unsigned NewAlignment = 0;
+ if (NewDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment))
+ NewAlignment = std::max(NewAlignment, NewDestAlignment);
+ if (AltDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment))
+ NewAlignment = std::max(NewAlignment, AltDestAlignment);
+ if (NewSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment))
+ NewAlignment = std::max(NewAlignment, NewSrcAlignment);
+ if (AltSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment))
+ NewAlignment = std::max(NewAlignment, AltSrcAlignment);
+
+ if (NewAlignment > MI->getAlignment()) {
+ MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
+ MI->getParent()->getContext()), NewAlignment));
+ ++NumMemIntAlignChanged;
+ }
+
+ NewDestAlignments.insert(std::make_pair(MTI, NewDestAlignment));
+ NewSrcAlignments.insert(std::make_pair(MTI, NewSrcAlignment));
+ } else if (NewDestAlignment > MI->getAlignment()) {
+ assert((!isa<MemIntrinsic>(MI) || isa<MemSetInst>(MI)) &&
+ "Unknown memory intrinsic");
+
+ MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
+ MI->getParent()->getContext()), NewDestAlignment));
+ ++NumMemIntAlignChanged;
+ }
+ }
+
+ // Now that we've updated that use of the pointer, look for other uses of
+ // the pointer to update.
+ Visited.insert(J);
+ for (User *UJ : J->users()) {
+ Instruction *K = cast<Instruction>(UJ);
+ if (!Visited.count(K) && isValidAssumeForContext(ACall, K, DT))
+ WorkList.push_back(K);
+ }
+ }
+
+ return true;
+}
+
+bool AlignmentFromAssumptions::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+ return Impl.runImpl(F, AC, SE, DT);
+}
+
+bool AlignmentFromAssumptionsPass::runImpl(Function &F, AssumptionCache &AC,
+ ScalarEvolution *SE_,
+ DominatorTree *DT_) {
+ SE = SE_;
+ DT = DT_;
+
+ NewDestAlignments.clear();
+ NewSrcAlignments.clear();
+
+ bool Changed = false;
+ for (auto &AssumeVH : AC.assumptions())
+ if (AssumeVH)
+ Changed |= processAssumption(cast<CallInst>(AssumeVH));
+
+ return Changed;
+}
+
+PreservedAnalyses
+AlignmentFromAssumptionsPass::run(Function &F, FunctionAnalysisManager &AM) {
+
+ AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
+ ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+ DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ if (!runImpl(F, AC, &SE, &DT))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<AAManager>();
+ PA.preserve<ScalarEvolutionAnalysis>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
new file mode 100644
index 000000000000..851efa000f65
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
@@ -0,0 +1,170 @@
+//===---- BDCE.cpp - Bit-tracking dead code elimination -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Bit-Tracking Dead Code Elimination pass. Some
+// instructions (shifts, some ands, ors, etc.) kill some of their input bits.
+// We track these dead bits and remove instructions that compute only these
+// dead bits.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/BDCE.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bdce"
+
+STATISTIC(NumRemoved, "Number of instructions removed (unused)");
+STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)");
+
+/// If an instruction is trivialized (dead), then the chain of users of that
+/// instruction may need to be cleared of assumptions that can no longer be
+/// guaranteed correct.
+static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) {
+ assert(I->getType()->isIntegerTy() && "Trivializing a non-integer value?");
+
+ // Initialize the worklist with eligible direct users.
+ SmallVector<Instruction *, 16> WorkList;
+ for (User *JU : I->users()) {
+ // If all bits of a user are demanded, then we know that nothing below that
+ // in the def-use chain needs to be changed.
+ auto *J = dyn_cast<Instruction>(JU);
+ if (J && J->getType()->isSized() &&
+ !DB.getDemandedBits(J).isAllOnesValue())
+ WorkList.push_back(J);
+
+ // Note that we need to check for unsized types above before asking for
+ // demanded bits. Normally, the only way to reach an instruction with an
+ // unsized type is via an instruction that has side effects (or otherwise
+ // will demand its input bits). However, if we have a readnone function
+ // that returns an unsized type (e.g., void), we must avoid asking for the
+ // demanded bits of the function call's return value. A void-returning
+ // readnone function is always dead (and so we can stop walking the use/def
+ // chain here), but the check is necessary to avoid asserting.
+ }
+
+ // DFS through subsequent users while tracking visits to avoid cycles.
+ SmallPtrSet<Instruction *, 16> Visited;
+ while (!WorkList.empty()) {
+ Instruction *J = WorkList.pop_back_val();
+
+ // NSW, NUW, and exact are based on operands that might have changed.
+ J->dropPoisonGeneratingFlags();
+
+ // We do not have to worry about llvm.assume or range metadata:
+ // 1. llvm.assume demands its operand, so trivializing can't change it.
+ // 2. range metadata only applies to memory accesses which demand all bits.
+
+ Visited.insert(J);
+
+ for (User *KU : J->users()) {
+ // If all bits of a user are demanded, then we know that nothing below
+ // that in the def-use chain needs to be changed.
+ auto *K = dyn_cast<Instruction>(KU);
+ if (K && !Visited.count(K) && K->getType()->isSized() &&
+ !DB.getDemandedBits(K).isAllOnesValue())
+ WorkList.push_back(K);
+ }
+ }
+}
+
+static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
+ SmallVector<Instruction*, 128> Worklist;
+ bool Changed = false;
+ for (Instruction &I : instructions(F)) {
+ // If the instruction has side effects and no non-dbg uses,
+ // skip it. This way we avoid computing known bits on an instruction
+ // that will not help us.
+ if (I.mayHaveSideEffects() && I.use_empty())
+ continue;
+
+ if (I.getType()->isIntegerTy() &&
+ !DB.getDemandedBits(&I).getBoolValue()) {
+ // For live instructions that have all dead bits, first make them dead by
+ // replacing all uses with something else. Then, if they don't need to
+ // remain live (because they have side effects, etc.) we can remove them.
+ DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
+
+ clearAssumptionsOfUsers(&I, DB);
+
+ // FIXME: In theory we could substitute undef here instead of zero.
+ // This should be reconsidered once we settle on the semantics of
+ // undef, poison, etc.
+ Value *Zero = ConstantInt::get(I.getType(), 0);
+ ++NumSimplified;
+ I.replaceNonMetadataUsesWith(Zero);
+ Changed = true;
+ }
+ if (!DB.isInstructionDead(&I))
+ continue;
+
+ Worklist.push_back(&I);
+ I.dropAllReferences();
+ Changed = true;
+ }
+
+ for (Instruction *&I : Worklist) {
+ ++NumRemoved;
+ I->eraseFromParent();
+ }
+
+ return Changed;
+}
+
+PreservedAnalyses BDCEPass::run(Function &F, FunctionAnalysisManager &AM) {
+ auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
+ if (!bitTrackingDCE(F, DB))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+namespace {
+struct BDCELegacyPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ BDCELegacyPass() : FunctionPass(ID) {
+ initializeBDCELegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ auto &DB = getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+ return bitTrackingDCE(F, DB);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DemandedBitsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+}
+
+char BDCELegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(BDCELegacyPass, "bdce",
+ "Bit-Tracking Dead Code Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_END(BDCELegacyPass, "bdce",
+ "Bit-Tracking Dead Code Elimination", false, false)
+
+FunctionPass *llvm::createBitTrackingDCEPass() { return new BDCELegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
new file mode 100644
index 000000000000..4edea7cc3c82
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -0,0 +1,418 @@
+//===- CallSiteSplitting.cpp ----------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a transformation that tries to split a call-site to pass
+// more constrained arguments if its argument is predicated in the control flow
+// so that we can expose better context to the later passes (e.g, inliner, jump
+// threading, or IPA-CP based function cloning, etc.).
+// As of now we support two cases :
+//
+// 1) Try to a split call-site with constrained arguments, if any constraints
+// on any argument can be found by following the single predecessors of the
+// all site's predecessors. Currently this pass only handles call-sites with 2
+// predecessors. For example, in the code below, we try to split the call-site
+// since we can predicate the argument(ptr) based on the OR condition.
+//
+// Split from :
+// if (!ptr || c)
+// callee(ptr);
+// to :
+// if (!ptr)
+// callee(null) // set the known constant value
+// else if (c)
+// callee(nonnull ptr) // set non-null attribute in the argument
+//
+// 2) We can also split a call-site based on constant incoming values of a PHI
+// For example,
+// from :
+// Header:
+// %c = icmp eq i32 %i1, %i2
+// br i1 %c, label %Tail, label %TBB
+// TBB:
+// br label Tail%
+// Tail:
+// %p = phi i32 [ 0, %Header], [ 1, %TBB]
+// call void @bar(i32 %p)
+// to
+// Header:
+// %c = icmp eq i32 %i1, %i2
+// br i1 %c, label %Tail-split0, label %TBB
+// TBB:
+// br label %Tail-split1
+// Tail-split0:
+// call void @bar(i32 0)
+// br label %Tail
+// Tail-split1:
+// call void @bar(i32 1)
+// br label %Tail
+// Tail:
+// %p = phi i32 [ 0, %Tail-split0 ], [ 1, %Tail-split1 ]
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/CallSiteSplitting.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "callsite-splitting"
+
+STATISTIC(NumCallSiteSplit, "Number of call-site split");
+
+static void addNonNullAttribute(Instruction *CallI, Instruction *NewCallI,
+ Value *Op) {
+ CallSite CS(NewCallI);
+ unsigned ArgNo = 0;
+ for (auto &I : CS.args()) {
+ if (&*I == Op)
+ CS.addParamAttr(ArgNo, Attribute::NonNull);
+ ++ArgNo;
+ }
+}
+
+static void setConstantInArgument(Instruction *CallI, Instruction *NewCallI,
+ Value *Op, Constant *ConstValue) {
+ CallSite CS(NewCallI);
+ unsigned ArgNo = 0;
+ for (auto &I : CS.args()) {
+ if (&*I == Op)
+ CS.setArgument(ArgNo, ConstValue);
+ ++ArgNo;
+ }
+}
+
+static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) {
+ assert(isa<Constant>(Cmp->getOperand(1)) && "Expected a constant operand.");
+ Value *Op0 = Cmp->getOperand(0);
+ unsigned ArgNo = 0;
+ for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
+ ++I, ++ArgNo) {
+ // Don't consider constant or arguments that are already known non-null.
+ if (isa<Constant>(*I) || CS.paramHasAttr(ArgNo, Attribute::NonNull))
+ continue;
+
+ if (*I == Op0)
+ return true;
+ }
+ return false;
+}
+
+/// If From has a conditional jump to To, add the condition to Conditions,
+/// if it is relevant to any argument at CS.
+static void
+recordCondition(const CallSite &CS, BasicBlock *From, BasicBlock *To,
+ SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
+ auto *BI = dyn_cast<BranchInst>(From->getTerminator());
+ if (!BI || !BI->isConditional())
+ return;
+
+ CmpInst::Predicate Pred;
+ Value *Cond = BI->getCondition();
+ if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant())))
+ return;
+
+ ICmpInst *Cmp = cast<ICmpInst>(Cond);
+ if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)
+ if (isCondRelevantToAnyCallArgument(Cmp, CS))
+ Conditions.push_back({Cmp, From->getTerminator()->getSuccessor(0) == To
+ ? Pred
+ : Cmp->getInversePredicate()});
+}
+
+/// Record ICmp conditions relevant to any argument in CS following Pred's
+/// single successors. If there are conflicting conditions along a path, like
+/// x == 1 and x == 0, the first condition will be used.
+static void
+recordConditions(const CallSite &CS, BasicBlock *Pred,
+ SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
+ recordCondition(CS, Pred, CS.getInstruction()->getParent(), Conditions);
+ BasicBlock *From = Pred;
+ BasicBlock *To = Pred;
+ SmallPtrSet<BasicBlock *, 4> Visited;
+ while (!Visited.count(From->getSinglePredecessor()) &&
+ (From = From->getSinglePredecessor())) {
+ recordCondition(CS, From, To, Conditions);
+ Visited.insert(From);
+ To = From;
+ }
+}
+
+static Instruction *
+addConditions(CallSite &CS,
+ SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
+ if (Conditions.empty())
+ return nullptr;
+
+ Instruction *NewCI = CS.getInstruction()->clone();
+ for (auto &Cond : Conditions) {
+ Value *Arg = Cond.first->getOperand(0);
+ Constant *ConstVal = cast<Constant>(Cond.first->getOperand(1));
+ if (Cond.second == ICmpInst::ICMP_EQ)
+ setConstantInArgument(CS.getInstruction(), NewCI, Arg, ConstVal);
+ else if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) {
+ assert(Cond.second == ICmpInst::ICMP_NE);
+ addNonNullAttribute(CS.getInstruction(), NewCI, Arg);
+ }
+ }
+ return NewCI;
+}
+
+static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) {
+ SmallVector<BasicBlock *, 2> Preds(predecessors((BB)));
+ assert(Preds.size() == 2 && "Expected exactly 2 predecessors!");
+ return Preds;
+}
+
+static bool canSplitCallSite(CallSite CS) {
+ // FIXME: As of now we handle only CallInst. InvokeInst could be handled
+ // without too much effort.
+ Instruction *Instr = CS.getInstruction();
+ if (!isa<CallInst>(Instr))
+ return false;
+
+ // Allow splitting a call-site only when there is no instruction before the
+ // call-site in the basic block. Based on this constraint, we only clone the
+ // call instruction, and we do not move a call-site across any other
+ // instruction.
+ BasicBlock *CallSiteBB = Instr->getParent();
+ if (Instr != CallSiteBB->getFirstNonPHIOrDbg())
+ return false;
+
+ // Need 2 predecessors and cannot split an edge from an IndirectBrInst.
+ SmallVector<BasicBlock *, 2> Preds(predecessors(CallSiteBB));
+ if (Preds.size() != 2 || isa<IndirectBrInst>(Preds[0]->getTerminator()) ||
+ isa<IndirectBrInst>(Preds[1]->getTerminator()))
+ return false;
+
+ return CallSiteBB->canSplitPredecessors();
+}
+
+/// Return true if the CS is split into its new predecessors which are directly
+/// hooked to each of its original predecessors pointed by PredBB1 and PredBB2.
+/// CallInst1 and CallInst2 will be the new call-sites placed in the new
+/// predecessors split for PredBB1 and PredBB2, respectively.
+/// For example, in the IR below with an OR condition, the call-site can
+/// be split. Assuming PredBB1=Header and PredBB2=TBB, CallInst1 will be the
+/// call-site placed between Header and Tail, and CallInst2 will be the
+/// call-site between TBB and Tail.
+///
+/// From :
+///
+/// Header:
+/// %c = icmp eq i32* %a, null
+/// br i1 %c %Tail, %TBB
+/// TBB:
+/// %c2 = icmp eq i32* %b, null
+/// br i1 %c %Tail, %End
+/// Tail:
+/// %ca = call i1 @callee (i32* %a, i32* %b)
+///
+/// to :
+///
+/// Header: // PredBB1 is Header
+/// %c = icmp eq i32* %a, null
+/// br i1 %c %Tail-split1, %TBB
+/// TBB: // PredBB2 is TBB
+/// %c2 = icmp eq i32* %b, null
+/// br i1 %c %Tail-split2, %End
+/// Tail-split1:
+/// %ca1 = call @callee (i32* null, i32* %b) // CallInst1
+/// br %Tail
+/// Tail-split2:
+/// %ca2 = call @callee (i32* nonnull %a, i32* null) // CallInst2
+/// br %Tail
+/// Tail:
+/// %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2]
+///
+/// Note that in case any arguments at the call-site are constrained by its
+/// predecessors, new call-sites with more constrained arguments will be
+/// created in createCallSitesOnPredicatedArgument().
+static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2,
+ Instruction *CallInst1, Instruction *CallInst2) {
+ Instruction *Instr = CS.getInstruction();
+ BasicBlock *TailBB = Instr->getParent();
+ assert(Instr == (TailBB->getFirstNonPHIOrDbg()) && "Unexpected call-site");
+
+ BasicBlock *SplitBlock1 =
+ SplitBlockPredecessors(TailBB, PredBB1, ".predBB1.split");
+ BasicBlock *SplitBlock2 =
+ SplitBlockPredecessors(TailBB, PredBB2, ".predBB2.split");
+
+ assert((SplitBlock1 && SplitBlock2) && "Unexpected new basic block split.");
+
+ if (!CallInst1)
+ CallInst1 = Instr->clone();
+ if (!CallInst2)
+ CallInst2 = Instr->clone();
+
+ CallInst1->insertBefore(&*SplitBlock1->getFirstInsertionPt());
+ CallInst2->insertBefore(&*SplitBlock2->getFirstInsertionPt());
+
+ CallSite CS1(CallInst1);
+ CallSite CS2(CallInst2);
+
+ // Handle PHIs used as arguments in the call-site.
+ for (PHINode &PN : TailBB->phis()) {
+ unsigned ArgNo = 0;
+ for (auto &CI : CS.args()) {
+ if (&*CI == &PN) {
+ CS1.setArgument(ArgNo, PN.getIncomingValueForBlock(SplitBlock1));
+ CS2.setArgument(ArgNo, PN.getIncomingValueForBlock(SplitBlock2));
+ }
+ ++ArgNo;
+ }
+ }
+
+ // Replace users of the original call with a PHI mering call-sites split.
+ if (Instr->getNumUses()) {
+ PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call",
+ TailBB->getFirstNonPHI());
+ PN->addIncoming(CallInst1, SplitBlock1);
+ PN->addIncoming(CallInst2, SplitBlock2);
+ Instr->replaceAllUsesWith(PN);
+ }
+ DEBUG(dbgs() << "split call-site : " << *Instr << " into \n");
+ DEBUG(dbgs() << " " << *CallInst1 << " in " << SplitBlock1->getName()
+ << "\n");
+ DEBUG(dbgs() << " " << *CallInst2 << " in " << SplitBlock2->getName()
+ << "\n");
+ Instr->eraseFromParent();
+ NumCallSiteSplit++;
+}
+
+// Return true if the call-site has an argument which is a PHI with only
+// constant incoming values.
+static bool isPredicatedOnPHI(CallSite CS) {
+ Instruction *Instr = CS.getInstruction();
+ BasicBlock *Parent = Instr->getParent();
+ if (Instr != Parent->getFirstNonPHIOrDbg())
+ return false;
+
+ for (auto &BI : *Parent) {
+ if (PHINode *PN = dyn_cast<PHINode>(&BI)) {
+ for (auto &I : CS.args())
+ if (&*I == PN) {
+ assert(PN->getNumIncomingValues() == 2 &&
+ "Unexpected number of incoming values");
+ if (PN->getIncomingBlock(0) == PN->getIncomingBlock(1))
+ return false;
+ if (PN->getIncomingValue(0) == PN->getIncomingValue(1))
+ continue;
+ if (isa<Constant>(PN->getIncomingValue(0)) &&
+ isa<Constant>(PN->getIncomingValue(1)))
+ return true;
+ }
+ }
+ break;
+ }
+ return false;
+}
+
+static bool tryToSplitOnPHIPredicatedArgument(CallSite CS) {
+ if (!isPredicatedOnPHI(CS))
+ return false;
+
+ auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
+ splitCallSite(CS, Preds[0], Preds[1], nullptr, nullptr);
+ return true;
+}
+
+static bool tryToSplitOnPredicatedArgument(CallSite CS) {
+ auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
+ if (Preds[0] == Preds[1])
+ return false;
+
+ SmallVector<std::pair<ICmpInst *, unsigned>, 2> C1, C2;
+ recordConditions(CS, Preds[0], C1);
+ recordConditions(CS, Preds[1], C2);
+
+ Instruction *CallInst1 = addConditions(CS, C1);
+ Instruction *CallInst2 = addConditions(CS, C2);
+ if (!CallInst1 && !CallInst2)
+ return false;
+
+ splitCallSite(CS, Preds[1], Preds[0], CallInst2, CallInst1);
+ return true;
+}
+
+static bool tryToSplitCallSite(CallSite CS) {
+ if (!CS.arg_size() || !canSplitCallSite(CS))
+ return false;
+ return tryToSplitOnPredicatedArgument(CS) ||
+ tryToSplitOnPHIPredicatedArgument(CS);
+}
+
+static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) {
+ bool Changed = false;
+ for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) {
+ BasicBlock &BB = *BI++;
+ for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) {
+ Instruction *I = &*II++;
+ CallSite CS(cast<Value>(I));
+ if (!CS || isa<IntrinsicInst>(I) || isInstructionTriviallyDead(I, &TLI))
+ continue;
+
+ Function *Callee = CS.getCalledFunction();
+ if (!Callee || Callee->isDeclaration())
+ continue;
+ Changed |= tryToSplitCallSite(CS);
+ }
+ }
+ return Changed;
+}
+
+namespace {
+struct CallSiteSplittingLegacyPass : public FunctionPass {
+ static char ID;
+ CallSiteSplittingLegacyPass() : FunctionPass(ID) {
+ initializeCallSiteSplittingLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ return doCallSiteSplitting(F, TLI);
+ }
+};
+} // namespace
+
+char CallSiteSplittingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting",
+ "Call-site splitting", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting",
+ "Call-site splitting", false, false)
+FunctionPass *llvm::createCallSiteSplittingPass() {
+ return new CallSiteSplittingLegacyPass();
+}
+
+PreservedAnalyses CallSiteSplittingPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+
+ if (!doCallSiteSplitting(F, TLI))
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
new file mode 100644
index 000000000000..e4b08c5ed305
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -0,0 +1,822 @@
+//===- ConstantHoisting.cpp - Prepare code for expensive constants --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies expensive constants to hoist and coalesces them to
+// better prepare it for SelectionDAG-based code generation. This works around
+// the limitations of the basic-block-at-a-time approach.
+//
+// First it scans all instructions for integer constants and calculates its
+// cost. If the constant can be folded into the instruction (the cost is
+// TCC_Free) or the cost is just a simple operation (TCC_BASIC), then we don't
+// consider it expensive and leave it alone. This is the default behavior and
+// the default implementation of getIntImmCost will always return TCC_Free.
+//
+// If the cost is more than TCC_BASIC, then the integer constant can't be folded
+// into the instruction and it might be beneficial to hoist the constant.
+// Similar constants are coalesced to reduce register pressure and
+// materialization code.
+//
+// When a constant is hoisted, it is also hidden behind a bitcast to force it to
+// be live-out of the basic block. Otherwise the constant would be just
+// duplicated and each basic block would have its own copy in the SelectionDAG.
+// The SelectionDAG recognizes such constants as opaque and doesn't perform
+// certain transformations on them, which would create a new expensive constant.
+//
+// This optimization is only applied to integer constants in instructions and
+// simple (this means not nested) constant cast expressions. For example:
+// %0 = load i64* inttoptr (i64 big_constant to i64*)
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/ConstantHoisting.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+using namespace consthoist;
+
+#define DEBUG_TYPE "consthoist"
+
+STATISTIC(NumConstantsHoisted, "Number of constants hoisted");
+STATISTIC(NumConstantsRebased, "Number of constants rebased");
+
+static cl::opt<bool> ConstHoistWithBlockFrequency(
+ "consthoist-with-block-frequency", cl::init(true), cl::Hidden,
+ cl::desc("Enable the use of the block frequency analysis to reduce the "
+ "chance to execute const materialization more frequently than "
+ "without hoisting."));
+
+namespace {
+
+/// \brief The constant hoisting pass.
+class ConstantHoistingLegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ ConstantHoistingLegacyPass() : FunctionPass(ID) {
+ initializeConstantHoistingLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &Fn) override;
+
+ StringRef getPassName() const override { return "Constant Hoisting"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ if (ConstHoistWithBlockFrequency)
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+
+ void releaseMemory() override { Impl.releaseMemory(); }
+
+private:
+ ConstantHoistingPass Impl;
+};
+
+} // end anonymous namespace
+
+char ConstantHoistingLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ConstantHoistingLegacyPass, "consthoist",
+ "Constant Hoisting", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(ConstantHoistingLegacyPass, "consthoist",
+ "Constant Hoisting", false, false)
+
+FunctionPass *llvm::createConstantHoistingPass() {
+ return new ConstantHoistingLegacyPass();
+}
+
+/// \brief Perform the constant hoisting optimization for the given function.
+bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
+ if (skipFunction(Fn))
+ return false;
+
+ DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
+ DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
+
+ bool MadeChange =
+ Impl.runImpl(Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn),
+ getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ ConstHoistWithBlockFrequency
+ ? &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI()
+ : nullptr,
+ Fn.getEntryBlock());
+
+ if (MadeChange) {
+ DEBUG(dbgs() << "********** Function after Constant Hoisting: "
+ << Fn.getName() << '\n');
+ DEBUG(dbgs() << Fn);
+ }
+ DEBUG(dbgs() << "********** End Constant Hoisting **********\n");
+
+ return MadeChange;
+}
+
+/// \brief Find the constant materialization insertion point.
+Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
+ unsigned Idx) const {
+ // If the operand is a cast instruction, then we have to materialize the
+ // constant before the cast instruction.
+ if (Idx != ~0U) {
+ Value *Opnd = Inst->getOperand(Idx);
+ if (auto CastInst = dyn_cast<Instruction>(Opnd))
+ if (CastInst->isCast())
+ return CastInst;
+ }
+
+ // The simple and common case. This also includes constant expressions.
+ if (!isa<PHINode>(Inst) && !Inst->isEHPad())
+ return Inst;
+
+ // We can't insert directly before a phi node or an eh pad. Insert before
+ // the terminator of the incoming or dominating block.
+ assert(Entry != Inst->getParent() && "PHI or landing pad in entry block!");
+ if (Idx != ~0U && isa<PHINode>(Inst))
+ return cast<PHINode>(Inst)->getIncomingBlock(Idx)->getTerminator();
+
+ // This must be an EH pad. Iterate over immediate dominators until we find a
+ // non-EH pad. We need to skip over catchswitch blocks, which are both EH pads
+ // and terminators.
+ auto IDom = DT->getNode(Inst->getParent())->getIDom();
+ while (IDom->getBlock()->isEHPad()) {
+ assert(Entry != IDom->getBlock() && "eh pad in entry block");
+ IDom = IDom->getIDom();
+ }
+
+ return IDom->getBlock()->getTerminator();
+}
+
+/// \brief Given \p BBs as input, find another set of BBs which collectively
+/// dominates \p BBs and have the minimal sum of frequencies. Return the BB
+/// set found in \p BBs.
+static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
+ BasicBlock *Entry,
+ SmallPtrSet<BasicBlock *, 8> &BBs) {
+ assert(!BBs.count(Entry) && "Assume Entry is not in BBs");
+ // Nodes on the current path to the root.
+ SmallPtrSet<BasicBlock *, 8> Path;
+ // Candidates includes any block 'BB' in set 'BBs' that is not strictly
+ // dominated by any other blocks in set 'BBs', and all nodes in the path
+ // in the dominator tree from Entry to 'BB'.
+ SmallPtrSet<BasicBlock *, 16> Candidates;
+ for (auto BB : BBs) {
+ Path.clear();
+ // Walk up the dominator tree until Entry or another BB in BBs
+ // is reached. Insert the nodes on the way to the Path.
+ BasicBlock *Node = BB;
+ // The "Path" is a candidate path to be added into Candidates set.
+ bool isCandidate = false;
+ do {
+ Path.insert(Node);
+ if (Node == Entry || Candidates.count(Node)) {
+ isCandidate = true;
+ break;
+ }
+ assert(DT.getNode(Node)->getIDom() &&
+ "Entry doens't dominate current Node");
+ Node = DT.getNode(Node)->getIDom()->getBlock();
+ } while (!BBs.count(Node));
+
+ // If isCandidate is false, Node is another Block in BBs dominating
+ // current 'BB'. Drop the nodes on the Path.
+ if (!isCandidate)
+ continue;
+
+ // Add nodes on the Path into Candidates.
+ Candidates.insert(Path.begin(), Path.end());
+ }
+
+ // Sort the nodes in Candidates in top-down order and save the nodes
+ // in Orders.
+ unsigned Idx = 0;
+ SmallVector<BasicBlock *, 16> Orders;
+ Orders.push_back(Entry);
+ while (Idx != Orders.size()) {
+ BasicBlock *Node = Orders[Idx++];
+ for (auto ChildDomNode : DT.getNode(Node)->getChildren()) {
+ if (Candidates.count(ChildDomNode->getBlock()))
+ Orders.push_back(ChildDomNode->getBlock());
+ }
+ }
+
+ // Visit Orders in bottom-up order.
+ using InsertPtsCostPair =
+ std::pair<SmallPtrSet<BasicBlock *, 16>, BlockFrequency>;
+
+ // InsertPtsMap is a map from a BB to the best insertion points for the
+ // subtree of BB (subtree not including the BB itself).
+ DenseMap<BasicBlock *, InsertPtsCostPair> InsertPtsMap;
+ InsertPtsMap.reserve(Orders.size() + 1);
+ for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) {
+ BasicBlock *Node = *RIt;
+ bool NodeInBBs = BBs.count(Node);
+ SmallPtrSet<BasicBlock *, 16> &InsertPts = InsertPtsMap[Node].first;
+ BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second;
+
+ // Return the optimal insert points in BBs.
+ if (Node == Entry) {
+ BBs.clear();
+ if (InsertPtsFreq > BFI.getBlockFreq(Node) ||
+ (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1))
+ BBs.insert(Entry);
+ else
+ BBs.insert(InsertPts.begin(), InsertPts.end());
+ break;
+ }
+
+ BasicBlock *Parent = DT.getNode(Node)->getIDom()->getBlock();
+ // Initially, ParentInsertPts is empty and ParentPtsFreq is 0. Every child
+ // will update its parent's ParentInsertPts and ParentPtsFreq.
+ SmallPtrSet<BasicBlock *, 16> &ParentInsertPts = InsertPtsMap[Parent].first;
+ BlockFrequency &ParentPtsFreq = InsertPtsMap[Parent].second;
+ // Choose to insert in Node or in subtree of Node.
+ // Don't hoist to EHPad because we may not find a proper place to insert
+ // in EHPad.
+ // If the total frequency of InsertPts is the same as the frequency of the
+ // target Node, and InsertPts contains more than one nodes, choose hoisting
+ // to reduce code size.
+ if (NodeInBBs ||
+ (!Node->isEHPad() &&
+ (InsertPtsFreq > BFI.getBlockFreq(Node) ||
+ (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1)))) {
+ ParentInsertPts.insert(Node);
+ ParentPtsFreq += BFI.getBlockFreq(Node);
+ } else {
+ ParentInsertPts.insert(InsertPts.begin(), InsertPts.end());
+ ParentPtsFreq += InsertPtsFreq;
+ }
+ }
+}
+
+/// \brief Find an insertion point that dominates all uses.
+SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
+ const ConstantInfo &ConstInfo) const {
+ assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
+ // Collect all basic blocks.
+ SmallPtrSet<BasicBlock *, 8> BBs;
+ SmallPtrSet<Instruction *, 8> InsertPts;
+ for (auto const &RCI : ConstInfo.RebasedConstants)
+ for (auto const &U : RCI.Uses)
+ BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent());
+
+ if (BBs.count(Entry)) {
+ InsertPts.insert(&Entry->front());
+ return InsertPts;
+ }
+
+ if (BFI) {
+ findBestInsertionSet(*DT, *BFI, Entry, BBs);
+ for (auto BB : BBs) {
+ BasicBlock::iterator InsertPt = BB->begin();
+ for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)
+ ;
+ InsertPts.insert(&*InsertPt);
+ }
+ return InsertPts;
+ }
+
+ while (BBs.size() >= 2) {
+ BasicBlock *BB, *BB1, *BB2;
+ BB1 = *BBs.begin();
+ BB2 = *std::next(BBs.begin());
+ BB = DT->findNearestCommonDominator(BB1, BB2);
+ if (BB == Entry) {
+ InsertPts.insert(&Entry->front());
+ return InsertPts;
+ }
+ BBs.erase(BB1);
+ BBs.erase(BB2);
+ BBs.insert(BB);
+ }
+ assert((BBs.size() == 1) && "Expected only one element.");
+ Instruction &FirstInst = (*BBs.begin())->front();
+ InsertPts.insert(findMatInsertPt(&FirstInst));
+ return InsertPts;
+}
+
+/// \brief Record constant integer ConstInt for instruction Inst at operand
+/// index Idx.
+///
+/// The operand at index Idx is not necessarily the constant integer itself. It
+/// could also be a cast instruction or a constant expression that uses the
+// constant integer.
+void ConstantHoistingPass::collectConstantCandidates(
+ ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx,
+ ConstantInt *ConstInt) {
+ unsigned Cost;
+ // Ask the target about the cost of materializing the constant for the given
+ // instruction and operand index.
+ if (auto IntrInst = dyn_cast<IntrinsicInst>(Inst))
+ Cost = TTI->getIntImmCost(IntrInst->getIntrinsicID(), Idx,
+ ConstInt->getValue(), ConstInt->getType());
+ else
+ Cost = TTI->getIntImmCost(Inst->getOpcode(), Idx, ConstInt->getValue(),
+ ConstInt->getType());
+
+ // Ignore cheap integer constants.
+ if (Cost > TargetTransformInfo::TCC_Basic) {
+ ConstCandMapType::iterator Itr;
+ bool Inserted;
+ std::tie(Itr, Inserted) = ConstCandMap.insert(std::make_pair(ConstInt, 0));
+ if (Inserted) {
+ ConstCandVec.push_back(ConstantCandidate(ConstInt));
+ Itr->second = ConstCandVec.size() - 1;
+ }
+ ConstCandVec[Itr->second].addUser(Inst, Idx, Cost);
+ DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx)))
+ dbgs() << "Collect constant " << *ConstInt << " from " << *Inst
+ << " with cost " << Cost << '\n';
+ else
+ dbgs() << "Collect constant " << *ConstInt << " indirectly from "
+ << *Inst << " via " << *Inst->getOperand(Idx) << " with cost "
+ << Cost << '\n';
+ );
+ }
+}
+
+/// \brief Check the operand for instruction Inst at index Idx.
+void ConstantHoistingPass::collectConstantCandidates(
+ ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) {
+ Value *Opnd = Inst->getOperand(Idx);
+
+ // Visit constant integers.
+ if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) {
+ collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+ return;
+ }
+
+ // Visit cast instructions that have constant integers.
+ if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
+ // Only visit cast instructions, which have been skipped. All other
+ // instructions should have already been visited.
+ if (!CastInst->isCast())
+ return;
+
+ if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) {
+ // Pretend the constant is directly used by the instruction and ignore
+ // the cast instruction.
+ collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+ return;
+ }
+ }
+
+ // Visit constant expressions that have constant integers.
+ if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
+ // Only visit constant cast expressions.
+ if (!ConstExpr->isCast())
+ return;
+
+ if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) {
+ // Pretend the constant is directly used by the instruction and ignore
+ // the constant expression.
+ collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+ return;
+ }
+ }
+}
+
+/// \brief Scan the instruction for expensive integer constants and record them
+/// in the constant candidate vector.
+void ConstantHoistingPass::collectConstantCandidates(
+ ConstCandMapType &ConstCandMap, Instruction *Inst) {
+ // Skip all cast instructions. They are visited indirectly later on.
+ if (Inst->isCast())
+ return;
+
+ // Scan all operands.
+ for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
+ // The cost of materializing the constants (defined in
+ // `TargetTransformInfo::getIntImmCost`) for instructions which only take
+ // constant variables is lower than `TargetTransformInfo::TCC_Basic`. So
+ // it's safe for us to collect constant candidates from all IntrinsicInsts.
+ if (canReplaceOperandWithVariable(Inst, Idx) || isa<IntrinsicInst>(Inst)) {
+ collectConstantCandidates(ConstCandMap, Inst, Idx);
+ }
+ } // end of for all operands
+}
+
+/// \brief Collect all integer constants in the function that cannot be folded
+/// into an instruction itself.
+void ConstantHoistingPass::collectConstantCandidates(Function &Fn) {
+ ConstCandMapType ConstCandMap;
+ for (BasicBlock &BB : Fn)
+ for (Instruction &Inst : BB)
+ collectConstantCandidates(ConstCandMap, &Inst);
+}
+
+// This helper function is necessary to deal with values that have different
+// bit widths (APInt Operator- does not like that). If the value cannot be
+// represented in uint64 we return an "empty" APInt. This is then interpreted
+// as the value is not in range.
+static Optional<APInt> calculateOffsetDiff(const APInt &V1, const APInt &V2) {
+ Optional<APInt> Res = None;
+ unsigned BW = V1.getBitWidth() > V2.getBitWidth() ?
+ V1.getBitWidth() : V2.getBitWidth();
+ uint64_t LimVal1 = V1.getLimitedValue();
+ uint64_t LimVal2 = V2.getLimitedValue();
+
+ if (LimVal1 == ~0ULL || LimVal2 == ~0ULL)
+ return Res;
+
+ uint64_t Diff = LimVal1 - LimVal2;
+ return APInt(BW, Diff, true);
+}
+
+// From a list of constants, one needs to picked as the base and the other
+// constants will be transformed into an offset from that base constant. The
+// question is which we can pick best? For example, consider these constants
+// and their number of uses:
+//
+// Constants| 2 | 4 | 12 | 42 |
+// NumUses | 3 | 2 | 8 | 7 |
+//
+// Selecting constant 12 because it has the most uses will generate negative
+// offsets for constants 2 and 4 (i.e. -10 and -8 respectively). If negative
+// offsets lead to less optimal code generation, then there might be better
+// solutions. Suppose immediates in the range of 0..35 are most optimally
+// supported by the architecture, then selecting constant 2 is most optimal
+// because this will generate offsets: 0, 2, 10, 40. Offsets 0, 2 and 10 are in
+// range 0..35, and thus 3 + 2 + 8 = 13 uses are in range. Selecting 12 would
+// have only 8 uses in range, so choosing 2 as a base is more optimal. Thus, in
+// selecting the base constant the range of the offsets is a very important
+// factor too that we take into account here. This algorithm calculates a total
+// costs for selecting a constant as the base and substract the costs if
+// immediates are out of range. It has quadratic complexity, so we call this
+// function only when we're optimising for size and there are less than 100
+// constants, we fall back to the straightforward algorithm otherwise
+// which does not do all the offset calculations.
+unsigned
+ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
+ ConstCandVecType::iterator E,
+ ConstCandVecType::iterator &MaxCostItr) {
+ unsigned NumUses = 0;
+
+ if(!Entry->getParent()->optForSize() || std::distance(S,E) > 100) {
+ for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+ NumUses += ConstCand->Uses.size();
+ if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost)
+ MaxCostItr = ConstCand;
+ }
+ return NumUses;
+ }
+
+ DEBUG(dbgs() << "== Maximize constants in range ==\n");
+ int MaxCost = -1;
+ for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+ auto Value = ConstCand->ConstInt->getValue();
+ Type *Ty = ConstCand->ConstInt->getType();
+ int Cost = 0;
+ NumUses += ConstCand->Uses.size();
+ DEBUG(dbgs() << "= Constant: " << ConstCand->ConstInt->getValue() << "\n");
+
+ for (auto User : ConstCand->Uses) {
+ unsigned Opcode = User.Inst->getOpcode();
+ unsigned OpndIdx = User.OpndIdx;
+ Cost += TTI->getIntImmCost(Opcode, OpndIdx, Value, Ty);
+ DEBUG(dbgs() << "Cost: " << Cost << "\n");
+
+ for (auto C2 = S; C2 != E; ++C2) {
+ Optional<APInt> Diff = calculateOffsetDiff(
+ C2->ConstInt->getValue(),
+ ConstCand->ConstInt->getValue());
+ if (Diff) {
+ const int ImmCosts =
+ TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.getValue(), Ty);
+ Cost -= ImmCosts;
+ DEBUG(dbgs() << "Offset " << Diff.getValue() << " "
+ << "has penalty: " << ImmCosts << "\n"
+ << "Adjusted cost: " << Cost << "\n");
+ }
+ }
+ }
+ DEBUG(dbgs() << "Cumulative cost: " << Cost << "\n");
+ if (Cost > MaxCost) {
+ MaxCost = Cost;
+ MaxCostItr = ConstCand;
+ DEBUG(dbgs() << "New candidate: " << MaxCostItr->ConstInt->getValue()
+ << "\n");
+ }
+ }
+ return NumUses;
+}
+
+/// \brief Find the base constant within the given range and rebase all other
+/// constants with respect to the base constant.
+void ConstantHoistingPass::findAndMakeBaseConstant(
+ ConstCandVecType::iterator S, ConstCandVecType::iterator E) {
+ auto MaxCostItr = S;
+ unsigned NumUses = maximizeConstantsInRange(S, E, MaxCostItr);
+
+ // Don't hoist constants that have only one use.
+ if (NumUses <= 1)
+ return;
+
+ ConstantInfo ConstInfo;
+ ConstInfo.BaseConstant = MaxCostItr->ConstInt;
+ Type *Ty = ConstInfo.BaseConstant->getType();
+
+ // Rebase the constants with respect to the base constant.
+ for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+ APInt Diff = ConstCand->ConstInt->getValue() -
+ ConstInfo.BaseConstant->getValue();
+ Constant *Offset = Diff == 0 ? nullptr : ConstantInt::get(Ty, Diff);
+ ConstInfo.RebasedConstants.push_back(
+ RebasedConstantInfo(std::move(ConstCand->Uses), Offset));
+ }
+ ConstantVec.push_back(std::move(ConstInfo));
+}
+
+/// \brief Finds and combines constant candidates that can be easily
+/// rematerialized with an add from a common base constant.
+void ConstantHoistingPass::findBaseConstants() {
+ // Sort the constants by value and type. This invalidates the mapping!
+ std::sort(ConstCandVec.begin(), ConstCandVec.end(),
+ [](const ConstantCandidate &LHS, const ConstantCandidate &RHS) {
+ if (LHS.ConstInt->getType() != RHS.ConstInt->getType())
+ return LHS.ConstInt->getType()->getBitWidth() <
+ RHS.ConstInt->getType()->getBitWidth();
+ return LHS.ConstInt->getValue().ult(RHS.ConstInt->getValue());
+ });
+
+ // Simple linear scan through the sorted constant candidate vector for viable
+ // merge candidates.
+ auto MinValItr = ConstCandVec.begin();
+ for (auto CC = std::next(ConstCandVec.begin()), E = ConstCandVec.end();
+ CC != E; ++CC) {
+ if (MinValItr->ConstInt->getType() == CC->ConstInt->getType()) {
+ // Check if the constant is in range of an add with immediate.
+ APInt Diff = CC->ConstInt->getValue() - MinValItr->ConstInt->getValue();
+ if ((Diff.getBitWidth() <= 64) &&
+ TTI->isLegalAddImmediate(Diff.getSExtValue()))
+ continue;
+ }
+ // We either have now a different constant type or the constant is not in
+ // range of an add with immediate anymore.
+ findAndMakeBaseConstant(MinValItr, CC);
+ // Start a new base constant search.
+ MinValItr = CC;
+ }
+ // Finalize the last base constant search.
+ findAndMakeBaseConstant(MinValItr, ConstCandVec.end());
+}
+
+/// \brief Updates the operand at Idx in instruction Inst with the result of
+/// instruction Mat. If the instruction is a PHI node then special
+/// handling for duplicate values form the same incoming basic block is
+/// required.
+/// \return The update will always succeed, but the return value indicated if
+/// Mat was used for the update or not.
+static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat) {
+ if (auto PHI = dyn_cast<PHINode>(Inst)) {
+ // Check if any previous operand of the PHI node has the same incoming basic
+ // block. This is a very odd case that happens when the incoming basic block
+ // has a switch statement. In this case use the same value as the previous
+ // operand(s), otherwise we will fail verification due to different values.
+ // The values are actually the same, but the variable names are different
+ // and the verifier doesn't like that.
+ BasicBlock *IncomingBB = PHI->getIncomingBlock(Idx);
+ for (unsigned i = 0; i < Idx; ++i) {
+ if (PHI->getIncomingBlock(i) == IncomingBB) {
+ Value *IncomingVal = PHI->getIncomingValue(i);
+ Inst->setOperand(Idx, IncomingVal);
+ return false;
+ }
+ }
+ }
+
+ Inst->setOperand(Idx, Mat);
+ return true;
+}
+
+/// \brief Emit materialization code for all rebased constants and update their
+/// users.
+void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
+ Constant *Offset,
+ const ConstantUser &ConstUser) {
+ Instruction *Mat = Base;
+ if (Offset) {
+ Instruction *InsertionPt = findMatInsertPt(ConstUser.Inst,
+ ConstUser.OpndIdx);
+ Mat = BinaryOperator::Create(Instruction::Add, Base, Offset,
+ "const_mat", InsertionPt);
+
+ DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0)
+ << " + " << *Offset << ") in BB "
+ << Mat->getParent()->getName() << '\n' << *Mat << '\n');
+ Mat->setDebugLoc(ConstUser.Inst->getDebugLoc());
+ }
+ Value *Opnd = ConstUser.Inst->getOperand(ConstUser.OpndIdx);
+
+ // Visit constant integer.
+ if (isa<ConstantInt>(Opnd)) {
+ DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+ if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat) && Offset)
+ Mat->eraseFromParent();
+ DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
+ return;
+ }
+
+ // Visit cast instruction.
+ if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
+ assert(CastInst->isCast() && "Expected an cast instruction!");
+ // Check if we already have visited this cast instruction before to avoid
+ // unnecessary cloning.
+ Instruction *&ClonedCastInst = ClonedCastMap[CastInst];
+ if (!ClonedCastInst) {
+ ClonedCastInst = CastInst->clone();
+ ClonedCastInst->setOperand(0, Mat);
+ ClonedCastInst->insertAfter(CastInst);
+ // Use the same debug location as the original cast instruction.
+ ClonedCastInst->setDebugLoc(CastInst->getDebugLoc());
+ DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n'
+ << "To : " << *ClonedCastInst << '\n');
+ }
+
+ DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+ updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ClonedCastInst);
+ DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
+ return;
+ }
+
+ // Visit constant expression.
+ if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
+ Instruction *ConstExprInst = ConstExpr->getAsInstruction();
+ ConstExprInst->setOperand(0, Mat);
+ ConstExprInst->insertBefore(findMatInsertPt(ConstUser.Inst,
+ ConstUser.OpndIdx));
+
+ // Use the same debug location as the instruction we are about to update.
+ ConstExprInst->setDebugLoc(ConstUser.Inst->getDebugLoc());
+
+ DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n'
+ << "From : " << *ConstExpr << '\n');
+ DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+ if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ConstExprInst)) {
+ ConstExprInst->eraseFromParent();
+ if (Offset)
+ Mat->eraseFromParent();
+ }
+ DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
+ return;
+ }
+}
+
+/// \brief Hoist and hide the base constant behind a bitcast and emit
+/// materialization code for derived constants.
+bool ConstantHoistingPass::emitBaseConstants() {
+ bool MadeChange = false;
+ for (auto const &ConstInfo : ConstantVec) {
+ // Hoist and hide the base constant behind a bitcast.
+ SmallPtrSet<Instruction *, 8> IPSet = findConstantInsertionPoint(ConstInfo);
+ assert(!IPSet.empty() && "IPSet is empty");
+
+ unsigned UsesNum = 0;
+ unsigned ReBasesNum = 0;
+ for (Instruction *IP : IPSet) {
+ IntegerType *Ty = ConstInfo.BaseConstant->getType();
+ Instruction *Base =
+ new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP);
+
+ Base->setDebugLoc(IP->getDebugLoc());
+
+ DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant
+ << ") to BB " << IP->getParent()->getName() << '\n'
+ << *Base << '\n');
+
+ // Emit materialization code for all rebased constants.
+ unsigned Uses = 0;
+ for (auto const &RCI : ConstInfo.RebasedConstants) {
+ for (auto const &U : RCI.Uses) {
+ Uses++;
+ BasicBlock *OrigMatInsertBB =
+ findMatInsertPt(U.Inst, U.OpndIdx)->getParent();
+ // If Base constant is to be inserted in multiple places,
+ // generate rebase for U using the Base dominating U.
+ if (IPSet.size() == 1 ||
+ DT->dominates(Base->getParent(), OrigMatInsertBB)) {
+ emitBaseConstants(Base, RCI.Offset, U);
+ ReBasesNum++;
+ }
+
+ Base->setDebugLoc(DILocation::getMergedLocation(Base->getDebugLoc(), U.Inst->getDebugLoc()));
+ }
+ }
+ UsesNum = Uses;
+
+ // Use the same debug location as the last user of the constant.
+ assert(!Base->use_empty() && "The use list is empty!?");
+ assert(isa<Instruction>(Base->user_back()) &&
+ "All uses should be instructions.");
+ }
+ (void)UsesNum;
+ (void)ReBasesNum;
+ // Expect all uses are rebased after rebase is done.
+ assert(UsesNum == ReBasesNum && "Not all uses are rebased");
+
+ NumConstantsHoisted++;
+
+ // Base constant is also included in ConstInfo.RebasedConstants, so
+ // deduct 1 from ConstInfo.RebasedConstants.size().
+ NumConstantsRebased = ConstInfo.RebasedConstants.size() - 1;
+
+ MadeChange = true;
+ }
+ return MadeChange;
+}
+
+/// \brief Check all cast instructions we made a copy of and remove them if they
+/// have no more users.
+void ConstantHoistingPass::deleteDeadCastInst() const {
+ for (auto const &I : ClonedCastMap)
+ if (I.first->use_empty())
+ I.first->eraseFromParent();
+}
+
+/// \brief Optimize expensive integer constants in the given function.
+bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
+ DominatorTree &DT, BlockFrequencyInfo *BFI,
+ BasicBlock &Entry) {
+ this->TTI = &TTI;
+ this->DT = &DT;
+ this->BFI = BFI;
+ this->Entry = &Entry;
+ // Collect all constant candidates.
+ collectConstantCandidates(Fn);
+
+ // There are no constant candidates to worry about.
+ if (ConstCandVec.empty())
+ return false;
+
+ // Combine constants that can be easily materialized with an add from a common
+ // base constant.
+ findBaseConstants();
+
+ // There are no constants to emit.
+ if (ConstantVec.empty())
+ return false;
+
+ // Finally hoist the base constant and emit materialization code for dependent
+ // constants.
+ bool MadeChange = emitBaseConstants();
+
+ // Cleanup dead instructions.
+ deleteDeadCastInst();
+
+ return MadeChange;
+}
+
+PreservedAnalyses ConstantHoistingPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto BFI = ConstHoistWithBlockFrequency
+ ? &AM.getResult<BlockFrequencyAnalysis>(F)
+ : nullptr;
+ if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock()))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
new file mode 100644
index 000000000000..4fa27891a974
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
@@ -0,0 +1,104 @@
+//===- ConstantProp.cpp - Code to perform Simple Constant Propagation -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements constant propagation and merging:
+//
+// Specifically, this:
+// * Converts instructions like "add int 1, 2" into 3
+//
+// Notice that:
+// * This pass has a habit of making definitions be dead. It is a good idea
+// to run a DIE pass sometime after running this pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <set>
+using namespace llvm;
+
+#define DEBUG_TYPE "constprop"
+
+STATISTIC(NumInstKilled, "Number of instructions killed");
+
+namespace {
+ struct ConstantPropagation : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ ConstantPropagation() : FunctionPass(ID) {
+ initializeConstantPropagationPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+ };
+}
+
+char ConstantPropagation::ID = 0;
+INITIALIZE_PASS_BEGIN(ConstantPropagation, "constprop",
+ "Simple constant propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(ConstantPropagation, "constprop",
+ "Simple constant propagation", false, false)
+
+FunctionPass *llvm::createConstantPropagationPass() {
+ return new ConstantPropagation();
+}
+
+bool ConstantPropagation::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ // Initialize the worklist to all of the instructions ready to process...
+ std::set<Instruction*> WorkList;
+ for (Instruction &I: instructions(&F))
+ WorkList.insert(&I);
+
+ bool Changed = false;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+ while (!WorkList.empty()) {
+ Instruction *I = *WorkList.begin();
+ WorkList.erase(WorkList.begin()); // Get an element from the worklist...
+
+ if (!I->use_empty()) // Don't muck with dead instructions...
+ if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) {
+ // Add all of the users of this instruction to the worklist, they might
+ // be constant propagatable now...
+ for (User *U : I->users())
+ WorkList.insert(cast<Instruction>(U));
+
+ // Replace all of the uses of a variable with uses of the constant.
+ I->replaceAllUsesWith(C);
+
+ // Remove the dead instruction.
+ WorkList.erase(I);
+ if (isInstructionTriviallyDead(I, TLI)) {
+ I->eraseFromParent();
+ ++NumInstKilled;
+ }
+
+ // We made a change to the function...
+ Changed = true;
+ }
+ }
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
new file mode 100644
index 000000000000..8f468ebf8949
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -0,0 +1,653 @@
+//===- CorrelatedValuePropagation.cpp - Propagate CFG-derived info --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Correlated Value Propagation pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "correlated-value-propagation"
+
+STATISTIC(NumPhis, "Number of phis propagated");
+STATISTIC(NumSelects, "Number of selects propagated");
+STATISTIC(NumMemAccess, "Number of memory access targets propagated");
+STATISTIC(NumCmps, "Number of comparisons propagated");
+STATISTIC(NumReturns, "Number of return values propagated");
+STATISTIC(NumDeadCases, "Number of switch cases removed");
+STATISTIC(NumSDivs, "Number of sdiv converted to udiv");
+STATISTIC(NumAShrs, "Number of ashr converted to lshr");
+STATISTIC(NumSRems, "Number of srem converted to urem");
+STATISTIC(NumOverflows, "Number of overflow checks removed");
+
+static cl::opt<bool> DontProcessAdds("cvp-dont-process-adds", cl::init(true));
+
+namespace {
+
+ class CorrelatedValuePropagation : public FunctionPass {
+ public:
+ static char ID;
+
+ CorrelatedValuePropagation(): FunctionPass(ID) {
+ initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LazyValueInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+ };
+
+} // end anonymous namespace
+
+char CorrelatedValuePropagation::ID = 0;
+
+INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation",
+ "Value Propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
+INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation",
+ "Value Propagation", false, false)
+
+// Public interface to the Value Propagation pass
+Pass *llvm::createCorrelatedValuePropagationPass() {
+ return new CorrelatedValuePropagation();
+}
+
+static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
+ if (S->getType()->isVectorTy()) return false;
+ if (isa<Constant>(S->getOperand(0))) return false;
+
+ Constant *C = LVI->getConstant(S->getOperand(0), S->getParent(), S);
+ if (!C) return false;
+
+ ConstantInt *CI = dyn_cast<ConstantInt>(C);
+ if (!CI) return false;
+
+ Value *ReplaceWith = S->getOperand(1);
+ Value *Other = S->getOperand(2);
+ if (!CI->isOne()) std::swap(ReplaceWith, Other);
+ if (ReplaceWith == S) ReplaceWith = UndefValue::get(S->getType());
+
+ S->replaceAllUsesWith(ReplaceWith);
+ S->eraseFromParent();
+
+ ++NumSelects;
+
+ return true;
+}
+
+static bool processPHI(PHINode *P, LazyValueInfo *LVI,
+ const SimplifyQuery &SQ) {
+ bool Changed = false;
+
+ BasicBlock *BB = P->getParent();
+ for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
+ Value *Incoming = P->getIncomingValue(i);
+ if (isa<Constant>(Incoming)) continue;
+
+ Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB, P);
+
+ // Look if the incoming value is a select with a scalar condition for which
+ // LVI can tells us the value. In that case replace the incoming value with
+ // the appropriate value of the select. This often allows us to remove the
+ // select later.
+ if (!V) {
+ SelectInst *SI = dyn_cast<SelectInst>(Incoming);
+ if (!SI) continue;
+
+ Value *Condition = SI->getCondition();
+ if (!Condition->getType()->isVectorTy()) {
+ if (Constant *C = LVI->getConstantOnEdge(
+ Condition, P->getIncomingBlock(i), BB, P)) {
+ if (C->isOneValue()) {
+ V = SI->getTrueValue();
+ } else if (C->isZeroValue()) {
+ V = SI->getFalseValue();
+ }
+ // Once LVI learns to handle vector types, we could also add support
+ // for vector type constants that are not all zeroes or all ones.
+ }
+ }
+
+ // Look if the select has a constant but LVI tells us that the incoming
+ // value can never be that constant. In that case replace the incoming
+ // value with the other value of the select. This often allows us to
+ // remove the select later.
+ if (!V) {
+ Constant *C = dyn_cast<Constant>(SI->getFalseValue());
+ if (!C) continue;
+
+ if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C,
+ P->getIncomingBlock(i), BB, P) !=
+ LazyValueInfo::False)
+ continue;
+ V = SI->getTrueValue();
+ }
+
+ DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
+ }
+
+ P->setIncomingValue(i, V);
+ Changed = true;
+ }
+
+ if (Value *V = SimplifyInstruction(P, SQ)) {
+ P->replaceAllUsesWith(V);
+ P->eraseFromParent();
+ Changed = true;
+ }
+
+ if (Changed)
+ ++NumPhis;
+
+ return Changed;
+}
+
+static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) {
+ Value *Pointer = nullptr;
+ if (LoadInst *L = dyn_cast<LoadInst>(I))
+ Pointer = L->getPointerOperand();
+ else
+ Pointer = cast<StoreInst>(I)->getPointerOperand();
+
+ if (isa<Constant>(Pointer)) return false;
+
+ Constant *C = LVI->getConstant(Pointer, I->getParent(), I);
+ if (!C) return false;
+
+ ++NumMemAccess;
+ I->replaceUsesOfWith(Pointer, C);
+ return true;
+}
+
+/// See if LazyValueInfo's ability to exploit edge conditions or range
+/// information is sufficient to prove this comparison. Even for local
+/// conditions, this can sometimes prove conditions instcombine can't by
+/// exploiting range information.
+static bool processCmp(CmpInst *C, LazyValueInfo *LVI) {
+ Value *Op0 = C->getOperand(0);
+ Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
+ if (!Op1) return false;
+
+ // As a policy choice, we choose not to waste compile time on anything where
+ // the comparison is testing local values. While LVI can sometimes reason
+ // about such cases, it's not its primary purpose. We do make sure to do
+ // the block local query for uses from terminator instructions, but that's
+ // handled in the code for each terminator.
+ auto *I = dyn_cast<Instruction>(Op0);
+ if (I && I->getParent() == C->getParent())
+ return false;
+
+ LazyValueInfo::Tristate Result =
+ LVI->getPredicateAt(C->getPredicate(), Op0, Op1, C);
+ if (Result == LazyValueInfo::Unknown) return false;
+
+ ++NumCmps;
+ if (Result == LazyValueInfo::True)
+ C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext()));
+ else
+ C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext()));
+ C->eraseFromParent();
+
+ return true;
+}
+
+/// Simplify a switch instruction by removing cases which can never fire. If the
+/// uselessness of a case could be determined locally then constant propagation
+/// would already have figured it out. Instead, walk the predecessors and
+/// statically evaluate cases based on information available on that edge. Cases
+/// that cannot fire no matter what the incoming edge can safely be removed. If
+/// a case fires on every incoming edge then the entire switch can be removed
+/// and replaced with a branch to the case destination.
+static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
+ Value *Cond = SI->getCondition();
+ BasicBlock *BB = SI->getParent();
+
+ // If the condition was defined in same block as the switch then LazyValueInfo
+ // currently won't say anything useful about it, though in theory it could.
+ if (isa<Instruction>(Cond) && cast<Instruction>(Cond)->getParent() == BB)
+ return false;
+
+ // If the switch is unreachable then trying to improve it is a waste of time.
+ pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
+ if (PB == PE) return false;
+
+ // Analyse each switch case in turn.
+ bool Changed = false;
+ for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
+ ConstantInt *Case = CI->getCaseValue();
+
+ // Check to see if the switch condition is equal to/not equal to the case
+ // value on every incoming edge, equal/not equal being the same each time.
+ LazyValueInfo::Tristate State = LazyValueInfo::Unknown;
+ for (pred_iterator PI = PB; PI != PE; ++PI) {
+ // Is the switch condition equal to the case value?
+ LazyValueInfo::Tristate Value = LVI->getPredicateOnEdge(CmpInst::ICMP_EQ,
+ Cond, Case, *PI,
+ BB, SI);
+ // Give up on this case if nothing is known.
+ if (Value == LazyValueInfo::Unknown) {
+ State = LazyValueInfo::Unknown;
+ break;
+ }
+
+ // If this was the first edge to be visited, record that all other edges
+ // need to give the same result.
+ if (PI == PB) {
+ State = Value;
+ continue;
+ }
+
+ // If this case is known to fire for some edges and known not to fire for
+ // others then there is nothing we can do - give up.
+ if (Value != State) {
+ State = LazyValueInfo::Unknown;
+ break;
+ }
+ }
+
+ if (State == LazyValueInfo::False) {
+ // This case never fires - remove it.
+ CI->getCaseSuccessor()->removePredecessor(BB);
+ CI = SI->removeCase(CI);
+ CE = SI->case_end();
+
+ // The condition can be modified by removePredecessor's PHI simplification
+ // logic.
+ Cond = SI->getCondition();
+
+ ++NumDeadCases;
+ Changed = true;
+ continue;
+ }
+ if (State == LazyValueInfo::True) {
+ // This case always fires. Arrange for the switch to be turned into an
+ // unconditional branch by replacing the switch condition with the case
+ // value.
+ SI->setCondition(Case);
+ NumDeadCases += SI->getNumCases();
+ Changed = true;
+ break;
+ }
+
+ // Increment the case iterator since we didn't delete it.
+ ++CI;
+ }
+
+ if (Changed)
+ // If the switch has been simplified to the point where it can be replaced
+ // by a branch then do so now.
+ ConstantFoldTerminator(BB);
+
+ return Changed;
+}
+
+// See if we can prove that the given overflow intrinsic will not overflow.
+static bool willNotOverflow(IntrinsicInst *II, LazyValueInfo *LVI) {
+ using OBO = OverflowingBinaryOperator;
+ auto NoWrap = [&] (Instruction::BinaryOps BinOp, unsigned NoWrapKind) {
+ Value *RHS = II->getOperand(1);
+ ConstantRange RRange = LVI->getConstantRange(RHS, II->getParent(), II);
+ ConstantRange NWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
+ BinOp, RRange, NoWrapKind);
+ // As an optimization, do not compute LRange if we do not need it.
+ if (NWRegion.isEmptySet())
+ return false;
+ Value *LHS = II->getOperand(0);
+ ConstantRange LRange = LVI->getConstantRange(LHS, II->getParent(), II);
+ return NWRegion.contains(LRange);
+ };
+ switch (II->getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::uadd_with_overflow:
+ return NoWrap(Instruction::Add, OBO::NoUnsignedWrap);
+ case Intrinsic::sadd_with_overflow:
+ return NoWrap(Instruction::Add, OBO::NoSignedWrap);
+ case Intrinsic::usub_with_overflow:
+ return NoWrap(Instruction::Sub, OBO::NoUnsignedWrap);
+ case Intrinsic::ssub_with_overflow:
+ return NoWrap(Instruction::Sub, OBO::NoSignedWrap);
+ }
+ return false;
+}
+
+static void processOverflowIntrinsic(IntrinsicInst *II) {
+ Value *NewOp = nullptr;
+ switch (II->getIntrinsicID()) {
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::sadd_with_overflow:
+ NewOp = BinaryOperator::CreateAdd(II->getOperand(0), II->getOperand(1),
+ II->getName(), II);
+ break;
+ case Intrinsic::usub_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ NewOp = BinaryOperator::CreateSub(II->getOperand(0), II->getOperand(1),
+ II->getName(), II);
+ break;
+ }
+ ++NumOverflows;
+ IRBuilder<> B(II);
+ Value *NewI = B.CreateInsertValue(UndefValue::get(II->getType()), NewOp, 0);
+ NewI = B.CreateInsertValue(NewI, ConstantInt::getFalse(II->getContext()), 1);
+ II->replaceAllUsesWith(NewI);
+ II->eraseFromParent();
+}
+
+/// Infer nonnull attributes for the arguments at the specified callsite.
+static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
+ SmallVector<unsigned, 4> ArgNos;
+ unsigned ArgNo = 0;
+
+ if (auto *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
+ if (willNotOverflow(II, LVI)) {
+ processOverflowIntrinsic(II);
+ return true;
+ }
+ }
+
+ for (Value *V : CS.args()) {
+ PointerType *Type = dyn_cast<PointerType>(V->getType());
+ // Try to mark pointer typed parameters as non-null. We skip the
+ // relatively expensive analysis for constants which are obviously either
+ // null or non-null to start with.
+ if (Type && !CS.paramHasAttr(ArgNo, Attribute::NonNull) &&
+ !isa<Constant>(V) &&
+ LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
+ ConstantPointerNull::get(Type),
+ CS.getInstruction()) == LazyValueInfo::False)
+ ArgNos.push_back(ArgNo);
+ ArgNo++;
+ }
+
+ assert(ArgNo == CS.arg_size() && "sanity check");
+
+ if (ArgNos.empty())
+ return false;
+
+ AttributeList AS = CS.getAttributes();
+ LLVMContext &Ctx = CS.getInstruction()->getContext();
+ AS = AS.addParamAttribute(Ctx, ArgNos,
+ Attribute::get(Ctx, Attribute::NonNull));
+ CS.setAttributes(AS);
+
+ return true;
+}
+
+static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) {
+ Constant *Zero = ConstantInt::get(SDI->getType(), 0);
+ for (Value *O : SDI->operands()) {
+ auto Result = LVI->getPredicateAt(ICmpInst::ICMP_SGE, O, Zero, SDI);
+ if (Result != LazyValueInfo::True)
+ return false;
+ }
+ return true;
+}
+
+static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
+ if (SDI->getType()->isVectorTy() ||
+ !hasPositiveOperands(SDI, LVI))
+ return false;
+
+ ++NumSRems;
+ auto *BO = BinaryOperator::CreateURem(SDI->getOperand(0), SDI->getOperand(1),
+ SDI->getName(), SDI);
+ SDI->replaceAllUsesWith(BO);
+ SDI->eraseFromParent();
+ return true;
+}
+
+/// See if LazyValueInfo's ability to exploit edge conditions or range
+/// information is sufficient to prove the both operands of this SDiv are
+/// positive. If this is the case, replace the SDiv with a UDiv. Even for local
+/// conditions, this can sometimes prove conditions instcombine can't by
+/// exploiting range information.
+static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
+ if (SDI->getType()->isVectorTy() ||
+ !hasPositiveOperands(SDI, LVI))
+ return false;
+
+ ++NumSDivs;
+ auto *BO = BinaryOperator::CreateUDiv(SDI->getOperand(0), SDI->getOperand(1),
+ SDI->getName(), SDI);
+ BO->setIsExact(SDI->isExact());
+ SDI->replaceAllUsesWith(BO);
+ SDI->eraseFromParent();
+
+ return true;
+}
+
+static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
+ if (SDI->getType()->isVectorTy())
+ return false;
+
+ Constant *Zero = ConstantInt::get(SDI->getType(), 0);
+ if (LVI->getPredicateAt(ICmpInst::ICMP_SGE, SDI->getOperand(0), Zero, SDI) !=
+ LazyValueInfo::True)
+ return false;
+
+ ++NumAShrs;
+ auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1),
+ SDI->getName(), SDI);
+ BO->setIsExact(SDI->isExact());
+ SDI->replaceAllUsesWith(BO);
+ SDI->eraseFromParent();
+
+ return true;
+}
+
+static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
+ using OBO = OverflowingBinaryOperator;
+
+ if (DontProcessAdds)
+ return false;
+
+ if (AddOp->getType()->isVectorTy())
+ return false;
+
+ bool NSW = AddOp->hasNoSignedWrap();
+ bool NUW = AddOp->hasNoUnsignedWrap();
+ if (NSW && NUW)
+ return false;
+
+ BasicBlock *BB = AddOp->getParent();
+
+ Value *LHS = AddOp->getOperand(0);
+ Value *RHS = AddOp->getOperand(1);
+
+ ConstantRange LRange = LVI->getConstantRange(LHS, BB, AddOp);
+
+ // Initialize RRange only if we need it. If we know that guaranteed no wrap
+ // range for the given LHS range is empty don't spend time calculating the
+ // range for the RHS.
+ Optional<ConstantRange> RRange;
+ auto LazyRRange = [&] () {
+ if (!RRange)
+ RRange = LVI->getConstantRange(RHS, BB, AddOp);
+ return RRange.getValue();
+ };
+
+ bool Changed = false;
+ if (!NUW) {
+ ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+ BinaryOperator::Add, LRange, OBO::NoUnsignedWrap);
+ if (!NUWRange.isEmptySet()) {
+ bool NewNUW = NUWRange.contains(LazyRRange());
+ AddOp->setHasNoUnsignedWrap(NewNUW);
+ Changed |= NewNUW;
+ }
+ }
+ if (!NSW) {
+ ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+ BinaryOperator::Add, LRange, OBO::NoSignedWrap);
+ if (!NSWRange.isEmptySet()) {
+ bool NewNSW = NSWRange.contains(LazyRRange());
+ AddOp->setHasNoSignedWrap(NewNSW);
+ Changed |= NewNSW;
+ }
+ }
+
+ return Changed;
+}
+
+static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
+ if (Constant *C = LVI->getConstant(V, At->getParent(), At))
+ return C;
+
+ // TODO: The following really should be sunk inside LVI's core algorithm, or
+ // at least the outer shims around such.
+ auto *C = dyn_cast<CmpInst>(V);
+ if (!C) return nullptr;
+
+ Value *Op0 = C->getOperand(0);
+ Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
+ if (!Op1) return nullptr;
+
+ LazyValueInfo::Tristate Result =
+ LVI->getPredicateAt(C->getPredicate(), Op0, Op1, At);
+ if (Result == LazyValueInfo::Unknown)
+ return nullptr;
+
+ return (Result == LazyValueInfo::True) ?
+ ConstantInt::getTrue(C->getContext()) :
+ ConstantInt::getFalse(C->getContext());
+}
+
+static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
+ bool FnChanged = false;
+ // Visiting in a pre-order depth-first traversal causes us to simplify early
+ // blocks before querying later blocks (which require us to analyze early
+ // blocks). Eagerly simplifying shallow blocks means there is strictly less
+ // work to do for deep blocks. This also means we don't visit unreachable
+ // blocks.
+ for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
+ bool BBChanged = false;
+ for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+ Instruction *II = &*BI++;
+ switch (II->getOpcode()) {
+ case Instruction::Select:
+ BBChanged |= processSelect(cast<SelectInst>(II), LVI);
+ break;
+ case Instruction::PHI:
+ BBChanged |= processPHI(cast<PHINode>(II), LVI, SQ);
+ break;
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ BBChanged |= processCmp(cast<CmpInst>(II), LVI);
+ break;
+ case Instruction::Load:
+ case Instruction::Store:
+ BBChanged |= processMemAccess(II, LVI);
+ break;
+ case Instruction::Call:
+ case Instruction::Invoke:
+ BBChanged |= processCallSite(CallSite(II), LVI);
+ break;
+ case Instruction::SRem:
+ BBChanged |= processSRem(cast<BinaryOperator>(II), LVI);
+ break;
+ case Instruction::SDiv:
+ BBChanged |= processSDiv(cast<BinaryOperator>(II), LVI);
+ break;
+ case Instruction::AShr:
+ BBChanged |= processAShr(cast<BinaryOperator>(II), LVI);
+ break;
+ case Instruction::Add:
+ BBChanged |= processAdd(cast<BinaryOperator>(II), LVI);
+ break;
+ }
+ }
+
+ Instruction *Term = BB->getTerminator();
+ switch (Term->getOpcode()) {
+ case Instruction::Switch:
+ BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI);
+ break;
+ case Instruction::Ret: {
+ auto *RI = cast<ReturnInst>(Term);
+ // Try to determine the return value if we can. This is mainly here to
+ // simplify the writing of unit tests, but also helps to enable IPO by
+ // constant folding the return values of callees.
+ auto *RetVal = RI->getReturnValue();
+ if (!RetVal) break; // handle "ret void"
+ if (isa<Constant>(RetVal)) break; // nothing to do
+ if (auto *C = getConstantAt(RetVal, RI, LVI)) {
+ ++NumReturns;
+ RI->replaceUsesOfWith(RetVal, C);
+ BBChanged = true;
+ }
+ }
+ }
+
+ FnChanged |= BBChanged;
+ }
+
+ return FnChanged;
+}
+
+bool CorrelatedValuePropagation::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+ return runImpl(F, LVI, getBestSimplifyQuery(*this, F));
+}
+
+PreservedAnalyses
+CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
+
+ LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F);
+ bool Changed = runImpl(F, LVI, getBestSimplifyQuery(AM, F));
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
new file mode 100644
index 000000000000..fa4806e884c3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
@@ -0,0 +1,163 @@
+//===- DCE.cpp - Code to perform dead code elimination --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead inst elimination and dead code elimination.
+//
+// Dead Inst Elimination performs a single pass over the function removing
+// instructions that are obviously dead. Dead Code Elimination is similar, but
+// it rechecks instructions that were used by removed instructions to see if
+// they are newly dead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/DCE.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "dce"
+
+STATISTIC(DIEEliminated, "Number of insts removed by DIE pass");
+STATISTIC(DCEEliminated, "Number of insts removed");
+
+namespace {
+ //===--------------------------------------------------------------------===//
+ // DeadInstElimination pass implementation
+ //
+ struct DeadInstElimination : public BasicBlockPass {
+ static char ID; // Pass identification, replacement for typeid
+ DeadInstElimination() : BasicBlockPass(ID) {
+ initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnBasicBlock(BasicBlock &BB) override {
+ if (skipBasicBlock(BB))
+ return false;
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+ bool Changed = false;
+ for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
+ Instruction *Inst = &*DI++;
+ if (isInstructionTriviallyDead(Inst, TLI)) {
+ Inst->eraseFromParent();
+ Changed = true;
+ ++DIEEliminated;
+ }
+ }
+ return Changed;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ }
+ };
+}
+
+char DeadInstElimination::ID = 0;
+INITIALIZE_PASS(DeadInstElimination, "die",
+ "Dead Instruction Elimination", false, false)
+
+Pass *llvm::createDeadInstEliminationPass() {
+ return new DeadInstElimination();
+}
+
+static bool DCEInstruction(Instruction *I,
+ SmallSetVector<Instruction *, 16> &WorkList,
+ const TargetLibraryInfo *TLI) {
+ if (isInstructionTriviallyDead(I, TLI)) {
+ // Null out all of the instruction's operands to see if any operand becomes
+ // dead as we go.
+ for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+ Value *OpV = I->getOperand(i);
+ I->setOperand(i, nullptr);
+
+ if (!OpV->use_empty() || I == OpV)
+ continue;
+
+ // If the operand is an instruction that became dead as we nulled out the
+ // operand, and if it is 'trivially' dead, delete it in a future loop
+ // iteration.
+ if (Instruction *OpI = dyn_cast<Instruction>(OpV))
+ if (isInstructionTriviallyDead(OpI, TLI))
+ WorkList.insert(OpI);
+ }
+
+ I->eraseFromParent();
+ ++DCEEliminated;
+ return true;
+ }
+ return false;
+}
+
+static bool eliminateDeadCode(Function &F, TargetLibraryInfo *TLI) {
+ bool MadeChange = false;
+ SmallSetVector<Instruction *, 16> WorkList;
+ // Iterate over the original function, only adding insts to the worklist
+ // if they actually need to be revisited. This avoids having to pre-init
+ // the worklist with the entire function's worth of instructions.
+ for (inst_iterator FI = inst_begin(F), FE = inst_end(F); FI != FE;) {
+ Instruction *I = &*FI;
+ ++FI;
+
+ // We're visiting this instruction now, so make sure it's not in the
+ // worklist from an earlier visit.
+ if (!WorkList.count(I))
+ MadeChange |= DCEInstruction(I, WorkList, TLI);
+ }
+
+ while (!WorkList.empty()) {
+ Instruction *I = WorkList.pop_back_val();
+ MadeChange |= DCEInstruction(I, WorkList, TLI);
+ }
+ return MadeChange;
+}
+
+PreservedAnalyses DCEPass::run(Function &F, FunctionAnalysisManager &AM) {
+ if (!eliminateDeadCode(F, AM.getCachedResult<TargetLibraryAnalysis>(F)))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+namespace {
+struct DCELegacyPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ DCELegacyPass() : FunctionPass(ID) {
+ initializeDCELegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+
+ return eliminateDeadCode(F, TLI);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ }
+};
+}
+
+char DCELegacyPass::ID = 0;
+INITIALIZE_PASS(DCELegacyPass, "dce", "Dead Code Elimination", false, false)
+
+FunctionPass *llvm::createDeadCodeEliminationPass() {
+ return new DCELegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
new file mode 100644
index 000000000000..b665d94a70aa
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -0,0 +1,1360 @@
+//===- DeadStoreElimination.cpp - Fast Dead Store Elimination -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a trivial dead store elimination that only considers
+// basic-block local redundant stores.
+//
+// FIXME: This should eventually be extended to be a post-dominator tree
+// traversal. Doing so would be pretty trivial.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/DeadStoreElimination.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstddef>
+#include <iterator>
+#include <map>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "dse"
+
+STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
+STATISTIC(NumFastStores, "Number of stores deleted");
+STATISTIC(NumFastOther , "Number of other instrs removed");
+STATISTIC(NumCompletePartials, "Number of stores dead by later partials");
+STATISTIC(NumModifiedStores, "Number of stores modified");
+
+static cl::opt<bool>
+EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking",
+ cl::init(true), cl::Hidden,
+ cl::desc("Enable partial-overwrite tracking in DSE"));
+
+static cl::opt<bool>
+EnablePartialStoreMerging("enable-dse-partial-store-merging",
+ cl::init(true), cl::Hidden,
+ cl::desc("Enable partial store merging in DSE"));
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+using OverlapIntervalsTy = std::map<int64_t, int64_t>;
+using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>;
+
+/// Delete this instruction. Before we do, go through and zero out all the
+/// operands of this instruction. If any of them become dead, delete them and
+/// the computation tree that feeds them.
+/// If ValueSet is non-null, remove any deleted instructions from it as well.
+static void
+deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
+ MemoryDependenceResults &MD, const TargetLibraryInfo &TLI,
+ InstOverlapIntervalsTy &IOL,
+ DenseMap<Instruction*, size_t> *InstrOrdering,
+ SmallSetVector<Value *, 16> *ValueSet = nullptr) {
+ SmallVector<Instruction*, 32> NowDeadInsts;
+
+ NowDeadInsts.push_back(I);
+ --NumFastOther;
+
+ // Keeping the iterator straight is a pain, so we let this routine tell the
+ // caller what the next instruction is after we're done mucking about.
+ BasicBlock::iterator NewIter = *BBI;
+
+ // Before we touch this instruction, remove it from memdep!
+ do {
+ Instruction *DeadInst = NowDeadInsts.pop_back_val();
+ ++NumFastOther;
+
+ // This instruction is dead, zap it, in stages. Start by removing it from
+ // MemDep, which needs to know the operands and needs it to be in the
+ // function.
+ MD.removeInstruction(DeadInst);
+
+ for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
+ Value *Op = DeadInst->getOperand(op);
+ DeadInst->setOperand(op, nullptr);
+
+ // If this operand just became dead, add it to the NowDeadInsts list.
+ if (!Op->use_empty()) continue;
+
+ if (Instruction *OpI = dyn_cast<Instruction>(Op))
+ if (isInstructionTriviallyDead(OpI, &TLI))
+ NowDeadInsts.push_back(OpI);
+ }
+
+ if (ValueSet) ValueSet->remove(DeadInst);
+ InstrOrdering->erase(DeadInst);
+ IOL.erase(DeadInst);
+
+ if (NewIter == DeadInst->getIterator())
+ NewIter = DeadInst->eraseFromParent();
+ else
+ DeadInst->eraseFromParent();
+ } while (!NowDeadInsts.empty());
+ *BBI = NewIter;
+}
+
+/// Does this instruction write some memory? This only returns true for things
+/// that we can analyze with other helpers below.
+static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
+ if (isa<StoreInst>(I))
+ return true;
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ default:
+ return false;
+ case Intrinsic::memset:
+ case Intrinsic::memmove:
+ case Intrinsic::memcpy:
+ case Intrinsic::init_trampoline:
+ case Intrinsic::lifetime_end:
+ return true;
+ }
+ }
+ if (auto CS = CallSite(I)) {
+ if (Function *F = CS.getCalledFunction()) {
+ StringRef FnName = F->getName();
+ if (TLI.has(LibFunc_strcpy) && FnName == TLI.getName(LibFunc_strcpy))
+ return true;
+ if (TLI.has(LibFunc_strncpy) && FnName == TLI.getName(LibFunc_strncpy))
+ return true;
+ if (TLI.has(LibFunc_strcat) && FnName == TLI.getName(LibFunc_strcat))
+ return true;
+ if (TLI.has(LibFunc_strncat) && FnName == TLI.getName(LibFunc_strncat))
+ return true;
+ }
+ }
+ return false;
+}
+
+/// Return a Location stored to by the specified instruction. If isRemovable
+/// returns true, this function and getLocForRead completely describe the memory
+/// operations for this instruction.
+static MemoryLocation getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+ return MemoryLocation::get(SI);
+
+ if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) {
+ // memcpy/memmove/memset.
+ MemoryLocation Loc = MemoryLocation::getForDest(MI);
+ return Loc;
+ }
+
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
+ if (!II)
+ return MemoryLocation();
+
+ switch (II->getIntrinsicID()) {
+ default:
+ return MemoryLocation(); // Unhandled intrinsic.
+ case Intrinsic::init_trampoline:
+ // FIXME: We don't know the size of the trampoline, so we can't really
+ // handle it here.
+ return MemoryLocation(II->getArgOperand(0));
+ case Intrinsic::lifetime_end: {
+ uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
+ return MemoryLocation(II->getArgOperand(1), Len);
+ }
+ }
+}
+
+/// Return the location read by the specified "hasMemoryWrite" instruction if
+/// any.
+static MemoryLocation getLocForRead(Instruction *Inst,
+ const TargetLibraryInfo &TLI) {
+ assert(hasMemoryWrite(Inst, TLI) && "Unknown instruction case");
+
+ // The only instructions that both read and write are the mem transfer
+ // instructions (memcpy/memmove).
+ if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst))
+ return MemoryLocation::getForSource(MTI);
+ return MemoryLocation();
+}
+
+/// If the value of this instruction and the memory it writes to is unused, may
+/// we delete this instruction?
+static bool isRemovable(Instruction *I) {
+ // Don't remove volatile/atomic stores.
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->isUnordered();
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate");
+ case Intrinsic::lifetime_end:
+ // Never remove dead lifetime_end's, e.g. because it is followed by a
+ // free.
+ return false;
+ case Intrinsic::init_trampoline:
+ // Always safe to remove init_trampoline.
+ return true;
+ case Intrinsic::memset:
+ case Intrinsic::memmove:
+ case Intrinsic::memcpy:
+ // Don't remove volatile memory intrinsics.
+ return !cast<MemIntrinsic>(II)->isVolatile();
+ }
+ }
+
+ if (auto CS = CallSite(I))
+ return CS.getInstruction()->use_empty();
+
+ return false;
+}
+
+/// Returns true if the end of this instruction can be safely shortened in
+/// length.
+static bool isShortenableAtTheEnd(Instruction *I) {
+ // Don't shorten stores for now
+ if (isa<StoreInst>(I))
+ return false;
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ default: return false;
+ case Intrinsic::memset:
+ case Intrinsic::memcpy:
+ // Do shorten memory intrinsics.
+ // FIXME: Add memmove if it's also safe to transform.
+ return true;
+ }
+ }
+
+ // Don't shorten libcalls calls for now.
+
+ return false;
+}
+
+/// Returns true if the beginning of this instruction can be safely shortened
+/// in length.
+static bool isShortenableAtTheBeginning(Instruction *I) {
+ // FIXME: Handle only memset for now. Supporting memcpy/memmove should be
+ // easily done by offsetting the source address.
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+ return II && II->getIntrinsicID() == Intrinsic::memset;
+}
+
+/// Return the pointer that is being written to.
+static Value *getStoredPointerOperand(Instruction *I) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->getPointerOperand();
+ if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+ return MI->getDest();
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ default: llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::init_trampoline:
+ return II->getArgOperand(0);
+ }
+ }
+
+ CallSite CS(I);
+ // All the supported functions so far happen to have dest as their first
+ // argument.
+ return CS.getArgument(0);
+}
+
+static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
+ const TargetLibraryInfo &TLI) {
+ uint64_t Size;
+ if (getObjectSize(V, Size, DL, &TLI))
+ return Size;
+ return MemoryLocation::UnknownSize;
+}
+
+namespace {
+
+enum OverwriteResult {
+ OW_Begin,
+ OW_Complete,
+ OW_End,
+ OW_PartialEarlierWithFullLater,
+ OW_Unknown
+};
+
+} // end anonymous namespace
+
+/// Return 'OW_Complete' if a store to the 'Later' location completely
+/// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the
+/// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the
+/// beginning of the 'Earlier' location is overwritten by 'Later'.
+/// 'OW_PartialEarlierWithFullLater' means that an earlier (big) store was
+/// overwritten by a latter (smaller) store which doesn't write outside the big
+/// store's memory locations. Returns 'OW_Unknown' if nothing can be determined.
+static OverwriteResult isOverwrite(const MemoryLocation &Later,
+ const MemoryLocation &Earlier,
+ const DataLayout &DL,
+ const TargetLibraryInfo &TLI,
+ int64_t &EarlierOff, int64_t &LaterOff,
+ Instruction *DepWrite,
+ InstOverlapIntervalsTy &IOL) {
+ // If we don't know the sizes of either access, then we can't do a comparison.
+ if (Later.Size == MemoryLocation::UnknownSize ||
+ Earlier.Size == MemoryLocation::UnknownSize)
+ return OW_Unknown;
+
+ const Value *P1 = Earlier.Ptr->stripPointerCasts();
+ const Value *P2 = Later.Ptr->stripPointerCasts();
+
+ // If the start pointers are the same, we just have to compare sizes to see if
+ // the later store was larger than the earlier store.
+ if (P1 == P2) {
+ // Make sure that the Later size is >= the Earlier size.
+ if (Later.Size >= Earlier.Size)
+ return OW_Complete;
+ }
+
+ // Check to see if the later store is to the entire object (either a global,
+ // an alloca, or a byval/inalloca argument). If so, then it clearly
+ // overwrites any other store to the same object.
+ const Value *UO1 = GetUnderlyingObject(P1, DL),
+ *UO2 = GetUnderlyingObject(P2, DL);
+
+ // If we can't resolve the same pointers to the same object, then we can't
+ // analyze them at all.
+ if (UO1 != UO2)
+ return OW_Unknown;
+
+ // If the "Later" store is to a recognizable object, get its size.
+ uint64_t ObjectSize = getPointerSize(UO2, DL, TLI);
+ if (ObjectSize != MemoryLocation::UnknownSize)
+ if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size)
+ return OW_Complete;
+
+ // Okay, we have stores to two completely different pointers. Try to
+ // decompose the pointer into a "base + constant_offset" form. If the base
+ // pointers are equal, then we can reason about the two stores.
+ EarlierOff = 0;
+ LaterOff = 0;
+ const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, DL);
+ const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, DL);
+
+ // If the base pointers still differ, we have two completely different stores.
+ if (BP1 != BP2)
+ return OW_Unknown;
+
+ // The later store completely overlaps the earlier store if:
+ //
+ // 1. Both start at the same offset and the later one's size is greater than
+ // or equal to the earlier one's, or
+ //
+ // |--earlier--|
+ // |-- later --|
+ //
+ // 2. The earlier store has an offset greater than the later offset, but which
+ // still lies completely within the later store.
+ //
+ // |--earlier--|
+ // |----- later ------|
+ //
+ // We have to be careful here as *Off is signed while *.Size is unsigned.
+ if (EarlierOff >= LaterOff &&
+ Later.Size >= Earlier.Size &&
+ uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size)
+ return OW_Complete;
+
+ // We may now overlap, although the overlap is not complete. There might also
+ // be other incomplete overlaps, and together, they might cover the complete
+ // earlier write.
+ // Note: The correctness of this logic depends on the fact that this function
+ // is not even called providing DepWrite when there are any intervening reads.
+ if (EnablePartialOverwriteTracking &&
+ LaterOff < int64_t(EarlierOff + Earlier.Size) &&
+ int64_t(LaterOff + Later.Size) >= EarlierOff) {
+
+ // Insert our part of the overlap into the map.
+ auto &IM = IOL[DepWrite];
+ DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff << ", " <<
+ int64_t(EarlierOff + Earlier.Size) << ") Later [" <<
+ LaterOff << ", " << int64_t(LaterOff + Later.Size) << ")\n");
+
+ // Make sure that we only insert non-overlapping intervals and combine
+ // adjacent intervals. The intervals are stored in the map with the ending
+ // offset as the key (in the half-open sense) and the starting offset as
+ // the value.
+ int64_t LaterIntStart = LaterOff, LaterIntEnd = LaterOff + Later.Size;
+
+ // Find any intervals ending at, or after, LaterIntStart which start
+ // before LaterIntEnd.
+ auto ILI = IM.lower_bound(LaterIntStart);
+ if (ILI != IM.end() && ILI->second <= LaterIntEnd) {
+ // This existing interval is overlapped with the current store somewhere
+ // in [LaterIntStart, LaterIntEnd]. Merge them by erasing the existing
+ // intervals and adjusting our start and end.
+ LaterIntStart = std::min(LaterIntStart, ILI->second);
+ LaterIntEnd = std::max(LaterIntEnd, ILI->first);
+ ILI = IM.erase(ILI);
+
+ // Continue erasing and adjusting our end in case other previous
+ // intervals are also overlapped with the current store.
+ //
+ // |--- ealier 1 ---| |--- ealier 2 ---|
+ // |------- later---------|
+ //
+ while (ILI != IM.end() && ILI->second <= LaterIntEnd) {
+ assert(ILI->second > LaterIntStart && "Unexpected interval");
+ LaterIntEnd = std::max(LaterIntEnd, ILI->first);
+ ILI = IM.erase(ILI);
+ }
+ }
+
+ IM[LaterIntEnd] = LaterIntStart;
+
+ ILI = IM.begin();
+ if (ILI->second <= EarlierOff &&
+ ILI->first >= int64_t(EarlierOff + Earlier.Size)) {
+ DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier [" <<
+ EarlierOff << ", " <<
+ int64_t(EarlierOff + Earlier.Size) <<
+ ") Composite Later [" <<
+ ILI->second << ", " << ILI->first << ")\n");
+ ++NumCompletePartials;
+ return OW_Complete;
+ }
+ }
+
+ // Check for an earlier store which writes to all the memory locations that
+ // the later store writes to.
+ if (EnablePartialStoreMerging && LaterOff >= EarlierOff &&
+ int64_t(EarlierOff + Earlier.Size) > LaterOff &&
+ uint64_t(LaterOff - EarlierOff) + Later.Size <= Earlier.Size) {
+ DEBUG(dbgs() << "DSE: Partial overwrite an earlier load [" << EarlierOff
+ << ", " << int64_t(EarlierOff + Earlier.Size)
+ << ") by a later store [" << LaterOff << ", "
+ << int64_t(LaterOff + Later.Size) << ")\n");
+ // TODO: Maybe come up with a better name?
+ return OW_PartialEarlierWithFullLater;
+ }
+
+ // Another interesting case is if the later store overwrites the end of the
+ // earlier store.
+ //
+ // |--earlier--|
+ // |-- later --|
+ //
+ // In this case we may want to trim the size of earlier to avoid generating
+ // writes to addresses which will definitely be overwritten later
+ if (!EnablePartialOverwriteTracking &&
+ (LaterOff > EarlierOff && LaterOff < int64_t(EarlierOff + Earlier.Size) &&
+ int64_t(LaterOff + Later.Size) >= int64_t(EarlierOff + Earlier.Size)))
+ return OW_End;
+
+ // Finally, we also need to check if the later store overwrites the beginning
+ // of the earlier store.
+ //
+ // |--earlier--|
+ // |-- later --|
+ //
+ // In this case we may want to move the destination address and trim the size
+ // of earlier to avoid generating writes to addresses which will definitely
+ // be overwritten later.
+ if (!EnablePartialOverwriteTracking &&
+ (LaterOff <= EarlierOff && int64_t(LaterOff + Later.Size) > EarlierOff)) {
+ assert(int64_t(LaterOff + Later.Size) <
+ int64_t(EarlierOff + Earlier.Size) &&
+ "Expect to be handled as OW_Complete");
+ return OW_Begin;
+ }
+ // Otherwise, they don't completely overlap.
+ return OW_Unknown;
+}
+
+/// If 'Inst' might be a self read (i.e. a noop copy of a
+/// memory region into an identical pointer) then it doesn't actually make its
+/// input dead in the traditional sense. Consider this case:
+///
+/// memcpy(A <- B)
+/// memcpy(A <- A)
+///
+/// In this case, the second store to A does not make the first store to A dead.
+/// The usual situation isn't an explicit A<-A store like this (which can be
+/// trivially removed) but a case where two pointers may alias.
+///
+/// This function detects when it is unsafe to remove a dependent instruction
+/// because the DSE inducing instruction may be a self-read.
+static bool isPossibleSelfRead(Instruction *Inst,
+ const MemoryLocation &InstStoreLoc,
+ Instruction *DepWrite,
+ const TargetLibraryInfo &TLI,
+ AliasAnalysis &AA) {
+ // Self reads can only happen for instructions that read memory. Get the
+ // location read.
+ MemoryLocation InstReadLoc = getLocForRead(Inst, TLI);
+ if (!InstReadLoc.Ptr) return false; // Not a reading instruction.
+
+ // If the read and written loc obviously don't alias, it isn't a read.
+ if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false;
+
+ // Okay, 'Inst' may copy over itself. However, we can still remove a the
+ // DepWrite instruction if we can prove that it reads from the same location
+ // as Inst. This handles useful cases like:
+ // memcpy(A <- B)
+ // memcpy(A <- B)
+ // Here we don't know if A/B may alias, but we do know that B/B are must
+ // aliases, so removing the first memcpy is safe (assuming it writes <= #
+ // bytes as the second one.
+ MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);
+
+ if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
+ return false;
+
+ // If DepWrite doesn't read memory or if we can't prove it is a must alias,
+ // then it can't be considered dead.
+ return true;
+}
+
+/// Returns true if the memory which is accessed by the second instruction is not
+/// modified between the first and the second instruction.
+/// Precondition: Second instruction must be dominated by the first
+/// instruction.
+static bool memoryIsNotModifiedBetween(Instruction *FirstI,
+ Instruction *SecondI,
+ AliasAnalysis *AA) {
+ SmallVector<BasicBlock *, 16> WorkList;
+ SmallPtrSet<BasicBlock *, 8> Visited;
+ BasicBlock::iterator FirstBBI(FirstI);
+ ++FirstBBI;
+ BasicBlock::iterator SecondBBI(SecondI);
+ BasicBlock *FirstBB = FirstI->getParent();
+ BasicBlock *SecondBB = SecondI->getParent();
+ MemoryLocation MemLoc = MemoryLocation::get(SecondI);
+
+ // Start checking the store-block.
+ WorkList.push_back(SecondBB);
+ bool isFirstBlock = true;
+
+ // Check all blocks going backward until we reach the load-block.
+ while (!WorkList.empty()) {
+ BasicBlock *B = WorkList.pop_back_val();
+
+ // Ignore instructions before LI if this is the FirstBB.
+ BasicBlock::iterator BI = (B == FirstBB ? FirstBBI : B->begin());
+
+ BasicBlock::iterator EI;
+ if (isFirstBlock) {
+ // Ignore instructions after SI if this is the first visit of SecondBB.
+ assert(B == SecondBB && "first block is not the store block");
+ EI = SecondBBI;
+ isFirstBlock = false;
+ } else {
+ // It's not SecondBB or (in case of a loop) the second visit of SecondBB.
+ // In this case we also have to look at instructions after SI.
+ EI = B->end();
+ }
+ for (; BI != EI; ++BI) {
+ Instruction *I = &*BI;
+ if (I->mayWriteToMemory() && I != SecondI)
+ if (isModSet(AA->getModRefInfo(I, MemLoc)))
+ return false;
+ }
+ if (B != FirstBB) {
+ assert(B != &FirstBB->getParent()->getEntryBlock() &&
+ "Should not hit the entry block because SI must be dominated by LI");
+ for (auto PredI = pred_begin(B), PE = pred_end(B); PredI != PE; ++PredI) {
+ if (!Visited.insert(*PredI).second)
+ continue;
+ WorkList.push_back(*PredI);
+ }
+ }
+ }
+ return true;
+}
+
+/// Find all blocks that will unconditionally lead to the block BB and append
+/// them to F.
+static void findUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
+ BasicBlock *BB, DominatorTree *DT) {
+ for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+ BasicBlock *Pred = *I;
+ if (Pred == BB) continue;
+ TerminatorInst *PredTI = Pred->getTerminator();
+ if (PredTI->getNumSuccessors() != 1)
+ continue;
+
+ if (DT->isReachableFromEntry(Pred))
+ Blocks.push_back(Pred);
+ }
+}
+
+/// Handle frees of entire structures whose dependency is a store
+/// to a field of that structure.
+static bool handleFree(CallInst *F, AliasAnalysis *AA,
+ MemoryDependenceResults *MD, DominatorTree *DT,
+ const TargetLibraryInfo *TLI,
+ InstOverlapIntervalsTy &IOL,
+ DenseMap<Instruction*, size_t> *InstrOrdering) {
+ bool MadeChange = false;
+
+ MemoryLocation Loc = MemoryLocation(F->getOperand(0));
+ SmallVector<BasicBlock *, 16> Blocks;
+ Blocks.push_back(F->getParent());
+ const DataLayout &DL = F->getModule()->getDataLayout();
+
+ while (!Blocks.empty()) {
+ BasicBlock *BB = Blocks.pop_back_val();
+ Instruction *InstPt = BB->getTerminator();
+ if (BB == F->getParent()) InstPt = F;
+
+ MemDepResult Dep =
+ MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB);
+ while (Dep.isDef() || Dep.isClobber()) {
+ Instruction *Dependency = Dep.getInst();
+ if (!hasMemoryWrite(Dependency, *TLI) || !isRemovable(Dependency))
+ break;
+
+ Value *DepPointer =
+ GetUnderlyingObject(getStoredPointerOperand(Dependency), DL);
+
+ // Check for aliasing.
+ if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
+ break;
+
+ DEBUG(dbgs() << "DSE: Dead Store to soon to be freed memory:\n DEAD: "
+ << *Dependency << '\n');
+
+ // DCE instructions only used to calculate that store.
+ BasicBlock::iterator BBI(Dependency);
+ deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL, InstrOrdering);
+ ++NumFastStores;
+ MadeChange = true;
+
+ // Inst's old Dependency is now deleted. Compute the next dependency,
+ // which may also be dead, as in
+ // s[0] = 0;
+ // s[1] = 0; // This has just been deleted.
+ // free(s);
+ Dep = MD->getPointerDependencyFrom(Loc, false, BBI, BB);
+ }
+
+ if (Dep.isNonLocal())
+ findUnconditionalPreds(Blocks, BB, DT);
+ }
+
+ return MadeChange;
+}
+
+/// Check to see if the specified location may alias any of the stack objects in
+/// the DeadStackObjects set. If so, they become live because the location is
+/// being loaded.
+static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
+ SmallSetVector<Value *, 16> &DeadStackObjects,
+ const DataLayout &DL, AliasAnalysis *AA,
+ const TargetLibraryInfo *TLI) {
+ const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL);
+
+ // A constant can't be in the dead pointer set.
+ if (isa<Constant>(UnderlyingPointer))
+ return;
+
+ // If the kill pointer can be easily reduced to an alloca, don't bother doing
+ // extraneous AA queries.
+ if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) {
+ DeadStackObjects.remove(const_cast<Value*>(UnderlyingPointer));
+ return;
+ }
+
+ // Remove objects that could alias LoadedLoc.
+ DeadStackObjects.remove_if([&](Value *I) {
+ // See if the loaded location could alias the stack location.
+ MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI));
+ return !AA->isNoAlias(StackLoc, LoadedLoc);
+ });
+}
+
+/// Remove dead stores to stack-allocated locations in the function end block.
+/// Ex:
+/// %A = alloca i32
+/// ...
+/// store i32 1, i32* %A
+/// ret void
+static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
+ MemoryDependenceResults *MD,
+ const TargetLibraryInfo *TLI,
+ InstOverlapIntervalsTy &IOL,
+ DenseMap<Instruction*, size_t> *InstrOrdering) {
+ bool MadeChange = false;
+
+ // Keep track of all of the stack objects that are dead at the end of the
+ // function.
+ SmallSetVector<Value*, 16> DeadStackObjects;
+
+ // Find all of the alloca'd pointers in the entry block.
+ BasicBlock &Entry = BB.getParent()->front();
+ for (Instruction &I : Entry) {
+ if (isa<AllocaInst>(&I))
+ DeadStackObjects.insert(&I);
+
+ // Okay, so these are dead heap objects, but if the pointer never escapes
+ // then it's leaked by this function anyways.
+ else if (isAllocLikeFn(&I, TLI) && !PointerMayBeCaptured(&I, true, true))
+ DeadStackObjects.insert(&I);
+ }
+
+ // Treat byval or inalloca arguments the same, stores to them are dead at the
+ // end of the function.
+ for (Argument &AI : BB.getParent()->args())
+ if (AI.hasByValOrInAllocaAttr())
+ DeadStackObjects.insert(&AI);
+
+ const DataLayout &DL = BB.getModule()->getDataLayout();
+
+ // Scan the basic block backwards
+ for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){
+ --BBI;
+
+ // If we find a store, check to see if it points into a dead stack value.
+ if (hasMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) {
+ // See through pointer-to-pointer bitcasts
+ SmallVector<Value *, 4> Pointers;
+ GetUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers, DL);
+
+ // Stores to stack values are valid candidates for removal.
+ bool AllDead = true;
+ for (Value *Pointer : Pointers)
+ if (!DeadStackObjects.count(Pointer)) {
+ AllDead = false;
+ break;
+ }
+
+ if (AllDead) {
+ Instruction *Dead = &*BBI;
+
+ DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: "
+ << *Dead << "\n Objects: ";
+ for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
+ E = Pointers.end(); I != E; ++I) {
+ dbgs() << **I;
+ if (std::next(I) != E)
+ dbgs() << ", ";
+ }
+ dbgs() << '\n');
+
+ // DCE instructions only used to calculate that store.
+ deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, InstrOrdering, &DeadStackObjects);
+ ++NumFastStores;
+ MadeChange = true;
+ continue;
+ }
+ }
+
+ // Remove any dead non-memory-mutating instructions.
+ if (isInstructionTriviallyDead(&*BBI, TLI)) {
+ DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n DEAD: "
+ << *&*BBI << '\n');
+ deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, InstrOrdering, &DeadStackObjects);
+ ++NumFastOther;
+ MadeChange = true;
+ continue;
+ }
+
+ if (isa<AllocaInst>(BBI)) {
+ // Remove allocas from the list of dead stack objects; there can't be
+ // any references before the definition.
+ DeadStackObjects.remove(&*BBI);
+ continue;
+ }
+
+ if (auto CS = CallSite(&*BBI)) {
+ // Remove allocation function calls from the list of dead stack objects;
+ // there can't be any references before the definition.
+ if (isAllocLikeFn(&*BBI, TLI))
+ DeadStackObjects.remove(&*BBI);
+
+ // If this call does not access memory, it can't be loading any of our
+ // pointers.
+ if (AA->doesNotAccessMemory(CS))
+ continue;
+
+ // If the call might load from any of our allocas, then any store above
+ // the call is live.
+ DeadStackObjects.remove_if([&](Value *I) {
+ // See if the call site touches the value.
+ return isRefSet(AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI)));
+ });
+
+ // If all of the allocas were clobbered by the call then we're not going
+ // to find anything else to process.
+ if (DeadStackObjects.empty())
+ break;
+
+ continue;
+ }
+
+ // We can remove the dead stores, irrespective of the fence and its ordering
+ // (release/acquire/seq_cst). Fences only constraints the ordering of
+ // already visible stores, it does not make a store visible to other
+ // threads. So, skipping over a fence does not change a store from being
+ // dead.
+ if (isa<FenceInst>(*BBI))
+ continue;
+
+ MemoryLocation LoadedLoc;
+
+ // If we encounter a use of the pointer, it is no longer considered dead
+ if (LoadInst *L = dyn_cast<LoadInst>(BBI)) {
+ if (!L->isUnordered()) // Be conservative with atomic/volatile load
+ break;
+ LoadedLoc = MemoryLocation::get(L);
+ } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) {
+ LoadedLoc = MemoryLocation::get(V);
+ } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) {
+ LoadedLoc = MemoryLocation::getForSource(MTI);
+ } else if (!BBI->mayReadFromMemory()) {
+ // Instruction doesn't read memory. Note that stores that weren't removed
+ // above will hit this case.
+ continue;
+ } else {
+ // Unknown inst; assume it clobbers everything.
+ break;
+ }
+
+ // Remove any allocas from the DeadPointer set that are loaded, as this
+ // makes any stores above the access live.
+ removeAccessedObjects(LoadedLoc, DeadStackObjects, DL, AA, TLI);
+
+ // If all of the allocas were clobbered by the access then we're not going
+ // to find anything else to process.
+ if (DeadStackObjects.empty())
+ break;
+ }
+
+ return MadeChange;
+}
+
+static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
+ int64_t &EarlierSize, int64_t LaterOffset,
+ int64_t LaterSize, bool IsOverwriteEnd) {
+ // TODO: base this on the target vector size so that if the earlier
+ // store was too small to get vector writes anyway then its likely
+ // a good idea to shorten it
+ // Power of 2 vector writes are probably always a bad idea to optimize
+ // as any store/memset/memcpy is likely using vector instructions so
+ // shortening it to not vector size is likely to be slower
+ MemIntrinsic *EarlierIntrinsic = cast<MemIntrinsic>(EarlierWrite);
+ unsigned EarlierWriteAlign = EarlierIntrinsic->getAlignment();
+ if (!IsOverwriteEnd)
+ LaterOffset = int64_t(LaterOffset + LaterSize);
+
+ if (!(isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) &&
+ !((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0))
+ return false;
+
+ DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW "
+ << (IsOverwriteEnd ? "END" : "BEGIN") << ": " << *EarlierWrite
+ << "\n KILLER (offset " << LaterOffset << ", " << EarlierSize
+ << ")\n");
+
+ int64_t NewLength = IsOverwriteEnd
+ ? LaterOffset - EarlierOffset
+ : EarlierSize - (LaterOffset - EarlierOffset);
+
+ Value *EarlierWriteLength = EarlierIntrinsic->getLength();
+ Value *TrimmedLength =
+ ConstantInt::get(EarlierWriteLength->getType(), NewLength);
+ EarlierIntrinsic->setLength(TrimmedLength);
+
+ EarlierSize = NewLength;
+ if (!IsOverwriteEnd) {
+ int64_t OffsetMoved = (LaterOffset - EarlierOffset);
+ Value *Indices[1] = {
+ ConstantInt::get(EarlierWriteLength->getType(), OffsetMoved)};
+ GetElementPtrInst *NewDestGEP = GetElementPtrInst::CreateInBounds(
+ EarlierIntrinsic->getRawDest(), Indices, "", EarlierWrite);
+ EarlierIntrinsic->setDest(NewDestGEP);
+ EarlierOffset = EarlierOffset + OffsetMoved;
+ }
+ return true;
+}
+
+static bool tryToShortenEnd(Instruction *EarlierWrite,
+ OverlapIntervalsTy &IntervalMap,
+ int64_t &EarlierStart, int64_t &EarlierSize) {
+ if (IntervalMap.empty() || !isShortenableAtTheEnd(EarlierWrite))
+ return false;
+
+ OverlapIntervalsTy::iterator OII = --IntervalMap.end();
+ int64_t LaterStart = OII->second;
+ int64_t LaterSize = OII->first - LaterStart;
+
+ if (LaterStart > EarlierStart && LaterStart < EarlierStart + EarlierSize &&
+ LaterStart + LaterSize >= EarlierStart + EarlierSize) {
+ if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
+ LaterSize, true)) {
+ IntervalMap.erase(OII);
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool tryToShortenBegin(Instruction *EarlierWrite,
+ OverlapIntervalsTy &IntervalMap,
+ int64_t &EarlierStart, int64_t &EarlierSize) {
+ if (IntervalMap.empty() || !isShortenableAtTheBeginning(EarlierWrite))
+ return false;
+
+ OverlapIntervalsTy::iterator OII = IntervalMap.begin();
+ int64_t LaterStart = OII->second;
+ int64_t LaterSize = OII->first - LaterStart;
+
+ if (LaterStart <= EarlierStart && LaterStart + LaterSize > EarlierStart) {
+ assert(LaterStart + LaterSize < EarlierStart + EarlierSize &&
+ "Should have been handled as OW_Complete");
+ if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
+ LaterSize, false)) {
+ IntervalMap.erase(OII);
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool removePartiallyOverlappedStores(AliasAnalysis *AA,
+ const DataLayout &DL,
+ InstOverlapIntervalsTy &IOL) {
+ bool Changed = false;
+ for (auto OI : IOL) {
+ Instruction *EarlierWrite = OI.first;
+ MemoryLocation Loc = getLocForWrite(EarlierWrite, *AA);
+ assert(isRemovable(EarlierWrite) && "Expect only removable instruction");
+ assert(Loc.Size != MemoryLocation::UnknownSize && "Unexpected mem loc");
+
+ const Value *Ptr = Loc.Ptr->stripPointerCasts();
+ int64_t EarlierStart = 0;
+ int64_t EarlierSize = int64_t(Loc.Size);
+ GetPointerBaseWithConstantOffset(Ptr, EarlierStart, DL);
+ OverlapIntervalsTy &IntervalMap = OI.second;
+ Changed |=
+ tryToShortenEnd(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
+ if (IntervalMap.empty())
+ continue;
+ Changed |=
+ tryToShortenBegin(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
+ }
+ return Changed;
+}
+
+static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
+ AliasAnalysis *AA, MemoryDependenceResults *MD,
+ const DataLayout &DL,
+ const TargetLibraryInfo *TLI,
+ InstOverlapIntervalsTy &IOL,
+ DenseMap<Instruction*, size_t> *InstrOrdering) {
+ // Must be a store instruction.
+ StoreInst *SI = dyn_cast<StoreInst>(Inst);
+ if (!SI)
+ return false;
+
+ // If we're storing the same value back to a pointer that we just loaded from,
+ // then the store can be removed.
+ if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) {
+ if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
+ isRemovable(SI) && memoryIsNotModifiedBetween(DepLoad, SI, AA)) {
+
+ DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n LOAD: "
+ << *DepLoad << "\n STORE: " << *SI << '\n');
+
+ deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, InstrOrdering);
+ ++NumRedundantStores;
+ return true;
+ }
+ }
+
+ // Remove null stores into the calloc'ed objects
+ Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand());
+ if (StoredConstant && StoredConstant->isNullValue() && isRemovable(SI)) {
+ Instruction *UnderlyingPointer =
+ dyn_cast<Instruction>(GetUnderlyingObject(SI->getPointerOperand(), DL));
+
+ if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) &&
+ memoryIsNotModifiedBetween(UnderlyingPointer, SI, AA)) {
+ DEBUG(
+ dbgs() << "DSE: Remove null store to the calloc'ed object:\n DEAD: "
+ << *Inst << "\n OBJECT: " << *UnderlyingPointer << '\n');
+
+ deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, InstrOrdering);
+ ++NumRedundantStores;
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
+ MemoryDependenceResults *MD, DominatorTree *DT,
+ const TargetLibraryInfo *TLI) {
+ const DataLayout &DL = BB.getModule()->getDataLayout();
+ bool MadeChange = false;
+
+ // FIXME: Maybe change this to use some abstraction like OrderedBasicBlock?
+ // The current OrderedBasicBlock can't deal with mutation at the moment.
+ size_t LastThrowingInstIndex = 0;
+ DenseMap<Instruction*, size_t> InstrOrdering;
+ size_t InstrIndex = 1;
+
+ // A map of interval maps representing partially-overwritten value parts.
+ InstOverlapIntervalsTy IOL;
+
+ // Do a top-down walk on the BB.
+ for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
+ // Handle 'free' calls specially.
+ if (CallInst *F = isFreeCall(&*BBI, TLI)) {
+ MadeChange |= handleFree(F, AA, MD, DT, TLI, IOL, &InstrOrdering);
+ // Increment BBI after handleFree has potentially deleted instructions.
+ // This ensures we maintain a valid iterator.
+ ++BBI;
+ continue;
+ }
+
+ Instruction *Inst = &*BBI++;
+
+ size_t CurInstNumber = InstrIndex++;
+ InstrOrdering.insert(std::make_pair(Inst, CurInstNumber));
+ if (Inst->mayThrow()) {
+ LastThrowingInstIndex = CurInstNumber;
+ continue;
+ }
+
+ // Check to see if Inst writes to memory. If not, continue.
+ if (!hasMemoryWrite(Inst, *TLI))
+ continue;
+
+ // eliminateNoopStore will update in iterator, if necessary.
+ if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL, &InstrOrdering)) {
+ MadeChange = true;
+ continue;
+ }
+
+ // If we find something that writes memory, get its memory dependence.
+ MemDepResult InstDep = MD->getDependency(Inst);
+
+ // Ignore any store where we can't find a local dependence.
+ // FIXME: cross-block DSE would be fun. :)
+ if (!InstDep.isDef() && !InstDep.isClobber())
+ continue;
+
+ // Figure out what location is being stored to.
+ MemoryLocation Loc = getLocForWrite(Inst, *AA);
+
+ // If we didn't get a useful location, fail.
+ if (!Loc.Ptr)
+ continue;
+
+ // Loop until we find a store we can eliminate or a load that
+ // invalidates the analysis. Without an upper bound on the number of
+ // instructions examined, this analysis can become very time-consuming.
+ // However, the potential gain diminishes as we process more instructions
+ // without eliminating any of them. Therefore, we limit the number of
+ // instructions we look at.
+ auto Limit = MD->getDefaultBlockScanLimit();
+ while (InstDep.isDef() || InstDep.isClobber()) {
+ // Get the memory clobbered by the instruction we depend on. MemDep will
+ // skip any instructions that 'Loc' clearly doesn't interact with. If we
+ // end up depending on a may- or must-aliased load, then we can't optimize
+ // away the store and we bail out. However, if we depend on something
+ // that overwrites the memory location we *can* potentially optimize it.
+ //
+ // Find out what memory location the dependent instruction stores.
+ Instruction *DepWrite = InstDep.getInst();
+ MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA);
+ // If we didn't get a useful location, or if it isn't a size, bail out.
+ if (!DepLoc.Ptr)
+ break;
+
+ // Make sure we don't look past a call which might throw. This is an
+ // issue because MemoryDependenceAnalysis works in the wrong direction:
+ // it finds instructions which dominate the current instruction, rather than
+ // instructions which are post-dominated by the current instruction.
+ //
+ // If the underlying object is a non-escaping memory allocation, any store
+ // to it is dead along the unwind edge. Otherwise, we need to preserve
+ // the store.
+ size_t DepIndex = InstrOrdering.lookup(DepWrite);
+ assert(DepIndex && "Unexpected instruction");
+ if (DepIndex <= LastThrowingInstIndex) {
+ const Value* Underlying = GetUnderlyingObject(DepLoc.Ptr, DL);
+ bool IsStoreDeadOnUnwind = isa<AllocaInst>(Underlying);
+ if (!IsStoreDeadOnUnwind) {
+ // We're looking for a call to an allocation function
+ // where the allocation doesn't escape before the last
+ // throwing instruction; PointerMayBeCaptured
+ // reasonably fast approximation.
+ IsStoreDeadOnUnwind = isAllocLikeFn(Underlying, TLI) &&
+ !PointerMayBeCaptured(Underlying, false, true);
+ }
+ if (!IsStoreDeadOnUnwind)
+ break;
+ }
+
+ // If we find a write that is a) removable (i.e., non-volatile), b) is
+ // completely obliterated by the store to 'Loc', and c) which we know that
+ // 'Inst' doesn't load from, then we can remove it.
+ // Also try to merge two stores if a later one only touches memory written
+ // to by the earlier one.
+ if (isRemovable(DepWrite) &&
+ !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
+ int64_t InstWriteOffset, DepWriteOffset;
+ OverwriteResult OR =
+ isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset,
+ DepWrite, IOL);
+ if (OR == OW_Complete) {
+ DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: "
+ << *DepWrite << "\n KILLER: " << *Inst << '\n');
+
+ // Delete the store and now-dead instructions that feed it.
+ deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, &InstrOrdering);
+ ++NumFastStores;
+ MadeChange = true;
+
+ // We erased DepWrite; start over.
+ InstDep = MD->getDependency(Inst);
+ continue;
+ } else if ((OR == OW_End && isShortenableAtTheEnd(DepWrite)) ||
+ ((OR == OW_Begin &&
+ isShortenableAtTheBeginning(DepWrite)))) {
+ assert(!EnablePartialOverwriteTracking && "Do not expect to perform "
+ "when partial-overwrite "
+ "tracking is enabled");
+ int64_t EarlierSize = DepLoc.Size;
+ int64_t LaterSize = Loc.Size;
+ bool IsOverwriteEnd = (OR == OW_End);
+ MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize,
+ InstWriteOffset, LaterSize, IsOverwriteEnd);
+ } else if (EnablePartialStoreMerging &&
+ OR == OW_PartialEarlierWithFullLater) {
+ auto *Earlier = dyn_cast<StoreInst>(DepWrite);
+ auto *Later = dyn_cast<StoreInst>(Inst);
+ if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) &&
+ Later && isa<ConstantInt>(Later->getValueOperand()) &&
+ memoryIsNotModifiedBetween(Earlier, Later, AA)) {
+ // If the store we find is:
+ // a) partially overwritten by the store to 'Loc'
+ // b) the later store is fully contained in the earlier one and
+ // c) they both have a constant value
+ // Merge the two stores, replacing the earlier store's value with a
+ // merge of both values.
+ // TODO: Deal with other constant types (vectors, etc), and probably
+ // some mem intrinsics (if needed)
+
+ APInt EarlierValue =
+ cast<ConstantInt>(Earlier->getValueOperand())->getValue();
+ APInt LaterValue =
+ cast<ConstantInt>(Later->getValueOperand())->getValue();
+ unsigned LaterBits = LaterValue.getBitWidth();
+ assert(EarlierValue.getBitWidth() > LaterValue.getBitWidth());
+ LaterValue = LaterValue.zext(EarlierValue.getBitWidth());
+
+ // Offset of the smaller store inside the larger store
+ unsigned BitOffsetDiff = (InstWriteOffset - DepWriteOffset) * 8;
+ unsigned LShiftAmount =
+ DL.isBigEndian()
+ ? EarlierValue.getBitWidth() - BitOffsetDiff - LaterBits
+ : BitOffsetDiff;
+ APInt Mask =
+ APInt::getBitsSet(EarlierValue.getBitWidth(), LShiftAmount,
+ LShiftAmount + LaterBits);
+ // Clear the bits we'll be replacing, then OR with the smaller
+ // store, shifted appropriately.
+ APInt Merged =
+ (EarlierValue & ~Mask) | (LaterValue << LShiftAmount);
+ DEBUG(dbgs() << "DSE: Merge Stores:\n Earlier: " << *DepWrite
+ << "\n Later: " << *Inst
+ << "\n Merged Value: " << Merged << '\n');
+
+ auto *SI = new StoreInst(
+ ConstantInt::get(Earlier->getValueOperand()->getType(), Merged),
+ Earlier->getPointerOperand(), false, Earlier->getAlignment(),
+ Earlier->getOrdering(), Earlier->getSyncScopeID(), DepWrite);
+
+ unsigned MDToKeep[] = {LLVMContext::MD_dbg, LLVMContext::MD_tbaa,
+ LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias,
+ LLVMContext::MD_nontemporal};
+ SI->copyMetadata(*DepWrite, MDToKeep);
+ ++NumModifiedStores;
+
+ // Remove earlier, wider, store
+ size_t Idx = InstrOrdering.lookup(DepWrite);
+ InstrOrdering.erase(DepWrite);
+ InstrOrdering.insert(std::make_pair(SI, Idx));
+
+ // Delete the old stores and now-dead instructions that feed them.
+ deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL, &InstrOrdering);
+ deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL,
+ &InstrOrdering);
+ MadeChange = true;
+
+ // We erased DepWrite and Inst (Loc); start over.
+ break;
+ }
+ }
+ }
+
+ // If this is a may-aliased store that is clobbering the store value, we
+ // can keep searching past it for another must-aliased pointer that stores
+ // to the same location. For example, in:
+ // store -> P
+ // store -> Q
+ // store -> P
+ // we can remove the first store to P even though we don't know if P and Q
+ // alias.
+ if (DepWrite == &BB.front()) break;
+
+ // Can't look past this instruction if it might read 'Loc'.
+ if (isRefSet(AA->getModRefInfo(DepWrite, Loc)))
+ break;
+
+ InstDep = MD->getPointerDependencyFrom(Loc, /*isLoad=*/ false,
+ DepWrite->getIterator(), &BB,
+ /*QueryInst=*/ nullptr, &Limit);
+ }
+ }
+
+ if (EnablePartialOverwriteTracking)
+ MadeChange |= removePartiallyOverlappedStores(AA, DL, IOL);
+
+ // If this block ends in a return, unwind, or unreachable, all allocas are
+ // dead at its end, which means stores to them are also dead.
+ if (BB.getTerminator()->getNumSuccessors() == 0)
+ MadeChange |= handleEndBlock(BB, AA, MD, TLI, IOL, &InstrOrdering);
+
+ return MadeChange;
+}
+
+static bool eliminateDeadStores(Function &F, AliasAnalysis *AA,
+ MemoryDependenceResults *MD, DominatorTree *DT,
+ const TargetLibraryInfo *TLI) {
+ bool MadeChange = false;
+ for (BasicBlock &BB : F)
+ // Only check non-dead blocks. Dead blocks may have strange pointer
+ // cycles that will confuse alias analysis.
+ if (DT->isReachableFromEntry(&BB))
+ MadeChange |= eliminateDeadStores(BB, AA, MD, DT, TLI);
+
+ return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+// DSE Pass
+//===----------------------------------------------------------------------===//
+PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
+ AliasAnalysis *AA = &AM.getResult<AAManager>(F);
+ DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ MemoryDependenceResults *MD = &AM.getResult<MemoryDependenceAnalysis>(F);
+ const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+
+ if (!eliminateDeadStores(F, AA, MD, DT, TLI))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ PA.preserve<MemoryDependenceAnalysis>();
+ return PA;
+}
+
+namespace {
+
+/// A legacy pass for the legacy pass manager that wraps \c DSEPass.
+class DSELegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ DSELegacyPass() : FunctionPass(ID) {
+ initializeDSELegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ MemoryDependenceResults *MD =
+ &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+ const TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+ return eliminateDeadStores(F, AA, MD, DT, TLI);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<MemoryDependenceWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<MemoryDependenceWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+char DSELegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
+ false)
+
+FunctionPass *llvm::createDeadStoreEliminationPass() {
+ return new DSELegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/DivRemPairs.cpp b/contrib/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
new file mode 100644
index 000000000000..e383af89a384
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
@@ -0,0 +1,206 @@
+//===- DivRemPairs.cpp - Hoist/decompose division and remainder -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass hoists and/or decomposes integer division and remainder
+// instructions to enable CFG improvements and better codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/DivRemPairs.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "div-rem-pairs"
+STATISTIC(NumPairs, "Number of div/rem pairs");
+STATISTIC(NumHoisted, "Number of instructions hoisted");
+STATISTIC(NumDecomposed, "Number of instructions decomposed");
+
+/// Find matching pairs of integer div/rem ops (they have the same numerator,
+/// denominator, and signedness). If they exist in different basic blocks, bring
+/// them together by hoisting or replace the common division operation that is
+/// implicit in the remainder:
+/// X % Y <--> X - ((X / Y) * Y).
+///
+/// We can largely ignore the normal safety and cost constraints on speculation
+/// of these ops when we find a matching pair. This is because we are already
+/// guaranteed that any exceptions and most cost are already incurred by the
+/// first member of the pair.
+///
+/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or
+/// SimplifyCFG, but it's split off on its own because it's different enough
+/// that it doesn't quite match the stated objectives of those passes.
+static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
+ const DominatorTree &DT) {
+ bool Changed = false;
+
+ // Insert all divide and remainder instructions into maps keyed by their
+ // operands and opcode (signed or unsigned).
+ DenseMap<DivRemMapKey, Instruction *> DivMap, RemMap;
+ for (auto &BB : F) {
+ for (auto &I : BB) {
+ if (I.getOpcode() == Instruction::SDiv)
+ DivMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
+ else if (I.getOpcode() == Instruction::UDiv)
+ DivMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
+ else if (I.getOpcode() == Instruction::SRem)
+ RemMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
+ else if (I.getOpcode() == Instruction::URem)
+ RemMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
+ }
+ }
+
+ // We can iterate over either map because we are only looking for matched
+ // pairs. Choose remainders for efficiency because they are usually even more
+ // rare than division.
+ for (auto &RemPair : RemMap) {
+ // Find the matching division instruction from the division map.
+ Instruction *DivInst = DivMap[RemPair.getFirst()];
+ if (!DivInst)
+ continue;
+
+ // We have a matching pair of div/rem instructions. If one dominates the
+ // other, hoist and/or replace one.
+ NumPairs++;
+ Instruction *RemInst = RemPair.getSecond();
+ bool IsSigned = DivInst->getOpcode() == Instruction::SDiv;
+ bool HasDivRemOp = TTI.hasDivRemOp(DivInst->getType(), IsSigned);
+
+ // If the target supports div+rem and the instructions are in the same block
+ // already, there's nothing to do. The backend should handle this. If the
+ // target does not support div+rem, then we will decompose the rem.
+ if (HasDivRemOp && RemInst->getParent() == DivInst->getParent())
+ continue;
+
+ bool DivDominates = DT.dominates(DivInst, RemInst);
+ if (!DivDominates && !DT.dominates(RemInst, DivInst))
+ continue;
+
+ if (HasDivRemOp) {
+ // The target has a single div/rem operation. Hoist the lower instruction
+ // to make the matched pair visible to the backend.
+ if (DivDominates)
+ RemInst->moveAfter(DivInst);
+ else
+ DivInst->moveAfter(RemInst);
+ NumHoisted++;
+ } else {
+ // The target does not have a single div/rem operation. Decompose the
+ // remainder calculation as:
+ // X % Y --> X - ((X / Y) * Y).
+ Value *X = RemInst->getOperand(0);
+ Value *Y = RemInst->getOperand(1);
+ Instruction *Mul = BinaryOperator::CreateMul(DivInst, Y);
+ Instruction *Sub = BinaryOperator::CreateSub(X, Mul);
+
+ // If the remainder dominates, then hoist the division up to that block:
+ //
+ // bb1:
+ // %rem = srem %x, %y
+ // bb2:
+ // %div = sdiv %x, %y
+ // -->
+ // bb1:
+ // %div = sdiv %x, %y
+ // %mul = mul %div, %y
+ // %rem = sub %x, %mul
+ //
+ // If the division dominates, it's already in the right place. The mul+sub
+ // will be in a different block because we don't assume that they are
+ // cheap to speculatively execute:
+ //
+ // bb1:
+ // %div = sdiv %x, %y
+ // bb2:
+ // %rem = srem %x, %y
+ // -->
+ // bb1:
+ // %div = sdiv %x, %y
+ // bb2:
+ // %mul = mul %div, %y
+ // %rem = sub %x, %mul
+ //
+ // If the div and rem are in the same block, we do the same transform,
+ // but any code movement would be within the same block.
+
+ if (!DivDominates)
+ DivInst->moveBefore(RemInst);
+ Mul->insertAfter(RemInst);
+ Sub->insertAfter(Mul);
+
+ // Now kill the explicit remainder. We have replaced it with:
+ // (sub X, (mul (div X, Y), Y)
+ RemInst->replaceAllUsesWith(Sub);
+ RemInst->eraseFromParent();
+ NumDecomposed++;
+ }
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+// Pass manager boilerplate below here.
+
+namespace {
+struct DivRemPairsLegacyPass : public FunctionPass {
+ static char ID;
+ DivRemPairsLegacyPass() : FunctionPass(ID) {
+ initializeDivRemPairsLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.setPreservesCFG();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ return optimizeDivRem(F, TTI, DT);
+ }
+};
+}
+
+char DivRemPairsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(DivRemPairsLegacyPass, "div-rem-pairs",
+ "Hoist/decompose integer division and remainder", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(DivRemPairsLegacyPass, "div-rem-pairs",
+ "Hoist/decompose integer division and remainder", false,
+ false)
+FunctionPass *llvm::createDivRemPairsPass() {
+ return new DivRemPairsLegacyPass();
+}
+
+PreservedAnalyses DivRemPairsPass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
+ DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+ if (!optimizeDivRem(F, TTI, DT))
+ return PreservedAnalyses::all();
+ // TODO: This pass just hoists/replaces math ops - all analyses are preserved?
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
new file mode 100644
index 000000000000..5798e1c4ee99
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -0,0 +1,1185 @@
+//===- EarlyCSE.cpp - Simple and fast CSE pass ----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs a simple dominator tree walk that eliminates trivially
+// redundant instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/RecyclingAllocator.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <deque>
+#include <memory>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "early-cse"
+
+STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd");
+STATISTIC(NumCSE, "Number of instructions CSE'd");
+STATISTIC(NumCSECVP, "Number of compare instructions CVP'd");
+STATISTIC(NumCSELoad, "Number of load instructions CSE'd");
+STATISTIC(NumCSECall, "Number of call instructions CSE'd");
+STATISTIC(NumDSE, "Number of trivial dead stores removed");
+
+//===----------------------------------------------------------------------===//
+// SimpleValue
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// \brief Struct representing the available values in the scoped hash table.
+struct SimpleValue {
+ Instruction *Inst;
+
+ SimpleValue(Instruction *I) : Inst(I) {
+ assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+ }
+
+ bool isSentinel() const {
+ return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+ Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+
+ static bool canHandle(Instruction *Inst) {
+ // This can only handle non-void readnone functions.
+ if (CallInst *CI = dyn_cast<CallInst>(Inst))
+ return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy();
+ return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) ||
+ isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) ||
+ isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
+ isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
+ isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst);
+ }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+template <> struct DenseMapInfo<SimpleValue> {
+ static inline SimpleValue getEmptyKey() {
+ return DenseMapInfo<Instruction *>::getEmptyKey();
+ }
+
+ static inline SimpleValue getTombstoneKey() {
+ return DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+
+ static unsigned getHashValue(SimpleValue Val);
+ static bool isEqual(SimpleValue LHS, SimpleValue RHS);
+};
+
+} // end namespace llvm
+
+unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
+ Instruction *Inst = Val.Inst;
+ // Hash in all of the operands as pointers.
+ if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst)) {
+ Value *LHS = BinOp->getOperand(0);
+ Value *RHS = BinOp->getOperand(1);
+ if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1))
+ std::swap(LHS, RHS);
+
+ return hash_combine(BinOp->getOpcode(), LHS, RHS);
+ }
+
+ if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) {
+ Value *LHS = CI->getOperand(0);
+ Value *RHS = CI->getOperand(1);
+ CmpInst::Predicate Pred = CI->getPredicate();
+ if (Inst->getOperand(0) > Inst->getOperand(1)) {
+ std::swap(LHS, RHS);
+ Pred = CI->getSwappedPredicate();
+ }
+ return hash_combine(Inst->getOpcode(), Pred, LHS, RHS);
+ }
+
+ // Hash min/max/abs (cmp + select) to allow for commuted operands.
+ // Min/max may also have non-canonical compare predicate (eg, the compare for
+ // smin may use 'sgt' rather than 'slt'), and non-canonical operands in the
+ // compare.
+ Value *A, *B;
+ SelectPatternFlavor SPF = matchSelectPattern(Inst, A, B).Flavor;
+ // TODO: We should also detect FP min/max.
+ if (SPF == SPF_SMIN || SPF == SPF_SMAX ||
+ SPF == SPF_UMIN || SPF == SPF_UMAX ||
+ SPF == SPF_ABS || SPF == SPF_NABS) {
+ if (A > B)
+ std::swap(A, B);
+ return hash_combine(Inst->getOpcode(), SPF, A, B);
+ }
+
+ if (CastInst *CI = dyn_cast<CastInst>(Inst))
+ return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0));
+
+ if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst))
+ return hash_combine(EVI->getOpcode(), EVI->getOperand(0),
+ hash_combine_range(EVI->idx_begin(), EVI->idx_end()));
+
+ if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst))
+ return hash_combine(IVI->getOpcode(), IVI->getOperand(0),
+ IVI->getOperand(1),
+ hash_combine_range(IVI->idx_begin(), IVI->idx_end()));
+
+ assert((isa<CallInst>(Inst) || isa<BinaryOperator>(Inst) ||
+ isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
+ isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
+ isa<ShuffleVectorInst>(Inst)) &&
+ "Invalid/unknown instruction");
+
+ // Mix in the opcode.
+ return hash_combine(
+ Inst->getOpcode(),
+ hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
+}
+
+bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
+ Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
+
+ if (LHS.isSentinel() || RHS.isSentinel())
+ return LHSI == RHSI;
+
+ if (LHSI->getOpcode() != RHSI->getOpcode())
+ return false;
+ if (LHSI->isIdenticalToWhenDefined(RHSI))
+ return true;
+
+ // If we're not strictly identical, we still might be a commutable instruction
+ if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) {
+ if (!LHSBinOp->isCommutative())
+ return false;
+
+ assert(isa<BinaryOperator>(RHSI) &&
+ "same opcode, but different instruction type?");
+ BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI);
+
+ // Commuted equality
+ return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) &&
+ LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0);
+ }
+ if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) {
+ assert(isa<CmpInst>(RHSI) &&
+ "same opcode, but different instruction type?");
+ CmpInst *RHSCmp = cast<CmpInst>(RHSI);
+ // Commuted equality
+ return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) &&
+ LHSCmp->getOperand(1) == RHSCmp->getOperand(0) &&
+ LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
+ }
+
+ // Min/max/abs can occur with commuted operands, non-canonical predicates,
+ // and/or non-canonical operands.
+ Value *LHSA, *LHSB;
+ SelectPatternFlavor LSPF = matchSelectPattern(LHSI, LHSA, LHSB).Flavor;
+ // TODO: We should also detect FP min/max.
+ if (LSPF == SPF_SMIN || LSPF == SPF_SMAX ||
+ LSPF == SPF_UMIN || LSPF == SPF_UMAX ||
+ LSPF == SPF_ABS || LSPF == SPF_NABS) {
+ Value *RHSA, *RHSB;
+ SelectPatternFlavor RSPF = matchSelectPattern(RHSI, RHSA, RHSB).Flavor;
+ return (LSPF == RSPF && ((LHSA == RHSA && LHSB == RHSB) ||
+ (LHSA == RHSB && LHSB == RHSA)));
+ }
+
+ return false;
+}
+
+//===----------------------------------------------------------------------===//
+// CallValue
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// \brief Struct representing the available call values in the scoped hash
+/// table.
+struct CallValue {
+ Instruction *Inst;
+
+ CallValue(Instruction *I) : Inst(I) {
+ assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+ }
+
+ bool isSentinel() const {
+ return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+ Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+
+ static bool canHandle(Instruction *Inst) {
+ // Don't value number anything that returns void.
+ if (Inst->getType()->isVoidTy())
+ return false;
+
+ CallInst *CI = dyn_cast<CallInst>(Inst);
+ if (!CI || !CI->onlyReadsMemory())
+ return false;
+ return true;
+ }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+template <> struct DenseMapInfo<CallValue> {
+ static inline CallValue getEmptyKey() {
+ return DenseMapInfo<Instruction *>::getEmptyKey();
+ }
+
+ static inline CallValue getTombstoneKey() {
+ return DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+
+ static unsigned getHashValue(CallValue Val);
+ static bool isEqual(CallValue LHS, CallValue RHS);
+};
+
+} // end namespace llvm
+
+unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
+ Instruction *Inst = Val.Inst;
+ // Hash all of the operands as pointers and mix in the opcode.
+ return hash_combine(
+ Inst->getOpcode(),
+ hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
+}
+
+bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
+ Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
+ if (LHS.isSentinel() || RHS.isSentinel())
+ return LHSI == RHSI;
+ return LHSI->isIdenticalTo(RHSI);
+}
+
+//===----------------------------------------------------------------------===//
+// EarlyCSE implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// \brief A simple and fast domtree-based CSE pass.
+///
+/// This pass does a simple depth-first walk over the dominator tree,
+/// eliminating trivially redundant instructions and using instsimplify to
+/// canonicalize things as it goes. It is intended to be fast and catch obvious
+/// cases so that instcombine and other passes are more effective. It is
+/// expected that a later pass of GVN will catch the interesting/hard cases.
+class EarlyCSE {
+public:
+ const TargetLibraryInfo &TLI;
+ const TargetTransformInfo &TTI;
+ DominatorTree &DT;
+ AssumptionCache &AC;
+ const SimplifyQuery SQ;
+ MemorySSA *MSSA;
+ std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
+
+ using AllocatorTy =
+ RecyclingAllocator<BumpPtrAllocator,
+ ScopedHashTableVal<SimpleValue, Value *>>;
+ using ScopedHTType =
+ ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
+ AllocatorTy>;
+
+ /// \brief A scoped hash table of the current values of all of our simple
+ /// scalar expressions.
+ ///
+ /// As we walk down the domtree, we look to see if instructions are in this:
+ /// if so, we replace them with what we find, otherwise we insert them so
+ /// that dominated values can succeed in their lookup.
+ ScopedHTType AvailableValues;
+
+ /// A scoped hash table of the current values of previously encounted memory
+ /// locations.
+ ///
+ /// This allows us to get efficient access to dominating loads or stores when
+ /// we have a fully redundant load. In addition to the most recent load, we
+ /// keep track of a generation count of the read, which is compared against
+ /// the current generation count. The current generation count is incremented
+ /// after every possibly writing memory operation, which ensures that we only
+ /// CSE loads with other loads that have no intervening store. Ordering
+ /// events (such as fences or atomic instructions) increment the generation
+ /// count as well; essentially, we model these as writes to all possible
+ /// locations. Note that atomic and/or volatile loads and stores can be
+ /// present the table; it is the responsibility of the consumer to inspect
+ /// the atomicity/volatility if needed.
+ struct LoadValue {
+ Instruction *DefInst = nullptr;
+ unsigned Generation = 0;
+ int MatchingId = -1;
+ bool IsAtomic = false;
+ bool IsInvariant = false;
+
+ LoadValue() = default;
+ LoadValue(Instruction *Inst, unsigned Generation, unsigned MatchingId,
+ bool IsAtomic, bool IsInvariant)
+ : DefInst(Inst), Generation(Generation), MatchingId(MatchingId),
+ IsAtomic(IsAtomic), IsInvariant(IsInvariant) {}
+ };
+
+ using LoadMapAllocator =
+ RecyclingAllocator<BumpPtrAllocator,
+ ScopedHashTableVal<Value *, LoadValue>>;
+ using LoadHTType =
+ ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>,
+ LoadMapAllocator>;
+
+ LoadHTType AvailableLoads;
+
+ /// \brief A scoped hash table of the current values of read-only call
+ /// values.
+ ///
+ /// It uses the same generation count as loads.
+ using CallHTType =
+ ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>;
+ CallHTType AvailableCalls;
+
+ /// \brief This is the current generation of the memory value.
+ unsigned CurrentGeneration = 0;
+
+ /// \brief Set up the EarlyCSE runner for a particular function.
+ EarlyCSE(const DataLayout &DL, const TargetLibraryInfo &TLI,
+ const TargetTransformInfo &TTI, DominatorTree &DT,
+ AssumptionCache &AC, MemorySSA *MSSA)
+ : TLI(TLI), TTI(TTI), DT(DT), AC(AC), SQ(DL, &TLI, &DT, &AC), MSSA(MSSA),
+ MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {}
+
+ bool run();
+
+private:
+ // Almost a POD, but needs to call the constructors for the scoped hash
+ // tables so that a new scope gets pushed on. These are RAII so that the
+ // scope gets popped when the NodeScope is destroyed.
+ class NodeScope {
+ public:
+ NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
+ CallHTType &AvailableCalls)
+ : Scope(AvailableValues), LoadScope(AvailableLoads),
+ CallScope(AvailableCalls) {}
+ NodeScope(const NodeScope &) = delete;
+ NodeScope &operator=(const NodeScope &) = delete;
+
+ private:
+ ScopedHTType::ScopeTy Scope;
+ LoadHTType::ScopeTy LoadScope;
+ CallHTType::ScopeTy CallScope;
+ };
+
+ // Contains all the needed information to create a stack for doing a depth
+ // first traversal of the tree. This includes scopes for values, loads, and
+ // calls as well as the generation. There is a child iterator so that the
+ // children do not need to be store separately.
+ class StackNode {
+ public:
+ StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
+ CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n,
+ DomTreeNode::iterator child, DomTreeNode::iterator end)
+ : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
+ EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls)
+ {}
+ StackNode(const StackNode &) = delete;
+ StackNode &operator=(const StackNode &) = delete;
+
+ // Accessors.
+ unsigned currentGeneration() { return CurrentGeneration; }
+ unsigned childGeneration() { return ChildGeneration; }
+ void childGeneration(unsigned generation) { ChildGeneration = generation; }
+ DomTreeNode *node() { return Node; }
+ DomTreeNode::iterator childIter() { return ChildIter; }
+
+ DomTreeNode *nextChild() {
+ DomTreeNode *child = *ChildIter;
+ ++ChildIter;
+ return child;
+ }
+
+ DomTreeNode::iterator end() { return EndIter; }
+ bool isProcessed() { return Processed; }
+ void process() { Processed = true; }
+
+ private:
+ unsigned CurrentGeneration;
+ unsigned ChildGeneration;
+ DomTreeNode *Node;
+ DomTreeNode::iterator ChildIter;
+ DomTreeNode::iterator EndIter;
+ NodeScope Scopes;
+ bool Processed = false;
+ };
+
+ /// \brief Wrapper class to handle memory instructions, including loads,
+ /// stores and intrinsic loads and stores defined by the target.
+ class ParseMemoryInst {
+ public:
+ ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
+ : Inst(Inst) {
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
+ if (TTI.getTgtMemIntrinsic(II, Info))
+ IsTargetMemInst = true;
+ }
+
+ bool isLoad() const {
+ if (IsTargetMemInst) return Info.ReadMem;
+ return isa<LoadInst>(Inst);
+ }
+
+ bool isStore() const {
+ if (IsTargetMemInst) return Info.WriteMem;
+ return isa<StoreInst>(Inst);
+ }
+
+ bool isAtomic() const {
+ if (IsTargetMemInst)
+ return Info.Ordering != AtomicOrdering::NotAtomic;
+ return Inst->isAtomic();
+ }
+
+ bool isUnordered() const {
+ if (IsTargetMemInst)
+ return Info.isUnordered();
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ return LI->isUnordered();
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ return SI->isUnordered();
+ }
+ // Conservative answer
+ return !Inst->isAtomic();
+ }
+
+ bool isVolatile() const {
+ if (IsTargetMemInst)
+ return Info.IsVolatile;
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ return LI->isVolatile();
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ return SI->isVolatile();
+ }
+ // Conservative answer
+ return true;
+ }
+
+ bool isInvariantLoad() const {
+ if (auto *LI = dyn_cast<LoadInst>(Inst))
+ return LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr;
+ return false;
+ }
+
+ bool isMatchingMemLoc(const ParseMemoryInst &Inst) const {
+ return (getPointerOperand() == Inst.getPointerOperand() &&
+ getMatchingId() == Inst.getMatchingId());
+ }
+
+ bool isValid() const { return getPointerOperand() != nullptr; }
+
+ // For regular (non-intrinsic) loads/stores, this is set to -1. For
+ // intrinsic loads/stores, the id is retrieved from the corresponding
+ // field in the MemIntrinsicInfo structure. That field contains
+ // non-negative values only.
+ int getMatchingId() const {
+ if (IsTargetMemInst) return Info.MatchingId;
+ return -1;
+ }
+
+ Value *getPointerOperand() const {
+ if (IsTargetMemInst) return Info.PtrVal;
+ if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ return LI->getPointerOperand();
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ return SI->getPointerOperand();
+ }
+ return nullptr;
+ }
+
+ bool mayReadFromMemory() const {
+ if (IsTargetMemInst) return Info.ReadMem;
+ return Inst->mayReadFromMemory();
+ }
+
+ bool mayWriteToMemory() const {
+ if (IsTargetMemInst) return Info.WriteMem;
+ return Inst->mayWriteToMemory();
+ }
+
+ private:
+ bool IsTargetMemInst = false;
+ MemIntrinsicInfo Info;
+ Instruction *Inst;
+ };
+
+ bool processNode(DomTreeNode *Node);
+
+ Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
+ if (auto *LI = dyn_cast<LoadInst>(Inst))
+ return LI;
+ if (auto *SI = dyn_cast<StoreInst>(Inst))
+ return SI->getValueOperand();
+ assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
+ return TTI.getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst),
+ ExpectedType);
+ }
+
+ bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration,
+ Instruction *EarlierInst, Instruction *LaterInst);
+
+ void removeMSSA(Instruction *Inst) {
+ if (!MSSA)
+ return;
+ // Removing a store here can leave MemorySSA in an unoptimized state by
+ // creating MemoryPhis that have identical arguments and by creating
+ // MemoryUses whose defining access is not an actual clobber. We handle the
+ // phi case eagerly here. The non-optimized MemoryUse case is lazily
+ // updated by MemorySSA getClobberingMemoryAccess.
+ if (MemoryAccess *MA = MSSA->getMemoryAccess(Inst)) {
+ // Optimize MemoryPhi nodes that may become redundant by having all the
+ // same input values once MA is removed.
+ SmallSetVector<MemoryPhi *, 4> PhisToCheck;
+ SmallVector<MemoryAccess *, 8> WorkQueue;
+ WorkQueue.push_back(MA);
+ // Process MemoryPhi nodes in FIFO order using a ever-growing vector since
+ // we shouldn't be processing that many phis and this will avoid an
+ // allocation in almost all cases.
+ for (unsigned I = 0; I < WorkQueue.size(); ++I) {
+ MemoryAccess *WI = WorkQueue[I];
+
+ for (auto *U : WI->users())
+ if (MemoryPhi *MP = dyn_cast<MemoryPhi>(U))
+ PhisToCheck.insert(MP);
+
+ MSSAUpdater->removeMemoryAccess(WI);
+
+ for (MemoryPhi *MP : PhisToCheck) {
+ MemoryAccess *FirstIn = MP->getIncomingValue(0);
+ if (llvm::all_of(MP->incoming_values(),
+ [=](Use &In) { return In == FirstIn; }))
+ WorkQueue.push_back(MP);
+ }
+ PhisToCheck.clear();
+ }
+ }
+ }
+};
+
+} // end anonymous namespace
+
+/// Determine if the memory referenced by LaterInst is from the same heap
+/// version as EarlierInst.
+/// This is currently called in two scenarios:
+///
+/// load p
+/// ...
+/// load p
+///
+/// and
+///
+/// x = load p
+/// ...
+/// store x, p
+///
+/// in both cases we want to verify that there are no possible writes to the
+/// memory referenced by p between the earlier and later instruction.
+bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration,
+ unsigned LaterGeneration,
+ Instruction *EarlierInst,
+ Instruction *LaterInst) {
+ // Check the simple memory generation tracking first.
+ if (EarlierGeneration == LaterGeneration)
+ return true;
+
+ if (!MSSA)
+ return false;
+
+ // If MemorySSA has determined that one of EarlierInst or LaterInst does not
+ // read/write memory, then we can safely return true here.
+ // FIXME: We could be more aggressive when checking doesNotAccessMemory(),
+ // onlyReadsMemory(), mayReadFromMemory(), and mayWriteToMemory() in this pass
+ // by also checking the MemorySSA MemoryAccess on the instruction. Initial
+ // experiments suggest this isn't worthwhile, at least for C/C++ code compiled
+ // with the default optimization pipeline.
+ auto *EarlierMA = MSSA->getMemoryAccess(EarlierInst);
+ if (!EarlierMA)
+ return true;
+ auto *LaterMA = MSSA->getMemoryAccess(LaterInst);
+ if (!LaterMA)
+ return true;
+
+ // Since we know LaterDef dominates LaterInst and EarlierInst dominates
+ // LaterInst, if LaterDef dominates EarlierInst then it can't occur between
+ // EarlierInst and LaterInst and neither can any other write that potentially
+ // clobbers LaterInst.
+ MemoryAccess *LaterDef =
+ MSSA->getWalker()->getClobberingMemoryAccess(LaterInst);
+ return MSSA->dominates(LaterDef, EarlierMA);
+}
+
+bool EarlyCSE::processNode(DomTreeNode *Node) {
+ bool Changed = false;
+ BasicBlock *BB = Node->getBlock();
+
+ // If this block has a single predecessor, then the predecessor is the parent
+ // of the domtree node and all of the live out memory values are still current
+ // in this block. If this block has multiple predecessors, then they could
+ // have invalidated the live-out memory values of our parent value. For now,
+ // just be conservative and invalidate memory if this block has multiple
+ // predecessors.
+ if (!BB->getSinglePredecessor())
+ ++CurrentGeneration;
+
+ // If this node has a single predecessor which ends in a conditional branch,
+ // we can infer the value of the branch condition given that we took this
+ // path. We need the single predecessor to ensure there's not another path
+ // which reaches this block where the condition might hold a different
+ // value. Since we're adding this to the scoped hash table (like any other
+ // def), it will have been popped if we encounter a future merge block.
+ if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+ auto *BI = dyn_cast<BranchInst>(Pred->getTerminator());
+ if (BI && BI->isConditional()) {
+ auto *CondInst = dyn_cast<Instruction>(BI->getCondition());
+ if (CondInst && SimpleValue::canHandle(CondInst)) {
+ assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
+ auto *TorF = (BI->getSuccessor(0) == BB)
+ ? ConstantInt::getTrue(BB->getContext())
+ : ConstantInt::getFalse(BB->getContext());
+ AvailableValues.insert(CondInst, TorF);
+ DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
+ << CondInst->getName() << "' as " << *TorF << " in "
+ << BB->getName() << "\n");
+ // Replace all dominated uses with the known value.
+ if (unsigned Count = replaceDominatedUsesWith(
+ CondInst, TorF, DT, BasicBlockEdge(Pred, BB))) {
+ Changed = true;
+ NumCSECVP += Count;
+ }
+ }
+ }
+ }
+
+ /// LastStore - Keep track of the last non-volatile store that we saw... for
+ /// as long as there in no instruction that reads memory. If we see a store
+ /// to the same location, we delete the dead store. This zaps trivial dead
+ /// stores which can occur in bitfield code among other things.
+ Instruction *LastStore = nullptr;
+
+ // See if any instructions in the block can be eliminated. If so, do it. If
+ // not, add them to AvailableValues.
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+ Instruction *Inst = &*I++;
+
+ // Dead instructions should just be removed.
+ if (isInstructionTriviallyDead(Inst, &TLI)) {
+ DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
+ removeMSSA(Inst);
+ Inst->eraseFromParent();
+ Changed = true;
+ ++NumSimplify;
+ continue;
+ }
+
+ // Skip assume intrinsics, they don't really have side effects (although
+ // they're marked as such to ensure preservation of control dependencies),
+ // and this pass will not bother with its removal. However, we should mark
+ // its condition as true for all dominated blocks.
+ if (match(Inst, m_Intrinsic<Intrinsic::assume>())) {
+ auto *CondI =
+ dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0));
+ if (CondI && SimpleValue::canHandle(CondI)) {
+ DEBUG(dbgs() << "EarlyCSE considering assumption: " << *Inst << '\n');
+ AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
+ } else
+ DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n');
+ continue;
+ }
+
+ // Skip sideeffect intrinsics, for the same reason as assume intrinsics.
+ if (match(Inst, m_Intrinsic<Intrinsic::sideeffect>())) {
+ DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << *Inst << '\n');
+ continue;
+ }
+
+ // Skip invariant.start intrinsics since they only read memory, and we can
+ // forward values across it. Also, we dont need to consume the last store
+ // since the semantics of invariant.start allow us to perform DSE of the
+ // last store, if there was a store following invariant.start. Consider:
+ //
+ // store 30, i8* p
+ // invariant.start(p)
+ // store 40, i8* p
+ // We can DSE the store to 30, since the store 40 to invariant location p
+ // causes undefined behaviour.
+ if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>()))
+ continue;
+
+ if (match(Inst, m_Intrinsic<Intrinsic::experimental_guard>())) {
+ if (auto *CondI =
+ dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0))) {
+ if (SimpleValue::canHandle(CondI)) {
+ // Do we already know the actual value of this condition?
+ if (auto *KnownCond = AvailableValues.lookup(CondI)) {
+ // Is the condition known to be true?
+ if (isa<ConstantInt>(KnownCond) &&
+ cast<ConstantInt>(KnownCond)->isOne()) {
+ DEBUG(dbgs() << "EarlyCSE removing guard: " << *Inst << '\n');
+ removeMSSA(Inst);
+ Inst->eraseFromParent();
+ Changed = true;
+ continue;
+ } else
+ // Use the known value if it wasn't true.
+ cast<CallInst>(Inst)->setArgOperand(0, KnownCond);
+ }
+ // The condition we're on guarding here is true for all dominated
+ // locations.
+ AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
+ }
+ }
+
+ // Guard intrinsics read all memory, but don't write any memory.
+ // Accordingly, don't update the generation but consume the last store (to
+ // avoid an incorrect DSE).
+ LastStore = nullptr;
+ continue;
+ }
+
+ // If the instruction can be simplified (e.g. X+0 = X) then replace it with
+ // its simpler value.
+ if (Value *V = SimplifyInstruction(Inst, SQ)) {
+ DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n');
+ bool Killed = false;
+ if (!Inst->use_empty()) {
+ Inst->replaceAllUsesWith(V);
+ Changed = true;
+ }
+ if (isInstructionTriviallyDead(Inst, &TLI)) {
+ removeMSSA(Inst);
+ Inst->eraseFromParent();
+ Changed = true;
+ Killed = true;
+ }
+ if (Changed)
+ ++NumSimplify;
+ if (Killed)
+ continue;
+ }
+
+ // If this is a simple instruction that we can value number, process it.
+ if (SimpleValue::canHandle(Inst)) {
+ // See if the instruction has an available value. If so, use it.
+ if (Value *V = AvailableValues.lookup(Inst)) {
+ DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << " to: " << *V << '\n');
+ if (auto *I = dyn_cast<Instruction>(V))
+ I->andIRFlags(Inst);
+ Inst->replaceAllUsesWith(V);
+ removeMSSA(Inst);
+ Inst->eraseFromParent();
+ Changed = true;
+ ++NumCSE;
+ continue;
+ }
+
+ // Otherwise, just remember that this value is available.
+ AvailableValues.insert(Inst, Inst);
+ continue;
+ }
+
+ ParseMemoryInst MemInst(Inst, TTI);
+ // If this is a non-volatile load, process it.
+ if (MemInst.isValid() && MemInst.isLoad()) {
+ // (conservatively) we can't peak past the ordering implied by this
+ // operation, but we can add this load to our set of available values
+ if (MemInst.isVolatile() || !MemInst.isUnordered()) {
+ LastStore = nullptr;
+ ++CurrentGeneration;
+ }
+
+ // If we have an available version of this load, and if it is the right
+ // generation or the load is known to be from an invariant location,
+ // replace this instruction.
+ //
+ // If either the dominating load or the current load are invariant, then
+ // we can assume the current load loads the same value as the dominating
+ // load.
+ LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
+ if (InVal.DefInst != nullptr &&
+ InVal.MatchingId == MemInst.getMatchingId() &&
+ // We don't yet handle removing loads with ordering of any kind.
+ !MemInst.isVolatile() && MemInst.isUnordered() &&
+ // We can't replace an atomic load with one which isn't also atomic.
+ InVal.IsAtomic >= MemInst.isAtomic() &&
+ (InVal.IsInvariant || MemInst.isInvariantLoad() ||
+ isSameMemGeneration(InVal.Generation, CurrentGeneration,
+ InVal.DefInst, Inst))) {
+ Value *Op = getOrCreateResult(InVal.DefInst, Inst->getType());
+ if (Op != nullptr) {
+ DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
+ << " to: " << *InVal.DefInst << '\n');
+ if (!Inst->use_empty())
+ Inst->replaceAllUsesWith(Op);
+ removeMSSA(Inst);
+ Inst->eraseFromParent();
+ Changed = true;
+ ++NumCSELoad;
+ continue;
+ }
+ }
+
+ // Otherwise, remember that we have this instruction.
+ AvailableLoads.insert(
+ MemInst.getPointerOperand(),
+ LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
+ MemInst.isAtomic(), MemInst.isInvariantLoad()));
+ LastStore = nullptr;
+ continue;
+ }
+
+ // If this instruction may read from memory or throw (and potentially read
+ // from memory in the exception handler), forget LastStore. Load/store
+ // intrinsics will indicate both a read and a write to memory. The target
+ // may override this (e.g. so that a store intrinsic does not read from
+ // memory, and thus will be treated the same as a regular store for
+ // commoning purposes).
+ if ((Inst->mayReadFromMemory() || Inst->mayThrow()) &&
+ !(MemInst.isValid() && !MemInst.mayReadFromMemory()))
+ LastStore = nullptr;
+
+ // If this is a read-only call, process it.
+ if (CallValue::canHandle(Inst)) {
+ // If we have an available version of this call, and if it is the right
+ // generation, replace this instruction.
+ std::pair<Instruction *, unsigned> InVal = AvailableCalls.lookup(Inst);
+ if (InVal.first != nullptr &&
+ isSameMemGeneration(InVal.second, CurrentGeneration, InVal.first,
+ Inst)) {
+ DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst
+ << " to: " << *InVal.first << '\n');
+ if (!Inst->use_empty())
+ Inst->replaceAllUsesWith(InVal.first);
+ removeMSSA(Inst);
+ Inst->eraseFromParent();
+ Changed = true;
+ ++NumCSECall;
+ continue;
+ }
+
+ // Otherwise, remember that we have this instruction.
+ AvailableCalls.insert(
+ Inst, std::pair<Instruction *, unsigned>(Inst, CurrentGeneration));
+ continue;
+ }
+
+ // A release fence requires that all stores complete before it, but does
+ // not prevent the reordering of following loads 'before' the fence. As a
+ // result, we don't need to consider it as writing to memory and don't need
+ // to advance the generation. We do need to prevent DSE across the fence,
+ // but that's handled above.
+ if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
+ if (FI->getOrdering() == AtomicOrdering::Release) {
+ assert(Inst->mayReadFromMemory() && "relied on to prevent DSE above");
+ continue;
+ }
+
+ // write back DSE - If we write back the same value we just loaded from
+ // the same location and haven't passed any intervening writes or ordering
+ // operations, we can remove the write. The primary benefit is in allowing
+ // the available load table to remain valid and value forward past where
+ // the store originally was.
+ if (MemInst.isValid() && MemInst.isStore()) {
+ LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
+ if (InVal.DefInst &&
+ InVal.DefInst == getOrCreateResult(Inst, InVal.DefInst->getType()) &&
+ InVal.MatchingId == MemInst.getMatchingId() &&
+ // We don't yet handle removing stores with ordering of any kind.
+ !MemInst.isVolatile() && MemInst.isUnordered() &&
+ isSameMemGeneration(InVal.Generation, CurrentGeneration,
+ InVal.DefInst, Inst)) {
+ // It is okay to have a LastStore to a different pointer here if MemorySSA
+ // tells us that the load and store are from the same memory generation.
+ // In that case, LastStore should keep its present value since we're
+ // removing the current store.
+ assert((!LastStore ||
+ ParseMemoryInst(LastStore, TTI).getPointerOperand() ==
+ MemInst.getPointerOperand() ||
+ MSSA) &&
+ "can't have an intervening store if not using MemorySSA!");
+ DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n');
+ removeMSSA(Inst);
+ Inst->eraseFromParent();
+ Changed = true;
+ ++NumDSE;
+ // We can avoid incrementing the generation count since we were able
+ // to eliminate this store.
+ continue;
+ }
+ }
+
+ // Okay, this isn't something we can CSE at all. Check to see if it is
+ // something that could modify memory. If so, our available memory values
+ // cannot be used so bump the generation count.
+ if (Inst->mayWriteToMemory()) {
+ ++CurrentGeneration;
+
+ if (MemInst.isValid() && MemInst.isStore()) {
+ // We do a trivial form of DSE if there are two stores to the same
+ // location with no intervening loads. Delete the earlier store.
+ // At the moment, we don't remove ordered stores, but do remove
+ // unordered atomic stores. There's no special requirement (for
+ // unordered atomics) about removing atomic stores only in favor of
+ // other atomic stores since we we're going to execute the non-atomic
+ // one anyway and the atomic one might never have become visible.
+ if (LastStore) {
+ ParseMemoryInst LastStoreMemInst(LastStore, TTI);
+ assert(LastStoreMemInst.isUnordered() &&
+ !LastStoreMemInst.isVolatile() &&
+ "Violated invariant");
+ if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
+ DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
+ << " due to: " << *Inst << '\n');
+ removeMSSA(LastStore);
+ LastStore->eraseFromParent();
+ Changed = true;
+ ++NumDSE;
+ LastStore = nullptr;
+ }
+ // fallthrough - we can exploit information about this store
+ }
+
+ // Okay, we just invalidated anything we knew about loaded values. Try
+ // to salvage *something* by remembering that the stored value is a live
+ // version of the pointer. It is safe to forward from volatile stores
+ // to non-volatile loads, so we don't have to check for volatility of
+ // the store.
+ AvailableLoads.insert(
+ MemInst.getPointerOperand(),
+ LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
+ MemInst.isAtomic(), /*IsInvariant=*/false));
+
+ // Remember that this was the last unordered store we saw for DSE. We
+ // don't yet handle DSE on ordered or volatile stores since we don't
+ // have a good way to model the ordering requirement for following
+ // passes once the store is removed. We could insert a fence, but
+ // since fences are slightly stronger than stores in their ordering,
+ // it's not clear this is a profitable transform. Another option would
+ // be to merge the ordering with that of the post dominating store.
+ if (MemInst.isUnordered() && !MemInst.isVolatile())
+ LastStore = Inst;
+ else
+ LastStore = nullptr;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+bool EarlyCSE::run() {
+ // Note, deque is being used here because there is significant performance
+ // gains over vector when the container becomes very large due to the
+ // specific access patterns. For more information see the mailing list
+ // discussion on this:
+ // http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
+ std::deque<StackNode *> nodesToProcess;
+
+ bool Changed = false;
+
+ // Process the root node.
+ nodesToProcess.push_back(new StackNode(
+ AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration,
+ DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end()));
+
+ // Save the current generation.
+ unsigned LiveOutGeneration = CurrentGeneration;
+
+ // Process the stack.
+ while (!nodesToProcess.empty()) {
+ // Grab the first item off the stack. Set the current generation, remove
+ // the node from the stack, and process it.
+ StackNode *NodeToProcess = nodesToProcess.back();
+
+ // Initialize class members.
+ CurrentGeneration = NodeToProcess->currentGeneration();
+
+ // Check if the node needs to be processed.
+ if (!NodeToProcess->isProcessed()) {
+ // Process the node.
+ Changed |= processNode(NodeToProcess->node());
+ NodeToProcess->childGeneration(CurrentGeneration);
+ NodeToProcess->process();
+ } else if (NodeToProcess->childIter() != NodeToProcess->end()) {
+ // Push the next child onto the stack.
+ DomTreeNode *child = NodeToProcess->nextChild();
+ nodesToProcess.push_back(
+ new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
+ NodeToProcess->childGeneration(), child, child->begin(),
+ child->end()));
+ } else {
+ // It has been processed, and there are no more children to process,
+ // so delete it and pop it off the stack.
+ delete NodeToProcess;
+ nodesToProcess.pop_back();
+ }
+ } // while (!nodes...)
+
+ // Reset the current generation.
+ CurrentGeneration = LiveOutGeneration;
+
+ return Changed;
+}
+
+PreservedAnalyses EarlyCSEPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto *MSSA =
+ UseMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() : nullptr;
+
+ EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA);
+
+ if (!CSE.run())
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ if (UseMemorySSA)
+ PA.preserve<MemorySSAAnalysis>();
+ return PA;
+}
+
+namespace {
+
+/// \brief A simple and fast domtree-based CSE pass.
+///
+/// This pass does a simple depth-first walk over the dominator tree,
+/// eliminating trivially redundant instructions and using instsimplify to
+/// canonicalize things as it goes. It is intended to be fast and catch obvious
+/// cases so that instcombine and other passes are more effective. It is
+/// expected that a later pass of GVN will catch the interesting/hard cases.
+template<bool UseMemorySSA>
+class EarlyCSELegacyCommonPass : public FunctionPass {
+public:
+ static char ID;
+
+ EarlyCSELegacyCommonPass() : FunctionPass(ID) {
+ if (UseMemorySSA)
+ initializeEarlyCSEMemSSALegacyPassPass(*PassRegistry::getPassRegistry());
+ else
+ initializeEarlyCSELegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto *MSSA =
+ UseMemorySSA ? &getAnalysis<MemorySSAWrapperPass>().getMSSA() : nullptr;
+
+ EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA);
+
+ return CSE.run();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ if (UseMemorySSA) {
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
+ }
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.setPreservesCFG();
+ }
+};
+
+} // end anonymous namespace
+
+using EarlyCSELegacyPass = EarlyCSELegacyCommonPass</*UseMemorySSA=*/false>;
+
+template<>
+char EarlyCSELegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(EarlyCSELegacyPass, "early-cse", "Early CSE", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(EarlyCSELegacyPass, "early-cse", "Early CSE", false, false)
+
+using EarlyCSEMemSSALegacyPass =
+ EarlyCSELegacyCommonPass</*UseMemorySSA=*/true>;
+
+template<>
+char EarlyCSEMemSSALegacyPass::ID = 0;
+
+FunctionPass *llvm::createEarlyCSEPass(bool UseMemorySSA) {
+ if (UseMemorySSA)
+ return new EarlyCSEMemSSALegacyPass();
+ else
+ return new EarlyCSELegacyPass();
+}
+
+INITIALIZE_PASS_BEGIN(EarlyCSEMemSSALegacyPass, "early-cse-memssa",
+ "Early CSE w/ MemorySSA", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(EarlyCSEMemSSALegacyPass, "early-cse-memssa",
+ "Early CSE w/ MemorySSA", false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
new file mode 100644
index 000000000000..063df779a30b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -0,0 +1,80 @@
+//===- FlattenCFGPass.cpp - CFG Flatten Pass ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements flattening of CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "flattencfg"
+
+namespace {
+struct FlattenCFGPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+public:
+ FlattenCFGPass() : FunctionPass(ID) {
+ initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AAResultsWrapperPass>();
+ }
+
+private:
+ AliasAnalysis *AA;
+};
+}
+
+char FlattenCFGPass::ID = 0;
+INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
+ false)
+
+// Public interface to the FlattenCFG pass
+FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); }
+
+/// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function,
+/// iterating until no more changes are made.
+static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
+ bool Changed = false;
+ bool LocalChange = true;
+ while (LocalChange) {
+ LocalChange = false;
+
+ // Loop over all of the basic blocks and remove them if they are unneeded...
+ //
+ for (Function::iterator BBIt = F.begin(); BBIt != F.end();) {
+ if (FlattenCFG(&*BBIt++, AA)) {
+ LocalChange = true;
+ }
+ }
+ Changed |= LocalChange;
+ }
+ return Changed;
+}
+
+bool FlattenCFGPass::runOnFunction(Function &F) {
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ bool EverChanged = false;
+ // iterativelyFlattenCFG can make some blocks dead.
+ while (iterativelyFlattenCFG(F, AA)) {
+ removeUnreachableBlocks(F);
+ EverChanged = true;
+ }
+ return EverChanged;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
new file mode 100644
index 000000000000..b105ece8dc7c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -0,0 +1,525 @@
+//===- Float2Int.cpp - Demote floating point ops to work on integers ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Float2Int pass, which aims to demote floating
+// point operations to work on integers, where that is losslessly possible.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "float2int"
+
+#include "llvm/Transforms/Scalar/Float2Int.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <deque>
+#include <functional> // For std::function
+using namespace llvm;
+
+// The algorithm is simple. Start at instructions that convert from the
+// float to the int domain: fptoui, fptosi and fcmp. Walk up the def-use
+// graph, using an equivalence datastructure to unify graphs that interfere.
+//
+// Mappable instructions are those with an integer corrollary that, given
+// integer domain inputs, produce an integer output; fadd, for example.
+//
+// If a non-mappable instruction is seen, this entire def-use graph is marked
+// as non-transformable. If we see an instruction that converts from the
+// integer domain to FP domain (uitofp,sitofp), we terminate our walk.
+
+/// The largest integer type worth dealing with.
+static cl::opt<unsigned>
+MaxIntegerBW("float2int-max-integer-bw", cl::init(64), cl::Hidden,
+ cl::desc("Max integer bitwidth to consider in float2int"
+ "(default=64)"));
+
+namespace {
+ struct Float2IntLegacyPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ Float2IntLegacyPass() : FunctionPass(ID) {
+ initializeFloat2IntLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ return Impl.runImpl(F);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+
+ private:
+ Float2IntPass Impl;
+ };
+}
+
+char Float2IntLegacyPass::ID = 0;
+INITIALIZE_PASS(Float2IntLegacyPass, "float2int", "Float to int", false, false)
+
+// Given a FCmp predicate, return a matching ICmp predicate if one
+// exists, otherwise return BAD_ICMP_PREDICATE.
+static CmpInst::Predicate mapFCmpPred(CmpInst::Predicate P) {
+ switch (P) {
+ case CmpInst::FCMP_OEQ:
+ case CmpInst::FCMP_UEQ:
+ return CmpInst::ICMP_EQ;
+ case CmpInst::FCMP_OGT:
+ case CmpInst::FCMP_UGT:
+ return CmpInst::ICMP_SGT;
+ case CmpInst::FCMP_OGE:
+ case CmpInst::FCMP_UGE:
+ return CmpInst::ICMP_SGE;
+ case CmpInst::FCMP_OLT:
+ case CmpInst::FCMP_ULT:
+ return CmpInst::ICMP_SLT;
+ case CmpInst::FCMP_OLE:
+ case CmpInst::FCMP_ULE:
+ return CmpInst::ICMP_SLE;
+ case CmpInst::FCMP_ONE:
+ case CmpInst::FCMP_UNE:
+ return CmpInst::ICMP_NE;
+ default:
+ return CmpInst::BAD_ICMP_PREDICATE;
+ }
+}
+
+// Given a floating point binary operator, return the matching
+// integer version.
+static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ default: llvm_unreachable("Unhandled opcode!");
+ case Instruction::FAdd: return Instruction::Add;
+ case Instruction::FSub: return Instruction::Sub;
+ case Instruction::FMul: return Instruction::Mul;
+ }
+}
+
+// Find the roots - instructions that convert from the FP domain to
+// integer domain.
+void Float2IntPass::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
+ for (auto &I : instructions(F)) {
+ if (isa<VectorType>(I.getType()))
+ continue;
+ switch (I.getOpcode()) {
+ default: break;
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ Roots.insert(&I);
+ break;
+ case Instruction::FCmp:
+ if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) !=
+ CmpInst::BAD_ICMP_PREDICATE)
+ Roots.insert(&I);
+ break;
+ }
+ }
+}
+
+// Helper - mark I as having been traversed, having range R.
+void Float2IntPass::seen(Instruction *I, ConstantRange R) {
+ DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n");
+ auto IT = SeenInsts.find(I);
+ if (IT != SeenInsts.end())
+ IT->second = std::move(R);
+ else
+ SeenInsts.insert(std::make_pair(I, std::move(R)));
+}
+
+// Helper - get a range representing a poison value.
+ConstantRange Float2IntPass::badRange() {
+ return ConstantRange(MaxIntegerBW + 1, true);
+}
+ConstantRange Float2IntPass::unknownRange() {
+ return ConstantRange(MaxIntegerBW + 1, false);
+}
+ConstantRange Float2IntPass::validateRange(ConstantRange R) {
+ if (R.getBitWidth() > MaxIntegerBW + 1)
+ return badRange();
+ return R;
+}
+
+// The most obvious way to structure the search is a depth-first, eager
+// search from each root. However, that require direct recursion and so
+// can only handle small instruction sequences. Instead, we split the search
+// up into two phases:
+// - walkBackwards: A breadth-first walk of the use-def graph starting from
+// the roots. Populate "SeenInsts" with interesting
+// instructions and poison values if they're obvious and
+// cheap to compute. Calculate the equivalance set structure
+// while we're here too.
+// - walkForwards: Iterate over SeenInsts in reverse order, so we visit
+// defs before their uses. Calculate the real range info.
+
+// Breadth-first walk of the use-def graph; determine the set of nodes
+// we care about and eagerly determine if some of them are poisonous.
+void Float2IntPass::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) {
+ std::deque<Instruction*> Worklist(Roots.begin(), Roots.end());
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.back();
+ Worklist.pop_back();
+
+ if (SeenInsts.find(I) != SeenInsts.end())
+ // Seen already.
+ continue;
+
+ switch (I->getOpcode()) {
+ // FIXME: Handle select and phi nodes.
+ default:
+ // Path terminated uncleanly.
+ seen(I, badRange());
+ break;
+
+ case Instruction::UIToFP:
+ case Instruction::SIToFP: {
+ // Path terminated cleanly - use the type of the integer input to seed
+ // the analysis.
+ unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
+ auto Input = ConstantRange(BW, true);
+ auto CastOp = (Instruction::CastOps)I->getOpcode();
+ seen(I, validateRange(Input.castOp(CastOp, MaxIntegerBW+1)));
+ continue;
+ }
+
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FCmp:
+ seen(I, unknownRange());
+ break;
+ }
+
+ for (Value *O : I->operands()) {
+ if (Instruction *OI = dyn_cast<Instruction>(O)) {
+ // Unify def-use chains if they interfere.
+ ECs.unionSets(I, OI);
+ if (SeenInsts.find(I)->second != badRange())
+ Worklist.push_back(OI);
+ } else if (!isa<ConstantFP>(O)) {
+ // Not an instruction or ConstantFP? we can't do anything.
+ seen(I, badRange());
+ }
+ }
+ }
+}
+
+// Walk forwards down the list of seen instructions, so we visit defs before
+// uses.
+void Float2IntPass::walkForwards() {
+ for (auto &It : reverse(SeenInsts)) {
+ if (It.second != unknownRange())
+ continue;
+
+ Instruction *I = It.first;
+ std::function<ConstantRange(ArrayRef<ConstantRange>)> Op;
+ switch (I->getOpcode()) {
+ // FIXME: Handle select and phi nodes.
+ default:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ llvm_unreachable("Should have been handled in walkForwards!");
+
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ Op = [I](ArrayRef<ConstantRange> Ops) {
+ assert(Ops.size() == 2 && "its a binary operator!");
+ auto BinOp = (Instruction::BinaryOps) I->getOpcode();
+ return Ops[0].binaryOp(BinOp, Ops[1]);
+ };
+ break;
+
+ //
+ // Root-only instructions - we'll only see these if they're the
+ // first node in a walk.
+ //
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ Op = [I](ArrayRef<ConstantRange> Ops) {
+ assert(Ops.size() == 1 && "FPTo[US]I is a unary operator!");
+ // Note: We're ignoring the casts output size here as that's what the
+ // caller expects.
+ auto CastOp = (Instruction::CastOps)I->getOpcode();
+ return Ops[0].castOp(CastOp, MaxIntegerBW+1);
+ };
+ break;
+
+ case Instruction::FCmp:
+ Op = [](ArrayRef<ConstantRange> Ops) {
+ assert(Ops.size() == 2 && "FCmp is a binary operator!");
+ return Ops[0].unionWith(Ops[1]);
+ };
+ break;
+ }
+
+ bool Abort = false;
+ SmallVector<ConstantRange,4> OpRanges;
+ for (Value *O : I->operands()) {
+ if (Instruction *OI = dyn_cast<Instruction>(O)) {
+ assert(SeenInsts.find(OI) != SeenInsts.end() &&
+ "def not seen before use!");
+ OpRanges.push_back(SeenInsts.find(OI)->second);
+ } else if (ConstantFP *CF = dyn_cast<ConstantFP>(O)) {
+ // Work out if the floating point number can be losslessly represented
+ // as an integer.
+ // APFloat::convertToInteger(&Exact) purports to do what we want, but
+ // the exactness can be too precise. For example, negative zero can
+ // never be exactly converted to an integer.
+ //
+ // Instead, we ask APFloat to round itself to an integral value - this
+ // preserves sign-of-zero - then compare the result with the original.
+ //
+ const APFloat &F = CF->getValueAPF();
+
+ // First, weed out obviously incorrect values. Non-finite numbers
+ // can't be represented and neither can negative zero, unless
+ // we're in fast math mode.
+ if (!F.isFinite() ||
+ (F.isZero() && F.isNegative() && isa<FPMathOperator>(I) &&
+ !I->hasNoSignedZeros())) {
+ seen(I, badRange());
+ Abort = true;
+ break;
+ }
+
+ APFloat NewF = F;
+ auto Res = NewF.roundToIntegral(APFloat::rmNearestTiesToEven);
+ if (Res != APFloat::opOK || NewF.compare(F) != APFloat::cmpEqual) {
+ seen(I, badRange());
+ Abort = true;
+ break;
+ }
+ // OK, it's representable. Now get it.
+ APSInt Int(MaxIntegerBW+1, false);
+ bool Exact;
+ CF->getValueAPF().convertToInteger(Int,
+ APFloat::rmNearestTiesToEven,
+ &Exact);
+ OpRanges.push_back(ConstantRange(Int));
+ } else {
+ llvm_unreachable("Should have already marked this as badRange!");
+ }
+ }
+
+ // Reduce the operands' ranges to a single range and return.
+ if (!Abort)
+ seen(I, Op(OpRanges));
+ }
+}
+
+// If there is a valid transform to be done, do it.
+bool Float2IntPass::validateAndTransform() {
+ bool MadeChange = false;
+
+ // Iterate over every disjoint partition of the def-use graph.
+ for (auto It = ECs.begin(), E = ECs.end(); It != E; ++It) {
+ ConstantRange R(MaxIntegerBW + 1, false);
+ bool Fail = false;
+ Type *ConvertedToTy = nullptr;
+
+ // For every member of the partition, union all the ranges together.
+ for (auto MI = ECs.member_begin(It), ME = ECs.member_end();
+ MI != ME; ++MI) {
+ Instruction *I = *MI;
+ auto SeenI = SeenInsts.find(I);
+ if (SeenI == SeenInsts.end())
+ continue;
+
+ R = R.unionWith(SeenI->second);
+ // We need to ensure I has no users that have not been seen.
+ // If it does, transformation would be illegal.
+ //
+ // Don't count the roots, as they terminate the graphs.
+ if (Roots.count(I) == 0) {
+ // Set the type of the conversion while we're here.
+ if (!ConvertedToTy)
+ ConvertedToTy = I->getType();
+ for (User *U : I->users()) {
+ Instruction *UI = dyn_cast<Instruction>(U);
+ if (!UI || SeenInsts.find(UI) == SeenInsts.end()) {
+ DEBUG(dbgs() << "F2I: Failing because of " << *U << "\n");
+ Fail = true;
+ break;
+ }
+ }
+ }
+ if (Fail)
+ break;
+ }
+
+ // If the set was empty, or we failed, or the range is poisonous,
+ // bail out.
+ if (ECs.member_begin(It) == ECs.member_end() || Fail ||
+ R.isFullSet() || R.isSignWrappedSet())
+ continue;
+ assert(ConvertedToTy && "Must have set the convertedtoty by this point!");
+
+ // The number of bits required is the maximum of the upper and
+ // lower limits, plus one so it can be signed.
+ unsigned MinBW = std::max(R.getLower().getMinSignedBits(),
+ R.getUpper().getMinSignedBits()) + 1;
+ DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n");
+
+ // If we've run off the realms of the exactly representable integers,
+ // the floating point result will differ from an integer approximation.
+
+ // Do we need more bits than are in the mantissa of the type we converted
+ // to? semanticsPrecision returns the number of mantissa bits plus one
+ // for the sign bit.
+ unsigned MaxRepresentableBits
+ = APFloat::semanticsPrecision(ConvertedToTy->getFltSemantics()) - 1;
+ if (MinBW > MaxRepresentableBits) {
+ DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n");
+ continue;
+ }
+ if (MinBW > 64) {
+ DEBUG(dbgs() << "F2I: Value requires more than 64 bits to represent!\n");
+ continue;
+ }
+
+ // OK, R is known to be representable. Now pick a type for it.
+ // FIXME: Pick the smallest legal type that will fit.
+ Type *Ty = (MinBW > 32) ? Type::getInt64Ty(*Ctx) : Type::getInt32Ty(*Ctx);
+
+ for (auto MI = ECs.member_begin(It), ME = ECs.member_end();
+ MI != ME; ++MI)
+ convert(*MI, Ty);
+ MadeChange = true;
+ }
+
+ return MadeChange;
+}
+
+Value *Float2IntPass::convert(Instruction *I, Type *ToTy) {
+ if (ConvertedInsts.find(I) != ConvertedInsts.end())
+ // Already converted this instruction.
+ return ConvertedInsts[I];
+
+ SmallVector<Value*,4> NewOperands;
+ for (Value *V : I->operands()) {
+ // Don't recurse if we're an instruction that terminates the path.
+ if (I->getOpcode() == Instruction::UIToFP ||
+ I->getOpcode() == Instruction::SIToFP) {
+ NewOperands.push_back(V);
+ } else if (Instruction *VI = dyn_cast<Instruction>(V)) {
+ NewOperands.push_back(convert(VI, ToTy));
+ } else if (ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
+ APSInt Val(ToTy->getPrimitiveSizeInBits(), /*IsUnsigned=*/false);
+ bool Exact;
+ CF->getValueAPF().convertToInteger(Val,
+ APFloat::rmNearestTiesToEven,
+ &Exact);
+ NewOperands.push_back(ConstantInt::get(ToTy, Val));
+ } else {
+ llvm_unreachable("Unhandled operand type?");
+ }
+ }
+
+ // Now create a new instruction.
+ IRBuilder<> IRB(I);
+ Value *NewV = nullptr;
+ switch (I->getOpcode()) {
+ default: llvm_unreachable("Unhandled instruction!");
+
+ case Instruction::FPToUI:
+ NewV = IRB.CreateZExtOrTrunc(NewOperands[0], I->getType());
+ break;
+
+ case Instruction::FPToSI:
+ NewV = IRB.CreateSExtOrTrunc(NewOperands[0], I->getType());
+ break;
+
+ case Instruction::FCmp: {
+ CmpInst::Predicate P = mapFCmpPred(cast<CmpInst>(I)->getPredicate());
+ assert(P != CmpInst::BAD_ICMP_PREDICATE && "Unhandled predicate!");
+ NewV = IRB.CreateICmp(P, NewOperands[0], NewOperands[1], I->getName());
+ break;
+ }
+
+ case Instruction::UIToFP:
+ NewV = IRB.CreateZExtOrTrunc(NewOperands[0], ToTy);
+ break;
+
+ case Instruction::SIToFP:
+ NewV = IRB.CreateSExtOrTrunc(NewOperands[0], ToTy);
+ break;
+
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ NewV = IRB.CreateBinOp(mapBinOpcode(I->getOpcode()),
+ NewOperands[0], NewOperands[1],
+ I->getName());
+ break;
+ }
+
+ // If we're a root instruction, RAUW.
+ if (Roots.count(I))
+ I->replaceAllUsesWith(NewV);
+
+ ConvertedInsts[I] = NewV;
+ return NewV;
+}
+
+// Perform dead code elimination on the instructions we just modified.
+void Float2IntPass::cleanup() {
+ for (auto &I : reverse(ConvertedInsts))
+ I.first->eraseFromParent();
+}
+
+bool Float2IntPass::runImpl(Function &F) {
+ DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n");
+ // Clear out all state.
+ ECs = EquivalenceClasses<Instruction*>();
+ SeenInsts.clear();
+ ConvertedInsts.clear();
+ Roots.clear();
+
+ Ctx = &F.getParent()->getContext();
+
+ findRoots(F, Roots);
+
+ walkBackwards(Roots);
+ walkForwards();
+
+ bool Modified = validateAndTransform();
+ if (Modified)
+ cleanup();
+ return Modified;
+}
+
+namespace llvm {
+FunctionPass *createFloat2IntPass() { return new Float2IntLegacyPass(); }
+
+PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &) {
+ if (!runImpl(F))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+} // End namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
new file mode 100644
index 000000000000..e2c1eaf58e43
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -0,0 +1,2667 @@
+//===- GVN.cpp - Eliminate redundant values and loads ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs global value numbering to eliminate fully redundant
+// instructions. It also performs simple dead load elimination.
+//
+// Note that this pass does the value numbering itself; it does not use the
+// ValueNumbering analysis passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/VNCoercion.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::gvn;
+using namespace llvm::VNCoercion;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "gvn"
+
+STATISTIC(NumGVNInstr, "Number of instructions deleted");
+STATISTIC(NumGVNLoad, "Number of loads deleted");
+STATISTIC(NumGVNPRE, "Number of instructions PRE'd");
+STATISTIC(NumGVNBlocks, "Number of blocks merged");
+STATISTIC(NumGVNSimpl, "Number of instructions simplified");
+STATISTIC(NumGVNEqProp, "Number of equalities propagated");
+STATISTIC(NumPRELoad, "Number of loads PRE'd");
+
+static cl::opt<bool> EnablePRE("enable-pre",
+ cl::init(true), cl::Hidden);
+static cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true));
+
+// Maximum allowed recursion depth.
+static cl::opt<uint32_t>
+MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore,
+ cl::desc("Max recurse depth (default = 1000)"));
+
+struct llvm::GVN::Expression {
+ uint32_t opcode;
+ Type *type;
+ bool commutative = false;
+ SmallVector<uint32_t, 4> varargs;
+
+ Expression(uint32_t o = ~2U) : opcode(o) {}
+
+ bool operator==(const Expression &other) const {
+ if (opcode != other.opcode)
+ return false;
+ if (opcode == ~0U || opcode == ~1U)
+ return true;
+ if (type != other.type)
+ return false;
+ if (varargs != other.varargs)
+ return false;
+ return true;
+ }
+
+ friend hash_code hash_value(const Expression &Value) {
+ return hash_combine(
+ Value.opcode, Value.type,
+ hash_combine_range(Value.varargs.begin(), Value.varargs.end()));
+ }
+};
+
+namespace llvm {
+
+template <> struct DenseMapInfo<GVN::Expression> {
+ static inline GVN::Expression getEmptyKey() { return ~0U; }
+ static inline GVN::Expression getTombstoneKey() { return ~1U; }
+
+ static unsigned getHashValue(const GVN::Expression &e) {
+ using llvm::hash_value;
+
+ return static_cast<unsigned>(hash_value(e));
+ }
+
+ static bool isEqual(const GVN::Expression &LHS, const GVN::Expression &RHS) {
+ return LHS == RHS;
+ }
+};
+
+} // end namespace llvm
+
+/// Represents a particular available value that we know how to materialize.
+/// Materialization of an AvailableValue never fails. An AvailableValue is
+/// implicitly associated with a rematerialization point which is the
+/// location of the instruction from which it was formed.
+struct llvm::gvn::AvailableValue {
+ enum ValType {
+ SimpleVal, // A simple offsetted value that is accessed.
+ LoadVal, // A value produced by a load.
+ MemIntrin, // A memory intrinsic which is loaded from.
+ UndefVal // A UndefValue representing a value from dead block (which
+ // is not yet physically removed from the CFG).
+ };
+
+ /// V - The value that is live out of the block.
+ PointerIntPair<Value *, 2, ValType> Val;
+
+ /// Offset - The byte offset in Val that is interesting for the load query.
+ unsigned Offset;
+
+ static AvailableValue get(Value *V, unsigned Offset = 0) {
+ AvailableValue Res;
+ Res.Val.setPointer(V);
+ Res.Val.setInt(SimpleVal);
+ Res.Offset = Offset;
+ return Res;
+ }
+
+ static AvailableValue getMI(MemIntrinsic *MI, unsigned Offset = 0) {
+ AvailableValue Res;
+ Res.Val.setPointer(MI);
+ Res.Val.setInt(MemIntrin);
+ Res.Offset = Offset;
+ return Res;
+ }
+
+ static AvailableValue getLoad(LoadInst *LI, unsigned Offset = 0) {
+ AvailableValue Res;
+ Res.Val.setPointer(LI);
+ Res.Val.setInt(LoadVal);
+ Res.Offset = Offset;
+ return Res;
+ }
+
+ static AvailableValue getUndef() {
+ AvailableValue Res;
+ Res.Val.setPointer(nullptr);
+ Res.Val.setInt(UndefVal);
+ Res.Offset = 0;
+ return Res;
+ }
+
+ bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
+ bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
+ bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
+ bool isUndefValue() const { return Val.getInt() == UndefVal; }
+
+ Value *getSimpleValue() const {
+ assert(isSimpleValue() && "Wrong accessor");
+ return Val.getPointer();
+ }
+
+ LoadInst *getCoercedLoadValue() const {
+ assert(isCoercedLoadValue() && "Wrong accessor");
+ return cast<LoadInst>(Val.getPointer());
+ }
+
+ MemIntrinsic *getMemIntrinValue() const {
+ assert(isMemIntrinValue() && "Wrong accessor");
+ return cast<MemIntrinsic>(Val.getPointer());
+ }
+
+ /// Emit code at the specified insertion point to adjust the value defined
+ /// here to the specified type. This handles various coercion cases.
+ Value *MaterializeAdjustedValue(LoadInst *LI, Instruction *InsertPt,
+ GVN &gvn) const;
+};
+
+/// Represents an AvailableValue which can be rematerialized at the end of
+/// the associated BasicBlock.
+struct llvm::gvn::AvailableValueInBlock {
+ /// BB - The basic block in question.
+ BasicBlock *BB;
+
+ /// AV - The actual available value
+ AvailableValue AV;
+
+ static AvailableValueInBlock get(BasicBlock *BB, AvailableValue &&AV) {
+ AvailableValueInBlock Res;
+ Res.BB = BB;
+ Res.AV = std::move(AV);
+ return Res;
+ }
+
+ static AvailableValueInBlock get(BasicBlock *BB, Value *V,
+ unsigned Offset = 0) {
+ return get(BB, AvailableValue::get(V, Offset));
+ }
+
+ static AvailableValueInBlock getUndef(BasicBlock *BB) {
+ return get(BB, AvailableValue::getUndef());
+ }
+
+ /// Emit code at the end of this block to adjust the value defined here to
+ /// the specified type. This handles various coercion cases.
+ Value *MaterializeAdjustedValue(LoadInst *LI, GVN &gvn) const {
+ return AV.MaterializeAdjustedValue(LI, BB->getTerminator(), gvn);
+ }
+};
+
+//===----------------------------------------------------------------------===//
+// ValueTable Internal Functions
+//===----------------------------------------------------------------------===//
+
+GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
+ Expression e;
+ e.type = I->getType();
+ e.opcode = I->getOpcode();
+ for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end();
+ OI != OE; ++OI)
+ e.varargs.push_back(lookupOrAdd(*OI));
+ if (I->isCommutative()) {
+ // Ensure that commutative instructions that only differ by a permutation
+ // of their operands get the same value number by sorting the operand value
+ // numbers. Since all commutative instructions have two operands it is more
+ // efficient to sort by hand rather than using, say, std::sort.
+ assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
+ if (e.varargs[0] > e.varargs[1])
+ std::swap(e.varargs[0], e.varargs[1]);
+ e.commutative = true;
+ }
+
+ if (CmpInst *C = dyn_cast<CmpInst>(I)) {
+ // Sort the operand value numbers so x<y and y>x get the same value number.
+ CmpInst::Predicate Predicate = C->getPredicate();
+ if (e.varargs[0] > e.varargs[1]) {
+ std::swap(e.varargs[0], e.varargs[1]);
+ Predicate = CmpInst::getSwappedPredicate(Predicate);
+ }
+ e.opcode = (C->getOpcode() << 8) | Predicate;
+ e.commutative = true;
+ } else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) {
+ for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end();
+ II != IE; ++II)
+ e.varargs.push_back(*II);
+ }
+
+ return e;
+}
+
+GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode,
+ CmpInst::Predicate Predicate,
+ Value *LHS, Value *RHS) {
+ assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
+ "Not a comparison!");
+ Expression e;
+ e.type = CmpInst::makeCmpResultType(LHS->getType());
+ e.varargs.push_back(lookupOrAdd(LHS));
+ e.varargs.push_back(lookupOrAdd(RHS));
+
+ // Sort the operand value numbers so x<y and y>x get the same value number.
+ if (e.varargs[0] > e.varargs[1]) {
+ std::swap(e.varargs[0], e.varargs[1]);
+ Predicate = CmpInst::getSwappedPredicate(Predicate);
+ }
+ e.opcode = (Opcode << 8) | Predicate;
+ e.commutative = true;
+ return e;
+}
+
+GVN::Expression GVN::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
+ assert(EI && "Not an ExtractValueInst?");
+ Expression e;
+ e.type = EI->getType();
+ e.opcode = 0;
+
+ IntrinsicInst *I = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
+ if (I != nullptr && EI->getNumIndices() == 1 && *EI->idx_begin() == 0 ) {
+ // EI might be an extract from one of our recognised intrinsics. If it
+ // is we'll synthesize a semantically equivalent expression instead on
+ // an extract value expression.
+ switch (I->getIntrinsicID()) {
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ e.opcode = Instruction::Add;
+ break;
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ e.opcode = Instruction::Sub;
+ break;
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow:
+ e.opcode = Instruction::Mul;
+ break;
+ default:
+ break;
+ }
+
+ if (e.opcode != 0) {
+ // Intrinsic recognized. Grab its args to finish building the expression.
+ assert(I->getNumArgOperands() == 2 &&
+ "Expect two args for recognised intrinsics.");
+ e.varargs.push_back(lookupOrAdd(I->getArgOperand(0)));
+ e.varargs.push_back(lookupOrAdd(I->getArgOperand(1)));
+ return e;
+ }
+ }
+
+ // Not a recognised intrinsic. Fall back to producing an extract value
+ // expression.
+ e.opcode = EI->getOpcode();
+ for (Instruction::op_iterator OI = EI->op_begin(), OE = EI->op_end();
+ OI != OE; ++OI)
+ e.varargs.push_back(lookupOrAdd(*OI));
+
+ for (ExtractValueInst::idx_iterator II = EI->idx_begin(), IE = EI->idx_end();
+ II != IE; ++II)
+ e.varargs.push_back(*II);
+
+ return e;
+}
+
+//===----------------------------------------------------------------------===//
+// ValueTable External Functions
+//===----------------------------------------------------------------------===//
+
+GVN::ValueTable::ValueTable() = default;
+GVN::ValueTable::ValueTable(const ValueTable &) = default;
+GVN::ValueTable::ValueTable(ValueTable &&) = default;
+GVN::ValueTable::~ValueTable() = default;
+
+/// add - Insert a value into the table with a specified value number.
+void GVN::ValueTable::add(Value *V, uint32_t num) {
+ valueNumbering.insert(std::make_pair(V, num));
+ if (PHINode *PN = dyn_cast<PHINode>(V))
+ NumberingPhi[num] = PN;
+}
+
+uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
+ if (AA->doesNotAccessMemory(C)) {
+ Expression exp = createExpr(C);
+ uint32_t e = assignExpNewValueNum(exp).first;
+ valueNumbering[C] = e;
+ return e;
+ } else if (AA->onlyReadsMemory(C)) {
+ Expression exp = createExpr(C);
+ auto ValNum = assignExpNewValueNum(exp);
+ if (ValNum.second) {
+ valueNumbering[C] = ValNum.first;
+ return ValNum.first;
+ }
+ if (!MD) {
+ uint32_t e = assignExpNewValueNum(exp).first;
+ valueNumbering[C] = e;
+ return e;
+ }
+
+ MemDepResult local_dep = MD->getDependency(C);
+
+ if (!local_dep.isDef() && !local_dep.isNonLocal()) {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ if (local_dep.isDef()) {
+ CallInst* local_cdep = cast<CallInst>(local_dep.getInst());
+
+ if (local_cdep->getNumArgOperands() != C->getNumArgOperands()) {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+ uint32_t c_vn = lookupOrAdd(C->getArgOperand(i));
+ uint32_t cd_vn = lookupOrAdd(local_cdep->getArgOperand(i));
+ if (c_vn != cd_vn) {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+ }
+
+ uint32_t v = lookupOrAdd(local_cdep);
+ valueNumbering[C] = v;
+ return v;
+ }
+
+ // Non-local case.
+ const MemoryDependenceResults::NonLocalDepInfo &deps =
+ MD->getNonLocalCallDependency(CallSite(C));
+ // FIXME: Move the checking logic to MemDep!
+ CallInst* cdep = nullptr;
+
+ // Check to see if we have a single dominating call instruction that is
+ // identical to C.
+ for (unsigned i = 0, e = deps.size(); i != e; ++i) {
+ const NonLocalDepEntry *I = &deps[i];
+ if (I->getResult().isNonLocal())
+ continue;
+
+ // We don't handle non-definitions. If we already have a call, reject
+ // instruction dependencies.
+ if (!I->getResult().isDef() || cdep != nullptr) {
+ cdep = nullptr;
+ break;
+ }
+
+ CallInst *NonLocalDepCall = dyn_cast<CallInst>(I->getResult().getInst());
+ // FIXME: All duplicated with non-local case.
+ if (NonLocalDepCall && DT->properlyDominates(I->getBB(), C->getParent())){
+ cdep = NonLocalDepCall;
+ continue;
+ }
+
+ cdep = nullptr;
+ break;
+ }
+
+ if (!cdep) {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ if (cdep->getNumArgOperands() != C->getNumArgOperands()) {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+ for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+ uint32_t c_vn = lookupOrAdd(C->getArgOperand(i));
+ uint32_t cd_vn = lookupOrAdd(cdep->getArgOperand(i));
+ if (c_vn != cd_vn) {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+ }
+
+ uint32_t v = lookupOrAdd(cdep);
+ valueNumbering[C] = v;
+ return v;
+ } else {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+}
+
+/// Returns true if a value number exists for the specified value.
+bool GVN::ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; }
+
+/// lookup_or_add - Returns the value number for the specified value, assigning
+/// it a new number if it did not have one before.
+uint32_t GVN::ValueTable::lookupOrAdd(Value *V) {
+ DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+ if (VI != valueNumbering.end())
+ return VI->second;
+
+ if (!isa<Instruction>(V)) {
+ valueNumbering[V] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ Instruction* I = cast<Instruction>(V);
+ Expression exp;
+ switch (I->getOpcode()) {
+ case Instruction::Call:
+ return lookupOrAddCall(cast<CallInst>(I));
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::BitCast:
+ case Instruction::Select:
+ case Instruction::ExtractElement:
+ case Instruction::InsertElement:
+ case Instruction::ShuffleVector:
+ case Instruction::InsertValue:
+ case Instruction::GetElementPtr:
+ exp = createExpr(I);
+ break;
+ case Instruction::ExtractValue:
+ exp = createExtractvalueExpr(cast<ExtractValueInst>(I));
+ break;
+ case Instruction::PHI:
+ valueNumbering[V] = nextValueNumber;
+ NumberingPhi[nextValueNumber] = cast<PHINode>(V);
+ return nextValueNumber++;
+ default:
+ valueNumbering[V] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ uint32_t e = assignExpNewValueNum(exp).first;
+ valueNumbering[V] = e;
+ return e;
+}
+
+/// Returns the value number of the specified value. Fails if
+/// the value has not yet been numbered.
+uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const {
+ DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V);
+ if (Verify) {
+ assert(VI != valueNumbering.end() && "Value not numbered?");
+ return VI->second;
+ }
+ return (VI != valueNumbering.end()) ? VI->second : 0;
+}
+
+/// Returns the value number of the given comparison,
+/// assigning it a new number if it did not have one before. Useful when
+/// we deduced the result of a comparison, but don't immediately have an
+/// instruction realizing that comparison to hand.
+uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode,
+ CmpInst::Predicate Predicate,
+ Value *LHS, Value *RHS) {
+ Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS);
+ return assignExpNewValueNum(exp).first;
+}
+
+/// Remove all entries from the ValueTable.
+void GVN::ValueTable::clear() {
+ valueNumbering.clear();
+ expressionNumbering.clear();
+ NumberingPhi.clear();
+ PhiTranslateTable.clear();
+ nextValueNumber = 1;
+ Expressions.clear();
+ ExprIdx.clear();
+ nextExprNumber = 0;
+}
+
+/// Remove a value from the value numbering.
+void GVN::ValueTable::erase(Value *V) {
+ uint32_t Num = valueNumbering.lookup(V);
+ valueNumbering.erase(V);
+ // If V is PHINode, V <--> value number is an one-to-one mapping.
+ if (isa<PHINode>(V))
+ NumberingPhi.erase(Num);
+}
+
+/// verifyRemoved - Verify that the value is removed from all internal data
+/// structures.
+void GVN::ValueTable::verifyRemoved(const Value *V) const {
+ for (DenseMap<Value*, uint32_t>::const_iterator
+ I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) {
+ assert(I->first != V && "Inst still occurs in value numbering map!");
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// GVN Pass
+//===----------------------------------------------------------------------===//
+
+PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
+ // FIXME: The order of evaluation of these 'getResult' calls is very
+ // significant! Re-ordering these variables will cause GVN when run alone to
+ // be less effective! We should fix memdep and basic-aa to not exhibit this
+ // behavior, but until then don't change the order here.
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &AA = AM.getResult<AAManager>(F);
+ auto &MemDep = AM.getResult<MemoryDependenceAnalysis>(F);
+ auto *LI = AM.getCachedResult<LoopAnalysis>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ bool Changed = runImpl(F, AC, DT, TLI, AA, &MemDep, LI, &ORE);
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<GlobalsAA>();
+ PA.preserve<TargetLibraryAnalysis>();
+ return PA;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) const {
+ errs() << "{\n";
+ for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
+ E = d.end(); I != E; ++I) {
+ errs() << I->first << "\n";
+ I->second->dump();
+ }
+ errs() << "}\n";
+}
+#endif
+
+/// Return true if we can prove that the value
+/// we're analyzing is fully available in the specified block. As we go, keep
+/// track of which blocks we know are fully alive in FullyAvailableBlocks. This
+/// map is actually a tri-state map with the following values:
+/// 0) we know the block *is not* fully available.
+/// 1) we know the block *is* fully available.
+/// 2) we do not know whether the block is fully available or not, but we are
+/// currently speculating that it will be.
+/// 3) we are speculating for this block and have used that to speculate for
+/// other blocks.
+static bool IsValueFullyAvailableInBlock(BasicBlock *BB,
+ DenseMap<BasicBlock*, char> &FullyAvailableBlocks,
+ uint32_t RecurseDepth) {
+ if (RecurseDepth > MaxRecurseDepth)
+ return false;
+
+ // Optimistically assume that the block is fully available and check to see
+ // if we already know about this block in one lookup.
+ std::pair<DenseMap<BasicBlock*, char>::iterator, char> IV =
+ FullyAvailableBlocks.insert(std::make_pair(BB, 2));
+
+ // If the entry already existed for this block, return the precomputed value.
+ if (!IV.second) {
+ // If this is a speculative "available" value, mark it as being used for
+ // speculation of other blocks.
+ if (IV.first->second == 2)
+ IV.first->second = 3;
+ return IV.first->second != 0;
+ }
+
+ // Otherwise, see if it is fully available in all predecessors.
+ pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+
+ // If this block has no predecessors, it isn't live-in here.
+ if (PI == PE)
+ goto SpeculationFailure;
+
+ for (; PI != PE; ++PI)
+ // If the value isn't fully available in one of our predecessors, then it
+ // isn't fully available in this block either. Undo our previous
+ // optimistic assumption and bail out.
+ if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks,RecurseDepth+1))
+ goto SpeculationFailure;
+
+ return true;
+
+// If we get here, we found out that this is not, after
+// all, a fully-available block. We have a problem if we speculated on this and
+// used the speculation to mark other blocks as available.
+SpeculationFailure:
+ char &BBVal = FullyAvailableBlocks[BB];
+
+ // If we didn't speculate on this, just return with it set to false.
+ if (BBVal == 2) {
+ BBVal = 0;
+ return false;
+ }
+
+ // If we did speculate on this value, we could have blocks set to 1 that are
+ // incorrect. Walk the (transitive) successors of this block and mark them as
+ // 0 if set to one.
+ SmallVector<BasicBlock*, 32> BBWorklist;
+ BBWorklist.push_back(BB);
+
+ do {
+ BasicBlock *Entry = BBWorklist.pop_back_val();
+ // Note that this sets blocks to 0 (unavailable) if they happen to not
+ // already be in FullyAvailableBlocks. This is safe.
+ char &EntryVal = FullyAvailableBlocks[Entry];
+ if (EntryVal == 0) continue; // Already unavailable.
+
+ // Mark as unavailable.
+ EntryVal = 0;
+
+ BBWorklist.append(succ_begin(Entry), succ_end(Entry));
+ } while (!BBWorklist.empty());
+
+ return false;
+}
+
+/// Given a set of loads specified by ValuesPerBlock,
+/// construct SSA form, allowing us to eliminate LI. This returns the value
+/// that should be used at LI's definition site.
+static Value *ConstructSSAForLoadSet(LoadInst *LI,
+ SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock,
+ GVN &gvn) {
+ // Check for the fully redundant, dominating load case. In this case, we can
+ // just use the dominating value directly.
+ if (ValuesPerBlock.size() == 1 &&
+ gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB,
+ LI->getParent())) {
+ assert(!ValuesPerBlock[0].AV.isUndefValue() &&
+ "Dead BB dominate this block");
+ return ValuesPerBlock[0].MaterializeAdjustedValue(LI, gvn);
+ }
+
+ // Otherwise, we have to construct SSA form.
+ SmallVector<PHINode*, 8> NewPHIs;
+ SSAUpdater SSAUpdate(&NewPHIs);
+ SSAUpdate.Initialize(LI->getType(), LI->getName());
+
+ for (const AvailableValueInBlock &AV : ValuesPerBlock) {
+ BasicBlock *BB = AV.BB;
+
+ if (SSAUpdate.HasValueForBlock(BB))
+ continue;
+
+ SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LI, gvn));
+ }
+
+ // Perform PHI construction.
+ return SSAUpdate.GetValueInMiddleOfBlock(LI->getParent());
+}
+
+Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
+ Instruction *InsertPt,
+ GVN &gvn) const {
+ Value *Res;
+ Type *LoadTy = LI->getType();
+ const DataLayout &DL = LI->getModule()->getDataLayout();
+ if (isSimpleValue()) {
+ Res = getSimpleValue();
+ if (Res->getType() != LoadTy) {
+ Res = getStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
+
+ DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " "
+ << *getSimpleValue() << '\n'
+ << *Res << '\n' << "\n\n\n");
+ }
+ } else if (isCoercedLoadValue()) {
+ LoadInst *Load = getCoercedLoadValue();
+ if (Load->getType() == LoadTy && Offset == 0) {
+ Res = Load;
+ } else {
+ Res = getLoadValueForLoad(Load, Offset, LoadTy, InsertPt, DL);
+ // We would like to use gvn.markInstructionForDeletion here, but we can't
+ // because the load is already memoized into the leader map table that GVN
+ // tracks. It is potentially possible to remove the load from the table,
+ // but then there all of the operations based on it would need to be
+ // rehashed. Just leave the dead load around.
+ gvn.getMemDep().removeInstruction(Load);
+ DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << " "
+ << *getCoercedLoadValue() << '\n'
+ << *Res << '\n'
+ << "\n\n\n");
+ }
+ } else if (isMemIntrinValue()) {
+ Res = getMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
+ InsertPt, DL);
+ DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
+ << " " << *getMemIntrinValue() << '\n'
+ << *Res << '\n' << "\n\n\n");
+ } else {
+ assert(isUndefValue() && "Should be UndefVal");
+ DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";);
+ return UndefValue::get(LoadTy);
+ }
+ assert(Res && "failed to materialize?");
+ return Res;
+}
+
+static bool isLifetimeStart(const Instruction *Inst) {
+ if (const IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst))
+ return II->getIntrinsicID() == Intrinsic::lifetime_start;
+ return false;
+}
+
+/// \brief Try to locate the three instruction involved in a missed
+/// load-elimination case that is due to an intervening store.
+static void reportMayClobberedLoad(LoadInst *LI, MemDepResult DepInfo,
+ DominatorTree *DT,
+ OptimizationRemarkEmitter *ORE) {
+ using namespace ore;
+
+ User *OtherAccess = nullptr;
+
+ OptimizationRemarkMissed R(DEBUG_TYPE, "LoadClobbered", LI);
+ R << "load of type " << NV("Type", LI->getType()) << " not eliminated"
+ << setExtraArgs();
+
+ for (auto *U : LI->getPointerOperand()->users())
+ if (U != LI && (isa<LoadInst>(U) || isa<StoreInst>(U)) &&
+ DT->dominates(cast<Instruction>(U), LI)) {
+ // FIXME: for now give up if there are multiple memory accesses that
+ // dominate the load. We need further analysis to decide which one is
+ // that we're forwarding from.
+ if (OtherAccess)
+ OtherAccess = nullptr;
+ else
+ OtherAccess = U;
+ }
+
+ if (OtherAccess)
+ R << " in favor of " << NV("OtherAccess", OtherAccess);
+
+ R << " because it is clobbered by " << NV("ClobberedBy", DepInfo.getInst());
+
+ ORE->emit(R);
+}
+
+bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
+ Value *Address, AvailableValue &Res) {
+ assert((DepInfo.isDef() || DepInfo.isClobber()) &&
+ "expected a local dependence");
+ assert(LI->isUnordered() && "rules below are incorrect for ordered access");
+
+ const DataLayout &DL = LI->getModule()->getDataLayout();
+
+ if (DepInfo.isClobber()) {
+ // If the dependence is to a store that writes to a superset of the bits
+ // read by the load, we can extract the bits we need for the load from the
+ // stored value.
+ if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) {
+ // Can't forward from non-atomic to atomic without violating memory model.
+ if (Address && LI->isAtomic() <= DepSI->isAtomic()) {
+ int Offset =
+ analyzeLoadFromClobberingStore(LI->getType(), Address, DepSI, DL);
+ if (Offset != -1) {
+ Res = AvailableValue::get(DepSI->getValueOperand(), Offset);
+ return true;
+ }
+ }
+ }
+
+ // Check to see if we have something like this:
+ // load i32* P
+ // load i8* (P+1)
+ // if we have this, replace the later with an extraction from the former.
+ if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) {
+ // If this is a clobber and L is the first instruction in its block, then
+ // we have the first instruction in the entry block.
+ // Can't forward from non-atomic to atomic without violating memory model.
+ if (DepLI != LI && Address && LI->isAtomic() <= DepLI->isAtomic()) {
+ int Offset =
+ analyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
+
+ if (Offset != -1) {
+ Res = AvailableValue::getLoad(DepLI, Offset);
+ return true;
+ }
+ }
+ }
+
+ // If the clobbering value is a memset/memcpy/memmove, see if we can
+ // forward a value on from it.
+ if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) {
+ if (Address && !LI->isAtomic()) {
+ int Offset = analyzeLoadFromClobberingMemInst(LI->getType(), Address,
+ DepMI, DL);
+ if (Offset != -1) {
+ Res = AvailableValue::getMI(DepMI, Offset);
+ return true;
+ }
+ }
+ }
+ // Nothing known about this clobber, have to be conservative
+ DEBUG(
+ // fast print dep, using operator<< on instruction is too slow.
+ dbgs() << "GVN: load ";
+ LI->printAsOperand(dbgs());
+ Instruction *I = DepInfo.getInst();
+ dbgs() << " is clobbered by " << *I << '\n';
+ );
+ if (ORE->allowExtraAnalysis(DEBUG_TYPE))
+ reportMayClobberedLoad(LI, DepInfo, DT, ORE);
+
+ return false;
+ }
+ assert(DepInfo.isDef() && "follows from above");
+
+ Instruction *DepInst = DepInfo.getInst();
+
+ // Loading the allocation -> undef.
+ if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
+ // Loading immediately after lifetime begin -> undef.
+ isLifetimeStart(DepInst)) {
+ Res = AvailableValue::get(UndefValue::get(LI->getType()));
+ return true;
+ }
+
+ // Loading from calloc (which zero initializes memory) -> zero
+ if (isCallocLikeFn(DepInst, TLI)) {
+ Res = AvailableValue::get(Constant::getNullValue(LI->getType()));
+ return true;
+ }
+
+ if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
+ // Reject loads and stores that are to the same address but are of
+ // different types if we have to. If the stored value is larger or equal to
+ // the loaded value, we can reuse it.
+ if (S->getValueOperand()->getType() != LI->getType() &&
+ !canCoerceMustAliasedValueToLoad(S->getValueOperand(),
+ LI->getType(), DL))
+ return false;
+
+ // Can't forward from non-atomic to atomic without violating memory model.
+ if (S->isAtomic() < LI->isAtomic())
+ return false;
+
+ Res = AvailableValue::get(S->getValueOperand());
+ return true;
+ }
+
+ if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) {
+ // If the types mismatch and we can't handle it, reject reuse of the load.
+ // If the stored value is larger or equal to the loaded value, we can reuse
+ // it.
+ if (LD->getType() != LI->getType() &&
+ !canCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
+ return false;
+
+ // Can't forward from non-atomic to atomic without violating memory model.
+ if (LD->isAtomic() < LI->isAtomic())
+ return false;
+
+ Res = AvailableValue::getLoad(LD);
+ return true;
+ }
+
+ // Unknown def - must be conservative
+ DEBUG(
+ // fast print dep, using operator<< on instruction is too slow.
+ dbgs() << "GVN: load ";
+ LI->printAsOperand(dbgs());
+ dbgs() << " has unknown def " << *DepInst << '\n';
+ );
+ return false;
+}
+
+void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
+ AvailValInBlkVect &ValuesPerBlock,
+ UnavailBlkVect &UnavailableBlocks) {
+ // Filter out useless results (non-locals, etc). Keep track of the blocks
+ // where we have a value available in repl, also keep track of whether we see
+ // dependencies that produce an unknown value for the load (such as a call
+ // that could potentially clobber the load).
+ unsigned NumDeps = Deps.size();
+ for (unsigned i = 0, e = NumDeps; i != e; ++i) {
+ BasicBlock *DepBB = Deps[i].getBB();
+ MemDepResult DepInfo = Deps[i].getResult();
+
+ if (DeadBlocks.count(DepBB)) {
+ // Dead dependent mem-op disguise as a load evaluating the same value
+ // as the load in question.
+ ValuesPerBlock.push_back(AvailableValueInBlock::getUndef(DepBB));
+ continue;
+ }
+
+ if (!DepInfo.isDef() && !DepInfo.isClobber()) {
+ UnavailableBlocks.push_back(DepBB);
+ continue;
+ }
+
+ // The address being loaded in this non-local block may not be the same as
+ // the pointer operand of the load if PHI translation occurs. Make sure
+ // to consider the right address.
+ Value *Address = Deps[i].getAddress();
+
+ AvailableValue AV;
+ if (AnalyzeLoadAvailability(LI, DepInfo, Address, AV)) {
+ // subtlety: because we know this was a non-local dependency, we know
+ // it's safe to materialize anywhere between the instruction within
+ // DepInfo and the end of it's block.
+ ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
+ std::move(AV)));
+ } else {
+ UnavailableBlocks.push_back(DepBB);
+ }
+ }
+
+ assert(NumDeps == ValuesPerBlock.size() + UnavailableBlocks.size() &&
+ "post condition violation");
+}
+
+bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
+ UnavailBlkVect &UnavailableBlocks) {
+ // Okay, we have *some* definitions of the value. This means that the value
+ // is available in some of our (transitive) predecessors. Lets think about
+ // doing PRE of this load. This will involve inserting a new load into the
+ // predecessor when it's not available. We could do this in general, but
+ // prefer to not increase code size. As such, we only do this when we know
+ // that we only have to insert *one* load (which means we're basically moving
+ // the load, not inserting a new one).
+
+ SmallPtrSet<BasicBlock *, 4> Blockers(UnavailableBlocks.begin(),
+ UnavailableBlocks.end());
+
+ // Let's find the first basic block with more than one predecessor. Walk
+ // backwards through predecessors if needed.
+ BasicBlock *LoadBB = LI->getParent();
+ BasicBlock *TmpBB = LoadBB;
+ bool IsSafeToSpeculativelyExecute = isSafeToSpeculativelyExecute(LI);
+
+ // Check that there is no implicit control flow instructions above our load in
+ // its block. If there is an instruction that doesn't always pass the
+ // execution to the following instruction, then moving through it may become
+ // invalid. For example:
+ //
+ // int arr[LEN];
+ // int index = ???;
+ // ...
+ // guard(0 <= index && index < LEN);
+ // use(arr[index]);
+ //
+ // It is illegal to move the array access to any point above the guard,
+ // because if the index is out of bounds we should deoptimize rather than
+ // access the array.
+ // Check that there is no guard in this block above our intruction.
+ if (!IsSafeToSpeculativelyExecute) {
+ auto It = FirstImplicitControlFlowInsts.find(TmpBB);
+ if (It != FirstImplicitControlFlowInsts.end()) {
+ assert(It->second->getParent() == TmpBB &&
+ "Implicit control flow map broken?");
+ if (OI->dominates(It->second, LI))
+ return false;
+ }
+ }
+ while (TmpBB->getSinglePredecessor()) {
+ TmpBB = TmpBB->getSinglePredecessor();
+ if (TmpBB == LoadBB) // Infinite (unreachable) loop.
+ return false;
+ if (Blockers.count(TmpBB))
+ return false;
+
+ // If any of these blocks has more than one successor (i.e. if the edge we
+ // just traversed was critical), then there are other paths through this
+ // block along which the load may not be anticipated. Hoisting the load
+ // above this block would be adding the load to execution paths along
+ // which it was not previously executed.
+ if (TmpBB->getTerminator()->getNumSuccessors() != 1)
+ return false;
+
+ // Check that there is no implicit control flow in a block above.
+ if (!IsSafeToSpeculativelyExecute &&
+ FirstImplicitControlFlowInsts.count(TmpBB))
+ return false;
+ }
+
+ assert(TmpBB);
+ LoadBB = TmpBB;
+
+ // Check to see how many predecessors have the loaded value fully
+ // available.
+ MapVector<BasicBlock *, Value *> PredLoads;
+ DenseMap<BasicBlock*, char> FullyAvailableBlocks;
+ for (const AvailableValueInBlock &AV : ValuesPerBlock)
+ FullyAvailableBlocks[AV.BB] = true;
+ for (BasicBlock *UnavailableBB : UnavailableBlocks)
+ FullyAvailableBlocks[UnavailableBB] = false;
+
+ SmallVector<BasicBlock *, 4> CriticalEdgePred;
+ for (BasicBlock *Pred : predecessors(LoadBB)) {
+ // If any predecessor block is an EH pad that does not allow non-PHI
+ // instructions before the terminator, we can't PRE the load.
+ if (Pred->getTerminator()->isEHPad()) {
+ DEBUG(dbgs()
+ << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '"
+ << Pred->getName() << "': " << *LI << '\n');
+ return false;
+ }
+
+ if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) {
+ continue;
+ }
+
+ if (Pred->getTerminator()->getNumSuccessors() != 1) {
+ if (isa<IndirectBrInst>(Pred->getTerminator())) {
+ DEBUG(dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '"
+ << Pred->getName() << "': " << *LI << '\n');
+ return false;
+ }
+
+ if (LoadBB->isEHPad()) {
+ DEBUG(dbgs()
+ << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '"
+ << Pred->getName() << "': " << *LI << '\n');
+ return false;
+ }
+
+ CriticalEdgePred.push_back(Pred);
+ } else {
+ // Only add the predecessors that will not be split for now.
+ PredLoads[Pred] = nullptr;
+ }
+ }
+
+ // Decide whether PRE is profitable for this load.
+ unsigned NumUnavailablePreds = PredLoads.size() + CriticalEdgePred.size();
+ assert(NumUnavailablePreds != 0 &&
+ "Fully available value should already be eliminated!");
+
+ // If this load is unavailable in multiple predecessors, reject it.
+ // FIXME: If we could restructure the CFG, we could make a common pred with
+ // all the preds that don't have an available LI and insert a new load into
+ // that one block.
+ if (NumUnavailablePreds != 1)
+ return false;
+
+ // Split critical edges, and update the unavailable predecessors accordingly.
+ for (BasicBlock *OrigPred : CriticalEdgePred) {
+ BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB);
+ assert(!PredLoads.count(OrigPred) && "Split edges shouldn't be in map!");
+ PredLoads[NewPred] = nullptr;
+ DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->"
+ << LoadBB->getName() << '\n');
+ }
+
+ // Check if the load can safely be moved to all the unavailable predecessors.
+ bool CanDoPRE = true;
+ const DataLayout &DL = LI->getModule()->getDataLayout();
+ SmallVector<Instruction*, 8> NewInsts;
+ for (auto &PredLoad : PredLoads) {
+ BasicBlock *UnavailablePred = PredLoad.first;
+
+ // Do PHI translation to get its value in the predecessor if necessary. The
+ // returned pointer (if non-null) is guaranteed to dominate UnavailablePred.
+
+ // If all preds have a single successor, then we know it is safe to insert
+ // the load on the pred (?!?), so we can insert code to materialize the
+ // pointer if it is not available.
+ PHITransAddr Address(LI->getPointerOperand(), DL, AC);
+ Value *LoadPtr = nullptr;
+ LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
+ *DT, NewInsts);
+
+ // If we couldn't find or insert a computation of this phi translated value,
+ // we fail PRE.
+ if (!LoadPtr) {
+ DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: "
+ << *LI->getPointerOperand() << "\n");
+ CanDoPRE = false;
+ break;
+ }
+
+ PredLoad.second = LoadPtr;
+ }
+
+ if (!CanDoPRE) {
+ while (!NewInsts.empty()) {
+ Instruction *I = NewInsts.pop_back_val();
+ markInstructionForDeletion(I);
+ }
+ // HINT: Don't revert the edge-splitting as following transformation may
+ // also need to split these critical edges.
+ return !CriticalEdgePred.empty();
+ }
+
+ // Okay, we can eliminate this load by inserting a reload in the predecessor
+ // and using PHI construction to get the value in the other predecessors, do
+ // it.
+ DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n');
+ DEBUG(if (!NewInsts.empty())
+ dbgs() << "INSERTED " << NewInsts.size() << " INSTS: "
+ << *NewInsts.back() << '\n');
+
+ // Assign value numbers to the new instructions.
+ for (Instruction *I : NewInsts) {
+ // Instructions that have been inserted in predecessor(s) to materialize
+ // the load address do not retain their original debug locations. Doing
+ // so could lead to confusing (but correct) source attributions.
+ // FIXME: How do we retain source locations without causing poor debugging
+ // behavior?
+ I->setDebugLoc(DebugLoc());
+
+ // FIXME: We really _ought_ to insert these value numbers into their
+ // parent's availability map. However, in doing so, we risk getting into
+ // ordering issues. If a block hasn't been processed yet, we would be
+ // marking a value as AVAIL-IN, which isn't what we intend.
+ VN.lookupOrAdd(I);
+ }
+
+ for (const auto &PredLoad : PredLoads) {
+ BasicBlock *UnavailablePred = PredLoad.first;
+ Value *LoadPtr = PredLoad.second;
+
+ auto *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre",
+ LI->isVolatile(), LI->getAlignment(),
+ LI->getOrdering(), LI->getSyncScopeID(),
+ UnavailablePred->getTerminator());
+ NewLoad->setDebugLoc(LI->getDebugLoc());
+
+ // Transfer the old load's AA tags to the new load.
+ AAMDNodes Tags;
+ LI->getAAMetadata(Tags);
+ if (Tags)
+ NewLoad->setAAMetadata(Tags);
+
+ if (auto *MD = LI->getMetadata(LLVMContext::MD_invariant_load))
+ NewLoad->setMetadata(LLVMContext::MD_invariant_load, MD);
+ if (auto *InvGroupMD = LI->getMetadata(LLVMContext::MD_invariant_group))
+ NewLoad->setMetadata(LLVMContext::MD_invariant_group, InvGroupMD);
+ if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range))
+ NewLoad->setMetadata(LLVMContext::MD_range, RangeMD);
+
+ // We do not propagate the old load's debug location, because the new
+ // load now lives in a different BB, and we want to avoid a jumpy line
+ // table.
+ // FIXME: How do we retain source locations without causing poor debugging
+ // behavior?
+
+ // Add the newly created load.
+ ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred,
+ NewLoad));
+ MD->invalidateCachedPointerInfo(LoadPtr);
+ DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n');
+ }
+
+ // Perform PHI construction.
+ Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
+ LI->replaceAllUsesWith(V);
+ if (isa<PHINode>(V))
+ V->takeName(LI);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ I->setDebugLoc(LI->getDebugLoc());
+ if (V->getType()->isPtrOrPtrVectorTy())
+ MD->invalidateCachedPointerInfo(V);
+ markInstructionForDeletion(LI);
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "LoadPRE", LI)
+ << "load eliminated by PRE";
+ });
+ ++NumPRELoad;
+ return true;
+}
+
+static void reportLoadElim(LoadInst *LI, Value *AvailableValue,
+ OptimizationRemarkEmitter *ORE) {
+ using namespace ore;
+
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "LoadElim", LI)
+ << "load of type " << NV("Type", LI->getType()) << " eliminated"
+ << setExtraArgs() << " in favor of "
+ << NV("InfavorOfValue", AvailableValue);
+ });
+}
+
+/// Attempt to eliminate a load whose dependencies are
+/// non-local by performing PHI construction.
+bool GVN::processNonLocalLoad(LoadInst *LI) {
+ // non-local speculations are not allowed under asan.
+ if (LI->getParent()->getParent()->hasFnAttribute(
+ Attribute::SanitizeAddress) ||
+ LI->getParent()->getParent()->hasFnAttribute(
+ Attribute::SanitizeHWAddress))
+ return false;
+
+ // Step 1: Find the non-local dependencies of the load.
+ LoadDepVect Deps;
+ MD->getNonLocalPointerDependency(LI, Deps);
+
+ // If we had to process more than one hundred blocks to find the
+ // dependencies, this load isn't worth worrying about. Optimizing
+ // it will be too expensive.
+ unsigned NumDeps = Deps.size();
+ if (NumDeps > 100)
+ return false;
+
+ // If we had a phi translation failure, we'll have a single entry which is a
+ // clobber in the current block. Reject this early.
+ if (NumDeps == 1 &&
+ !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) {
+ DEBUG(
+ dbgs() << "GVN: non-local load ";
+ LI->printAsOperand(dbgs());
+ dbgs() << " has unknown dependencies\n";
+ );
+ return false;
+ }
+
+ // If this load follows a GEP, see if we can PRE the indices before analyzing.
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) {
+ for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(),
+ OE = GEP->idx_end();
+ OI != OE; ++OI)
+ if (Instruction *I = dyn_cast<Instruction>(OI->get()))
+ performScalarPRE(I);
+ }
+
+ // Step 2: Analyze the availability of the load
+ AvailValInBlkVect ValuesPerBlock;
+ UnavailBlkVect UnavailableBlocks;
+ AnalyzeLoadAvailability(LI, Deps, ValuesPerBlock, UnavailableBlocks);
+
+ // If we have no predecessors that produce a known value for this load, exit
+ // early.
+ if (ValuesPerBlock.empty())
+ return false;
+
+ // Step 3: Eliminate fully redundancy.
+ //
+ // If all of the instructions we depend on produce a known value for this
+ // load, then it is fully redundant and we can use PHI insertion to compute
+ // its value. Insert PHIs and remove the fully redundant value now.
+ if (UnavailableBlocks.empty()) {
+ DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
+
+ // Perform PHI construction.
+ Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
+ LI->replaceAllUsesWith(V);
+
+ if (isa<PHINode>(V))
+ V->takeName(LI);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ // If instruction I has debug info, then we should not update it.
+ // Also, if I has a null DebugLoc, then it is still potentially incorrect
+ // to propagate LI's DebugLoc because LI may not post-dominate I.
+ if (LI->getDebugLoc() && LI->getParent() == I->getParent())
+ I->setDebugLoc(LI->getDebugLoc());
+ if (V->getType()->isPtrOrPtrVectorTy())
+ MD->invalidateCachedPointerInfo(V);
+ markInstructionForDeletion(LI);
+ ++NumGVNLoad;
+ reportLoadElim(LI, V, ORE);
+ return true;
+ }
+
+ // Step 4: Eliminate partial redundancy.
+ if (!EnablePRE || !EnableLoadPRE)
+ return false;
+
+ return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks);
+}
+
+bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
+ assert(IntrinsicI->getIntrinsicID() == Intrinsic::assume &&
+ "This function can only be called with llvm.assume intrinsic");
+ Value *V = IntrinsicI->getArgOperand(0);
+
+ if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) {
+ if (Cond->isZero()) {
+ Type *Int8Ty = Type::getInt8Ty(V->getContext());
+ // Insert a new store to null instruction before the load to indicate that
+ // this code is not reachable. FIXME: We could insert unreachable
+ // instruction directly because we can modify the CFG.
+ new StoreInst(UndefValue::get(Int8Ty),
+ Constant::getNullValue(Int8Ty->getPointerTo()),
+ IntrinsicI);
+ }
+ markInstructionForDeletion(IntrinsicI);
+ return false;
+ } else if (isa<Constant>(V)) {
+ // If it's not false, and constant, it must evaluate to true. This means our
+ // assume is assume(true), and thus, pointless, and we don't want to do
+ // anything more here.
+ return false;
+ }
+
+ Constant *True = ConstantInt::getTrue(V->getContext());
+ bool Changed = false;
+
+ for (BasicBlock *Successor : successors(IntrinsicI->getParent())) {
+ BasicBlockEdge Edge(IntrinsicI->getParent(), Successor);
+
+ // This property is only true in dominated successors, propagateEquality
+ // will check dominance for us.
+ Changed |= propagateEquality(V, True, Edge, false);
+ }
+
+ // We can replace assume value with true, which covers cases like this:
+ // call void @llvm.assume(i1 %cmp)
+ // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true
+ ReplaceWithConstMap[V] = True;
+
+ // If one of *cmp *eq operand is const, adding it to map will cover this:
+ // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen
+ // call void @llvm.assume(i1 %cmp)
+ // ret float %0 ; will change it to ret float 3.000000e+00
+ if (auto *CmpI = dyn_cast<CmpInst>(V)) {
+ if (CmpI->getPredicate() == CmpInst::Predicate::ICMP_EQ ||
+ CmpI->getPredicate() == CmpInst::Predicate::FCMP_OEQ ||
+ (CmpI->getPredicate() == CmpInst::Predicate::FCMP_UEQ &&
+ CmpI->getFastMathFlags().noNaNs())) {
+ Value *CmpLHS = CmpI->getOperand(0);
+ Value *CmpRHS = CmpI->getOperand(1);
+ if (isa<Constant>(CmpLHS))
+ std::swap(CmpLHS, CmpRHS);
+ auto *RHSConst = dyn_cast<Constant>(CmpRHS);
+
+ // If only one operand is constant.
+ if (RHSConst != nullptr && !isa<Constant>(CmpLHS))
+ ReplaceWithConstMap[CmpLHS] = RHSConst;
+ }
+ }
+ return Changed;
+}
+
+static void patchReplacementInstruction(Instruction *I, Value *Repl) {
+ auto *ReplInst = dyn_cast<Instruction>(Repl);
+ if (!ReplInst)
+ return;
+
+ // Patch the replacement so that it is not more restrictive than the value
+ // being replaced.
+ // Note that if 'I' is a load being replaced by some operation,
+ // for example, by an arithmetic operation, then andIRFlags()
+ // would just erase all math flags from the original arithmetic
+ // operation, which is clearly not wanted and not needed.
+ if (!isa<LoadInst>(I))
+ ReplInst->andIRFlags(I);
+
+ // FIXME: If both the original and replacement value are part of the
+ // same control-flow region (meaning that the execution of one
+ // guarantees the execution of the other), then we can combine the
+ // noalias scopes here and do better than the general conservative
+ // answer used in combineMetadata().
+
+ // In general, GVN unifies expressions over different control-flow
+ // regions, and so we need a conservative combination of the noalias
+ // scopes.
+ static const unsigned KnownIDs[] = {
+ LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias, LLVMContext::MD_range,
+ LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load,
+ LLVMContext::MD_invariant_group};
+ combineMetadata(ReplInst, I, KnownIDs);
+}
+
+static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
+ patchReplacementInstruction(I, Repl);
+ I->replaceAllUsesWith(Repl);
+}
+
+/// Attempt to eliminate a load, first by eliminating it
+/// locally, and then attempting non-local elimination if that fails.
+bool GVN::processLoad(LoadInst *L) {
+ if (!MD)
+ return false;
+
+ // This code hasn't been audited for ordered or volatile memory access
+ if (!L->isUnordered())
+ return false;
+
+ if (L->use_empty()) {
+ markInstructionForDeletion(L);
+ return true;
+ }
+
+ // ... to a pointer that has been loaded from before...
+ MemDepResult Dep = MD->getDependency(L);
+
+ // If it is defined in another block, try harder.
+ if (Dep.isNonLocal())
+ return processNonLocalLoad(L);
+
+ // Only handle the local case below
+ if (!Dep.isDef() && !Dep.isClobber()) {
+ // This might be a NonFuncLocal or an Unknown
+ DEBUG(
+ // fast print dep, using operator<< on instruction is too slow.
+ dbgs() << "GVN: load ";
+ L->printAsOperand(dbgs());
+ dbgs() << " has unknown dependence\n";
+ );
+ return false;
+ }
+
+ AvailableValue AV;
+ if (AnalyzeLoadAvailability(L, Dep, L->getPointerOperand(), AV)) {
+ Value *AvailableValue = AV.MaterializeAdjustedValue(L, L, *this);
+
+ // Replace the load!
+ patchAndReplaceAllUsesWith(L, AvailableValue);
+ markInstructionForDeletion(L);
+ ++NumGVNLoad;
+ reportLoadElim(L, AvailableValue, ORE);
+ // Tell MDA to rexamine the reused pointer since we might have more
+ // information after forwarding it.
+ if (MD && AvailableValue->getType()->isPtrOrPtrVectorTy())
+ MD->invalidateCachedPointerInfo(AvailableValue);
+ return true;
+ }
+
+ return false;
+}
+
+/// Return a pair the first field showing the value number of \p Exp and the
+/// second field showing whether it is a value number newly created.
+std::pair<uint32_t, bool>
+GVN::ValueTable::assignExpNewValueNum(Expression &Exp) {
+ uint32_t &e = expressionNumbering[Exp];
+ bool CreateNewValNum = !e;
+ if (CreateNewValNum) {
+ Expressions.push_back(Exp);
+ if (ExprIdx.size() < nextValueNumber + 1)
+ ExprIdx.resize(nextValueNumber * 2);
+ e = nextValueNumber;
+ ExprIdx[nextValueNumber++] = nextExprNumber++;
+ }
+ return {e, CreateNewValNum};
+}
+
+/// Return whether all the values related with the same \p num are
+/// defined in \p BB.
+bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB,
+ GVN &Gvn) {
+ LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
+ while (Vals && Vals->BB == BB)
+ Vals = Vals->Next;
+ return !Vals;
+}
+
+/// Wrap phiTranslateImpl to provide caching functionality.
+uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred,
+ const BasicBlock *PhiBlock, uint32_t Num,
+ GVN &Gvn) {
+ auto FindRes = PhiTranslateTable.find({Num, Pred});
+ if (FindRes != PhiTranslateTable.end())
+ return FindRes->second;
+ uint32_t NewNum = phiTranslateImpl(Pred, PhiBlock, Num, Gvn);
+ PhiTranslateTable.insert({{Num, Pred}, NewNum});
+ return NewNum;
+}
+
+/// Translate value number \p Num using phis, so that it has the values of
+/// the phis in BB.
+uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
+ const BasicBlock *PhiBlock,
+ uint32_t Num, GVN &Gvn) {
+ if (PHINode *PN = NumberingPhi[Num]) {
+ for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+ if (PN->getParent() == PhiBlock && PN->getIncomingBlock(i) == Pred)
+ if (uint32_t TransVal = lookup(PN->getIncomingValue(i), false))
+ return TransVal;
+ }
+ return Num;
+ }
+
+ // If there is any value related with Num is defined in a BB other than
+ // PhiBlock, it cannot depend on a phi in PhiBlock without going through
+ // a backedge. We can do an early exit in that case to save compile time.
+ if (!areAllValsInBB(Num, PhiBlock, Gvn))
+ return Num;
+
+ if (Num >= ExprIdx.size() || ExprIdx[Num] == 0)
+ return Num;
+ Expression Exp = Expressions[ExprIdx[Num]];
+
+ for (unsigned i = 0; i < Exp.varargs.size(); i++) {
+ // For InsertValue and ExtractValue, some varargs are index numbers
+ // instead of value numbers. Those index numbers should not be
+ // translated.
+ if ((i > 1 && Exp.opcode == Instruction::InsertValue) ||
+ (i > 0 && Exp.opcode == Instruction::ExtractValue))
+ continue;
+ Exp.varargs[i] = phiTranslate(Pred, PhiBlock, Exp.varargs[i], Gvn);
+ }
+
+ if (Exp.commutative) {
+ assert(Exp.varargs.size() == 2 && "Unsupported commutative expression!");
+ if (Exp.varargs[0] > Exp.varargs[1]) {
+ std::swap(Exp.varargs[0], Exp.varargs[1]);
+ uint32_t Opcode = Exp.opcode >> 8;
+ if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)
+ Exp.opcode = (Opcode << 8) |
+ CmpInst::getSwappedPredicate(
+ static_cast<CmpInst::Predicate>(Exp.opcode & 255));
+ }
+ }
+
+ if (uint32_t NewNum = expressionNumbering[Exp])
+ return NewNum;
+ return Num;
+}
+
+/// Erase stale entry from phiTranslate cache so phiTranslate can be computed
+/// again.
+void GVN::ValueTable::eraseTranslateCacheEntry(uint32_t Num,
+ const BasicBlock &CurrBlock) {
+ for (const BasicBlock *Pred : predecessors(&CurrBlock)) {
+ auto FindRes = PhiTranslateTable.find({Num, Pred});
+ if (FindRes != PhiTranslateTable.end())
+ PhiTranslateTable.erase(FindRes);
+ }
+}
+
+// In order to find a leader for a given value number at a
+// specific basic block, we first obtain the list of all Values for that number,
+// and then scan the list to find one whose block dominates the block in
+// question. This is fast because dominator tree queries consist of only
+// a few comparisons of DFS numbers.
+Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) {
+ LeaderTableEntry Vals = LeaderTable[num];
+ if (!Vals.Val) return nullptr;
+
+ Value *Val = nullptr;
+ if (DT->dominates(Vals.BB, BB)) {
+ Val = Vals.Val;
+ if (isa<Constant>(Val)) return Val;
+ }
+
+ LeaderTableEntry* Next = Vals.Next;
+ while (Next) {
+ if (DT->dominates(Next->BB, BB)) {
+ if (isa<Constant>(Next->Val)) return Next->Val;
+ if (!Val) Val = Next->Val;
+ }
+
+ Next = Next->Next;
+ }
+
+ return Val;
+}
+
+/// There is an edge from 'Src' to 'Dst'. Return
+/// true if every path from the entry block to 'Dst' passes via this edge. In
+/// particular 'Dst' must not be reachable via another edge from 'Src'.
+static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
+ DominatorTree *DT) {
+ // While in theory it is interesting to consider the case in which Dst has
+ // more than one predecessor, because Dst might be part of a loop which is
+ // only reachable from Src, in practice it is pointless since at the time
+ // GVN runs all such loops have preheaders, which means that Dst will have
+ // been changed to have only one predecessor, namely Src.
+ const BasicBlock *Pred = E.getEnd()->getSinglePredecessor();
+ assert((!Pred || Pred == E.getStart()) &&
+ "No edge between these basic blocks!");
+ return Pred != nullptr;
+}
+
+void GVN::assignBlockRPONumber(Function &F) {
+ uint32_t NextBlockNumber = 1;
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+ for (BasicBlock *BB : RPOT)
+ BlockRPONumber[BB] = NextBlockNumber++;
+}
+
+// Tries to replace instruction with const, using information from
+// ReplaceWithConstMap.
+bool GVN::replaceOperandsWithConsts(Instruction *Instr) const {
+ bool Changed = false;
+ for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) {
+ Value *Operand = Instr->getOperand(OpNum);
+ auto it = ReplaceWithConstMap.find(Operand);
+ if (it != ReplaceWithConstMap.end()) {
+ assert(!isa<Constant>(Operand) &&
+ "Replacing constants with constants is invalid");
+ DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " << *it->second
+ << " in instruction " << *Instr << '\n');
+ Instr->setOperand(OpNum, it->second);
+ Changed = true;
+ }
+ }
+ return Changed;
+}
+
+/// The given values are known to be equal in every block
+/// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with
+/// 'RHS' everywhere in the scope. Returns whether a change was made.
+/// If DominatesByEdge is false, then it means that we will propagate the RHS
+/// value starting from the end of Root.Start.
+bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
+ bool DominatesByEdge) {
+ SmallVector<std::pair<Value*, Value*>, 4> Worklist;
+ Worklist.push_back(std::make_pair(LHS, RHS));
+ bool Changed = false;
+ // For speed, compute a conservative fast approximation to
+ // DT->dominates(Root, Root.getEnd());
+ const bool RootDominatesEnd = isOnlyReachableViaThisEdge(Root, DT);
+
+ while (!Worklist.empty()) {
+ std::pair<Value*, Value*> Item = Worklist.pop_back_val();
+ LHS = Item.first; RHS = Item.second;
+
+ if (LHS == RHS)
+ continue;
+ assert(LHS->getType() == RHS->getType() && "Equality but unequal types!");
+
+ // Don't try to propagate equalities between constants.
+ if (isa<Constant>(LHS) && isa<Constant>(RHS))
+ continue;
+
+ // Prefer a constant on the right-hand side, or an Argument if no constants.
+ if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS)))
+ std::swap(LHS, RHS);
+ assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!");
+
+ // If there is no obvious reason to prefer the left-hand side over the
+ // right-hand side, ensure the longest lived term is on the right-hand side,
+ // so the shortest lived term will be replaced by the longest lived.
+ // This tends to expose more simplifications.
+ uint32_t LVN = VN.lookupOrAdd(LHS);
+ if ((isa<Argument>(LHS) && isa<Argument>(RHS)) ||
+ (isa<Instruction>(LHS) && isa<Instruction>(RHS))) {
+ // Move the 'oldest' value to the right-hand side, using the value number
+ // as a proxy for age.
+ uint32_t RVN = VN.lookupOrAdd(RHS);
+ if (LVN < RVN) {
+ std::swap(LHS, RHS);
+ LVN = RVN;
+ }
+ }
+
+ // If value numbering later sees that an instruction in the scope is equal
+ // to 'LHS' then ensure it will be turned into 'RHS'. In order to preserve
+ // the invariant that instructions only occur in the leader table for their
+ // own value number (this is used by removeFromLeaderTable), do not do this
+ // if RHS is an instruction (if an instruction in the scope is morphed into
+ // LHS then it will be turned into RHS by the next GVN iteration anyway, so
+ // using the leader table is about compiling faster, not optimizing better).
+ // The leader table only tracks basic blocks, not edges. Only add to if we
+ // have the simple case where the edge dominates the end.
+ if (RootDominatesEnd && !isa<Instruction>(RHS))
+ addToLeaderTable(LVN, RHS, Root.getEnd());
+
+ // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope. As
+ // LHS always has at least one use that is not dominated by Root, this will
+ // never do anything if LHS has only one use.
+ if (!LHS->hasOneUse()) {
+ unsigned NumReplacements =
+ DominatesByEdge
+ ? replaceDominatedUsesWith(LHS, RHS, *DT, Root)
+ : replaceDominatedUsesWith(LHS, RHS, *DT, Root.getStart());
+
+ Changed |= NumReplacements > 0;
+ NumGVNEqProp += NumReplacements;
+ }
+
+ // Now try to deduce additional equalities from this one. For example, if
+ // the known equality was "(A != B)" == "false" then it follows that A and B
+ // are equal in the scope. Only boolean equalities with an explicit true or
+ // false RHS are currently supported.
+ if (!RHS->getType()->isIntegerTy(1))
+ // Not a boolean equality - bail out.
+ continue;
+ ConstantInt *CI = dyn_cast<ConstantInt>(RHS);
+ if (!CI)
+ // RHS neither 'true' nor 'false' - bail out.
+ continue;
+ // Whether RHS equals 'true'. Otherwise it equals 'false'.
+ bool isKnownTrue = CI->isMinusOne();
+ bool isKnownFalse = !isKnownTrue;
+
+ // If "A && B" is known true then both A and B are known true. If "A || B"
+ // is known false then both A and B are known false.
+ Value *A, *B;
+ if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) ||
+ (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) {
+ Worklist.push_back(std::make_pair(A, RHS));
+ Worklist.push_back(std::make_pair(B, RHS));
+ continue;
+ }
+
+ // If we are propagating an equality like "(A == B)" == "true" then also
+ // propagate the equality A == B. When propagating a comparison such as
+ // "(A >= B)" == "true", replace all instances of "A < B" with "false".
+ if (CmpInst *Cmp = dyn_cast<CmpInst>(LHS)) {
+ Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
+
+ // If "A == B" is known true, or "A != B" is known false, then replace
+ // A with B everywhere in the scope.
+ if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) ||
+ (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE))
+ Worklist.push_back(std::make_pair(Op0, Op1));
+
+ // Handle the floating point versions of equality comparisons too.
+ if ((isKnownTrue && Cmp->getPredicate() == CmpInst::FCMP_OEQ) ||
+ (isKnownFalse && Cmp->getPredicate() == CmpInst::FCMP_UNE)) {
+
+ // Floating point -0.0 and 0.0 compare equal, so we can only
+ // propagate values if we know that we have a constant and that
+ // its value is non-zero.
+
+ // FIXME: We should do this optimization if 'no signed zeros' is
+ // applicable via an instruction-level fast-math-flag or some other
+ // indicator that relaxed FP semantics are being used.
+
+ if (isa<ConstantFP>(Op1) && !cast<ConstantFP>(Op1)->isZero())
+ Worklist.push_back(std::make_pair(Op0, Op1));
+ }
+
+ // If "A >= B" is known true, replace "A < B" with false everywhere.
+ CmpInst::Predicate NotPred = Cmp->getInversePredicate();
+ Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse);
+ // Since we don't have the instruction "A < B" immediately to hand, work
+ // out the value number that it would have and use that to find an
+ // appropriate instruction (if any).
+ uint32_t NextNum = VN.getNextUnusedValueNumber();
+ uint32_t Num = VN.lookupOrAddCmp(Cmp->getOpcode(), NotPred, Op0, Op1);
+ // If the number we were assigned was brand new then there is no point in
+ // looking for an instruction realizing it: there cannot be one!
+ if (Num < NextNum) {
+ Value *NotCmp = findLeader(Root.getEnd(), Num);
+ if (NotCmp && isa<Instruction>(NotCmp)) {
+ unsigned NumReplacements =
+ DominatesByEdge
+ ? replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root)
+ : replaceDominatedUsesWith(NotCmp, NotVal, *DT,
+ Root.getStart());
+ Changed |= NumReplacements > 0;
+ NumGVNEqProp += NumReplacements;
+ }
+ }
+ // Ensure that any instruction in scope that gets the "A < B" value number
+ // is replaced with false.
+ // The leader table only tracks basic blocks, not edges. Only add to if we
+ // have the simple case where the edge dominates the end.
+ if (RootDominatesEnd)
+ addToLeaderTable(Num, NotVal, Root.getEnd());
+
+ continue;
+ }
+ }
+
+ return Changed;
+}
+
+/// When calculating availability, handle an instruction
+/// by inserting it into the appropriate sets
+bool GVN::processInstruction(Instruction *I) {
+ // Ignore dbg info intrinsics.
+ if (isa<DbgInfoIntrinsic>(I))
+ return false;
+
+ // If the instruction can be easily simplified then do so now in preference
+ // to value numbering it. Value numbering often exposes redundancies, for
+ // example if it determines that %y is equal to %x then the instruction
+ // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
+ const DataLayout &DL = I->getModule()->getDataLayout();
+ if (Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC})) {
+ bool Changed = false;
+ if (!I->use_empty()) {
+ I->replaceAllUsesWith(V);
+ Changed = true;
+ }
+ if (isInstructionTriviallyDead(I, TLI)) {
+ markInstructionForDeletion(I);
+ Changed = true;
+ }
+ if (Changed) {
+ if (MD && V->getType()->isPtrOrPtrVectorTy())
+ MD->invalidateCachedPointerInfo(V);
+ ++NumGVNSimpl;
+ return true;
+ }
+ }
+
+ if (IntrinsicInst *IntrinsicI = dyn_cast<IntrinsicInst>(I))
+ if (IntrinsicI->getIntrinsicID() == Intrinsic::assume)
+ return processAssumeIntrinsic(IntrinsicI);
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ if (processLoad(LI))
+ return true;
+
+ unsigned Num = VN.lookupOrAdd(LI);
+ addToLeaderTable(Num, LI, LI->getParent());
+ return false;
+ }
+
+ // For conditional branches, we can perform simple conditional propagation on
+ // the condition value itself.
+ if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+ if (!BI->isConditional())
+ return false;
+
+ if (isa<Constant>(BI->getCondition()))
+ return processFoldableCondBr(BI);
+
+ Value *BranchCond = BI->getCondition();
+ BasicBlock *TrueSucc = BI->getSuccessor(0);
+ BasicBlock *FalseSucc = BI->getSuccessor(1);
+ // Avoid multiple edges early.
+ if (TrueSucc == FalseSucc)
+ return false;
+
+ BasicBlock *Parent = BI->getParent();
+ bool Changed = false;
+
+ Value *TrueVal = ConstantInt::getTrue(TrueSucc->getContext());
+ BasicBlockEdge TrueE(Parent, TrueSucc);
+ Changed |= propagateEquality(BranchCond, TrueVal, TrueE, true);
+
+ Value *FalseVal = ConstantInt::getFalse(FalseSucc->getContext());
+ BasicBlockEdge FalseE(Parent, FalseSucc);
+ Changed |= propagateEquality(BranchCond, FalseVal, FalseE, true);
+
+ return Changed;
+ }
+
+ // For switches, propagate the case values into the case destinations.
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+ Value *SwitchCond = SI->getCondition();
+ BasicBlock *Parent = SI->getParent();
+ bool Changed = false;
+
+ // Remember how many outgoing edges there are to every successor.
+ SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+ for (unsigned i = 0, n = SI->getNumSuccessors(); i != n; ++i)
+ ++SwitchEdges[SI->getSuccessor(i)];
+
+ for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+ i != e; ++i) {
+ BasicBlock *Dst = i->getCaseSuccessor();
+ // If there is only a single edge, propagate the case value into it.
+ if (SwitchEdges.lookup(Dst) == 1) {
+ BasicBlockEdge E(Parent, Dst);
+ Changed |= propagateEquality(SwitchCond, i->getCaseValue(), E, true);
+ }
+ }
+ return Changed;
+ }
+
+ // Instructions with void type don't return a value, so there's
+ // no point in trying to find redundancies in them.
+ if (I->getType()->isVoidTy())
+ return false;
+
+ uint32_t NextNum = VN.getNextUnusedValueNumber();
+ unsigned Num = VN.lookupOrAdd(I);
+
+ // Allocations are always uniquely numbered, so we can save time and memory
+ // by fast failing them.
+ if (isa<AllocaInst>(I) || isa<TerminatorInst>(I) || isa<PHINode>(I)) {
+ addToLeaderTable(Num, I, I->getParent());
+ return false;
+ }
+
+ // If the number we were assigned was a brand new VN, then we don't
+ // need to do a lookup to see if the number already exists
+ // somewhere in the domtree: it can't!
+ if (Num >= NextNum) {
+ addToLeaderTable(Num, I, I->getParent());
+ return false;
+ }
+
+ // Perform fast-path value-number based elimination of values inherited from
+ // dominators.
+ Value *Repl = findLeader(I->getParent(), Num);
+ if (!Repl) {
+ // Failure, just remember this instance for future use.
+ addToLeaderTable(Num, I, I->getParent());
+ return false;
+ } else if (Repl == I) {
+ // If I was the result of a shortcut PRE, it might already be in the table
+ // and the best replacement for itself. Nothing to do.
+ return false;
+ }
+
+ // Remove it!
+ patchAndReplaceAllUsesWith(I, Repl);
+ if (MD && Repl->getType()->isPtrOrPtrVectorTy())
+ MD->invalidateCachedPointerInfo(Repl);
+ markInstructionForDeletion(I);
+ return true;
+}
+
+/// runOnFunction - This is the main transformation entry point for a function.
+bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
+ const TargetLibraryInfo &RunTLI, AAResults &RunAA,
+ MemoryDependenceResults *RunMD, LoopInfo *LI,
+ OptimizationRemarkEmitter *RunORE) {
+ AC = &RunAC;
+ DT = &RunDT;
+ VN.setDomTree(DT);
+ TLI = &RunTLI;
+ VN.setAliasAnalysis(&RunAA);
+ MD = RunMD;
+ OrderedInstructions OrderedInstrs(DT);
+ OI = &OrderedInstrs;
+ VN.setMemDep(MD);
+ ORE = RunORE;
+
+ bool Changed = false;
+ bool ShouldContinue = true;
+
+ // Merge unconditional branches, allowing PRE to catch more
+ // optimization opportunities.
+ for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
+ BasicBlock *BB = &*FI++;
+
+ bool removedBlock = MergeBlockIntoPredecessor(BB, DT, LI, MD);
+ if (removedBlock)
+ ++NumGVNBlocks;
+
+ Changed |= removedBlock;
+ }
+
+ unsigned Iteration = 0;
+ while (ShouldContinue) {
+ DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n");
+ ShouldContinue = iterateOnFunction(F);
+ Changed |= ShouldContinue;
+ ++Iteration;
+ }
+
+ if (EnablePRE) {
+ // Fabricate val-num for dead-code in order to suppress assertion in
+ // performPRE().
+ assignValNumForDeadCode();
+ assignBlockRPONumber(F);
+ bool PREChanged = true;
+ while (PREChanged) {
+ PREChanged = performPRE(F);
+ Changed |= PREChanged;
+ }
+ }
+
+ // FIXME: Should perform GVN again after PRE does something. PRE can move
+ // computations into blocks where they become fully redundant. Note that
+ // we can't do this until PRE's critical edge splitting updates memdep.
+ // Actually, when this happens, we should just fully integrate PRE into GVN.
+
+ cleanupGlobalSets();
+ // Do not cleanup DeadBlocks in cleanupGlobalSets() as it's called for each
+ // iteration.
+ DeadBlocks.clear();
+
+ return Changed;
+}
+
+bool GVN::processBlock(BasicBlock *BB) {
+ // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function
+ // (and incrementing BI before processing an instruction).
+ assert(InstrsToErase.empty() &&
+ "We expect InstrsToErase to be empty across iterations");
+ if (DeadBlocks.count(BB))
+ return false;
+
+ // Clearing map before every BB because it can be used only for single BB.
+ ReplaceWithConstMap.clear();
+ bool ChangedFunction = false;
+
+ for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+ BI != BE;) {
+ if (!ReplaceWithConstMap.empty())
+ ChangedFunction |= replaceOperandsWithConsts(&*BI);
+ ChangedFunction |= processInstruction(&*BI);
+
+ if (InstrsToErase.empty()) {
+ ++BI;
+ continue;
+ }
+
+ // If we need some instructions deleted, do it now.
+ NumGVNInstr += InstrsToErase.size();
+
+ // Avoid iterator invalidation.
+ bool AtStart = BI == BB->begin();
+ if (!AtStart)
+ --BI;
+
+ bool InvalidateImplicitCF = false;
+ const Instruction *MaybeFirstICF = FirstImplicitControlFlowInsts.lookup(BB);
+ for (auto *I : InstrsToErase) {
+ assert(I->getParent() == BB && "Removing instruction from wrong block?");
+ DEBUG(dbgs() << "GVN removed: " << *I << '\n');
+ if (MD) MD->removeInstruction(I);
+ DEBUG(verifyRemoved(I));
+ if (MaybeFirstICF == I) {
+ // We have erased the first ICF in block. The map needs to be updated.
+ InvalidateImplicitCF = true;
+ // Do not keep dangling pointer on the erased instruction.
+ MaybeFirstICF = nullptr;
+ }
+ I->eraseFromParent();
+ }
+
+ OI->invalidateBlock(BB);
+ InstrsToErase.clear();
+ if (InvalidateImplicitCF)
+ fillImplicitControlFlowInfo(BB);
+
+ if (AtStart)
+ BI = BB->begin();
+ else
+ ++BI;
+ }
+
+ return ChangedFunction;
+}
+
+// Instantiate an expression in a predecessor that lacked it.
+bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
+ BasicBlock *Curr, unsigned int ValNo) {
+ // Because we are going top-down through the block, all value numbers
+ // will be available in the predecessor by the time we need them. Any
+ // that weren't originally present will have been instantiated earlier
+ // in this loop.
+ bool success = true;
+ for (unsigned i = 0, e = Instr->getNumOperands(); i != e; ++i) {
+ Value *Op = Instr->getOperand(i);
+ if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op))
+ continue;
+ // This could be a newly inserted instruction, in which case, we won't
+ // find a value number, and should give up before we hurt ourselves.
+ // FIXME: Rewrite the infrastructure to let it easier to value number
+ // and process newly inserted instructions.
+ if (!VN.exists(Op)) {
+ success = false;
+ break;
+ }
+ uint32_t TValNo =
+ VN.phiTranslate(Pred, Curr, VN.lookup(Op), *this);
+ if (Value *V = findLeader(Pred, TValNo)) {
+ Instr->setOperand(i, V);
+ } else {
+ success = false;
+ break;
+ }
+ }
+
+ // Fail out if we encounter an operand that is not available in
+ // the PRE predecessor. This is typically because of loads which
+ // are not value numbered precisely.
+ if (!success)
+ return false;
+
+ Instr->insertBefore(Pred->getTerminator());
+ Instr->setName(Instr->getName() + ".pre");
+ Instr->setDebugLoc(Instr->getDebugLoc());
+
+ unsigned Num = VN.lookupOrAdd(Instr);
+ VN.add(Instr, Num);
+
+ // Update the availability map to include the new instruction.
+ addToLeaderTable(Num, Instr, Pred);
+ return true;
+}
+
+bool GVN::performScalarPRE(Instruction *CurInst) {
+ if (isa<AllocaInst>(CurInst) || isa<TerminatorInst>(CurInst) ||
+ isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() ||
+ CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
+ isa<DbgInfoIntrinsic>(CurInst))
+ return false;
+
+ // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
+ // sinking the compare again, and it would force the code generator to
+ // move the i1 from processor flags or predicate registers into a general
+ // purpose register.
+ if (isa<CmpInst>(CurInst))
+ return false;
+
+ // We don't currently value number ANY inline asm calls.
+ if (CallInst *CallI = dyn_cast<CallInst>(CurInst))
+ if (CallI->isInlineAsm())
+ return false;
+
+ uint32_t ValNo = VN.lookup(CurInst);
+
+ // Look for the predecessors for PRE opportunities. We're
+ // only trying to solve the basic diamond case, where
+ // a value is computed in the successor and one predecessor,
+ // but not the other. We also explicitly disallow cases
+ // where the successor is its own predecessor, because they're
+ // more complicated to get right.
+ unsigned NumWith = 0;
+ unsigned NumWithout = 0;
+ BasicBlock *PREPred = nullptr;
+ BasicBlock *CurrentBlock = CurInst->getParent();
+
+ SmallVector<std::pair<Value *, BasicBlock *>, 8> predMap;
+ for (BasicBlock *P : predecessors(CurrentBlock)) {
+ // We're not interested in PRE where blocks with predecessors that are
+ // not reachable.
+ if (!DT->isReachableFromEntry(P)) {
+ NumWithout = 2;
+ break;
+ }
+ // It is not safe to do PRE when P->CurrentBlock is a loop backedge, and
+ // when CurInst has operand defined in CurrentBlock (so it may be defined
+ // by phi in the loop header).
+ if (BlockRPONumber[P] >= BlockRPONumber[CurrentBlock] &&
+ llvm::any_of(CurInst->operands(), [&](const Use &U) {
+ if (auto *Inst = dyn_cast<Instruction>(U.get()))
+ return Inst->getParent() == CurrentBlock;
+ return false;
+ })) {
+ NumWithout = 2;
+ break;
+ }
+
+ uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this);
+ Value *predV = findLeader(P, TValNo);
+ if (!predV) {
+ predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
+ PREPred = P;
+ ++NumWithout;
+ } else if (predV == CurInst) {
+ /* CurInst dominates this predecessor. */
+ NumWithout = 2;
+ break;
+ } else {
+ predMap.push_back(std::make_pair(predV, P));
+ ++NumWith;
+ }
+ }
+
+ // Don't do PRE when it might increase code size, i.e. when
+ // we would need to insert instructions in more than one pred.
+ if (NumWithout > 1 || NumWith == 0)
+ return false;
+
+ // We may have a case where all predecessors have the instruction,
+ // and we just need to insert a phi node. Otherwise, perform
+ // insertion.
+ Instruction *PREInstr = nullptr;
+
+ if (NumWithout != 0) {
+ if (!isSafeToSpeculativelyExecute(CurInst)) {
+ // It is only valid to insert a new instruction if the current instruction
+ // is always executed. An instruction with implicit control flow could
+ // prevent us from doing it. If we cannot speculate the execution, then
+ // PRE should be prohibited.
+ auto It = FirstImplicitControlFlowInsts.find(CurrentBlock);
+ if (It != FirstImplicitControlFlowInsts.end()) {
+ assert(It->second->getParent() == CurrentBlock &&
+ "Implicit control flow map broken?");
+ if (OI->dominates(It->second, CurInst))
+ return false;
+ }
+ }
+
+ // Don't do PRE across indirect branch.
+ if (isa<IndirectBrInst>(PREPred->getTerminator()))
+ return false;
+
+ // We can't do PRE safely on a critical edge, so instead we schedule
+ // the edge to be split and perform the PRE the next time we iterate
+ // on the function.
+ unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock);
+ if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) {
+ toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum));
+ return false;
+ }
+ // We need to insert somewhere, so let's give it a shot
+ PREInstr = CurInst->clone();
+ if (!performScalarPREInsertion(PREInstr, PREPred, CurrentBlock, ValNo)) {
+ // If we failed insertion, make sure we remove the instruction.
+ DEBUG(verifyRemoved(PREInstr));
+ PREInstr->deleteValue();
+ return false;
+ }
+ }
+
+ // Either we should have filled in the PRE instruction, or we should
+ // not have needed insertions.
+ assert(PREInstr != nullptr || NumWithout == 0);
+
+ ++NumGVNPRE;
+
+ // Create a PHI to make the value available in this block.
+ PHINode *Phi =
+ PHINode::Create(CurInst->getType(), predMap.size(),
+ CurInst->getName() + ".pre-phi", &CurrentBlock->front());
+ for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
+ if (Value *V = predMap[i].first) {
+ // If we use an existing value in this phi, we have to patch the original
+ // value because the phi will be used to replace a later value.
+ patchReplacementInstruction(CurInst, V);
+ Phi->addIncoming(V, predMap[i].second);
+ } else
+ Phi->addIncoming(PREInstr, PREPred);
+ }
+
+ VN.add(Phi, ValNo);
+ // After creating a new PHI for ValNo, the phi translate result for ValNo will
+ // be changed, so erase the related stale entries in phi translate cache.
+ VN.eraseTranslateCacheEntry(ValNo, *CurrentBlock);
+ addToLeaderTable(ValNo, Phi, CurrentBlock);
+ Phi->setDebugLoc(CurInst->getDebugLoc());
+ CurInst->replaceAllUsesWith(Phi);
+ if (MD && Phi->getType()->isPtrOrPtrVectorTy())
+ MD->invalidateCachedPointerInfo(Phi);
+ VN.erase(CurInst);
+ removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
+
+ DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
+ if (MD)
+ MD->removeInstruction(CurInst);
+ DEBUG(verifyRemoved(CurInst));
+ bool InvalidateImplicitCF =
+ FirstImplicitControlFlowInsts.lookup(CurInst->getParent()) == CurInst;
+ // FIXME: Intended to be markInstructionForDeletion(CurInst), but it causes
+ // some assertion failures.
+ OI->invalidateBlock(CurrentBlock);
+ CurInst->eraseFromParent();
+ if (InvalidateImplicitCF)
+ fillImplicitControlFlowInfo(CurrentBlock);
+ ++NumGVNInstr;
+
+ return true;
+}
+
+/// Perform a purely local form of PRE that looks for diamond
+/// control flow patterns and attempts to perform simple PRE at the join point.
+bool GVN::performPRE(Function &F) {
+ bool Changed = false;
+ for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) {
+ // Nothing to PRE in the entry block.
+ if (CurrentBlock == &F.getEntryBlock())
+ continue;
+
+ // Don't perform PRE on an EH pad.
+ if (CurrentBlock->isEHPad())
+ continue;
+
+ for (BasicBlock::iterator BI = CurrentBlock->begin(),
+ BE = CurrentBlock->end();
+ BI != BE;) {
+ Instruction *CurInst = &*BI++;
+ Changed |= performScalarPRE(CurInst);
+ }
+ }
+
+ if (splitCriticalEdges())
+ Changed = true;
+
+ return Changed;
+}
+
+/// Split the critical edge connecting the given two blocks, and return
+/// the block inserted to the critical edge.
+BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
+ BasicBlock *BB =
+ SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT));
+ if (MD)
+ MD->invalidateCachedPredecessors();
+ return BB;
+}
+
+/// Split critical edges found during the previous
+/// iteration that may enable further optimization.
+bool GVN::splitCriticalEdges() {
+ if (toSplit.empty())
+ return false;
+ do {
+ std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val();
+ SplitCriticalEdge(Edge.first, Edge.second,
+ CriticalEdgeSplittingOptions(DT));
+ } while (!toSplit.empty());
+ if (MD) MD->invalidateCachedPredecessors();
+ return true;
+}
+
+/// Executes one iteration of GVN
+bool GVN::iterateOnFunction(Function &F) {
+ cleanupGlobalSets();
+
+ // Top-down walk of the dominator tree
+ bool Changed = false;
+ // Needed for value numbering with phi construction to work.
+ // RPOT walks the graph in its constructor and will not be invalidated during
+ // processBlock.
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+
+ for (BasicBlock *BB : RPOT)
+ fillImplicitControlFlowInfo(BB);
+ for (BasicBlock *BB : RPOT)
+ Changed |= processBlock(BB);
+
+ return Changed;
+}
+
+void GVN::cleanupGlobalSets() {
+ VN.clear();
+ LeaderTable.clear();
+ BlockRPONumber.clear();
+ TableAllocator.Reset();
+ FirstImplicitControlFlowInsts.clear();
+}
+
+void
+GVN::fillImplicitControlFlowInfo(BasicBlock *BB) {
+ // Make sure that all marked instructions are actually deleted by this point,
+ // so that we don't need to care about omitting them.
+ assert(InstrsToErase.empty() && "Filling before removed all marked insns?");
+ auto MayNotTransferExecutionToSuccessor = [&](const Instruction *I) {
+ // If a block's instruction doesn't always pass the control to its successor
+ // instruction, mark the block as having implicit control flow. We use them
+ // to avoid wrong assumptions of sort "if A is executed and B post-dominates
+ // A, then B is also executed". This is not true is there is an implicit
+ // control flow instruction (e.g. a guard) between them.
+ //
+ // TODO: Currently, isGuaranteedToTransferExecutionToSuccessor returns false
+ // for volatile stores and loads because they can trap. The discussion on
+ // whether or not it is correct is still ongoing. We might want to get rid
+ // of this logic in the future. Anyways, trapping instructions shouldn't
+ // introduce implicit control flow, so we explicitly allow them here. This
+ // must be removed once isGuaranteedToTransferExecutionToSuccessor is fixed.
+ if (isGuaranteedToTransferExecutionToSuccessor(I))
+ return false;
+ if (isa<LoadInst>(I)) {
+ assert(cast<LoadInst>(I)->isVolatile() &&
+ "Non-volatile load should transfer execution to successor!");
+ return false;
+ }
+ if (isa<StoreInst>(I)) {
+ assert(cast<StoreInst>(I)->isVolatile() &&
+ "Non-volatile store should transfer execution to successor!");
+ return false;
+ }
+ return true;
+ };
+ FirstImplicitControlFlowInsts.erase(BB);
+
+ for (auto &I : *BB)
+ if (MayNotTransferExecutionToSuccessor(&I)) {
+ FirstImplicitControlFlowInsts[BB] = &I;
+ break;
+ }
+}
+
+/// Verify that the specified instruction does not occur in our
+/// internal data structures.
+void GVN::verifyRemoved(const Instruction *Inst) const {
+ VN.verifyRemoved(Inst);
+
+ // Walk through the value number scope to make sure the instruction isn't
+ // ferreted away in it.
+ for (DenseMap<uint32_t, LeaderTableEntry>::const_iterator
+ I = LeaderTable.begin(), E = LeaderTable.end(); I != E; ++I) {
+ const LeaderTableEntry *Node = &I->second;
+ assert(Node->Val != Inst && "Inst still in value numbering scope!");
+
+ while (Node->Next) {
+ Node = Node->Next;
+ assert(Node->Val != Inst && "Inst still in value numbering scope!");
+ }
+ }
+}
+
+/// BB is declared dead, which implied other blocks become dead as well. This
+/// function is to add all these blocks to "DeadBlocks". For the dead blocks'
+/// live successors, update their phi nodes by replacing the operands
+/// corresponding to dead blocks with UndefVal.
+void GVN::addDeadBlock(BasicBlock *BB) {
+ SmallVector<BasicBlock *, 4> NewDead;
+ SmallSetVector<BasicBlock *, 4> DF;
+
+ NewDead.push_back(BB);
+ while (!NewDead.empty()) {
+ BasicBlock *D = NewDead.pop_back_val();
+ if (DeadBlocks.count(D))
+ continue;
+
+ // All blocks dominated by D are dead.
+ SmallVector<BasicBlock *, 8> Dom;
+ DT->getDescendants(D, Dom);
+ DeadBlocks.insert(Dom.begin(), Dom.end());
+
+ // Figure out the dominance-frontier(D).
+ for (BasicBlock *B : Dom) {
+ for (BasicBlock *S : successors(B)) {
+ if (DeadBlocks.count(S))
+ continue;
+
+ bool AllPredDead = true;
+ for (BasicBlock *P : predecessors(S))
+ if (!DeadBlocks.count(P)) {
+ AllPredDead = false;
+ break;
+ }
+
+ if (!AllPredDead) {
+ // S could be proved dead later on. That is why we don't update phi
+ // operands at this moment.
+ DF.insert(S);
+ } else {
+ // While S is not dominated by D, it is dead by now. This could take
+ // place if S already have a dead predecessor before D is declared
+ // dead.
+ NewDead.push_back(S);
+ }
+ }
+ }
+ }
+
+ // For the dead blocks' live successors, update their phi nodes by replacing
+ // the operands corresponding to dead blocks with UndefVal.
+ for(SmallSetVector<BasicBlock *, 4>::iterator I = DF.begin(), E = DF.end();
+ I != E; I++) {
+ BasicBlock *B = *I;
+ if (DeadBlocks.count(B))
+ continue;
+
+ SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B));
+ for (BasicBlock *P : Preds) {
+ if (!DeadBlocks.count(P))
+ continue;
+
+ if (isCriticalEdge(P->getTerminator(), GetSuccessorNumber(P, B))) {
+ if (BasicBlock *S = splitCriticalEdges(P, B))
+ DeadBlocks.insert(P = S);
+ }
+
+ for (BasicBlock::iterator II = B->begin(); isa<PHINode>(II); ++II) {
+ PHINode &Phi = cast<PHINode>(*II);
+ Phi.setIncomingValue(Phi.getBasicBlockIndex(P),
+ UndefValue::get(Phi.getType()));
+ }
+ }
+ }
+}
+
+// If the given branch is recognized as a foldable branch (i.e. conditional
+// branch with constant condition), it will perform following analyses and
+// transformation.
+// 1) If the dead out-coming edge is a critical-edge, split it. Let
+// R be the target of the dead out-coming edge.
+// 1) Identify the set of dead blocks implied by the branch's dead outcoming
+// edge. The result of this step will be {X| X is dominated by R}
+// 2) Identify those blocks which haves at least one dead predecessor. The
+// result of this step will be dominance-frontier(R).
+// 3) Update the PHIs in DF(R) by replacing the operands corresponding to
+// dead blocks with "UndefVal" in an hope these PHIs will optimized away.
+//
+// Return true iff *NEW* dead code are found.
+bool GVN::processFoldableCondBr(BranchInst *BI) {
+ if (!BI || BI->isUnconditional())
+ return false;
+
+ // If a branch has two identical successors, we cannot declare either dead.
+ if (BI->getSuccessor(0) == BI->getSuccessor(1))
+ return false;
+
+ ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+ if (!Cond)
+ return false;
+
+ BasicBlock *DeadRoot =
+ Cond->getZExtValue() ? BI->getSuccessor(1) : BI->getSuccessor(0);
+ if (DeadBlocks.count(DeadRoot))
+ return false;
+
+ if (!DeadRoot->getSinglePredecessor())
+ DeadRoot = splitCriticalEdges(BI->getParent(), DeadRoot);
+
+ addDeadBlock(DeadRoot);
+ return true;
+}
+
+// performPRE() will trigger assert if it comes across an instruction without
+// associated val-num. As it normally has far more live instructions than dead
+// instructions, it makes more sense just to "fabricate" a val-number for the
+// dead code than checking if instruction involved is dead or not.
+void GVN::assignValNumForDeadCode() {
+ for (BasicBlock *BB : DeadBlocks) {
+ for (Instruction &Inst : *BB) {
+ unsigned ValNum = VN.lookupOrAdd(&Inst);
+ addToLeaderTable(ValNum, &Inst, BB);
+ }
+ }
+}
+
+class llvm::gvn::GVNLegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ explicit GVNLegacyPass(bool NoLoads = false)
+ : FunctionPass(ID), NoLoads(NoLoads) {
+ initializeGVNLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+
+ return Impl.runImpl(
+ F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+ getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+ getAnalysis<AAResultsWrapperPass>().getAAResults(),
+ NoLoads ? nullptr
+ : &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(),
+ LIWP ? &LIWP->getLoopInfo() : nullptr,
+ &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ if (!NoLoads)
+ AU.addRequired<MemoryDependenceWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ }
+
+private:
+ bool NoLoads;
+ GVN Impl;
+};
+
+char GVNLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
+
+// The public interface to this file...
+FunctionPass *llvm::createGVNPass(bool NoLoads) {
+ return new GVNLegacyPass(NoLoads);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
new file mode 100644
index 000000000000..026fab5dbd3b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -0,0 +1,1207 @@
+//===- GVNHoist.cpp - Hoist scalar and load expressions -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass hoists expressions from branches to a common dominator. It uses
+// GVN (global value numbering) to discover expressions computing the same
+// values. The primary goals of code-hoisting are:
+// 1. To reduce the code size.
+// 2. In some cases reduce critical path (by exposing more ILP).
+//
+// The algorithm factors out the reachability of values such that multiple
+// queries to find reachability of values are fast. This is based on finding the
+// ANTIC points in the CFG which do not change during hoisting. The ANTIC points
+// are basically the dominance-frontiers in the inverse graph. So we introduce a
+// data structure (CHI nodes) to keep track of values flowing out of a basic
+// block. We only do this for values with multiple occurrences in the function
+// as they are the potential hoistable candidates. This approach allows us to
+// hoist instructions to a basic block with more than two successors, as well as
+// deal with infinite loops in a trivial way.
+//
+// Limitations: This pass does not hoist fully redundant expressions because
+// they are already handled by GVN-PRE. It is advisable to run gvn-hoist before
+// and after gvn-pre because gvn-pre creates opportunities for more instructions
+// to be hoisted.
+//
+// Hoisting may affect the performance in some cases. To mitigate that, hoisting
+// is disabled in the following cases.
+// 1. Scalars across calls.
+// 2. geps when corresponding load/store cannot be hoisted.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gvn-hoist"
+
+STATISTIC(NumHoisted, "Number of instructions hoisted");
+STATISTIC(NumRemoved, "Number of instructions removed");
+STATISTIC(NumLoadsHoisted, "Number of loads hoisted");
+STATISTIC(NumLoadsRemoved, "Number of loads removed");
+STATISTIC(NumStoresHoisted, "Number of stores hoisted");
+STATISTIC(NumStoresRemoved, "Number of stores removed");
+STATISTIC(NumCallsHoisted, "Number of calls hoisted");
+STATISTIC(NumCallsRemoved, "Number of calls removed");
+
+static cl::opt<int>
+ MaxHoistedThreshold("gvn-max-hoisted", cl::Hidden, cl::init(-1),
+ cl::desc("Max number of instructions to hoist "
+ "(default unlimited = -1)"));
+
+static cl::opt<int> MaxNumberOfBBSInPath(
+ "gvn-hoist-max-bbs", cl::Hidden, cl::init(4),
+ cl::desc("Max number of basic blocks on the path between "
+ "hoisting locations (default = 4, unlimited = -1)"));
+
+static cl::opt<int> MaxDepthInBB(
+ "gvn-hoist-max-depth", cl::Hidden, cl::init(100),
+ cl::desc("Hoist instructions from the beginning of the BB up to the "
+ "maximum specified depth (default = 100, unlimited = -1)"));
+
+static cl::opt<int>
+ MaxChainLength("gvn-hoist-max-chain-length", cl::Hidden, cl::init(10),
+ cl::desc("Maximum length of dependent chains to hoist "
+ "(default = 10, unlimited = -1)"));
+
+namespace llvm {
+
+using BBSideEffectsSet = DenseMap<const BasicBlock *, bool>;
+using SmallVecInsn = SmallVector<Instruction *, 4>;
+using SmallVecImplInsn = SmallVectorImpl<Instruction *>;
+
+// Each element of a hoisting list contains the basic block where to hoist and
+// a list of instructions to be hoisted.
+using HoistingPointInfo = std::pair<BasicBlock *, SmallVecInsn>;
+
+using HoistingPointList = SmallVector<HoistingPointInfo, 4>;
+
+// A map from a pair of VNs to all the instructions with those VNs.
+using VNType = std::pair<unsigned, unsigned>;
+
+using VNtoInsns = DenseMap<VNType, SmallVector<Instruction *, 4>>;
+
+// CHI keeps information about values flowing out of a basic block. It is
+// similar to PHI but in the inverse graph, and used for outgoing values on each
+// edge. For conciseness, it is computed only for instructions with multiple
+// occurrences in the CFG because they are the only hoistable candidates.
+// A (CHI[{V, B, I1}, {V, C, I2}]
+// / \
+// / \
+// B(I1) C (I2)
+// The Value number for both I1 and I2 is V, the CHI node will save the
+// instruction as well as the edge where the value is flowing to.
+struct CHIArg {
+ VNType VN;
+
+ // Edge destination (shows the direction of flow), may not be where the I is.
+ BasicBlock *Dest;
+
+ // The instruction (VN) which uses the values flowing out of CHI.
+ Instruction *I;
+
+ bool operator==(const CHIArg &A) { return VN == A.VN; }
+ bool operator!=(const CHIArg &A) { return !(*this == A); }
+};
+
+using CHIIt = SmallVectorImpl<CHIArg>::iterator;
+using CHIArgs = iterator_range<CHIIt>;
+using OutValuesType = DenseMap<BasicBlock *, SmallVector<CHIArg, 2>>;
+using InValuesType =
+ DenseMap<BasicBlock *, SmallVector<std::pair<VNType, Instruction *>, 2>>;
+
+// An invalid value number Used when inserting a single value number into
+// VNtoInsns.
+enum : unsigned { InvalidVN = ~2U };
+
+// Records all scalar instructions candidate for code hoisting.
+class InsnInfo {
+ VNtoInsns VNtoScalars;
+
+public:
+ // Inserts I and its value number in VNtoScalars.
+ void insert(Instruction *I, GVN::ValueTable &VN) {
+ // Scalar instruction.
+ unsigned V = VN.lookupOrAdd(I);
+ VNtoScalars[{V, InvalidVN}].push_back(I);
+ }
+
+ const VNtoInsns &getVNTable() const { return VNtoScalars; }
+};
+
+// Records all load instructions candidate for code hoisting.
+class LoadInfo {
+ VNtoInsns VNtoLoads;
+
+public:
+ // Insert Load and the value number of its memory address in VNtoLoads.
+ void insert(LoadInst *Load, GVN::ValueTable &VN) {
+ if (Load->isSimple()) {
+ unsigned V = VN.lookupOrAdd(Load->getPointerOperand());
+ VNtoLoads[{V, InvalidVN}].push_back(Load);
+ }
+ }
+
+ const VNtoInsns &getVNTable() const { return VNtoLoads; }
+};
+
+// Records all store instructions candidate for code hoisting.
+class StoreInfo {
+ VNtoInsns VNtoStores;
+
+public:
+ // Insert the Store and a hash number of the store address and the stored
+ // value in VNtoStores.
+ void insert(StoreInst *Store, GVN::ValueTable &VN) {
+ if (!Store->isSimple())
+ return;
+ // Hash the store address and the stored value.
+ Value *Ptr = Store->getPointerOperand();
+ Value *Val = Store->getValueOperand();
+ VNtoStores[{VN.lookupOrAdd(Ptr), VN.lookupOrAdd(Val)}].push_back(Store);
+ }
+
+ const VNtoInsns &getVNTable() const { return VNtoStores; }
+};
+
+// Records all call instructions candidate for code hoisting.
+class CallInfo {
+ VNtoInsns VNtoCallsScalars;
+ VNtoInsns VNtoCallsLoads;
+ VNtoInsns VNtoCallsStores;
+
+public:
+ // Insert Call and its value numbering in one of the VNtoCalls* containers.
+ void insert(CallInst *Call, GVN::ValueTable &VN) {
+ // A call that doesNotAccessMemory is handled as a Scalar,
+ // onlyReadsMemory will be handled as a Load instruction,
+ // all other calls will be handled as stores.
+ unsigned V = VN.lookupOrAdd(Call);
+ auto Entry = std::make_pair(V, InvalidVN);
+
+ if (Call->doesNotAccessMemory())
+ VNtoCallsScalars[Entry].push_back(Call);
+ else if (Call->onlyReadsMemory())
+ VNtoCallsLoads[Entry].push_back(Call);
+ else
+ VNtoCallsStores[Entry].push_back(Call);
+ }
+
+ const VNtoInsns &getScalarVNTable() const { return VNtoCallsScalars; }
+ const VNtoInsns &getLoadVNTable() const { return VNtoCallsLoads; }
+ const VNtoInsns &getStoreVNTable() const { return VNtoCallsStores; }
+};
+
+static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) {
+ static const unsigned KnownIDs[] = {
+ LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias, LLVMContext::MD_range,
+ LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load,
+ LLVMContext::MD_invariant_group};
+ combineMetadata(ReplInst, I, KnownIDs);
+}
+
+// This pass hoists common computations across branches sharing common
+// dominator. The primary goal is to reduce the code size, and in some
+// cases reduce critical path (by exposing more ILP).
+class GVNHoist {
+public:
+ GVNHoist(DominatorTree *DT, PostDominatorTree *PDT, AliasAnalysis *AA,
+ MemoryDependenceResults *MD, MemorySSA *MSSA)
+ : DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA),
+ MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {}
+
+ bool run(Function &F) {
+ NumFuncArgs = F.arg_size();
+ VN.setDomTree(DT);
+ VN.setAliasAnalysis(AA);
+ VN.setMemDep(MD);
+ bool Res = false;
+ // Perform DFS Numbering of instructions.
+ unsigned BBI = 0;
+ for (const BasicBlock *BB : depth_first(&F.getEntryBlock())) {
+ DFSNumber[BB] = ++BBI;
+ unsigned I = 0;
+ for (auto &Inst : *BB)
+ DFSNumber[&Inst] = ++I;
+ }
+
+ int ChainLength = 0;
+
+ // FIXME: use lazy evaluation of VN to avoid the fix-point computation.
+ while (true) {
+ if (MaxChainLength != -1 && ++ChainLength >= MaxChainLength)
+ return Res;
+
+ auto HoistStat = hoistExpressions(F);
+ if (HoistStat.first + HoistStat.second == 0)
+ return Res;
+
+ if (HoistStat.second > 0)
+ // To address a limitation of the current GVN, we need to rerun the
+ // hoisting after we hoisted loads or stores in order to be able to
+ // hoist all scalars dependent on the hoisted ld/st.
+ VN.clear();
+
+ Res = true;
+ }
+
+ return Res;
+ }
+
+ // Copied from NewGVN.cpp
+ // This function provides global ranking of operations so that we can place
+ // them in a canonical order. Note that rank alone is not necessarily enough
+ // for a complete ordering, as constants all have the same rank. However,
+ // generally, we will simplify an operation with all constants so that it
+ // doesn't matter what order they appear in.
+ unsigned int rank(const Value *V) const {
+ // Prefer constants to undef to anything else
+ // Undef is a constant, have to check it first.
+ // Prefer smaller constants to constantexprs
+ if (isa<ConstantExpr>(V))
+ return 2;
+ if (isa<UndefValue>(V))
+ return 1;
+ if (isa<Constant>(V))
+ return 0;
+ else if (auto *A = dyn_cast<Argument>(V))
+ return 3 + A->getArgNo();
+
+ // Need to shift the instruction DFS by number of arguments + 3 to account
+ // for the constant and argument ranking above.
+ auto Result = DFSNumber.lookup(V);
+ if (Result > 0)
+ return 4 + NumFuncArgs + Result;
+ // Unreachable or something else, just return a really large number.
+ return ~0;
+ }
+
+private:
+ GVN::ValueTable VN;
+ DominatorTree *DT;
+ PostDominatorTree *PDT;
+ AliasAnalysis *AA;
+ MemoryDependenceResults *MD;
+ MemorySSA *MSSA;
+ std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
+ DenseMap<const Value *, unsigned> DFSNumber;
+ BBSideEffectsSet BBSideEffects;
+ DenseSet<const BasicBlock *> HoistBarrier;
+ SmallVector<BasicBlock *, 32> IDFBlocks;
+ unsigned NumFuncArgs;
+ const bool HoistingGeps = false;
+
+ enum InsKind { Unknown, Scalar, Load, Store };
+
+ // Return true when there are exception handling in BB.
+ bool hasEH(const BasicBlock *BB) {
+ auto It = BBSideEffects.find(BB);
+ if (It != BBSideEffects.end())
+ return It->second;
+
+ if (BB->isEHPad() || BB->hasAddressTaken()) {
+ BBSideEffects[BB] = true;
+ return true;
+ }
+
+ if (BB->getTerminator()->mayThrow()) {
+ BBSideEffects[BB] = true;
+ return true;
+ }
+
+ BBSideEffects[BB] = false;
+ return false;
+ }
+
+ // Return true when a successor of BB dominates A.
+ bool successorDominate(const BasicBlock *BB, const BasicBlock *A) {
+ for (const BasicBlock *Succ : BB->getTerminator()->successors())
+ if (DT->dominates(Succ, A))
+ return true;
+
+ return false;
+ }
+
+ // Return true when I1 appears before I2 in the instructions of BB.
+ bool firstInBB(const Instruction *I1, const Instruction *I2) {
+ assert(I1->getParent() == I2->getParent());
+ unsigned I1DFS = DFSNumber.lookup(I1);
+ unsigned I2DFS = DFSNumber.lookup(I2);
+ assert(I1DFS && I2DFS);
+ return I1DFS < I2DFS;
+ }
+
+ // Return true when there are memory uses of Def in BB.
+ bool hasMemoryUse(const Instruction *NewPt, MemoryDef *Def,
+ const BasicBlock *BB) {
+ const MemorySSA::AccessList *Acc = MSSA->getBlockAccesses(BB);
+ if (!Acc)
+ return false;
+
+ Instruction *OldPt = Def->getMemoryInst();
+ const BasicBlock *OldBB = OldPt->getParent();
+ const BasicBlock *NewBB = NewPt->getParent();
+ bool ReachedNewPt = false;
+
+ for (const MemoryAccess &MA : *Acc)
+ if (const MemoryUse *MU = dyn_cast<MemoryUse>(&MA)) {
+ Instruction *Insn = MU->getMemoryInst();
+
+ // Do not check whether MU aliases Def when MU occurs after OldPt.
+ if (BB == OldBB && firstInBB(OldPt, Insn))
+ break;
+
+ // Do not check whether MU aliases Def when MU occurs before NewPt.
+ if (BB == NewBB) {
+ if (!ReachedNewPt) {
+ if (firstInBB(Insn, NewPt))
+ continue;
+ ReachedNewPt = true;
+ }
+ }
+ if (MemorySSAUtil::defClobbersUseOrDef(Def, MU, *AA))
+ return true;
+ }
+
+ return false;
+ }
+
+ bool hasEHhelper(const BasicBlock *BB, const BasicBlock *SrcBB,
+ int &NBBsOnAllPaths) {
+ // Stop walk once the limit is reached.
+ if (NBBsOnAllPaths == 0)
+ return true;
+
+ // Impossible to hoist with exceptions on the path.
+ if (hasEH(BB))
+ return true;
+
+ // No such instruction after HoistBarrier in a basic block was
+ // selected for hoisting so instructions selected within basic block with
+ // a hoist barrier can be hoisted.
+ if ((BB != SrcBB) && HoistBarrier.count(BB))
+ return true;
+
+ return false;
+ }
+
+ // Return true when there are exception handling or loads of memory Def
+ // between Def and NewPt. This function is only called for stores: Def is
+ // the MemoryDef of the store to be hoisted.
+
+ // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
+ // return true when the counter NBBsOnAllPaths reaces 0, except when it is
+ // initialized to -1 which is unlimited.
+ bool hasEHOrLoadsOnPath(const Instruction *NewPt, MemoryDef *Def,
+ int &NBBsOnAllPaths) {
+ const BasicBlock *NewBB = NewPt->getParent();
+ const BasicBlock *OldBB = Def->getBlock();
+ assert(DT->dominates(NewBB, OldBB) && "invalid path");
+ assert(DT->dominates(Def->getDefiningAccess()->getBlock(), NewBB) &&
+ "def does not dominate new hoisting point");
+
+ // Walk all basic blocks reachable in depth-first iteration on the inverse
+ // CFG from OldBB to NewBB. These blocks are all the blocks that may be
+ // executed between the execution of NewBB and OldBB. Hoisting an expression
+ // from OldBB into NewBB has to be safe on all execution paths.
+ for (auto I = idf_begin(OldBB), E = idf_end(OldBB); I != E;) {
+ const BasicBlock *BB = *I;
+ if (BB == NewBB) {
+ // Stop traversal when reaching HoistPt.
+ I.skipChildren();
+ continue;
+ }
+
+ if (hasEHhelper(BB, OldBB, NBBsOnAllPaths))
+ return true;
+
+ // Check that we do not move a store past loads.
+ if (hasMemoryUse(NewPt, Def, BB))
+ return true;
+
+ // -1 is unlimited number of blocks on all paths.
+ if (NBBsOnAllPaths != -1)
+ --NBBsOnAllPaths;
+
+ ++I;
+ }
+
+ return false;
+ }
+
+ // Return true when there are exception handling between HoistPt and BB.
+ // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
+ // return true when the counter NBBsOnAllPaths reaches 0, except when it is
+ // initialized to -1 which is unlimited.
+ bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
+ int &NBBsOnAllPaths) {
+ assert(DT->dominates(HoistPt, SrcBB) && "Invalid path");
+
+ // Walk all basic blocks reachable in depth-first iteration on
+ // the inverse CFG from BBInsn to NewHoistPt. These blocks are all the
+ // blocks that may be executed between the execution of NewHoistPt and
+ // BBInsn. Hoisting an expression from BBInsn into NewHoistPt has to be safe
+ // on all execution paths.
+ for (auto I = idf_begin(SrcBB), E = idf_end(SrcBB); I != E;) {
+ const BasicBlock *BB = *I;
+ if (BB == HoistPt) {
+ // Stop traversal when reaching NewHoistPt.
+ I.skipChildren();
+ continue;
+ }
+
+ if (hasEHhelper(BB, SrcBB, NBBsOnAllPaths))
+ return true;
+
+ // -1 is unlimited number of blocks on all paths.
+ if (NBBsOnAllPaths != -1)
+ --NBBsOnAllPaths;
+
+ ++I;
+ }
+
+ return false;
+ }
+
+ // Return true when it is safe to hoist a memory load or store U from OldPt
+ // to NewPt.
+ bool safeToHoistLdSt(const Instruction *NewPt, const Instruction *OldPt,
+ MemoryUseOrDef *U, InsKind K, int &NBBsOnAllPaths) {
+ // In place hoisting is safe.
+ if (NewPt == OldPt)
+ return true;
+
+ const BasicBlock *NewBB = NewPt->getParent();
+ const BasicBlock *OldBB = OldPt->getParent();
+ const BasicBlock *UBB = U->getBlock();
+
+ // Check for dependences on the Memory SSA.
+ MemoryAccess *D = U->getDefiningAccess();
+ BasicBlock *DBB = D->getBlock();
+ if (DT->properlyDominates(NewBB, DBB))
+ // Cannot move the load or store to NewBB above its definition in DBB.
+ return false;
+
+ if (NewBB == DBB && !MSSA->isLiveOnEntryDef(D))
+ if (auto *UD = dyn_cast<MemoryUseOrDef>(D))
+ if (firstInBB(NewPt, UD->getMemoryInst()))
+ // Cannot move the load or store to NewPt above its definition in D.
+ return false;
+
+ // Check for unsafe hoistings due to side effects.
+ if (K == InsKind::Store) {
+ if (hasEHOrLoadsOnPath(NewPt, dyn_cast<MemoryDef>(U), NBBsOnAllPaths))
+ return false;
+ } else if (hasEHOnPath(NewBB, OldBB, NBBsOnAllPaths))
+ return false;
+
+ if (UBB == NewBB) {
+ if (DT->properlyDominates(DBB, NewBB))
+ return true;
+ assert(UBB == DBB);
+ assert(MSSA->locallyDominates(D, U));
+ }
+
+ // No side effects: it is safe to hoist.
+ return true;
+ }
+
+ // Return true when it is safe to hoist scalar instructions from all blocks in
+ // WL to HoistBB.
+ bool safeToHoistScalar(const BasicBlock *HoistBB, const BasicBlock *BB,
+ int &NBBsOnAllPaths) {
+ return !hasEHOnPath(HoistBB, BB, NBBsOnAllPaths);
+ }
+
+ // In the inverse CFG, the dominance frontier of basic block (BB) is the
+ // point where ANTIC needs to be computed for instructions which are going
+ // to be hoisted. Since this point does not change during gvn-hoist,
+ // we compute it only once (on demand).
+ // The ides is inspired from:
+ // "Partial Redundancy Elimination in SSA Form"
+ // ROBERT KENNEDY, SUN CHAN, SHIN-MING LIU, RAYMOND LO, PENG TU and FRED CHOW
+ // They use similar idea in the forward graph to to find fully redundant and
+ // partially redundant expressions, here it is used in the inverse graph to
+ // find fully anticipable instructions at merge point (post-dominator in
+ // the inverse CFG).
+ // Returns the edge via which an instruction in BB will get the values from.
+
+ // Returns true when the values are flowing out to each edge.
+ bool valueAnticipable(CHIArgs C, TerminatorInst *TI) const {
+ if (TI->getNumSuccessors() > (unsigned)std::distance(C.begin(), C.end()))
+ return false; // Not enough args in this CHI.
+
+ for (auto CHI : C) {
+ BasicBlock *Dest = CHI.Dest;
+ // Find if all the edges have values flowing out of BB.
+ bool Found = llvm::any_of(TI->successors(), [Dest](const BasicBlock *BB) {
+ return BB == Dest; });
+ if (!Found)
+ return false;
+ }
+ return true;
+ }
+
+ // Check if it is safe to hoist values tracked by CHI in the range
+ // [Begin, End) and accumulate them in Safe.
+ void checkSafety(CHIArgs C, BasicBlock *BB, InsKind K,
+ SmallVectorImpl<CHIArg> &Safe) {
+ int NumBBsOnAllPaths = MaxNumberOfBBSInPath;
+ for (auto CHI : C) {
+ Instruction *Insn = CHI.I;
+ if (!Insn) // No instruction was inserted in this CHI.
+ continue;
+ if (K == InsKind::Scalar) {
+ if (safeToHoistScalar(BB, Insn->getParent(), NumBBsOnAllPaths))
+ Safe.push_back(CHI);
+ } else {
+ MemoryUseOrDef *UD = MSSA->getMemoryAccess(Insn);
+ if (safeToHoistLdSt(BB->getTerminator(), Insn, UD, K, NumBBsOnAllPaths))
+ Safe.push_back(CHI);
+ }
+ }
+ }
+
+ using RenameStackType = DenseMap<VNType, SmallVector<Instruction *, 2>>;
+
+ // Push all the VNs corresponding to BB into RenameStack.
+ void fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
+ RenameStackType &RenameStack) {
+ auto it1 = ValueBBs.find(BB);
+ if (it1 != ValueBBs.end()) {
+ // Iterate in reverse order to keep lower ranked values on the top.
+ for (std::pair<VNType, Instruction *> &VI : reverse(it1->second)) {
+ // Get the value of instruction I
+ DEBUG(dbgs() << "\nPushing on stack: " << *VI.second);
+ RenameStack[VI.first].push_back(VI.second);
+ }
+ }
+ }
+
+ void fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs,
+ RenameStackType &RenameStack) {
+ // For each *predecessor* (because Post-DOM) of BB check if it has a CHI
+ for (auto Pred : predecessors(BB)) {
+ auto P = CHIBBs.find(Pred);
+ if (P == CHIBBs.end()) {
+ continue;
+ }
+ DEBUG(dbgs() << "\nLooking at CHIs in: " << Pred->getName(););
+ // A CHI is found (BB -> Pred is an edge in the CFG)
+ // Pop the stack until Top(V) = Ve.
+ auto &VCHI = P->second;
+ for (auto It = VCHI.begin(), E = VCHI.end(); It != E;) {
+ CHIArg &C = *It;
+ if (!C.Dest) {
+ auto si = RenameStack.find(C.VN);
+ // The Basic Block where CHI is must dominate the value we want to
+ // track in a CHI. In the PDom walk, there can be values in the
+ // stack which are not control dependent e.g., nested loop.
+ if (si != RenameStack.end() && si->second.size() &&
+ DT->properlyDominates(Pred, si->second.back()->getParent())) {
+ C.Dest = BB; // Assign the edge
+ C.I = si->second.pop_back_val(); // Assign the argument
+ DEBUG(dbgs() << "\nCHI Inserted in BB: " << C.Dest->getName()
+ << *C.I << ", VN: " << C.VN.first << ", "
+ << C.VN.second);
+ }
+ // Move to next CHI of a different value
+ It = std::find_if(It, VCHI.end(),
+ [It](CHIArg &A) { return A != *It; });
+ } else
+ ++It;
+ }
+ }
+ }
+
+ // Walk the post-dominator tree top-down and use a stack for each value to
+ // store the last value you see. When you hit a CHI from a given edge, the
+ // value to use as the argument is at the top of the stack, add the value to
+ // CHI and pop.
+ void insertCHI(InValuesType &ValueBBs, OutValuesType &CHIBBs) {
+ auto Root = PDT->getNode(nullptr);
+ if (!Root)
+ return;
+ // Depth first walk on PDom tree to fill the CHIargs at each PDF.
+ RenameStackType RenameStack;
+ for (auto Node : depth_first(Root)) {
+ BasicBlock *BB = Node->getBlock();
+ if (!BB)
+ continue;
+
+ // Collect all values in BB and push to stack.
+ fillRenameStack(BB, ValueBBs, RenameStack);
+
+ // Fill outgoing values in each CHI corresponding to BB.
+ fillChiArgs(BB, CHIBBs, RenameStack);
+ }
+ }
+
+ // Walk all the CHI-nodes to find ones which have a empty-entry and remove
+ // them Then collect all the instructions which are safe to hoist and see if
+ // they form a list of anticipable values. OutValues contains CHIs
+ // corresponding to each basic block.
+ void findHoistableCandidates(OutValuesType &CHIBBs, InsKind K,
+ HoistingPointList &HPL) {
+ auto cmpVN = [](const CHIArg &A, const CHIArg &B) { return A.VN < B.VN; };
+
+ // CHIArgs now have the outgoing values, so check for anticipability and
+ // accumulate hoistable candidates in HPL.
+ for (std::pair<BasicBlock *, SmallVector<CHIArg, 2>> &A : CHIBBs) {
+ BasicBlock *BB = A.first;
+ SmallVectorImpl<CHIArg> &CHIs = A.second;
+ // Vector of PHIs contains PHIs for different instructions.
+ // Sort the args according to their VNs, such that identical
+ // instructions are together.
+ std::stable_sort(CHIs.begin(), CHIs.end(), cmpVN);
+ auto TI = BB->getTerminator();
+ auto B = CHIs.begin();
+ // [PreIt, PHIIt) form a range of CHIs which have identical VNs.
+ auto PHIIt = std::find_if(CHIs.begin(), CHIs.end(),
+ [B](CHIArg &A) { return A != *B; });
+ auto PrevIt = CHIs.begin();
+ while (PrevIt != PHIIt) {
+ // Collect values which satisfy safety checks.
+ SmallVector<CHIArg, 2> Safe;
+ // We check for safety first because there might be multiple values in
+ // the same path, some of which are not safe to be hoisted, but overall
+ // each edge has at least one value which can be hoisted, making the
+ // value anticipable along that path.
+ checkSafety(make_range(PrevIt, PHIIt), BB, K, Safe);
+
+ // List of safe values should be anticipable at TI.
+ if (valueAnticipable(make_range(Safe.begin(), Safe.end()), TI)) {
+ HPL.push_back({BB, SmallVecInsn()});
+ SmallVecInsn &V = HPL.back().second;
+ for (auto B : Safe)
+ V.push_back(B.I);
+ }
+
+ // Check other VNs
+ PrevIt = PHIIt;
+ PHIIt = std::find_if(PrevIt, CHIs.end(),
+ [PrevIt](CHIArg &A) { return A != *PrevIt; });
+ }
+ }
+ }
+
+ // Compute insertion points for each values which can be fully anticipated at
+ // a dominator. HPL contains all such values.
+ void computeInsertionPoints(const VNtoInsns &Map, HoistingPointList &HPL,
+ InsKind K) {
+ // Sort VNs based on their rankings
+ std::vector<VNType> Ranks;
+ for (const auto &Entry : Map) {
+ Ranks.push_back(Entry.first);
+ }
+
+ // TODO: Remove fully-redundant expressions.
+ // Get instruction from the Map, assume that all the Instructions
+ // with same VNs have same rank (this is an approximation).
+ std::sort(Ranks.begin(), Ranks.end(),
+ [this, &Map](const VNType &r1, const VNType &r2) {
+ return (rank(*Map.lookup(r1).begin()) <
+ rank(*Map.lookup(r2).begin()));
+ });
+
+ // - Sort VNs according to their rank, and start with lowest ranked VN
+ // - Take a VN and for each instruction with same VN
+ // - Find the dominance frontier in the inverse graph (PDF)
+ // - Insert the chi-node at PDF
+ // - Remove the chi-nodes with missing entries
+ // - Remove values from CHI-nodes which do not truly flow out, e.g.,
+ // modified along the path.
+ // - Collect the remaining values that are still anticipable
+ SmallVector<BasicBlock *, 2> IDFBlocks;
+ ReverseIDFCalculator IDFs(*PDT);
+ OutValuesType OutValue;
+ InValuesType InValue;
+ for (const auto &R : Ranks) {
+ const SmallVecInsn &V = Map.lookup(R);
+ if (V.size() < 2)
+ continue;
+ const VNType &VN = R;
+ SmallPtrSet<BasicBlock *, 2> VNBlocks;
+ for (auto &I : V) {
+ BasicBlock *BBI = I->getParent();
+ if (!hasEH(BBI))
+ VNBlocks.insert(BBI);
+ }
+ // Compute the Post Dominance Frontiers of each basic block
+ // The dominance frontier of a live block X in the reverse
+ // control graph is the set of blocks upon which X is control
+ // dependent. The following sequence computes the set of blocks
+ // which currently have dead terminators that are control
+ // dependence sources of a block which is in NewLiveBlocks.
+ IDFs.setDefiningBlocks(VNBlocks);
+ IDFs.calculate(IDFBlocks);
+
+ // Make a map of BB vs instructions to be hoisted.
+ for (unsigned i = 0; i < V.size(); ++i) {
+ InValue[V[i]->getParent()].push_back(std::make_pair(VN, V[i]));
+ }
+ // Insert empty CHI node for this VN. This is used to factor out
+ // basic blocks where the ANTIC can potentially change.
+ for (auto IDFB : IDFBlocks) { // TODO: Prune out useless CHI insertions.
+ for (unsigned i = 0; i < V.size(); ++i) {
+ CHIArg C = {VN, nullptr, nullptr};
+ // Ignore spurious PDFs.
+ if (DT->properlyDominates(IDFB, V[i]->getParent())) {
+ OutValue[IDFB].push_back(C);
+ DEBUG(dbgs() << "\nInsertion a CHI for BB: " << IDFB->getName()
+ << ", for Insn: " << *V[i]);
+ }
+ }
+ }
+ }
+
+ // Insert CHI args at each PDF to iterate on factored graph of
+ // control dependence.
+ insertCHI(InValue, OutValue);
+ // Using the CHI args inserted at each PDF, find fully anticipable values.
+ findHoistableCandidates(OutValue, K, HPL);
+ }
+
+ // Return true when all operands of Instr are available at insertion point
+ // HoistPt. When limiting the number of hoisted expressions, one could hoist
+ // a load without hoisting its access function. So before hoisting any
+ // expression, make sure that all its operands are available at insert point.
+ bool allOperandsAvailable(const Instruction *I,
+ const BasicBlock *HoistPt) const {
+ for (const Use &Op : I->operands())
+ if (const auto *Inst = dyn_cast<Instruction>(&Op))
+ if (!DT->dominates(Inst->getParent(), HoistPt))
+ return false;
+
+ return true;
+ }
+
+ // Same as allOperandsAvailable with recursive check for GEP operands.
+ bool allGepOperandsAvailable(const Instruction *I,
+ const BasicBlock *HoistPt) const {
+ for (const Use &Op : I->operands())
+ if (const auto *Inst = dyn_cast<Instruction>(&Op))
+ if (!DT->dominates(Inst->getParent(), HoistPt)) {
+ if (const GetElementPtrInst *GepOp =
+ dyn_cast<GetElementPtrInst>(Inst)) {
+ if (!allGepOperandsAvailable(GepOp, HoistPt))
+ return false;
+ // Gep is available if all operands of GepOp are available.
+ } else {
+ // Gep is not available if it has operands other than GEPs that are
+ // defined in blocks not dominating HoistPt.
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // Make all operands of the GEP available.
+ void makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt,
+ const SmallVecInsn &InstructionsToHoist,
+ Instruction *Gep) const {
+ assert(allGepOperandsAvailable(Gep, HoistPt) &&
+ "GEP operands not available");
+
+ Instruction *ClonedGep = Gep->clone();
+ for (unsigned i = 0, e = Gep->getNumOperands(); i != e; ++i)
+ if (Instruction *Op = dyn_cast<Instruction>(Gep->getOperand(i))) {
+ // Check whether the operand is already available.
+ if (DT->dominates(Op->getParent(), HoistPt))
+ continue;
+
+ // As a GEP can refer to other GEPs, recursively make all the operands
+ // of this GEP available at HoistPt.
+ if (GetElementPtrInst *GepOp = dyn_cast<GetElementPtrInst>(Op))
+ makeGepsAvailable(ClonedGep, HoistPt, InstructionsToHoist, GepOp);
+ }
+
+ // Copy Gep and replace its uses in Repl with ClonedGep.
+ ClonedGep->insertBefore(HoistPt->getTerminator());
+
+ // Conservatively discard any optimization hints, they may differ on the
+ // other paths.
+ ClonedGep->dropUnknownNonDebugMetadata();
+
+ // If we have optimization hints which agree with each other along different
+ // paths, preserve them.
+ for (const Instruction *OtherInst : InstructionsToHoist) {
+ const GetElementPtrInst *OtherGep;
+ if (auto *OtherLd = dyn_cast<LoadInst>(OtherInst))
+ OtherGep = cast<GetElementPtrInst>(OtherLd->getPointerOperand());
+ else
+ OtherGep = cast<GetElementPtrInst>(
+ cast<StoreInst>(OtherInst)->getPointerOperand());
+ ClonedGep->andIRFlags(OtherGep);
+ }
+
+ // Replace uses of Gep with ClonedGep in Repl.
+ Repl->replaceUsesOfWith(Gep, ClonedGep);
+ }
+
+ void updateAlignment(Instruction *I, Instruction *Repl) {
+ if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) {
+ ReplacementLoad->setAlignment(
+ std::min(ReplacementLoad->getAlignment(),
+ cast<LoadInst>(I)->getAlignment()));
+ ++NumLoadsRemoved;
+ } else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) {
+ ReplacementStore->setAlignment(
+ std::min(ReplacementStore->getAlignment(),
+ cast<StoreInst>(I)->getAlignment()));
+ ++NumStoresRemoved;
+ } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) {
+ ReplacementAlloca->setAlignment(
+ std::max(ReplacementAlloca->getAlignment(),
+ cast<AllocaInst>(I)->getAlignment()));
+ } else if (isa<CallInst>(Repl)) {
+ ++NumCallsRemoved;
+ }
+ }
+
+ // Remove all the instructions in Candidates and replace their usage with Repl.
+ // Returns the number of instructions removed.
+ unsigned rauw(const SmallVecInsn &Candidates, Instruction *Repl,
+ MemoryUseOrDef *NewMemAcc) {
+ unsigned NR = 0;
+ for (Instruction *I : Candidates) {
+ if (I != Repl) {
+ ++NR;
+ updateAlignment(I, Repl);
+ if (NewMemAcc) {
+ // Update the uses of the old MSSA access with NewMemAcc.
+ MemoryAccess *OldMA = MSSA->getMemoryAccess(I);
+ OldMA->replaceAllUsesWith(NewMemAcc);
+ MSSAUpdater->removeMemoryAccess(OldMA);
+ }
+
+ Repl->andIRFlags(I);
+ combineKnownMetadata(Repl, I);
+ I->replaceAllUsesWith(Repl);
+ // Also invalidate the Alias Analysis cache.
+ MD->removeInstruction(I);
+ I->eraseFromParent();
+ }
+ }
+ return NR;
+ }
+
+ // Replace all Memory PHI usage with NewMemAcc.
+ void raMPHIuw(MemoryUseOrDef *NewMemAcc) {
+ SmallPtrSet<MemoryPhi *, 4> UsePhis;
+ for (User *U : NewMemAcc->users())
+ if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(U))
+ UsePhis.insert(Phi);
+
+ for (MemoryPhi *Phi : UsePhis) {
+ auto In = Phi->incoming_values();
+ if (llvm::all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
+ Phi->replaceAllUsesWith(NewMemAcc);
+ MSSAUpdater->removeMemoryAccess(Phi);
+ }
+ }
+ }
+
+ // Remove all other instructions and replace them with Repl.
+ unsigned removeAndReplace(const SmallVecInsn &Candidates, Instruction *Repl,
+ BasicBlock *DestBB, bool MoveAccess) {
+ MemoryUseOrDef *NewMemAcc = MSSA->getMemoryAccess(Repl);
+ if (MoveAccess && NewMemAcc) {
+ // The definition of this ld/st will not change: ld/st hoisting is
+ // legal when the ld/st is not moved past its current definition.
+ MSSAUpdater->moveToPlace(NewMemAcc, DestBB, MemorySSA::End);
+ }
+
+ // Replace all other instructions with Repl with memory access NewMemAcc.
+ unsigned NR = rauw(Candidates, Repl, NewMemAcc);
+
+ // Remove MemorySSA phi nodes with the same arguments.
+ if (NewMemAcc)
+ raMPHIuw(NewMemAcc);
+ return NR;
+ }
+
+ // In the case Repl is a load or a store, we make all their GEPs
+ // available: GEPs are not hoisted by default to avoid the address
+ // computations to be hoisted without the associated load or store.
+ bool makeGepOperandsAvailable(Instruction *Repl, BasicBlock *HoistPt,
+ const SmallVecInsn &InstructionsToHoist) const {
+ // Check whether the GEP of a ld/st can be synthesized at HoistPt.
+ GetElementPtrInst *Gep = nullptr;
+ Instruction *Val = nullptr;
+ if (auto *Ld = dyn_cast<LoadInst>(Repl)) {
+ Gep = dyn_cast<GetElementPtrInst>(Ld->getPointerOperand());
+ } else if (auto *St = dyn_cast<StoreInst>(Repl)) {
+ Gep = dyn_cast<GetElementPtrInst>(St->getPointerOperand());
+ Val = dyn_cast<Instruction>(St->getValueOperand());
+ // Check that the stored value is available.
+ if (Val) {
+ if (isa<GetElementPtrInst>(Val)) {
+ // Check whether we can compute the GEP at HoistPt.
+ if (!allGepOperandsAvailable(Val, HoistPt))
+ return false;
+ } else if (!DT->dominates(Val->getParent(), HoistPt))
+ return false;
+ }
+ }
+
+ // Check whether we can compute the Gep at HoistPt.
+ if (!Gep || !allGepOperandsAvailable(Gep, HoistPt))
+ return false;
+
+ makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Gep);
+
+ if (Val && isa<GetElementPtrInst>(Val))
+ makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Val);
+
+ return true;
+ }
+
+ std::pair<unsigned, unsigned> hoist(HoistingPointList &HPL) {
+ unsigned NI = 0, NL = 0, NS = 0, NC = 0, NR = 0;
+ for (const HoistingPointInfo &HP : HPL) {
+ // Find out whether we already have one of the instructions in HoistPt,
+ // in which case we do not have to move it.
+ BasicBlock *DestBB = HP.first;
+ const SmallVecInsn &InstructionsToHoist = HP.second;
+ Instruction *Repl = nullptr;
+ for (Instruction *I : InstructionsToHoist)
+ if (I->getParent() == DestBB)
+ // If there are two instructions in HoistPt to be hoisted in place:
+ // update Repl to be the first one, such that we can rename the uses
+ // of the second based on the first.
+ if (!Repl || firstInBB(I, Repl))
+ Repl = I;
+
+ // Keep track of whether we moved the instruction so we know whether we
+ // should move the MemoryAccess.
+ bool MoveAccess = true;
+ if (Repl) {
+ // Repl is already in HoistPt: it remains in place.
+ assert(allOperandsAvailable(Repl, DestBB) &&
+ "instruction depends on operands that are not available");
+ MoveAccess = false;
+ } else {
+ // When we do not find Repl in HoistPt, select the first in the list
+ // and move it to HoistPt.
+ Repl = InstructionsToHoist.front();
+
+ // We can move Repl in HoistPt only when all operands are available.
+ // The order in which hoistings are done may influence the availability
+ // of operands.
+ if (!allOperandsAvailable(Repl, DestBB)) {
+ // When HoistingGeps there is nothing more we can do to make the
+ // operands available: just continue.
+ if (HoistingGeps)
+ continue;
+
+ // When not HoistingGeps we need to copy the GEPs.
+ if (!makeGepOperandsAvailable(Repl, DestBB, InstructionsToHoist))
+ continue;
+ }
+
+ // Move the instruction at the end of HoistPt.
+ Instruction *Last = DestBB->getTerminator();
+ MD->removeInstruction(Repl);
+ Repl->moveBefore(Last);
+
+ DFSNumber[Repl] = DFSNumber[Last]++;
+ }
+
+ NR += removeAndReplace(InstructionsToHoist, Repl, DestBB, MoveAccess);
+
+ if (isa<LoadInst>(Repl))
+ ++NL;
+ else if (isa<StoreInst>(Repl))
+ ++NS;
+ else if (isa<CallInst>(Repl))
+ ++NC;
+ else // Scalar
+ ++NI;
+ }
+
+ NumHoisted += NL + NS + NC + NI;
+ NumRemoved += NR;
+ NumLoadsHoisted += NL;
+ NumStoresHoisted += NS;
+ NumCallsHoisted += NC;
+ return {NI, NL + NC + NS};
+ }
+
+ // Hoist all expressions. Returns Number of scalars hoisted
+ // and number of non-scalars hoisted.
+ std::pair<unsigned, unsigned> hoistExpressions(Function &F) {
+ InsnInfo II;
+ LoadInfo LI;
+ StoreInfo SI;
+ CallInfo CI;
+ for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
+ int InstructionNb = 0;
+ for (Instruction &I1 : *BB) {
+ // If I1 cannot guarantee progress, subsequent instructions
+ // in BB cannot be hoisted anyways.
+ if (!isGuaranteedToTransferExecutionToSuccessor(&I1)) {
+ HoistBarrier.insert(BB);
+ break;
+ }
+ // Only hoist the first instructions in BB up to MaxDepthInBB. Hoisting
+ // deeper may increase the register pressure and compilation time.
+ if (MaxDepthInBB != -1 && InstructionNb++ >= MaxDepthInBB)
+ break;
+
+ // Do not value number terminator instructions.
+ if (isa<TerminatorInst>(&I1))
+ break;
+
+ if (auto *Load = dyn_cast<LoadInst>(&I1))
+ LI.insert(Load, VN);
+ else if (auto *Store = dyn_cast<StoreInst>(&I1))
+ SI.insert(Store, VN);
+ else if (auto *Call = dyn_cast<CallInst>(&I1)) {
+ if (auto *Intr = dyn_cast<IntrinsicInst>(Call)) {
+ if (isa<DbgInfoIntrinsic>(Intr) ||
+ Intr->getIntrinsicID() == Intrinsic::assume ||
+ Intr->getIntrinsicID() == Intrinsic::sideeffect)
+ continue;
+ }
+ if (Call->mayHaveSideEffects())
+ break;
+
+ if (Call->isConvergent())
+ break;
+
+ CI.insert(Call, VN);
+ } else if (HoistingGeps || !isa<GetElementPtrInst>(&I1))
+ // Do not hoist scalars past calls that may write to memory because
+ // that could result in spills later. geps are handled separately.
+ // TODO: We can relax this for targets like AArch64 as they have more
+ // registers than X86.
+ II.insert(&I1, VN);
+ }
+ }
+
+ HoistingPointList HPL;
+ computeInsertionPoints(II.getVNTable(), HPL, InsKind::Scalar);
+ computeInsertionPoints(LI.getVNTable(), HPL, InsKind::Load);
+ computeInsertionPoints(SI.getVNTable(), HPL, InsKind::Store);
+ computeInsertionPoints(CI.getScalarVNTable(), HPL, InsKind::Scalar);
+ computeInsertionPoints(CI.getLoadVNTable(), HPL, InsKind::Load);
+ computeInsertionPoints(CI.getStoreVNTable(), HPL, InsKind::Store);
+ return hoist(HPL);
+ }
+};
+
+class GVNHoistLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ GVNHoistLegacyPass() : FunctionPass(ID) {
+ initializeGVNHoistLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+ auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+ auto &MD = getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+ auto &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+
+ GVNHoist G(&DT, &PDT, &AA, &MD, &MSSA);
+ return G.run(F);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<PostDominatorTreeWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<MemoryDependenceWrapperPass>();
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+
+} // end namespace llvm
+
+PreservedAnalyses GVNHoistPass::run(Function &F, FunctionAnalysisManager &AM) {
+ DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+ AliasAnalysis &AA = AM.getResult<AAManager>(F);
+ MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
+ MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+ GVNHoist G(&DT, &PDT, &AA, &MD, &MSSA);
+ if (!G.run(F))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<MemorySSAAnalysis>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+char GVNHoistLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GVNHoistLegacyPass, "gvn-hoist",
+ "Early GVN Hoisting of Expressions", false, false)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(GVNHoistLegacyPass, "gvn-hoist",
+ "Early GVN Hoisting of Expressions", false, false)
+
+FunctionPass *llvm::createGVNHoistPass() { return new GVNHoistLegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp b/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp
new file mode 100644
index 000000000000..5594c29bbd9f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -0,0 +1,922 @@
+//===- GVNSink.cpp - sink expressions into successors ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file GVNSink.cpp
+/// This pass attempts to sink instructions into successors, reducing static
+/// instruction count and enabling if-conversion.
+///
+/// We use a variant of global value numbering to decide what can be sunk.
+/// Consider:
+///
+/// [ %a1 = add i32 %b, 1 ] [ %c1 = add i32 %d, 1 ]
+/// [ %a2 = xor i32 %a1, 1 ] [ %c2 = xor i32 %c1, 1 ]
+/// \ /
+/// [ %e = phi i32 %a2, %c2 ]
+/// [ add i32 %e, 4 ]
+///
+///
+/// GVN would number %a1 and %c1 differently because they compute different
+/// results - the VN of an instruction is a function of its opcode and the
+/// transitive closure of its operands. This is the key property for hoisting
+/// and CSE.
+///
+/// What we want when sinking however is for a numbering that is a function of
+/// the *uses* of an instruction, which allows us to answer the question "if I
+/// replace %a1 with %c1, will it contribute in an equivalent way to all
+/// successive instructions?". The PostValueTable class in GVN provides this
+/// mapping.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/ArrayRecycler.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/GVNExpression.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gvn-sink"
+
+STATISTIC(NumRemoved, "Number of instructions removed");
+
+namespace llvm {
+namespace GVNExpression {
+
+LLVM_DUMP_METHOD void Expression::dump() const {
+ print(dbgs());
+ dbgs() << "\n";
+}
+
+} // end namespace GVNExpression
+} // end namespace llvm
+
+namespace {
+
+static bool isMemoryInst(const Instruction *I) {
+ return isa<LoadInst>(I) || isa<StoreInst>(I) ||
+ (isa<InvokeInst>(I) && !cast<InvokeInst>(I)->doesNotAccessMemory()) ||
+ (isa<CallInst>(I) && !cast<CallInst>(I)->doesNotAccessMemory());
+}
+
+/// Iterates through instructions in a set of blocks in reverse order from the
+/// first non-terminator. For example (assume all blocks have size n):
+/// LockstepReverseIterator I([B1, B2, B3]);
+/// *I-- = [B1[n], B2[n], B3[n]];
+/// *I-- = [B1[n-1], B2[n-1], B3[n-1]];
+/// *I-- = [B1[n-2], B2[n-2], B3[n-2]];
+/// ...
+///
+/// It continues until all blocks have been exhausted. Use \c getActiveBlocks()
+/// to
+/// determine which blocks are still going and the order they appear in the
+/// list returned by operator*.
+class LockstepReverseIterator {
+ ArrayRef<BasicBlock *> Blocks;
+ SmallSetVector<BasicBlock *, 4> ActiveBlocks;
+ SmallVector<Instruction *, 4> Insts;
+ bool Fail;
+
+public:
+ LockstepReverseIterator(ArrayRef<BasicBlock *> Blocks) : Blocks(Blocks) {
+ reset();
+ }
+
+ void reset() {
+ Fail = false;
+ ActiveBlocks.clear();
+ for (BasicBlock *BB : Blocks)
+ ActiveBlocks.insert(BB);
+ Insts.clear();
+ for (BasicBlock *BB : Blocks) {
+ if (BB->size() <= 1) {
+ // Block wasn't big enough - only contained a terminator.
+ ActiveBlocks.remove(BB);
+ continue;
+ }
+ Insts.push_back(BB->getTerminator()->getPrevNode());
+ }
+ if (Insts.empty())
+ Fail = true;
+ }
+
+ bool isValid() const { return !Fail; }
+ ArrayRef<Instruction *> operator*() const { return Insts; }
+
+ // Note: This needs to return a SmallSetVector as the elements of
+ // ActiveBlocks will be later copied to Blocks using std::copy. The
+ // resultant order of elements in Blocks needs to be deterministic.
+ // Using SmallPtrSet instead causes non-deterministic order while
+ // copying. And we cannot simply sort Blocks as they need to match the
+ // corresponding Values.
+ SmallSetVector<BasicBlock *, 4> &getActiveBlocks() { return ActiveBlocks; }
+
+ void restrictToBlocks(SmallSetVector<BasicBlock *, 4> &Blocks) {
+ for (auto II = Insts.begin(); II != Insts.end();) {
+ if (std::find(Blocks.begin(), Blocks.end(), (*II)->getParent()) ==
+ Blocks.end()) {
+ ActiveBlocks.remove((*II)->getParent());
+ II = Insts.erase(II);
+ } else {
+ ++II;
+ }
+ }
+ }
+
+ void operator--() {
+ if (Fail)
+ return;
+ SmallVector<Instruction *, 4> NewInsts;
+ for (auto *Inst : Insts) {
+ if (Inst == &Inst->getParent()->front())
+ ActiveBlocks.remove(Inst->getParent());
+ else
+ NewInsts.push_back(Inst->getPrevNode());
+ }
+ if (NewInsts.empty()) {
+ Fail = true;
+ return;
+ }
+ Insts = NewInsts;
+ }
+};
+
+//===----------------------------------------------------------------------===//
+
+/// Candidate solution for sinking. There may be different ways to
+/// sink instructions, differing in the number of instructions sunk,
+/// the number of predecessors sunk from and the number of PHIs
+/// required.
+struct SinkingInstructionCandidate {
+ unsigned NumBlocks;
+ unsigned NumInstructions;
+ unsigned NumPHIs;
+ unsigned NumMemoryInsts;
+ int Cost = -1;
+ SmallVector<BasicBlock *, 4> Blocks;
+
+ void calculateCost(unsigned NumOrigPHIs, unsigned NumOrigBlocks) {
+ unsigned NumExtraPHIs = NumPHIs - NumOrigPHIs;
+ unsigned SplitEdgeCost = (NumOrigBlocks > NumBlocks) ? 2 : 0;
+ Cost = (NumInstructions * (NumBlocks - 1)) -
+ (NumExtraPHIs *
+ NumExtraPHIs) // PHIs are expensive, so make sure they're worth it.
+ - SplitEdgeCost;
+ }
+
+ bool operator>(const SinkingInstructionCandidate &Other) const {
+ return Cost > Other.Cost;
+ }
+};
+
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &OS, const SinkingInstructionCandidate &C) {
+ OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks
+ << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">";
+ return OS;
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+
+/// Describes a PHI node that may or may not exist. These track the PHIs
+/// that must be created if we sunk a sequence of instructions. It provides
+/// a hash function for efficient equality comparisons.
+class ModelledPHI {
+ SmallVector<Value *, 4> Values;
+ SmallVector<BasicBlock *, 4> Blocks;
+
+public:
+ ModelledPHI() = default;
+
+ ModelledPHI(const PHINode *PN) {
+ // BasicBlock comes first so we sort by basic block pointer order, then by value pointer order.
+ SmallVector<std::pair<BasicBlock *, Value *>, 4> Ops;
+ for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I)
+ Ops.push_back({PN->getIncomingBlock(I), PN->getIncomingValue(I)});
+ std::sort(Ops.begin(), Ops.end());
+ for (auto &P : Ops) {
+ Blocks.push_back(P.first);
+ Values.push_back(P.second);
+ }
+ }
+
+ /// Create a dummy ModelledPHI that will compare unequal to any other ModelledPHI
+ /// without the same ID.
+ /// \note This is specifically for DenseMapInfo - do not use this!
+ static ModelledPHI createDummy(size_t ID) {
+ ModelledPHI M;
+ M.Values.push_back(reinterpret_cast<Value*>(ID));
+ return M;
+ }
+
+ /// Create a PHI from an array of incoming values and incoming blocks.
+ template <typename VArray, typename BArray>
+ ModelledPHI(const VArray &V, const BArray &B) {
+ std::copy(V.begin(), V.end(), std::back_inserter(Values));
+ std::copy(B.begin(), B.end(), std::back_inserter(Blocks));
+ }
+
+ /// Create a PHI from [I[OpNum] for I in Insts].
+ template <typename BArray>
+ ModelledPHI(ArrayRef<Instruction *> Insts, unsigned OpNum, const BArray &B) {
+ std::copy(B.begin(), B.end(), std::back_inserter(Blocks));
+ for (auto *I : Insts)
+ Values.push_back(I->getOperand(OpNum));
+ }
+
+ /// Restrict the PHI's contents down to only \c NewBlocks.
+ /// \c NewBlocks must be a subset of \c this->Blocks.
+ void restrictToBlocks(const SmallSetVector<BasicBlock *, 4> &NewBlocks) {
+ auto BI = Blocks.begin();
+ auto VI = Values.begin();
+ while (BI != Blocks.end()) {
+ assert(VI != Values.end());
+ if (std::find(NewBlocks.begin(), NewBlocks.end(), *BI) ==
+ NewBlocks.end()) {
+ BI = Blocks.erase(BI);
+ VI = Values.erase(VI);
+ } else {
+ ++BI;
+ ++VI;
+ }
+ }
+ assert(Blocks.size() == NewBlocks.size());
+ }
+
+ ArrayRef<Value *> getValues() const { return Values; }
+
+ bool areAllIncomingValuesSame() const {
+ return llvm::all_of(Values, [&](Value *V) { return V == Values[0]; });
+ }
+
+ bool areAllIncomingValuesSameType() const {
+ return llvm::all_of(
+ Values, [&](Value *V) { return V->getType() == Values[0]->getType(); });
+ }
+
+ bool areAnyIncomingValuesConstant() const {
+ return llvm::any_of(Values, [&](Value *V) { return isa<Constant>(V); });
+ }
+
+ // Hash functor
+ unsigned hash() const {
+ return (unsigned)hash_combine_range(Values.begin(), Values.end());
+ }
+
+ bool operator==(const ModelledPHI &Other) const {
+ return Values == Other.Values && Blocks == Other.Blocks;
+ }
+};
+
+template <typename ModelledPHI> struct DenseMapInfo {
+ static inline ModelledPHI &getEmptyKey() {
+ static ModelledPHI Dummy = ModelledPHI::createDummy(0);
+ return Dummy;
+ }
+
+ static inline ModelledPHI &getTombstoneKey() {
+ static ModelledPHI Dummy = ModelledPHI::createDummy(1);
+ return Dummy;
+ }
+
+ static unsigned getHashValue(const ModelledPHI &V) { return V.hash(); }
+
+ static bool isEqual(const ModelledPHI &LHS, const ModelledPHI &RHS) {
+ return LHS == RHS;
+ }
+};
+
+using ModelledPHISet = DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>>;
+
+//===----------------------------------------------------------------------===//
+// ValueTable
+//===----------------------------------------------------------------------===//
+// This is a value number table where the value number is a function of the
+// *uses* of a value, rather than its operands. Thus, if VN(A) == VN(B) we know
+// that the program would be equivalent if we replaced A with PHI(A, B).
+//===----------------------------------------------------------------------===//
+
+/// A GVN expression describing how an instruction is used. The operands
+/// field of BasicExpression is used to store uses, not operands.
+///
+/// This class also contains fields for discriminators used when determining
+/// equivalence of instructions with sideeffects.
+class InstructionUseExpr : public GVNExpression::BasicExpression {
+ unsigned MemoryUseOrder = -1;
+ bool Volatile = false;
+
+public:
+ InstructionUseExpr(Instruction *I, ArrayRecycler<Value *> &R,
+ BumpPtrAllocator &A)
+ : GVNExpression::BasicExpression(I->getNumUses()) {
+ allocateOperands(R, A);
+ setOpcode(I->getOpcode());
+ setType(I->getType());
+
+ for (auto &U : I->uses())
+ op_push_back(U.getUser());
+ std::sort(op_begin(), op_end());
+ }
+
+ void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; }
+ void setVolatile(bool V) { Volatile = V; }
+
+ hash_code getHashValue() const override {
+ return hash_combine(GVNExpression::BasicExpression::getHashValue(),
+ MemoryUseOrder, Volatile);
+ }
+
+ template <typename Function> hash_code getHashValue(Function MapFn) {
+ hash_code H =
+ hash_combine(getOpcode(), getType(), MemoryUseOrder, Volatile);
+ for (auto *V : operands())
+ H = hash_combine(H, MapFn(V));
+ return H;
+ }
+};
+
+class ValueTable {
+ DenseMap<Value *, uint32_t> ValueNumbering;
+ DenseMap<GVNExpression::Expression *, uint32_t> ExpressionNumbering;
+ DenseMap<size_t, uint32_t> HashNumbering;
+ BumpPtrAllocator Allocator;
+ ArrayRecycler<Value *> Recycler;
+ uint32_t nextValueNumber = 1;
+
+ /// Create an expression for I based on its opcode and its uses. If I
+ /// touches or reads memory, the expression is also based upon its memory
+ /// order - see \c getMemoryUseOrder().
+ InstructionUseExpr *createExpr(Instruction *I) {
+ InstructionUseExpr *E =
+ new (Allocator) InstructionUseExpr(I, Recycler, Allocator);
+ if (isMemoryInst(I))
+ E->setMemoryUseOrder(getMemoryUseOrder(I));
+
+ if (CmpInst *C = dyn_cast<CmpInst>(I)) {
+ CmpInst::Predicate Predicate = C->getPredicate();
+ E->setOpcode((C->getOpcode() << 8) | Predicate);
+ }
+ return E;
+ }
+
+ /// Helper to compute the value number for a memory instruction
+ /// (LoadInst/StoreInst), including checking the memory ordering and
+ /// volatility.
+ template <class Inst> InstructionUseExpr *createMemoryExpr(Inst *I) {
+ if (isStrongerThanUnordered(I->getOrdering()) || I->isAtomic())
+ return nullptr;
+ InstructionUseExpr *E = createExpr(I);
+ E->setVolatile(I->isVolatile());
+ return E;
+ }
+
+public:
+ ValueTable() = default;
+
+ /// Returns the value number for the specified value, assigning
+ /// it a new number if it did not have one before.
+ uint32_t lookupOrAdd(Value *V) {
+ auto VI = ValueNumbering.find(V);
+ if (VI != ValueNumbering.end())
+ return VI->second;
+
+ if (!isa<Instruction>(V)) {
+ ValueNumbering[V] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ Instruction *I = cast<Instruction>(V);
+ InstructionUseExpr *exp = nullptr;
+ switch (I->getOpcode()) {
+ case Instruction::Load:
+ exp = createMemoryExpr(cast<LoadInst>(I));
+ break;
+ case Instruction::Store:
+ exp = createMemoryExpr(cast<StoreInst>(I));
+ break;
+ case Instruction::Call:
+ case Instruction::Invoke:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::BitCast:
+ case Instruction::Select:
+ case Instruction::ExtractElement:
+ case Instruction::InsertElement:
+ case Instruction::ShuffleVector:
+ case Instruction::InsertValue:
+ case Instruction::GetElementPtr:
+ exp = createExpr(I);
+ break;
+ default:
+ break;
+ }
+
+ if (!exp) {
+ ValueNumbering[V] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ uint32_t e = ExpressionNumbering[exp];
+ if (!e) {
+ hash_code H = exp->getHashValue([=](Value *V) { return lookupOrAdd(V); });
+ auto I = HashNumbering.find(H);
+ if (I != HashNumbering.end()) {
+ e = I->second;
+ } else {
+ e = nextValueNumber++;
+ HashNumbering[H] = e;
+ ExpressionNumbering[exp] = e;
+ }
+ }
+ ValueNumbering[V] = e;
+ return e;
+ }
+
+ /// Returns the value number of the specified value. Fails if the value has
+ /// not yet been numbered.
+ uint32_t lookup(Value *V) const {
+ auto VI = ValueNumbering.find(V);
+ assert(VI != ValueNumbering.end() && "Value not numbered?");
+ return VI->second;
+ }
+
+ /// Removes all value numberings and resets the value table.
+ void clear() {
+ ValueNumbering.clear();
+ ExpressionNumbering.clear();
+ HashNumbering.clear();
+ Recycler.clear(Allocator);
+ nextValueNumber = 1;
+ }
+
+ /// \c Inst uses or touches memory. Return an ID describing the memory state
+ /// at \c Inst such that if getMemoryUseOrder(I1) == getMemoryUseOrder(I2),
+ /// the exact same memory operations happen after I1 and I2.
+ ///
+ /// This is a very hard problem in general, so we use domain-specific
+ /// knowledge that we only ever check for equivalence between blocks sharing a
+ /// single immediate successor that is common, and when determining if I1 ==
+ /// I2 we will have already determined that next(I1) == next(I2). This
+ /// inductive property allows us to simply return the value number of the next
+ /// instruction that defines memory.
+ uint32_t getMemoryUseOrder(Instruction *Inst) {
+ auto *BB = Inst->getParent();
+ for (auto I = std::next(Inst->getIterator()), E = BB->end();
+ I != E && !I->isTerminator(); ++I) {
+ if (!isMemoryInst(&*I))
+ continue;
+ if (isa<LoadInst>(&*I))
+ continue;
+ CallInst *CI = dyn_cast<CallInst>(&*I);
+ if (CI && CI->onlyReadsMemory())
+ continue;
+ InvokeInst *II = dyn_cast<InvokeInst>(&*I);
+ if (II && II->onlyReadsMemory())
+ continue;
+ return lookupOrAdd(&*I);
+ }
+ return 0;
+ }
+};
+
+//===----------------------------------------------------------------------===//
+
+class GVNSink {
+public:
+ GVNSink() = default;
+
+ bool run(Function &F) {
+ DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() << "\n");
+
+ unsigned NumSunk = 0;
+ ReversePostOrderTraversal<Function*> RPOT(&F);
+ for (auto *N : RPOT)
+ NumSunk += sinkBB(N);
+
+ return NumSunk > 0;
+ }
+
+private:
+ ValueTable VN;
+
+ bool isInstructionBlacklisted(Instruction *I) {
+ // These instructions may change or break semantics if moved.
+ if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) ||
+ I->getType()->isTokenTy())
+ return true;
+ return false;
+ }
+
+ /// The main heuristic function. Analyze the set of instructions pointed to by
+ /// LRI and return a candidate solution if these instructions can be sunk, or
+ /// None otherwise.
+ Optional<SinkingInstructionCandidate> analyzeInstructionForSinking(
+ LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
+ ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents);
+
+ /// Create a ModelledPHI for each PHI in BB, adding to PHIs.
+ void analyzeInitialPHIs(BasicBlock *BB, ModelledPHISet &PHIs,
+ SmallPtrSetImpl<Value *> &PHIContents) {
+ for (PHINode &PN : BB->phis()) {
+ auto MPHI = ModelledPHI(&PN);
+ PHIs.insert(MPHI);
+ for (auto *V : MPHI.getValues())
+ PHIContents.insert(V);
+ }
+ }
+
+ /// The main instruction sinking driver. Set up state and try and sink
+ /// instructions into BBEnd from its predecessors.
+ unsigned sinkBB(BasicBlock *BBEnd);
+
+ /// Perform the actual mechanics of sinking an instruction from Blocks into
+ /// BBEnd, which is their only successor.
+ void sinkLastInstruction(ArrayRef<BasicBlock *> Blocks, BasicBlock *BBEnd);
+
+ /// Remove PHIs that all have the same incoming value.
+ void foldPointlessPHINodes(BasicBlock *BB) {
+ auto I = BB->begin();
+ while (PHINode *PN = dyn_cast<PHINode>(I++)) {
+ if (!llvm::all_of(PN->incoming_values(), [&](const Value *V) {
+ return V == PN->getIncomingValue(0);
+ }))
+ continue;
+ if (PN->getIncomingValue(0) != PN)
+ PN->replaceAllUsesWith(PN->getIncomingValue(0));
+ else
+ PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+ PN->eraseFromParent();
+ }
+ }
+};
+
+Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
+ LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
+ ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents) {
+ auto Insts = *LRI;
+ DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I
+ : Insts) {
+ I->dump();
+ } dbgs() << " ]\n";);
+
+ DenseMap<uint32_t, unsigned> VNums;
+ for (auto *I : Insts) {
+ uint32_t N = VN.lookupOrAdd(I);
+ DEBUG(dbgs() << " VN=" << Twine::utohexstr(N) << " for" << *I << "\n");
+ if (N == ~0U)
+ return None;
+ VNums[N]++;
+ }
+ unsigned VNumToSink =
+ std::max_element(VNums.begin(), VNums.end(),
+ [](const std::pair<uint32_t, unsigned> &I,
+ const std::pair<uint32_t, unsigned> &J) {
+ return I.second < J.second;
+ })
+ ->first;
+
+ if (VNums[VNumToSink] == 1)
+ // Can't sink anything!
+ return None;
+
+ // Now restrict the number of incoming blocks down to only those with
+ // VNumToSink.
+ auto &ActivePreds = LRI.getActiveBlocks();
+ unsigned InitialActivePredSize = ActivePreds.size();
+ SmallVector<Instruction *, 4> NewInsts;
+ for (auto *I : Insts) {
+ if (VN.lookup(I) != VNumToSink)
+ ActivePreds.remove(I->getParent());
+ else
+ NewInsts.push_back(I);
+ }
+ for (auto *I : NewInsts)
+ if (isInstructionBlacklisted(I))
+ return None;
+
+ // If we've restricted the incoming blocks, restrict all needed PHIs also
+ // to that set.
+ bool RecomputePHIContents = false;
+ if (ActivePreds.size() != InitialActivePredSize) {
+ ModelledPHISet NewNeededPHIs;
+ for (auto P : NeededPHIs) {
+ P.restrictToBlocks(ActivePreds);
+ NewNeededPHIs.insert(P);
+ }
+ NeededPHIs = NewNeededPHIs;
+ LRI.restrictToBlocks(ActivePreds);
+ RecomputePHIContents = true;
+ }
+
+ // The sunk instruction's results.
+ ModelledPHI NewPHI(NewInsts, ActivePreds);
+
+ // Does sinking this instruction render previous PHIs redundant?
+ if (NeededPHIs.find(NewPHI) != NeededPHIs.end()) {
+ NeededPHIs.erase(NewPHI);
+ RecomputePHIContents = true;
+ }
+
+ if (RecomputePHIContents) {
+ // The needed PHIs have changed, so recompute the set of all needed
+ // values.
+ PHIContents.clear();
+ for (auto &PHI : NeededPHIs)
+ PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end());
+ }
+
+ // Is this instruction required by a later PHI that doesn't match this PHI?
+ // if so, we can't sink this instruction.
+ for (auto *V : NewPHI.getValues())
+ if (PHIContents.count(V))
+ // V exists in this PHI, but the whole PHI is different to NewPHI
+ // (else it would have been removed earlier). We cannot continue
+ // because this isn't representable.
+ return None;
+
+ // Which operands need PHIs?
+ // FIXME: If any of these fail, we should partition up the candidates to
+ // try and continue making progress.
+ Instruction *I0 = NewInsts[0];
+ for (unsigned OpNum = 0, E = I0->getNumOperands(); OpNum != E; ++OpNum) {
+ ModelledPHI PHI(NewInsts, OpNum, ActivePreds);
+ if (PHI.areAllIncomingValuesSame())
+ continue;
+ if (!canReplaceOperandWithVariable(I0, OpNum))
+ // We can 't create a PHI from this instruction!
+ return None;
+ if (NeededPHIs.count(PHI))
+ continue;
+ if (!PHI.areAllIncomingValuesSameType())
+ return None;
+ // Don't create indirect calls! The called value is the final operand.
+ if ((isa<CallInst>(I0) || isa<InvokeInst>(I0)) && OpNum == E - 1 &&
+ PHI.areAnyIncomingValuesConstant())
+ return None;
+
+ NeededPHIs.reserve(NeededPHIs.size());
+ NeededPHIs.insert(PHI);
+ PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end());
+ }
+
+ if (isMemoryInst(NewInsts[0]))
+ ++MemoryInstNum;
+
+ SinkingInstructionCandidate Cand;
+ Cand.NumInstructions = ++InstNum;
+ Cand.NumMemoryInsts = MemoryInstNum;
+ Cand.NumBlocks = ActivePreds.size();
+ Cand.NumPHIs = NeededPHIs.size();
+ for (auto *C : ActivePreds)
+ Cand.Blocks.push_back(C);
+
+ return Cand;
+}
+
+unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
+ DEBUG(dbgs() << "GVNSink: running on basic block ";
+ BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
+ SmallVector<BasicBlock *, 4> Preds;
+ for (auto *B : predecessors(BBEnd)) {
+ auto *T = B->getTerminator();
+ if (isa<BranchInst>(T) || isa<SwitchInst>(T))
+ Preds.push_back(B);
+ else
+ return 0;
+ }
+ if (Preds.size() < 2)
+ return 0;
+ std::sort(Preds.begin(), Preds.end());
+
+ unsigned NumOrigPreds = Preds.size();
+ // We can only sink instructions through unconditional branches.
+ for (auto I = Preds.begin(); I != Preds.end();) {
+ if ((*I)->getTerminator()->getNumSuccessors() != 1)
+ I = Preds.erase(I);
+ else
+ ++I;
+ }
+
+ LockstepReverseIterator LRI(Preds);
+ SmallVector<SinkingInstructionCandidate, 4> Candidates;
+ unsigned InstNum = 0, MemoryInstNum = 0;
+ ModelledPHISet NeededPHIs;
+ SmallPtrSet<Value *, 4> PHIContents;
+ analyzeInitialPHIs(BBEnd, NeededPHIs, PHIContents);
+ unsigned NumOrigPHIs = NeededPHIs.size();
+
+ while (LRI.isValid()) {
+ auto Cand = analyzeInstructionForSinking(LRI, InstNum, MemoryInstNum,
+ NeededPHIs, PHIContents);
+ if (!Cand)
+ break;
+ Cand->calculateCost(NumOrigPHIs, Preds.size());
+ Candidates.emplace_back(*Cand);
+ --LRI;
+ }
+
+ std::stable_sort(
+ Candidates.begin(), Candidates.end(),
+ [](const SinkingInstructionCandidate &A,
+ const SinkingInstructionCandidate &B) { return A > B; });
+ DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C
+ : Candidates) dbgs()
+ << " " << C << "\n";);
+
+ // Pick the top candidate, as long it is positive!
+ if (Candidates.empty() || Candidates.front().Cost <= 0)
+ return 0;
+ auto C = Candidates.front();
+
+ DEBUG(dbgs() << " -- Sinking: " << C << "\n");
+ BasicBlock *InsertBB = BBEnd;
+ if (C.Blocks.size() < NumOrigPreds) {
+ DEBUG(dbgs() << " -- Splitting edge to "; BBEnd->printAsOperand(dbgs());
+ dbgs() << "\n");
+ InsertBB = SplitBlockPredecessors(BBEnd, C.Blocks, ".gvnsink.split");
+ if (!InsertBB) {
+ DEBUG(dbgs() << " -- FAILED to split edge!\n");
+ // Edge couldn't be split.
+ return 0;
+ }
+ }
+
+ for (unsigned I = 0; I < C.NumInstructions; ++I)
+ sinkLastInstruction(C.Blocks, InsertBB);
+
+ return C.NumInstructions;
+}
+
+void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks,
+ BasicBlock *BBEnd) {
+ SmallVector<Instruction *, 4> Insts;
+ for (BasicBlock *BB : Blocks)
+ Insts.push_back(BB->getTerminator()->getPrevNode());
+ Instruction *I0 = Insts.front();
+
+ SmallVector<Value *, 4> NewOperands;
+ for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
+ bool NeedPHI = llvm::any_of(Insts, [&I0, O](const Instruction *I) {
+ return I->getOperand(O) != I0->getOperand(O);
+ });
+ if (!NeedPHI) {
+ NewOperands.push_back(I0->getOperand(O));
+ continue;
+ }
+
+ // Create a new PHI in the successor block and populate it.
+ auto *Op = I0->getOperand(O);
+ assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!");
+ auto *PN = PHINode::Create(Op->getType(), Insts.size(),
+ Op->getName() + ".sink", &BBEnd->front());
+ for (auto *I : Insts)
+ PN->addIncoming(I->getOperand(O), I->getParent());
+ NewOperands.push_back(PN);
+ }
+
+ // Arbitrarily use I0 as the new "common" instruction; remap its operands
+ // and move it to the start of the successor block.
+ for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O)
+ I0->getOperandUse(O).set(NewOperands[O]);
+ I0->moveBefore(&*BBEnd->getFirstInsertionPt());
+
+ // Update metadata and IR flags.
+ for (auto *I : Insts)
+ if (I != I0) {
+ combineMetadataForCSE(I0, I);
+ I0->andIRFlags(I);
+ }
+
+ for (auto *I : Insts)
+ if (I != I0)
+ I->replaceAllUsesWith(I0);
+ foldPointlessPHINodes(BBEnd);
+
+ // Finally nuke all instructions apart from the common instruction.
+ for (auto *I : Insts)
+ if (I != I0)
+ I->eraseFromParent();
+
+ NumRemoved += Insts.size() - 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Pass machinery / boilerplate
+
+class GVNSinkLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ GVNSinkLegacyPass() : FunctionPass(ID) {
+ initializeGVNSinkLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ GVNSink G;
+ return G.run(F);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) {
+ GVNSink G;
+ if (!G.run(F))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+char GVNSinkLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GVNSinkLegacyPass, "gvn-sink",
+ "Early GVN sinking of Expressions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(GVNSinkLegacyPass, "gvn-sink",
+ "Early GVN sinking of Expressions", false, false)
+
+FunctionPass *llvm::createGVNSinkPass() { return new GVNSinkLegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
new file mode 100644
index 000000000000..c4aeccb85ca7
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -0,0 +1,696 @@
+//===- GuardWidening.cpp - ---- Guard widening ----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the guard widening pass. The semantics of the
+// @llvm.experimental.guard intrinsic lets LLVM transform it so that it fails
+// more often that it did before the transform. This optimization is called
+// "widening" and can be used hoist and common runtime checks in situations like
+// these:
+//
+// %cmp0 = 7 u< Length
+// call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ]
+// call @unknown_side_effects()
+// %cmp1 = 9 u< Length
+// call @llvm.experimental.guard(i1 %cmp1) [ "deopt"(...) ]
+// ...
+//
+// =>
+//
+// %cmp0 = 9 u< Length
+// call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ]
+// call @unknown_side_effects()
+// ...
+//
+// If %cmp0 is false, @llvm.experimental.guard will "deoptimize" back to a
+// generic implementation of the same function, which will have the correct
+// semantics from that point onward. It is always _legal_ to deoptimize (so
+// replacing %cmp0 with false is "correct"), though it may not always be
+// profitable to do so.
+//
+// NB! This pass is a work in progress. It hasn't been tuned to be "production
+// ready" yet. It is known to have quadriatic running time and will not scale
+// to large numbers of guards
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/GuardWidening.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "guard-widening"
+
+namespace {
+
+class GuardWideningImpl {
+ DominatorTree &DT;
+ PostDominatorTree &PDT;
+ LoopInfo &LI;
+
+ /// The set of guards whose conditions have been widened into dominating
+ /// guards.
+ SmallVector<IntrinsicInst *, 16> EliminatedGuards;
+
+ /// The set of guards which have been widened to include conditions to other
+ /// guards.
+ DenseSet<IntrinsicInst *> WidenedGuards;
+
+ /// Try to eliminate guard \p Guard by widening it into an earlier dominating
+ /// guard. \p DFSI is the DFS iterator on the dominator tree that is
+ /// currently visiting the block containing \p Guard, and \p GuardsPerBlock
+ /// maps BasicBlocks to the set of guards seen in that block.
+ bool eliminateGuardViaWidening(
+ IntrinsicInst *Guard, const df_iterator<DomTreeNode *> &DFSI,
+ const DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> &
+ GuardsPerBlock);
+
+ /// Used to keep track of which widening potential is more effective.
+ enum WideningScore {
+ /// Don't widen.
+ WS_IllegalOrNegative,
+
+ /// Widening is performance neutral as far as the cycles spent in check
+ /// conditions goes (but can still help, e.g., code layout, having less
+ /// deopt state).
+ WS_Neutral,
+
+ /// Widening is profitable.
+ WS_Positive,
+
+ /// Widening is very profitable. Not significantly different from \c
+ /// WS_Positive, except by the order.
+ WS_VeryPositive
+ };
+
+ static StringRef scoreTypeToString(WideningScore WS);
+
+ /// Compute the score for widening the condition in \p DominatedGuard
+ /// (contained in \p DominatedGuardLoop) into \p DominatingGuard (contained in
+ /// \p DominatingGuardLoop).
+ WideningScore computeWideningScore(IntrinsicInst *DominatedGuard,
+ Loop *DominatedGuardLoop,
+ IntrinsicInst *DominatingGuard,
+ Loop *DominatingGuardLoop);
+
+ /// Helper to check if \p V can be hoisted to \p InsertPos.
+ bool isAvailableAt(Value *V, Instruction *InsertPos) {
+ SmallPtrSet<Instruction *, 8> Visited;
+ return isAvailableAt(V, InsertPos, Visited);
+ }
+
+ bool isAvailableAt(Value *V, Instruction *InsertPos,
+ SmallPtrSetImpl<Instruction *> &Visited);
+
+ /// Helper to hoist \p V to \p InsertPos. Guaranteed to succeed if \c
+ /// isAvailableAt returned true.
+ void makeAvailableAt(Value *V, Instruction *InsertPos);
+
+ /// Common helper used by \c widenGuard and \c isWideningCondProfitable. Try
+ /// to generate an expression computing the logical AND of \p Cond0 and \p
+ /// Cond1. Return true if the expression computing the AND is only as
+ /// expensive as computing one of the two. If \p InsertPt is true then
+ /// actually generate the resulting expression, make it available at \p
+ /// InsertPt and return it in \p Result (else no change to the IR is made).
+ bool widenCondCommon(Value *Cond0, Value *Cond1, Instruction *InsertPt,
+ Value *&Result);
+
+ /// Represents a range check of the form \c Base + \c Offset u< \c Length,
+ /// with the constraint that \c Length is not negative. \c CheckInst is the
+ /// pre-existing instruction in the IR that computes the result of this range
+ /// check.
+ class RangeCheck {
+ Value *Base;
+ ConstantInt *Offset;
+ Value *Length;
+ ICmpInst *CheckInst;
+
+ public:
+ explicit RangeCheck(Value *Base, ConstantInt *Offset, Value *Length,
+ ICmpInst *CheckInst)
+ : Base(Base), Offset(Offset), Length(Length), CheckInst(CheckInst) {}
+
+ void setBase(Value *NewBase) { Base = NewBase; }
+ void setOffset(ConstantInt *NewOffset) { Offset = NewOffset; }
+
+ Value *getBase() const { return Base; }
+ ConstantInt *getOffset() const { return Offset; }
+ const APInt &getOffsetValue() const { return getOffset()->getValue(); }
+ Value *getLength() const { return Length; };
+ ICmpInst *getCheckInst() const { return CheckInst; }
+
+ void print(raw_ostream &OS, bool PrintTypes = false) {
+ OS << "Base: ";
+ Base->printAsOperand(OS, PrintTypes);
+ OS << " Offset: ";
+ Offset->printAsOperand(OS, PrintTypes);
+ OS << " Length: ";
+ Length->printAsOperand(OS, PrintTypes);
+ }
+
+ LLVM_DUMP_METHOD void dump() {
+ print(dbgs());
+ dbgs() << "\n";
+ }
+ };
+
+ /// Parse \p CheckCond into a conjunction (logical-and) of range checks; and
+ /// append them to \p Checks. Returns true on success, may clobber \c Checks
+ /// on failure.
+ bool parseRangeChecks(Value *CheckCond, SmallVectorImpl<RangeCheck> &Checks) {
+ SmallPtrSet<Value *, 8> Visited;
+ return parseRangeChecks(CheckCond, Checks, Visited);
+ }
+
+ bool parseRangeChecks(Value *CheckCond, SmallVectorImpl<RangeCheck> &Checks,
+ SmallPtrSetImpl<Value *> &Visited);
+
+ /// Combine the checks in \p Checks into a smaller set of checks and append
+ /// them into \p CombinedChecks. Return true on success (i.e. all of checks
+ /// in \p Checks were combined into \p CombinedChecks). Clobbers \p Checks
+ /// and \p CombinedChecks on success and on failure.
+ bool combineRangeChecks(SmallVectorImpl<RangeCheck> &Checks,
+ SmallVectorImpl<RangeCheck> &CombinedChecks);
+
+ /// Can we compute the logical AND of \p Cond0 and \p Cond1 for the price of
+ /// computing only one of the two expressions?
+ bool isWideningCondProfitable(Value *Cond0, Value *Cond1) {
+ Value *ResultUnused;
+ return widenCondCommon(Cond0, Cond1, /*InsertPt=*/nullptr, ResultUnused);
+ }
+
+ /// Widen \p ToWiden to fail if \p NewCondition is false (in addition to
+ /// whatever it is already checking).
+ void widenGuard(IntrinsicInst *ToWiden, Value *NewCondition) {
+ Value *Result;
+ widenCondCommon(ToWiden->getArgOperand(0), NewCondition, ToWiden, Result);
+ ToWiden->setArgOperand(0, Result);
+ }
+
+public:
+ explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree &PDT,
+ LoopInfo &LI)
+ : DT(DT), PDT(PDT), LI(LI) {}
+
+ /// The entry point for this pass.
+ bool run();
+};
+
+struct GuardWideningLegacyPass : public FunctionPass {
+ static char ID;
+ GuardWideningPass Impl;
+
+ GuardWideningLegacyPass() : FunctionPass(ID) {
+ initializeGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ return GuardWideningImpl(
+ getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(),
+ getAnalysis<LoopInfoWrapperPass>().getLoopInfo()).run();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<PostDominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ }
+};
+
+}
+
+bool GuardWideningImpl::run() {
+ using namespace llvm::PatternMatch;
+
+ DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> GuardsInBlock;
+ bool Changed = false;
+
+ for (auto DFI = df_begin(DT.getRootNode()), DFE = df_end(DT.getRootNode());
+ DFI != DFE; ++DFI) {
+ auto *BB = (*DFI)->getBlock();
+ auto &CurrentList = GuardsInBlock[BB];
+
+ for (auto &I : *BB)
+ if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>()))
+ CurrentList.push_back(cast<IntrinsicInst>(&I));
+
+ for (auto *II : CurrentList)
+ Changed |= eliminateGuardViaWidening(II, DFI, GuardsInBlock);
+ }
+
+ for (auto *II : EliminatedGuards)
+ if (!WidenedGuards.count(II))
+ II->eraseFromParent();
+
+ return Changed;
+}
+
+bool GuardWideningImpl::eliminateGuardViaWidening(
+ IntrinsicInst *GuardInst, const df_iterator<DomTreeNode *> &DFSI,
+ const DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> &
+ GuardsInBlock) {
+ IntrinsicInst *BestSoFar = nullptr;
+ auto BestScoreSoFar = WS_IllegalOrNegative;
+ auto *GuardInstLoop = LI.getLoopFor(GuardInst->getParent());
+
+ // In the set of dominating guards, find the one we can merge GuardInst with
+ // for the most profit.
+ for (unsigned i = 0, e = DFSI.getPathLength(); i != e; ++i) {
+ auto *CurBB = DFSI.getPath(i)->getBlock();
+ auto *CurLoop = LI.getLoopFor(CurBB);
+ assert(GuardsInBlock.count(CurBB) && "Must have been populated by now!");
+ const auto &GuardsInCurBB = GuardsInBlock.find(CurBB)->second;
+
+ auto I = GuardsInCurBB.begin();
+ auto E = GuardsInCurBB.end();
+
+#ifndef NDEBUG
+ {
+ unsigned Index = 0;
+ for (auto &I : *CurBB) {
+ if (Index == GuardsInCurBB.size())
+ break;
+ if (GuardsInCurBB[Index] == &I)
+ Index++;
+ }
+ assert(Index == GuardsInCurBB.size() &&
+ "Guards expected to be in order!");
+ }
+#endif
+
+ assert((i == (e - 1)) == (GuardInst->getParent() == CurBB) && "Bad DFS?");
+
+ if (i == (e - 1)) {
+ // Corner case: make sure we're only looking at guards strictly dominating
+ // GuardInst when visiting GuardInst->getParent().
+ auto NewEnd = std::find(I, E, GuardInst);
+ assert(NewEnd != E && "GuardInst not in its own block?");
+ E = NewEnd;
+ }
+
+ for (auto *Candidate : make_range(I, E)) {
+ auto Score =
+ computeWideningScore(GuardInst, GuardInstLoop, Candidate, CurLoop);
+ DEBUG(dbgs() << "Score between " << *GuardInst->getArgOperand(0)
+ << " and " << *Candidate->getArgOperand(0) << " is "
+ << scoreTypeToString(Score) << "\n");
+ if (Score > BestScoreSoFar) {
+ BestScoreSoFar = Score;
+ BestSoFar = Candidate;
+ }
+ }
+ }
+
+ if (BestScoreSoFar == WS_IllegalOrNegative) {
+ DEBUG(dbgs() << "Did not eliminate guard " << *GuardInst << "\n");
+ return false;
+ }
+
+ assert(BestSoFar != GuardInst && "Should have never visited same guard!");
+ assert(DT.dominates(BestSoFar, GuardInst) && "Should be!");
+
+ DEBUG(dbgs() << "Widening " << *GuardInst << " into " << *BestSoFar
+ << " with score " << scoreTypeToString(BestScoreSoFar) << "\n");
+ widenGuard(BestSoFar, GuardInst->getArgOperand(0));
+ GuardInst->setArgOperand(0, ConstantInt::getTrue(GuardInst->getContext()));
+ EliminatedGuards.push_back(GuardInst);
+ WidenedGuards.insert(BestSoFar);
+ return true;
+}
+
+GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore(
+ IntrinsicInst *DominatedGuard, Loop *DominatedGuardLoop,
+ IntrinsicInst *DominatingGuard, Loop *DominatingGuardLoop) {
+ bool HoistingOutOfLoop = false;
+
+ if (DominatingGuardLoop != DominatedGuardLoop) {
+ if (DominatingGuardLoop &&
+ !DominatingGuardLoop->contains(DominatedGuardLoop))
+ return WS_IllegalOrNegative;
+
+ HoistingOutOfLoop = true;
+ }
+
+ if (!isAvailableAt(DominatedGuard->getArgOperand(0), DominatingGuard))
+ return WS_IllegalOrNegative;
+
+ bool HoistingOutOfIf =
+ !PDT.dominates(DominatedGuard->getParent(), DominatingGuard->getParent());
+
+ if (isWideningCondProfitable(DominatedGuard->getArgOperand(0),
+ DominatingGuard->getArgOperand(0)))
+ return HoistingOutOfLoop ? WS_VeryPositive : WS_Positive;
+
+ if (HoistingOutOfLoop)
+ return WS_Positive;
+
+ return HoistingOutOfIf ? WS_IllegalOrNegative : WS_Neutral;
+}
+
+bool GuardWideningImpl::isAvailableAt(Value *V, Instruction *Loc,
+ SmallPtrSetImpl<Instruction *> &Visited) {
+ auto *Inst = dyn_cast<Instruction>(V);
+ if (!Inst || DT.dominates(Inst, Loc) || Visited.count(Inst))
+ return true;
+
+ if (!isSafeToSpeculativelyExecute(Inst, Loc, &DT) ||
+ Inst->mayReadFromMemory())
+ return false;
+
+ Visited.insert(Inst);
+
+ // We only want to go _up_ the dominance chain when recursing.
+ assert(!isa<PHINode>(Loc) &&
+ "PHIs should return false for isSafeToSpeculativelyExecute");
+ assert(DT.isReachableFromEntry(Inst->getParent()) &&
+ "We did a DFS from the block entry!");
+ return all_of(Inst->operands(),
+ [&](Value *Op) { return isAvailableAt(Op, Loc, Visited); });
+}
+
+void GuardWideningImpl::makeAvailableAt(Value *V, Instruction *Loc) {
+ auto *Inst = dyn_cast<Instruction>(V);
+ if (!Inst || DT.dominates(Inst, Loc))
+ return;
+
+ assert(isSafeToSpeculativelyExecute(Inst, Loc, &DT) &&
+ !Inst->mayReadFromMemory() && "Should've checked with isAvailableAt!");
+
+ for (Value *Op : Inst->operands())
+ makeAvailableAt(Op, Loc);
+
+ Inst->moveBefore(Loc);
+}
+
+bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1,
+ Instruction *InsertPt, Value *&Result) {
+ using namespace llvm::PatternMatch;
+
+ {
+ // L >u C0 && L >u C1 -> L >u max(C0, C1)
+ ConstantInt *RHS0, *RHS1;
+ Value *LHS;
+ ICmpInst::Predicate Pred0, Pred1;
+ if (match(Cond0, m_ICmp(Pred0, m_Value(LHS), m_ConstantInt(RHS0))) &&
+ match(Cond1, m_ICmp(Pred1, m_Specific(LHS), m_ConstantInt(RHS1)))) {
+
+ ConstantRange CR0 =
+ ConstantRange::makeExactICmpRegion(Pred0, RHS0->getValue());
+ ConstantRange CR1 =
+ ConstantRange::makeExactICmpRegion(Pred1, RHS1->getValue());
+
+ // SubsetIntersect is a subset of the actual mathematical intersection of
+ // CR0 and CR1, while SupersetIntersect is a superset of the actual
+ // mathematical intersection. If these two ConstantRanges are equal, then
+ // we know we were able to represent the actual mathematical intersection
+ // of CR0 and CR1, and can use the same to generate an icmp instruction.
+ //
+ // Given what we're doing here and the semantics of guards, it would
+ // actually be correct to just use SubsetIntersect, but that may be too
+ // aggressive in cases we care about.
+ auto SubsetIntersect = CR0.inverse().unionWith(CR1.inverse()).inverse();
+ auto SupersetIntersect = CR0.intersectWith(CR1);
+
+ APInt NewRHSAP;
+ CmpInst::Predicate Pred;
+ if (SubsetIntersect == SupersetIntersect &&
+ SubsetIntersect.getEquivalentICmp(Pred, NewRHSAP)) {
+ if (InsertPt) {
+ ConstantInt *NewRHS = ConstantInt::get(Cond0->getContext(), NewRHSAP);
+ Result = new ICmpInst(InsertPt, Pred, LHS, NewRHS, "wide.chk");
+ }
+ return true;
+ }
+ }
+ }
+
+ {
+ SmallVector<GuardWideningImpl::RangeCheck, 4> Checks, CombinedChecks;
+ if (parseRangeChecks(Cond0, Checks) && parseRangeChecks(Cond1, Checks) &&
+ combineRangeChecks(Checks, CombinedChecks)) {
+ if (InsertPt) {
+ Result = nullptr;
+ for (auto &RC : CombinedChecks) {
+ makeAvailableAt(RC.getCheckInst(), InsertPt);
+ if (Result)
+ Result = BinaryOperator::CreateAnd(RC.getCheckInst(), Result, "",
+ InsertPt);
+ else
+ Result = RC.getCheckInst();
+ }
+
+ Result->setName("wide.chk");
+ }
+ return true;
+ }
+ }
+
+ // Base case -- just logical-and the two conditions together.
+
+ if (InsertPt) {
+ makeAvailableAt(Cond0, InsertPt);
+ makeAvailableAt(Cond1, InsertPt);
+
+ Result = BinaryOperator::CreateAnd(Cond0, Cond1, "wide.chk", InsertPt);
+ }
+
+ // We were not able to compute Cond0 AND Cond1 for the price of one.
+ return false;
+}
+
+bool GuardWideningImpl::parseRangeChecks(
+ Value *CheckCond, SmallVectorImpl<GuardWideningImpl::RangeCheck> &Checks,
+ SmallPtrSetImpl<Value *> &Visited) {
+ if (!Visited.insert(CheckCond).second)
+ return true;
+
+ using namespace llvm::PatternMatch;
+
+ {
+ Value *AndLHS, *AndRHS;
+ if (match(CheckCond, m_And(m_Value(AndLHS), m_Value(AndRHS))))
+ return parseRangeChecks(AndLHS, Checks) &&
+ parseRangeChecks(AndRHS, Checks);
+ }
+
+ auto *IC = dyn_cast<ICmpInst>(CheckCond);
+ if (!IC || !IC->getOperand(0)->getType()->isIntegerTy() ||
+ (IC->getPredicate() != ICmpInst::ICMP_ULT &&
+ IC->getPredicate() != ICmpInst::ICMP_UGT))
+ return false;
+
+ Value *CmpLHS = IC->getOperand(0), *CmpRHS = IC->getOperand(1);
+ if (IC->getPredicate() == ICmpInst::ICMP_UGT)
+ std::swap(CmpLHS, CmpRHS);
+
+ auto &DL = IC->getModule()->getDataLayout();
+
+ GuardWideningImpl::RangeCheck Check(
+ CmpLHS, cast<ConstantInt>(ConstantInt::getNullValue(CmpRHS->getType())),
+ CmpRHS, IC);
+
+ if (!isKnownNonNegative(Check.getLength(), DL))
+ return false;
+
+ // What we have in \c Check now is a correct interpretation of \p CheckCond.
+ // Try to see if we can move some constant offsets into the \c Offset field.
+
+ bool Changed;
+ auto &Ctx = CheckCond->getContext();
+
+ do {
+ Value *OpLHS;
+ ConstantInt *OpRHS;
+ Changed = false;
+
+#ifndef NDEBUG
+ auto *BaseInst = dyn_cast<Instruction>(Check.getBase());
+ assert((!BaseInst || DT.isReachableFromEntry(BaseInst->getParent())) &&
+ "Unreachable instruction?");
+#endif
+
+ if (match(Check.getBase(), m_Add(m_Value(OpLHS), m_ConstantInt(OpRHS)))) {
+ Check.setBase(OpLHS);
+ APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue();
+ Check.setOffset(ConstantInt::get(Ctx, NewOffset));
+ Changed = true;
+ } else if (match(Check.getBase(),
+ m_Or(m_Value(OpLHS), m_ConstantInt(OpRHS)))) {
+ KnownBits Known = computeKnownBits(OpLHS, DL);
+ if ((OpRHS->getValue() & Known.Zero) == OpRHS->getValue()) {
+ Check.setBase(OpLHS);
+ APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue();
+ Check.setOffset(ConstantInt::get(Ctx, NewOffset));
+ Changed = true;
+ }
+ }
+ } while (Changed);
+
+ Checks.push_back(Check);
+ return true;
+}
+
+bool GuardWideningImpl::combineRangeChecks(
+ SmallVectorImpl<GuardWideningImpl::RangeCheck> &Checks,
+ SmallVectorImpl<GuardWideningImpl::RangeCheck> &RangeChecksOut) {
+ unsigned OldCount = Checks.size();
+ while (!Checks.empty()) {
+ // Pick all of the range checks with a specific base and length, and try to
+ // merge them.
+ Value *CurrentBase = Checks.front().getBase();
+ Value *CurrentLength = Checks.front().getLength();
+
+ SmallVector<GuardWideningImpl::RangeCheck, 3> CurrentChecks;
+
+ auto IsCurrentCheck = [&](GuardWideningImpl::RangeCheck &RC) {
+ return RC.getBase() == CurrentBase && RC.getLength() == CurrentLength;
+ };
+
+ copy_if(Checks, std::back_inserter(CurrentChecks), IsCurrentCheck);
+ Checks.erase(remove_if(Checks, IsCurrentCheck), Checks.end());
+
+ assert(CurrentChecks.size() != 0 && "We know we have at least one!");
+
+ if (CurrentChecks.size() < 3) {
+ RangeChecksOut.insert(RangeChecksOut.end(), CurrentChecks.begin(),
+ CurrentChecks.end());
+ continue;
+ }
+
+ // CurrentChecks.size() will typically be 3 here, but so far there has been
+ // no need to hard-code that fact.
+
+ std::sort(CurrentChecks.begin(), CurrentChecks.end(),
+ [&](const GuardWideningImpl::RangeCheck &LHS,
+ const GuardWideningImpl::RangeCheck &RHS) {
+ return LHS.getOffsetValue().slt(RHS.getOffsetValue());
+ });
+
+ // Note: std::sort should not invalidate the ChecksStart iterator.
+
+ ConstantInt *MinOffset = CurrentChecks.front().getOffset(),
+ *MaxOffset = CurrentChecks.back().getOffset();
+
+ unsigned BitWidth = MaxOffset->getValue().getBitWidth();
+ if ((MaxOffset->getValue() - MinOffset->getValue())
+ .ugt(APInt::getSignedMinValue(BitWidth)))
+ return false;
+
+ APInt MaxDiff = MaxOffset->getValue() - MinOffset->getValue();
+ const APInt &HighOffset = MaxOffset->getValue();
+ auto OffsetOK = [&](const GuardWideningImpl::RangeCheck &RC) {
+ return (HighOffset - RC.getOffsetValue()).ult(MaxDiff);
+ };
+
+ if (MaxDiff.isMinValue() ||
+ !std::all_of(std::next(CurrentChecks.begin()), CurrentChecks.end(),
+ OffsetOK))
+ return false;
+
+ // We have a series of f+1 checks as:
+ //
+ // I+k_0 u< L ... Chk_0
+ // I+k_1 u< L ... Chk_1
+ // ...
+ // I+k_f u< L ... Chk_f
+ //
+ // with forall i in [0,f]: k_f-k_i u< k_f-k_0 ... Precond_0
+ // k_f-k_0 u< INT_MIN+k_f ... Precond_1
+ // k_f != k_0 ... Precond_2
+ //
+ // Claim:
+ // Chk_0 AND Chk_f implies all the other checks
+ //
+ // Informal proof sketch:
+ //
+ // We will show that the integer range [I+k_0,I+k_f] does not unsigned-wrap
+ // (i.e. going from I+k_0 to I+k_f does not cross the -1,0 boundary) and
+ // thus I+k_f is the greatest unsigned value in that range.
+ //
+ // This combined with Ckh_(f+1) shows that everything in that range is u< L.
+ // Via Precond_0 we know that all of the indices in Chk_0 through Chk_(f+1)
+ // lie in [I+k_0,I+k_f], this proving our claim.
+ //
+ // To see that [I+k_0,I+k_f] is not a wrapping range, note that there are
+ // two possibilities: I+k_0 u< I+k_f or I+k_0 >u I+k_f (they can't be equal
+ // since k_0 != k_f). In the former case, [I+k_0,I+k_f] is not a wrapping
+ // range by definition, and the latter case is impossible:
+ //
+ // 0-----I+k_f---I+k_0----L---INT_MAX,INT_MIN------------------(-1)
+ // xxxxxx xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+ //
+ // For Chk_0 to succeed, we'd have to have k_f-k_0 (the range highlighted
+ // with 'x' above) to be at least >u INT_MIN.
+
+ RangeChecksOut.emplace_back(CurrentChecks.front());
+ RangeChecksOut.emplace_back(CurrentChecks.back());
+ }
+
+ assert(RangeChecksOut.size() <= OldCount && "We pessimized!");
+ return RangeChecksOut.size() != OldCount;
+}
+
+PreservedAnalyses GuardWideningPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+ if (!GuardWideningImpl(DT, PDT, LI).run())
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+#ifndef NDEBUG
+StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
+ switch (WS) {
+ case WS_IllegalOrNegative:
+ return "IllegalOrNegative";
+ case WS_Neutral:
+ return "Neutral";
+ case WS_Positive:
+ return "Positive";
+ case WS_VeryPositive:
+ return "VeryPositive";
+ }
+
+ llvm_unreachable("Fully covered switch above!");
+}
+#endif
+
+char GuardWideningLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GuardWideningLegacyPass, "guard-widening", "Widen guards",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(GuardWideningLegacyPass, "guard-widening", "Widen guards",
+ false, false)
+
+FunctionPass *llvm::createGuardWideningPass() {
+ return new GuardWideningLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp b/contrib/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp
new file mode 100644
index 000000000000..807593379283
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp
@@ -0,0 +1,22 @@
+//===- IVUsersPrinter.cpp - Induction Variable Users Printer ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/IVUsersPrinter.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "iv-users"
+
+PreservedAnalyses IVUsersPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ AM.getResult<IVUsersAnalysis>(L, AR).print(OS);
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
new file mode 100644
index 000000000000..221fe57581ca
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -0,0 +1,2597 @@
+//===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into simpler forms suitable for subsequent
+// analysis and transformation.
+//
+// If the trip count of a loop is computable, this pass also makes the following
+// changes:
+// 1. The exit condition for the loop is canonicalized to compare the
+// induction value against the exit value. This turns loops like:
+// 'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)'
+// 2. Any use outside of the loop of an expression derived from the indvar
+// is changed to compute the derived value outside of the loop, eliminating
+// the dependence on the exit value of the induction variable. If the only
+// purpose of the loop is to compute the exit value of some derived
+// expression, this transformation will make the loop dead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/IndVarSimplify.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "indvars"
+
+STATISTIC(NumWidened , "Number of indvars widened");
+STATISTIC(NumReplaced , "Number of exit values replaced");
+STATISTIC(NumLFTR , "Number of loop exit tests replaced");
+STATISTIC(NumElimExt , "Number of IV sign/zero extends eliminated");
+STATISTIC(NumElimIV , "Number of congruent IVs eliminated");
+
+// Trip count verification can be enabled by default under NDEBUG if we
+// implement a strong expression equivalence checker in SCEV. Until then, we
+// use the verify-indvars flag, which may assert in some cases.
+static cl::opt<bool> VerifyIndvars(
+ "verify-indvars", cl::Hidden,
+ cl::desc("Verify the ScalarEvolution result after running indvars"));
+
+enum ReplaceExitVal { NeverRepl, OnlyCheapRepl, AlwaysRepl };
+
+static cl::opt<ReplaceExitVal> ReplaceExitValue(
+ "replexitval", cl::Hidden, cl::init(OnlyCheapRepl),
+ cl::desc("Choose the strategy to replace exit value in IndVarSimplify"),
+ cl::values(clEnumValN(NeverRepl, "never", "never replace exit value"),
+ clEnumValN(OnlyCheapRepl, "cheap",
+ "only replace exit value when the cost is cheap"),
+ clEnumValN(AlwaysRepl, "always",
+ "always replace exit value whenever possible")));
+
+static cl::opt<bool> UsePostIncrementRanges(
+ "indvars-post-increment-ranges", cl::Hidden,
+ cl::desc("Use post increment control-dependent ranges in IndVarSimplify"),
+ cl::init(true));
+
+static cl::opt<bool>
+DisableLFTR("disable-lftr", cl::Hidden, cl::init(false),
+ cl::desc("Disable Linear Function Test Replace optimization"));
+
+namespace {
+
+struct RewritePhi;
+
+class IndVarSimplify {
+ LoopInfo *LI;
+ ScalarEvolution *SE;
+ DominatorTree *DT;
+ const DataLayout &DL;
+ TargetLibraryInfo *TLI;
+ const TargetTransformInfo *TTI;
+
+ SmallVector<WeakTrackingVH, 16> DeadInsts;
+ bool Changed = false;
+
+ bool isValidRewrite(Value *FromVal, Value *ToVal);
+
+ void handleFloatingPointIV(Loop *L, PHINode *PH);
+ void rewriteNonIntegerIVs(Loop *L);
+
+ void simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI);
+
+ bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet);
+ void rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
+ void rewriteFirstIterationLoopExitValues(Loop *L);
+
+ Value *linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
+ PHINode *IndVar, SCEVExpander &Rewriter);
+
+ void sinkUnusedInvariants(Loop *L);
+
+ Value *expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L,
+ Instruction *InsertPt, Type *Ty);
+
+public:
+ IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+ const DataLayout &DL, TargetLibraryInfo *TLI,
+ TargetTransformInfo *TTI)
+ : LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI) {}
+
+ bool run(Loop *L);
+};
+
+} // end anonymous namespace
+
+/// Return true if the SCEV expansion generated by the rewriter can replace the
+/// original value. SCEV guarantees that it produces the same value, but the way
+/// it is produced may be illegal IR. Ideally, this function will only be
+/// called for verification.
+bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
+ // If an SCEV expression subsumed multiple pointers, its expansion could
+ // reassociate the GEP changing the base pointer. This is illegal because the
+ // final address produced by a GEP chain must be inbounds relative to its
+ // underlying object. Otherwise basic alias analysis, among other things,
+ // could fail in a dangerous way. Ultimately, SCEV will be improved to avoid
+ // producing an expression involving multiple pointers. Until then, we must
+ // bail out here.
+ //
+ // Retrieve the pointer operand of the GEP. Don't use GetUnderlyingObject
+ // because it understands lcssa phis while SCEV does not.
+ Value *FromPtr = FromVal;
+ Value *ToPtr = ToVal;
+ if (auto *GEP = dyn_cast<GEPOperator>(FromVal)) {
+ FromPtr = GEP->getPointerOperand();
+ }
+ if (auto *GEP = dyn_cast<GEPOperator>(ToVal)) {
+ ToPtr = GEP->getPointerOperand();
+ }
+ if (FromPtr != FromVal || ToPtr != ToVal) {
+ // Quickly check the common case
+ if (FromPtr == ToPtr)
+ return true;
+
+ // SCEV may have rewritten an expression that produces the GEP's pointer
+ // operand. That's ok as long as the pointer operand has the same base
+ // pointer. Unlike GetUnderlyingObject(), getPointerBase() will find the
+ // base of a recurrence. This handles the case in which SCEV expansion
+ // converts a pointer type recurrence into a nonrecurrent pointer base
+ // indexed by an integer recurrence.
+
+ // If the GEP base pointer is a vector of pointers, abort.
+ if (!FromPtr->getType()->isPointerTy() || !ToPtr->getType()->isPointerTy())
+ return false;
+
+ const SCEV *FromBase = SE->getPointerBase(SE->getSCEV(FromPtr));
+ const SCEV *ToBase = SE->getPointerBase(SE->getSCEV(ToPtr));
+ if (FromBase == ToBase)
+ return true;
+
+ DEBUG(dbgs() << "INDVARS: GEP rewrite bail out "
+ << *FromBase << " != " << *ToBase << "\n");
+
+ return false;
+ }
+ return true;
+}
+
+/// Determine the insertion point for this user. By default, insert immediately
+/// before the user. SCEVExpander or LICM will hoist loop invariants out of the
+/// loop. For PHI nodes, there may be multiple uses, so compute the nearest
+/// common dominator for the incoming blocks.
+static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
+ DominatorTree *DT, LoopInfo *LI) {
+ PHINode *PHI = dyn_cast<PHINode>(User);
+ if (!PHI)
+ return User;
+
+ Instruction *InsertPt = nullptr;
+ for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) {
+ if (PHI->getIncomingValue(i) != Def)
+ continue;
+
+ BasicBlock *InsertBB = PHI->getIncomingBlock(i);
+ if (!InsertPt) {
+ InsertPt = InsertBB->getTerminator();
+ continue;
+ }
+ InsertBB = DT->findNearestCommonDominator(InsertPt->getParent(), InsertBB);
+ InsertPt = InsertBB->getTerminator();
+ }
+ assert(InsertPt && "Missing phi operand");
+
+ auto *DefI = dyn_cast<Instruction>(Def);
+ if (!DefI)
+ return InsertPt;
+
+ assert(DT->dominates(DefI, InsertPt) && "def does not dominate all uses");
+
+ auto *L = LI->getLoopFor(DefI->getParent());
+ assert(!L || L->contains(LI->getLoopFor(InsertPt->getParent())));
+
+ for (auto *DTN = (*DT)[InsertPt->getParent()]; DTN; DTN = DTN->getIDom())
+ if (LI->getLoopFor(DTN->getBlock()) == L)
+ return DTN->getBlock()->getTerminator();
+
+ llvm_unreachable("DefI dominates InsertPt!");
+}
+
+//===----------------------------------------------------------------------===//
+// rewriteNonIntegerIVs and helpers. Prefer integer IVs.
+//===----------------------------------------------------------------------===//
+
+/// Convert APF to an integer, if possible.
+static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
+ bool isExact = false;
+ // See if we can convert this to an int64_t
+ uint64_t UIntVal;
+ if (APF.convertToInteger(makeMutableArrayRef(UIntVal), 64, true,
+ APFloat::rmTowardZero, &isExact) != APFloat::opOK ||
+ !isExact)
+ return false;
+ IntVal = UIntVal;
+ return true;
+}
+
+/// If the loop has floating induction variable then insert corresponding
+/// integer induction variable if possible.
+/// For example,
+/// for(double i = 0; i < 10000; ++i)
+/// bar(i)
+/// is converted into
+/// for(int i = 0; i < 10000; ++i)
+/// bar((double)i);
+void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
+ unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0));
+ unsigned BackEdge = IncomingEdge^1;
+
+ // Check incoming value.
+ auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge));
+
+ int64_t InitValue;
+ if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue))
+ return;
+
+ // Check IV increment. Reject this PN if increment operation is not
+ // an add or increment value can not be represented by an integer.
+ auto *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge));
+ if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return;
+
+ // If this is not an add of the PHI with a constantfp, or if the constant fp
+ // is not an integer, bail out.
+ ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1));
+ int64_t IncValue;
+ if (IncValueVal == nullptr || Incr->getOperand(0) != PN ||
+ !ConvertToSInt(IncValueVal->getValueAPF(), IncValue))
+ return;
+
+ // Check Incr uses. One user is PN and the other user is an exit condition
+ // used by the conditional terminator.
+ Value::user_iterator IncrUse = Incr->user_begin();
+ Instruction *U1 = cast<Instruction>(*IncrUse++);
+ if (IncrUse == Incr->user_end()) return;
+ Instruction *U2 = cast<Instruction>(*IncrUse++);
+ if (IncrUse != Incr->user_end()) return;
+
+ // Find exit condition, which is an fcmp. If it doesn't exist, or if it isn't
+ // only used by a branch, we can't transform it.
+ FCmpInst *Compare = dyn_cast<FCmpInst>(U1);
+ if (!Compare)
+ Compare = dyn_cast<FCmpInst>(U2);
+ if (!Compare || !Compare->hasOneUse() ||
+ !isa<BranchInst>(Compare->user_back()))
+ return;
+
+ BranchInst *TheBr = cast<BranchInst>(Compare->user_back());
+
+ // We need to verify that the branch actually controls the iteration count
+ // of the loop. If not, the new IV can overflow and no one will notice.
+ // The branch block must be in the loop and one of the successors must be out
+ // of the loop.
+ assert(TheBr->isConditional() && "Can't use fcmp if not conditional");
+ if (!L->contains(TheBr->getParent()) ||
+ (L->contains(TheBr->getSuccessor(0)) &&
+ L->contains(TheBr->getSuccessor(1))))
+ return;
+
+ // If it isn't a comparison with an integer-as-fp (the exit value), we can't
+ // transform it.
+ ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1));
+ int64_t ExitValue;
+ if (ExitValueVal == nullptr ||
+ !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue))
+ return;
+
+ // Find new predicate for integer comparison.
+ CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE;
+ switch (Compare->getPredicate()) {
+ default: return; // Unknown comparison.
+ case CmpInst::FCMP_OEQ:
+ case CmpInst::FCMP_UEQ: NewPred = CmpInst::ICMP_EQ; break;
+ case CmpInst::FCMP_ONE:
+ case CmpInst::FCMP_UNE: NewPred = CmpInst::ICMP_NE; break;
+ case CmpInst::FCMP_OGT:
+ case CmpInst::FCMP_UGT: NewPred = CmpInst::ICMP_SGT; break;
+ case CmpInst::FCMP_OGE:
+ case CmpInst::FCMP_UGE: NewPred = CmpInst::ICMP_SGE; break;
+ case CmpInst::FCMP_OLT:
+ case CmpInst::FCMP_ULT: NewPred = CmpInst::ICMP_SLT; break;
+ case CmpInst::FCMP_OLE:
+ case CmpInst::FCMP_ULE: NewPred = CmpInst::ICMP_SLE; break;
+ }
+
+ // We convert the floating point induction variable to a signed i32 value if
+ // we can. This is only safe if the comparison will not overflow in a way
+ // that won't be trapped by the integer equivalent operations. Check for this
+ // now.
+ // TODO: We could use i64 if it is native and the range requires it.
+
+ // The start/stride/exit values must all fit in signed i32.
+ if (!isInt<32>(InitValue) || !isInt<32>(IncValue) || !isInt<32>(ExitValue))
+ return;
+
+ // If not actually striding (add x, 0.0), avoid touching the code.
+ if (IncValue == 0)
+ return;
+
+ // Positive and negative strides have different safety conditions.
+ if (IncValue > 0) {
+ // If we have a positive stride, we require the init to be less than the
+ // exit value.
+ if (InitValue >= ExitValue)
+ return;
+
+ uint32_t Range = uint32_t(ExitValue-InitValue);
+ // Check for infinite loop, either:
+ // while (i <= Exit) or until (i > Exit)
+ if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) {
+ if (++Range == 0) return; // Range overflows.
+ }
+
+ unsigned Leftover = Range % uint32_t(IncValue);
+
+ // If this is an equality comparison, we require that the strided value
+ // exactly land on the exit value, otherwise the IV condition will wrap
+ // around and do things the fp IV wouldn't.
+ if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) &&
+ Leftover != 0)
+ return;
+
+ // If the stride would wrap around the i32 before exiting, we can't
+ // transform the IV.
+ if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue)
+ return;
+ } else {
+ // If we have a negative stride, we require the init to be greater than the
+ // exit value.
+ if (InitValue <= ExitValue)
+ return;
+
+ uint32_t Range = uint32_t(InitValue-ExitValue);
+ // Check for infinite loop, either:
+ // while (i >= Exit) or until (i < Exit)
+ if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) {
+ if (++Range == 0) return; // Range overflows.
+ }
+
+ unsigned Leftover = Range % uint32_t(-IncValue);
+
+ // If this is an equality comparison, we require that the strided value
+ // exactly land on the exit value, otherwise the IV condition will wrap
+ // around and do things the fp IV wouldn't.
+ if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) &&
+ Leftover != 0)
+ return;
+
+ // If the stride would wrap around the i32 before exiting, we can't
+ // transform the IV.
+ if (Leftover != 0 && int32_t(ExitValue+IncValue) > ExitValue)
+ return;
+ }
+
+ IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext());
+
+ // Insert new integer induction variable.
+ PHINode *NewPHI = PHINode::Create(Int32Ty, 2, PN->getName()+".int", PN);
+ NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue),
+ PN->getIncomingBlock(IncomingEdge));
+
+ Value *NewAdd =
+ BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue),
+ Incr->getName()+".int", Incr);
+ NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge));
+
+ ICmpInst *NewCompare = new ICmpInst(TheBr, NewPred, NewAdd,
+ ConstantInt::get(Int32Ty, ExitValue),
+ Compare->getName());
+
+ // In the following deletions, PN may become dead and may be deleted.
+ // Use a WeakTrackingVH to observe whether this happens.
+ WeakTrackingVH WeakPH = PN;
+
+ // Delete the old floating point exit comparison. The branch starts using the
+ // new comparison.
+ NewCompare->takeName(Compare);
+ Compare->replaceAllUsesWith(NewCompare);
+ RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI);
+
+ // Delete the old floating point increment.
+ Incr->replaceAllUsesWith(UndefValue::get(Incr->getType()));
+ RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI);
+
+ // If the FP induction variable still has uses, this is because something else
+ // in the loop uses its value. In order to canonicalize the induction
+ // variable, we chose to eliminate the IV and rewrite it in terms of an
+ // int->fp cast.
+ //
+ // We give preference to sitofp over uitofp because it is faster on most
+ // platforms.
+ if (WeakPH) {
+ Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
+ &*PN->getParent()->getFirstInsertionPt());
+ PN->replaceAllUsesWith(Conv);
+ RecursivelyDeleteTriviallyDeadInstructions(PN, TLI);
+ }
+ Changed = true;
+}
+
+void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) {
+ // First step. Check to see if there are any floating-point recurrences.
+ // If there are, change them into integer recurrences, permitting analysis by
+ // the SCEV routines.
+ BasicBlock *Header = L->getHeader();
+
+ SmallVector<WeakTrackingVH, 8> PHIs;
+ for (PHINode &PN : Header->phis())
+ PHIs.push_back(&PN);
+
+ for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
+ if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHIs[i]))
+ handleFloatingPointIV(L, PN);
+
+ // If the loop previously had floating-point IV, ScalarEvolution
+ // may not have been able to compute a trip count. Now that we've done some
+ // re-writing, the trip count may be computable.
+ if (Changed)
+ SE->forgetLoop(L);
+}
+
+namespace {
+
+// Collect information about PHI nodes which can be transformed in
+// rewriteLoopExitValues.
+struct RewritePhi {
+ PHINode *PN;
+
+ // Ith incoming value.
+ unsigned Ith;
+
+ // Exit value after expansion.
+ Value *Val;
+
+ // High Cost when expansion.
+ bool HighCost;
+
+ RewritePhi(PHINode *P, unsigned I, Value *V, bool H)
+ : PN(P), Ith(I), Val(V), HighCost(H) {}
+};
+
+} // end anonymous namespace
+
+Value *IndVarSimplify::expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S,
+ Loop *L, Instruction *InsertPt,
+ Type *ResultTy) {
+ // Before expanding S into an expensive LLVM expression, see if we can use an
+ // already existing value as the expansion for S.
+ if (Value *ExistingValue = Rewriter.getExactExistingExpansion(S, InsertPt, L))
+ if (ExistingValue->getType() == ResultTy)
+ return ExistingValue;
+
+ // We didn't find anything, fall back to using SCEVExpander.
+ return Rewriter.expandCodeFor(S, ResultTy, InsertPt);
+}
+
+//===----------------------------------------------------------------------===//
+// rewriteLoopExitValues - Optimize IV users outside the loop.
+// As a side effect, reduces the amount of IV processing within the loop.
+//===----------------------------------------------------------------------===//
+
+/// Check to see if this loop has a computable loop-invariant execution count.
+/// If so, this means that we can compute the final value of any expressions
+/// that are recurrent in the loop, and substitute the exit values from the loop
+/// into any instructions outside of the loop that use the final values of the
+/// current expressions.
+///
+/// This is mostly redundant with the regular IndVarSimplify activities that
+/// happen later, except that it's more powerful in some cases, because it's
+/// able to brute-force evaluate arbitrary instructions as long as they have
+/// constant operands at the beginning of the loop.
+void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
+ // Check a pre-condition.
+ assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+ "Indvars did not preserve LCSSA!");
+
+ SmallVector<BasicBlock*, 8> ExitBlocks;
+ L->getUniqueExitBlocks(ExitBlocks);
+
+ SmallVector<RewritePhi, 8> RewritePhiSet;
+ // Find all values that are computed inside the loop, but used outside of it.
+ // Because of LCSSA, these values will only occur in LCSSA PHI Nodes. Scan
+ // the exit blocks of the loop to find them.
+ for (BasicBlock *ExitBB : ExitBlocks) {
+ // If there are no PHI nodes in this exit block, then no values defined
+ // inside the loop are used on this path, skip it.
+ PHINode *PN = dyn_cast<PHINode>(ExitBB->begin());
+ if (!PN) continue;
+
+ unsigned NumPreds = PN->getNumIncomingValues();
+
+ // Iterate over all of the PHI nodes.
+ BasicBlock::iterator BBI = ExitBB->begin();
+ while ((PN = dyn_cast<PHINode>(BBI++))) {
+ if (PN->use_empty())
+ continue; // dead use, don't replace it
+
+ if (!SE->isSCEVable(PN->getType()))
+ continue;
+
+ // It's necessary to tell ScalarEvolution about this explicitly so that
+ // it can walk the def-use list and forget all SCEVs, as it may not be
+ // watching the PHI itself. Once the new exit value is in place, there
+ // may not be a def-use connection between the loop and every instruction
+ // which got a SCEVAddRecExpr for that loop.
+ SE->forgetValue(PN);
+
+ // Iterate over all of the values in all the PHI nodes.
+ for (unsigned i = 0; i != NumPreds; ++i) {
+ // If the value being merged in is not integer or is not defined
+ // in the loop, skip it.
+ Value *InVal = PN->getIncomingValue(i);
+ if (!isa<Instruction>(InVal))
+ continue;
+
+ // If this pred is for a subloop, not L itself, skip it.
+ if (LI->getLoopFor(PN->getIncomingBlock(i)) != L)
+ continue; // The Block is in a subloop, skip it.
+
+ // Check that InVal is defined in the loop.
+ Instruction *Inst = cast<Instruction>(InVal);
+ if (!L->contains(Inst))
+ continue;
+
+ // Okay, this instruction has a user outside of the current loop
+ // and varies predictably *inside* the loop. Evaluate the value it
+ // contains when the loop exits, if possible.
+ const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
+ if (!SE->isLoopInvariant(ExitValue, L) ||
+ !isSafeToExpand(ExitValue, *SE))
+ continue;
+
+ // Computing the value outside of the loop brings no benefit if :
+ // - it is definitely used inside the loop in a way which can not be
+ // optimized away.
+ // - no use outside of the loop can take advantage of hoisting the
+ // computation out of the loop
+ if (ExitValue->getSCEVType()>=scMulExpr) {
+ unsigned NumHardInternalUses = 0;
+ unsigned NumSoftExternalUses = 0;
+ unsigned NumUses = 0;
+ for (auto IB = Inst->user_begin(), IE = Inst->user_end();
+ IB != IE && NumUses <= 6; ++IB) {
+ Instruction *UseInstr = cast<Instruction>(*IB);
+ unsigned Opc = UseInstr->getOpcode();
+ NumUses++;
+ if (L->contains(UseInstr)) {
+ if (Opc == Instruction::Call || Opc == Instruction::Ret)
+ NumHardInternalUses++;
+ } else {
+ if (Opc == Instruction::PHI) {
+ // Do not count the Phi as a use. LCSSA may have inserted
+ // plenty of trivial ones.
+ NumUses--;
+ for (auto PB = UseInstr->user_begin(),
+ PE = UseInstr->user_end();
+ PB != PE && NumUses <= 6; ++PB, ++NumUses) {
+ unsigned PhiOpc = cast<Instruction>(*PB)->getOpcode();
+ if (PhiOpc != Instruction::Call && PhiOpc != Instruction::Ret)
+ NumSoftExternalUses++;
+ }
+ continue;
+ }
+ if (Opc != Instruction::Call && Opc != Instruction::Ret)
+ NumSoftExternalUses++;
+ }
+ }
+ if (NumUses <= 6 && NumHardInternalUses && !NumSoftExternalUses)
+ continue;
+ }
+
+ bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L, Inst);
+ Value *ExitVal =
+ expandSCEVIfNeeded(Rewriter, ExitValue, L, Inst, PN->getType());
+
+ DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n'
+ << " LoopVal = " << *Inst << "\n");
+
+ if (!isValidRewrite(Inst, ExitVal)) {
+ DeadInsts.push_back(ExitVal);
+ continue;
+ }
+
+ // Collect all the candidate PHINodes to be rewritten.
+ RewritePhiSet.emplace_back(PN, i, ExitVal, HighCost);
+ }
+ }
+ }
+
+ bool LoopCanBeDel = canLoopBeDeleted(L, RewritePhiSet);
+
+ // Transformation.
+ for (const RewritePhi &Phi : RewritePhiSet) {
+ PHINode *PN = Phi.PN;
+ Value *ExitVal = Phi.Val;
+
+ // Only do the rewrite when the ExitValue can be expanded cheaply.
+ // If LoopCanBeDel is true, rewrite exit value aggressively.
+ if (ReplaceExitValue == OnlyCheapRepl && !LoopCanBeDel && Phi.HighCost) {
+ DeadInsts.push_back(ExitVal);
+ continue;
+ }
+
+ Changed = true;
+ ++NumReplaced;
+ Instruction *Inst = cast<Instruction>(PN->getIncomingValue(Phi.Ith));
+ PN->setIncomingValue(Phi.Ith, ExitVal);
+
+ // If this instruction is dead now, delete it. Don't do it now to avoid
+ // invalidating iterators.
+ if (isInstructionTriviallyDead(Inst, TLI))
+ DeadInsts.push_back(Inst);
+
+ // Replace PN with ExitVal if that is legal and does not break LCSSA.
+ if (PN->getNumIncomingValues() == 1 &&
+ LI->replacementPreservesLCSSAForm(PN, ExitVal)) {
+ PN->replaceAllUsesWith(ExitVal);
+ PN->eraseFromParent();
+ }
+ }
+
+ // The insertion point instruction may have been deleted; clear it out
+ // so that the rewriter doesn't trip over it later.
+ Rewriter.clearInsertPoint();
+}
+
+//===---------------------------------------------------------------------===//
+// rewriteFirstIterationLoopExitValues: Rewrite loop exit values if we know
+// they will exit at the first iteration.
+//===---------------------------------------------------------------------===//
+
+/// Check to see if this loop has loop invariant conditions which lead to loop
+/// exits. If so, we know that if the exit path is taken, it is at the first
+/// loop iteration. This lets us predict exit values of PHI nodes that live in
+/// loop header.
+void IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) {
+ // Verify the input to the pass is already in LCSSA form.
+ assert(L->isLCSSAForm(*DT));
+
+ SmallVector<BasicBlock *, 8> ExitBlocks;
+ L->getUniqueExitBlocks(ExitBlocks);
+ auto *LoopHeader = L->getHeader();
+ assert(LoopHeader && "Invalid loop");
+
+ for (auto *ExitBB : ExitBlocks) {
+ // If there are no more PHI nodes in this exit block, then no more
+ // values defined inside the loop are used on this path.
+ for (PHINode &PN : ExitBB->phis()) {
+ for (unsigned IncomingValIdx = 0, E = PN.getNumIncomingValues();
+ IncomingValIdx != E; ++IncomingValIdx) {
+ auto *IncomingBB = PN.getIncomingBlock(IncomingValIdx);
+
+ // We currently only support loop exits from loop header. If the
+ // incoming block is not loop header, we need to recursively check
+ // all conditions starting from loop header are loop invariants.
+ // Additional support might be added in the future.
+ if (IncomingBB != LoopHeader)
+ continue;
+
+ // Get condition that leads to the exit path.
+ auto *TermInst = IncomingBB->getTerminator();
+
+ Value *Cond = nullptr;
+ if (auto *BI = dyn_cast<BranchInst>(TermInst)) {
+ // Must be a conditional branch, otherwise the block
+ // should not be in the loop.
+ Cond = BI->getCondition();
+ } else if (auto *SI = dyn_cast<SwitchInst>(TermInst))
+ Cond = SI->getCondition();
+ else
+ continue;
+
+ if (!L->isLoopInvariant(Cond))
+ continue;
+
+ auto *ExitVal = dyn_cast<PHINode>(PN.getIncomingValue(IncomingValIdx));
+
+ // Only deal with PHIs.
+ if (!ExitVal)
+ continue;
+
+ // If ExitVal is a PHI on the loop header, then we know its
+ // value along this exit because the exit can only be taken
+ // on the first iteration.
+ auto *LoopPreheader = L->getLoopPreheader();
+ assert(LoopPreheader && "Invalid loop");
+ int PreheaderIdx = ExitVal->getBasicBlockIndex(LoopPreheader);
+ if (PreheaderIdx != -1) {
+ assert(ExitVal->getParent() == LoopHeader &&
+ "ExitVal must be in loop header");
+ PN.setIncomingValue(IncomingValIdx,
+ ExitVal->getIncomingValue(PreheaderIdx));
+ }
+ }
+ }
+ }
+}
+
+/// Check whether it is possible to delete the loop after rewriting exit
+/// value. If it is possible, ignore ReplaceExitValue and do rewriting
+/// aggressively.
+bool IndVarSimplify::canLoopBeDeleted(
+ Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) {
+ BasicBlock *Preheader = L->getLoopPreheader();
+ // If there is no preheader, the loop will not be deleted.
+ if (!Preheader)
+ return false;
+
+ // In LoopDeletion pass Loop can be deleted when ExitingBlocks.size() > 1.
+ // We obviate multiple ExitingBlocks case for simplicity.
+ // TODO: If we see testcase with multiple ExitingBlocks can be deleted
+ // after exit value rewriting, we can enhance the logic here.
+ SmallVector<BasicBlock *, 4> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+ SmallVector<BasicBlock *, 8> ExitBlocks;
+ L->getUniqueExitBlocks(ExitBlocks);
+ if (ExitBlocks.size() > 1 || ExitingBlocks.size() > 1)
+ return false;
+
+ BasicBlock *ExitBlock = ExitBlocks[0];
+ BasicBlock::iterator BI = ExitBlock->begin();
+ while (PHINode *P = dyn_cast<PHINode>(BI)) {
+ Value *Incoming = P->getIncomingValueForBlock(ExitingBlocks[0]);
+
+ // If the Incoming value of P is found in RewritePhiSet, we know it
+ // could be rewritten to use a loop invariant value in transformation
+ // phase later. Skip it in the loop invariant check below.
+ bool found = false;
+ for (const RewritePhi &Phi : RewritePhiSet) {
+ unsigned i = Phi.Ith;
+ if (Phi.PN == P && (Phi.PN)->getIncomingValue(i) == Incoming) {
+ found = true;
+ break;
+ }
+ }
+
+ Instruction *I;
+ if (!found && (I = dyn_cast<Instruction>(Incoming)))
+ if (!L->hasLoopInvariantOperands(I))
+ return false;
+
+ ++BI;
+ }
+
+ for (auto *BB : L->blocks())
+ if (llvm::any_of(*BB, [](Instruction &I) {
+ return I.mayHaveSideEffects();
+ }))
+ return false;
+
+ return true;
+}
+
+//===----------------------------------------------------------------------===//
+// IV Widening - Extend the width of an IV to cover its widest uses.
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// Collect information about induction variables that are used by sign/zero
+// extend operations. This information is recorded by CollectExtend and provides
+// the input to WidenIV.
+struct WideIVInfo {
+ PHINode *NarrowIV = nullptr;
+
+ // Widest integer type created [sz]ext
+ Type *WidestNativeType = nullptr;
+
+ // Was a sext user seen before a zext?
+ bool IsSigned = false;
+};
+
+} // end anonymous namespace
+
+/// Update information about the induction variable that is extended by this
+/// sign or zero extend operation. This is used to determine the final width of
+/// the IV before actually widening it.
+static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,
+ const TargetTransformInfo *TTI) {
+ bool IsSigned = Cast->getOpcode() == Instruction::SExt;
+ if (!IsSigned && Cast->getOpcode() != Instruction::ZExt)
+ return;
+
+ Type *Ty = Cast->getType();
+ uint64_t Width = SE->getTypeSizeInBits(Ty);
+ if (!Cast->getModule()->getDataLayout().isLegalInteger(Width))
+ return;
+
+ // Check that `Cast` actually extends the induction variable (we rely on this
+ // later). This takes care of cases where `Cast` is extending a truncation of
+ // the narrow induction variable, and thus can end up being narrower than the
+ // "narrow" induction variable.
+ uint64_t NarrowIVWidth = SE->getTypeSizeInBits(WI.NarrowIV->getType());
+ if (NarrowIVWidth >= Width)
+ return;
+
+ // Cast is either an sext or zext up to this point.
+ // We should not widen an indvar if arithmetics on the wider indvar are more
+ // expensive than those on the narrower indvar. We check only the cost of ADD
+ // because at least an ADD is required to increment the induction variable. We
+ // could compute more comprehensively the cost of all instructions on the
+ // induction variable when necessary.
+ if (TTI &&
+ TTI->getArithmeticInstrCost(Instruction::Add, Ty) >
+ TTI->getArithmeticInstrCost(Instruction::Add,
+ Cast->getOperand(0)->getType())) {
+ return;
+ }
+
+ if (!WI.WidestNativeType) {
+ WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
+ WI.IsSigned = IsSigned;
+ return;
+ }
+
+ // We extend the IV to satisfy the sign of its first user, arbitrarily.
+ if (WI.IsSigned != IsSigned)
+ return;
+
+ if (Width > SE->getTypeSizeInBits(WI.WidestNativeType))
+ WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
+}
+
+namespace {
+
+/// Record a link in the Narrow IV def-use chain along with the WideIV that
+/// computes the same value as the Narrow IV def. This avoids caching Use*
+/// pointers.
+struct NarrowIVDefUse {
+ Instruction *NarrowDef = nullptr;
+ Instruction *NarrowUse = nullptr;
+ Instruction *WideDef = nullptr;
+
+ // True if the narrow def is never negative. Tracking this information lets
+ // us use a sign extension instead of a zero extension or vice versa, when
+ // profitable and legal.
+ bool NeverNegative = false;
+
+ NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD,
+ bool NeverNegative)
+ : NarrowDef(ND), NarrowUse(NU), WideDef(WD),
+ NeverNegative(NeverNegative) {}
+};
+
+/// The goal of this transform is to remove sign and zero extends without
+/// creating any new induction variables. To do this, it creates a new phi of
+/// the wider type and redirects all users, either removing extends or inserting
+/// truncs whenever we stop propagating the type.
+class WidenIV {
+ // Parameters
+ PHINode *OrigPhi;
+ Type *WideType;
+
+ // Context
+ LoopInfo *LI;
+ Loop *L;
+ ScalarEvolution *SE;
+ DominatorTree *DT;
+
+ // Does the module have any calls to the llvm.experimental.guard intrinsic
+ // at all? If not we can avoid scanning instructions looking for guards.
+ bool HasGuards;
+
+ // Result
+ PHINode *WidePhi = nullptr;
+ Instruction *WideInc = nullptr;
+ const SCEV *WideIncExpr = nullptr;
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts;
+
+ SmallPtrSet<Instruction *,16> Widened;
+ SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
+
+ enum ExtendKind { ZeroExtended, SignExtended, Unknown };
+
+ // A map tracking the kind of extension used to widen each narrow IV
+ // and narrow IV user.
+ // Key: pointer to a narrow IV or IV user.
+ // Value: the kind of extension used to widen this Instruction.
+ DenseMap<AssertingVH<Instruction>, ExtendKind> ExtendKindMap;
+
+ using DefUserPair = std::pair<AssertingVH<Value>, AssertingVH<Instruction>>;
+
+ // A map with control-dependent ranges for post increment IV uses. The key is
+ // a pair of IV def and a use of this def denoting the context. The value is
+ // a ConstantRange representing possible values of the def at the given
+ // context.
+ DenseMap<DefUserPair, ConstantRange> PostIncRangeInfos;
+
+ Optional<ConstantRange> getPostIncRangeInfo(Value *Def,
+ Instruction *UseI) {
+ DefUserPair Key(Def, UseI);
+ auto It = PostIncRangeInfos.find(Key);
+ return It == PostIncRangeInfos.end()
+ ? Optional<ConstantRange>(None)
+ : Optional<ConstantRange>(It->second);
+ }
+
+ void calculatePostIncRanges(PHINode *OrigPhi);
+ void calculatePostIncRange(Instruction *NarrowDef, Instruction *NarrowUser);
+
+ void updatePostIncRangeInfo(Value *Def, Instruction *UseI, ConstantRange R) {
+ DefUserPair Key(Def, UseI);
+ auto It = PostIncRangeInfos.find(Key);
+ if (It == PostIncRangeInfos.end())
+ PostIncRangeInfos.insert({Key, R});
+ else
+ It->second = R.intersectWith(It->second);
+ }
+
+public:
+ WidenIV(const WideIVInfo &WI, LoopInfo *LInfo, ScalarEvolution *SEv,
+ DominatorTree *DTree, SmallVectorImpl<WeakTrackingVH> &DI,
+ bool HasGuards)
+ : OrigPhi(WI.NarrowIV), WideType(WI.WidestNativeType), LI(LInfo),
+ L(LI->getLoopFor(OrigPhi->getParent())), SE(SEv), DT(DTree),
+ HasGuards(HasGuards), DeadInsts(DI) {
+ assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");
+ ExtendKindMap[OrigPhi] = WI.IsSigned ? SignExtended : ZeroExtended;
+ }
+
+ PHINode *createWideIV(SCEVExpander &Rewriter);
+
+protected:
+ Value *createExtendInst(Value *NarrowOper, Type *WideType, bool IsSigned,
+ Instruction *Use);
+
+ Instruction *cloneIVUser(NarrowIVDefUse DU, const SCEVAddRecExpr *WideAR);
+ Instruction *cloneArithmeticIVUser(NarrowIVDefUse DU,
+ const SCEVAddRecExpr *WideAR);
+ Instruction *cloneBitwiseIVUser(NarrowIVDefUse DU);
+
+ ExtendKind getExtendKind(Instruction *I);
+
+ using WidenedRecTy = std::pair<const SCEVAddRecExpr *, ExtendKind>;
+
+ WidenedRecTy getWideRecurrence(NarrowIVDefUse DU);
+
+ WidenedRecTy getExtendedOperandRecurrence(NarrowIVDefUse DU);
+
+ const SCEV *getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
+ unsigned OpCode) const;
+
+ Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);
+
+ bool widenLoopCompare(NarrowIVDefUse DU);
+
+ void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
+};
+
+} // end anonymous namespace
+
+/// Perform a quick domtree based check for loop invariance assuming that V is
+/// used within the loop. LoopInfo::isLoopInvariant() seems gratuitous for this
+/// purpose.
+static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) {
+ Instruction *Inst = dyn_cast<Instruction>(V);
+ if (!Inst)
+ return true;
+
+ return DT->properlyDominates(Inst->getParent(), L->getHeader());
+}
+
+Value *WidenIV::createExtendInst(Value *NarrowOper, Type *WideType,
+ bool IsSigned, Instruction *Use) {
+ // Set the debug location and conservative insertion point.
+ IRBuilder<> Builder(Use);
+ // Hoist the insertion point into loop preheaders as far as possible.
+ for (const Loop *L = LI->getLoopFor(Use->getParent());
+ L && L->getLoopPreheader() && isLoopInvariant(NarrowOper, L, DT);
+ L = L->getParentLoop())
+ Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
+
+ return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) :
+ Builder.CreateZExt(NarrowOper, WideType);
+}
+
+/// Instantiate a wide operation to replace a narrow operation. This only needs
+/// to handle operations that can evaluation to SCEVAddRec. It can safely return
+/// 0 for any operation we decide not to clone.
+Instruction *WidenIV::cloneIVUser(NarrowIVDefUse DU,
+ const SCEVAddRecExpr *WideAR) {
+ unsigned Opcode = DU.NarrowUse->getOpcode();
+ switch (Opcode) {
+ default:
+ return nullptr;
+ case Instruction::Add:
+ case Instruction::Mul:
+ case Instruction::UDiv:
+ case Instruction::Sub:
+ return cloneArithmeticIVUser(DU, WideAR);
+
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ return cloneBitwiseIVUser(DU);
+ }
+}
+
+Instruction *WidenIV::cloneBitwiseIVUser(NarrowIVDefUse DU) {
+ Instruction *NarrowUse = DU.NarrowUse;
+ Instruction *NarrowDef = DU.NarrowDef;
+ Instruction *WideDef = DU.WideDef;
+
+ DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n");
+
+ // Replace NarrowDef operands with WideDef. Otherwise, we don't know anything
+ // about the narrow operand yet so must insert a [sz]ext. It is probably loop
+ // invariant and will be folded or hoisted. If it actually comes from a
+ // widened IV, it should be removed during a future call to widenIVUse.
+ bool IsSigned = getExtendKind(NarrowDef) == SignExtended;
+ Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
+ ? WideDef
+ : createExtendInst(NarrowUse->getOperand(0), WideType,
+ IsSigned, NarrowUse);
+ Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
+ ? WideDef
+ : createExtendInst(NarrowUse->getOperand(1), WideType,
+ IsSigned, NarrowUse);
+
+ auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
+ auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
+ NarrowBO->getName());
+ IRBuilder<> Builder(NarrowUse);
+ Builder.Insert(WideBO);
+ WideBO->copyIRFlags(NarrowBO);
+ return WideBO;
+}
+
+Instruction *WidenIV::cloneArithmeticIVUser(NarrowIVDefUse DU,
+ const SCEVAddRecExpr *WideAR) {
+ Instruction *NarrowUse = DU.NarrowUse;
+ Instruction *NarrowDef = DU.NarrowDef;
+ Instruction *WideDef = DU.WideDef;
+
+ DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
+
+ unsigned IVOpIdx = (NarrowUse->getOperand(0) == NarrowDef) ? 0 : 1;
+
+ // We're trying to find X such that
+ //
+ // Widen(NarrowDef `op` NonIVNarrowDef) == WideAR == WideDef `op.wide` X
+ //
+ // We guess two solutions to X, sext(NonIVNarrowDef) and zext(NonIVNarrowDef),
+ // and check using SCEV if any of them are correct.
+
+ // Returns true if extending NonIVNarrowDef according to `SignExt` is a
+ // correct solution to X.
+ auto GuessNonIVOperand = [&](bool SignExt) {
+ const SCEV *WideLHS;
+ const SCEV *WideRHS;
+
+ auto GetExtend = [this, SignExt](const SCEV *S, Type *Ty) {
+ if (SignExt)
+ return SE->getSignExtendExpr(S, Ty);
+ return SE->getZeroExtendExpr(S, Ty);
+ };
+
+ if (IVOpIdx == 0) {
+ WideLHS = SE->getSCEV(WideDef);
+ const SCEV *NarrowRHS = SE->getSCEV(NarrowUse->getOperand(1));
+ WideRHS = GetExtend(NarrowRHS, WideType);
+ } else {
+ const SCEV *NarrowLHS = SE->getSCEV(NarrowUse->getOperand(0));
+ WideLHS = GetExtend(NarrowLHS, WideType);
+ WideRHS = SE->getSCEV(WideDef);
+ }
+
+ // WideUse is "WideDef `op.wide` X" as described in the comment.
+ const SCEV *WideUse = nullptr;
+
+ switch (NarrowUse->getOpcode()) {
+ default:
+ llvm_unreachable("No other possibility!");
+
+ case Instruction::Add:
+ WideUse = SE->getAddExpr(WideLHS, WideRHS);
+ break;
+
+ case Instruction::Mul:
+ WideUse = SE->getMulExpr(WideLHS, WideRHS);
+ break;
+
+ case Instruction::UDiv:
+ WideUse = SE->getUDivExpr(WideLHS, WideRHS);
+ break;
+
+ case Instruction::Sub:
+ WideUse = SE->getMinusSCEV(WideLHS, WideRHS);
+ break;
+ }
+
+ return WideUse == WideAR;
+ };
+
+ bool SignExtend = getExtendKind(NarrowDef) == SignExtended;
+ if (!GuessNonIVOperand(SignExtend)) {
+ SignExtend = !SignExtend;
+ if (!GuessNonIVOperand(SignExtend))
+ return nullptr;
+ }
+
+ Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
+ ? WideDef
+ : createExtendInst(NarrowUse->getOperand(0), WideType,
+ SignExtend, NarrowUse);
+ Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
+ ? WideDef
+ : createExtendInst(NarrowUse->getOperand(1), WideType,
+ SignExtend, NarrowUse);
+
+ auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
+ auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
+ NarrowBO->getName());
+
+ IRBuilder<> Builder(NarrowUse);
+ Builder.Insert(WideBO);
+ WideBO->copyIRFlags(NarrowBO);
+ return WideBO;
+}
+
+WidenIV::ExtendKind WidenIV::getExtendKind(Instruction *I) {
+ auto It = ExtendKindMap.find(I);
+ assert(It != ExtendKindMap.end() && "Instruction not yet extended!");
+ return It->second;
+}
+
+const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
+ unsigned OpCode) const {
+ if (OpCode == Instruction::Add)
+ return SE->getAddExpr(LHS, RHS);
+ if (OpCode == Instruction::Sub)
+ return SE->getMinusSCEV(LHS, RHS);
+ if (OpCode == Instruction::Mul)
+ return SE->getMulExpr(LHS, RHS);
+
+ llvm_unreachable("Unsupported opcode.");
+}
+
+/// No-wrap operations can transfer sign extension of their result to their
+/// operands. Generate the SCEV value for the widened operation without
+/// actually modifying the IR yet. If the expression after extending the
+/// operands is an AddRec for this loop, return the AddRec and the kind of
+/// extension used.
+WidenIV::WidenedRecTy WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) {
+ // Handle the common case of add<nsw/nuw>
+ const unsigned OpCode = DU.NarrowUse->getOpcode();
+ // Only Add/Sub/Mul instructions supported yet.
+ if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
+ OpCode != Instruction::Mul)
+ return {nullptr, Unknown};
+
+ // One operand (NarrowDef) has already been extended to WideDef. Now determine
+ // if extending the other will lead to a recurrence.
+ const unsigned ExtendOperIdx =
+ DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
+ assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
+
+ const SCEV *ExtendOperExpr = nullptr;
+ const OverflowingBinaryOperator *OBO =
+ cast<OverflowingBinaryOperator>(DU.NarrowUse);
+ ExtendKind ExtKind = getExtendKind(DU.NarrowDef);
+ if (ExtKind == SignExtended && OBO->hasNoSignedWrap())
+ ExtendOperExpr = SE->getSignExtendExpr(
+ SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+ else if(ExtKind == ZeroExtended && OBO->hasNoUnsignedWrap())
+ ExtendOperExpr = SE->getZeroExtendExpr(
+ SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+ else
+ return {nullptr, Unknown};
+
+ // When creating this SCEV expr, don't apply the current operations NSW or NUW
+ // flags. This instruction may be guarded by control flow that the no-wrap
+ // behavior depends on. Non-control-equivalent instructions can be mapped to
+ // the same SCEV expression, and it would be incorrect to transfer NSW/NUW
+ // semantics to those operations.
+ const SCEV *lhs = SE->getSCEV(DU.WideDef);
+ const SCEV *rhs = ExtendOperExpr;
+
+ // Let's swap operands to the initial order for the case of non-commutative
+ // operations, like SUB. See PR21014.
+ if (ExtendOperIdx == 0)
+ std::swap(lhs, rhs);
+ const SCEVAddRecExpr *AddRec =
+ dyn_cast<SCEVAddRecExpr>(getSCEVByOpCode(lhs, rhs, OpCode));
+
+ if (!AddRec || AddRec->getLoop() != L)
+ return {nullptr, Unknown};
+
+ return {AddRec, ExtKind};
+}
+
+/// Is this instruction potentially interesting for further simplification after
+/// widening it's type? In other words, can the extend be safely hoisted out of
+/// the loop with SCEV reducing the value to a recurrence on the same loop. If
+/// so, return the extended recurrence and the kind of extension used. Otherwise
+/// return {nullptr, Unknown}.
+WidenIV::WidenedRecTy WidenIV::getWideRecurrence(NarrowIVDefUse DU) {
+ if (!SE->isSCEVable(DU.NarrowUse->getType()))
+ return {nullptr, Unknown};
+
+ const SCEV *NarrowExpr = SE->getSCEV(DU.NarrowUse);
+ if (SE->getTypeSizeInBits(NarrowExpr->getType()) >=
+ SE->getTypeSizeInBits(WideType)) {
+ // NarrowUse implicitly widens its operand. e.g. a gep with a narrow
+ // index. So don't follow this use.
+ return {nullptr, Unknown};
+ }
+
+ const SCEV *WideExpr;
+ ExtendKind ExtKind;
+ if (DU.NeverNegative) {
+ WideExpr = SE->getSignExtendExpr(NarrowExpr, WideType);
+ if (isa<SCEVAddRecExpr>(WideExpr))
+ ExtKind = SignExtended;
+ else {
+ WideExpr = SE->getZeroExtendExpr(NarrowExpr, WideType);
+ ExtKind = ZeroExtended;
+ }
+ } else if (getExtendKind(DU.NarrowDef) == SignExtended) {
+ WideExpr = SE->getSignExtendExpr(NarrowExpr, WideType);
+ ExtKind = SignExtended;
+ } else {
+ WideExpr = SE->getZeroExtendExpr(NarrowExpr, WideType);
+ ExtKind = ZeroExtended;
+ }
+ const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr);
+ if (!AddRec || AddRec->getLoop() != L)
+ return {nullptr, Unknown};
+ return {AddRec, ExtKind};
+}
+
+/// This IV user cannot be widen. Replace this use of the original narrow IV
+/// with a truncation of the new wide IV to isolate and eliminate the narrow IV.
+static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT, LoopInfo *LI) {
+ DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef
+ << " for user " << *DU.NarrowUse << "\n");
+ IRBuilder<> Builder(
+ getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));
+ Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
+ DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
+}
+
+/// If the narrow use is a compare instruction, then widen the compare
+// (and possibly the other operand). The extend operation is hoisted into the
+// loop preheader as far as possible.
+bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) {
+ ICmpInst *Cmp = dyn_cast<ICmpInst>(DU.NarrowUse);
+ if (!Cmp)
+ return false;
+
+ // We can legally widen the comparison in the following two cases:
+ //
+ // - The signedness of the IV extension and comparison match
+ //
+ // - The narrow IV is always positive (and thus its sign extension is equal
+ // to its zero extension). For instance, let's say we're zero extending
+ // %narrow for the following use
+ //
+ // icmp slt i32 %narrow, %val ... (A)
+ //
+ // and %narrow is always positive. Then
+ //
+ // (A) == icmp slt i32 sext(%narrow), sext(%val)
+ // == icmp slt i32 zext(%narrow), sext(%val)
+ bool IsSigned = getExtendKind(DU.NarrowDef) == SignExtended;
+ if (!(DU.NeverNegative || IsSigned == Cmp->isSigned()))
+ return false;
+
+ Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0);
+ unsigned CastWidth = SE->getTypeSizeInBits(Op->getType());
+ unsigned IVWidth = SE->getTypeSizeInBits(WideType);
+ assert(CastWidth <= IVWidth && "Unexpected width while widening compare.");
+
+ // Widen the compare instruction.
+ IRBuilder<> Builder(
+ getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));
+ DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
+
+ // Widen the other operand of the compare, if necessary.
+ if (CastWidth < IVWidth) {
+ Value *ExtOp = createExtendInst(Op, WideType, Cmp->isSigned(), Cmp);
+ DU.NarrowUse->replaceUsesOfWith(Op, ExtOp);
+ }
+ return true;
+}
+
+/// Determine whether an individual user of the narrow IV can be widened. If so,
+/// return the wide clone of the user.
+Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
+ assert(ExtendKindMap.count(DU.NarrowDef) &&
+ "Should already know the kind of extension used to widen NarrowDef");
+
+ // Stop traversing the def-use chain at inner-loop phis or post-loop phis.
+ if (PHINode *UsePhi = dyn_cast<PHINode>(DU.NarrowUse)) {
+ if (LI->getLoopFor(UsePhi->getParent()) != L) {
+ // For LCSSA phis, sink the truncate outside the loop.
+ // After SimplifyCFG most loop exit targets have a single predecessor.
+ // Otherwise fall back to a truncate within the loop.
+ if (UsePhi->getNumOperands() != 1)
+ truncateIVUse(DU, DT, LI);
+ else {
+ // Widening the PHI requires us to insert a trunc. The logical place
+ // for this trunc is in the same BB as the PHI. This is not possible if
+ // the BB is terminated by a catchswitch.
+ if (isa<CatchSwitchInst>(UsePhi->getParent()->getTerminator()))
+ return nullptr;
+
+ PHINode *WidePhi =
+ PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide",
+ UsePhi);
+ WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0));
+ IRBuilder<> Builder(&*WidePhi->getParent()->getFirstInsertionPt());
+ Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType());
+ UsePhi->replaceAllUsesWith(Trunc);
+ DeadInsts.emplace_back(UsePhi);
+ DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi
+ << " to " << *WidePhi << "\n");
+ }
+ return nullptr;
+ }
+ }
+
+ // This narrow use can be widened by a sext if it's non-negative or its narrow
+ // def was widended by a sext. Same for zext.
+ auto canWidenBySExt = [&]() {
+ return DU.NeverNegative || getExtendKind(DU.NarrowDef) == SignExtended;
+ };
+ auto canWidenByZExt = [&]() {
+ return DU.NeverNegative || getExtendKind(DU.NarrowDef) == ZeroExtended;
+ };
+
+ // Our raison d'etre! Eliminate sign and zero extension.
+ if ((isa<SExtInst>(DU.NarrowUse) && canWidenBySExt()) ||
+ (isa<ZExtInst>(DU.NarrowUse) && canWidenByZExt())) {
+ Value *NewDef = DU.WideDef;
+ if (DU.NarrowUse->getType() != WideType) {
+ unsigned CastWidth = SE->getTypeSizeInBits(DU.NarrowUse->getType());
+ unsigned IVWidth = SE->getTypeSizeInBits(WideType);
+ if (CastWidth < IVWidth) {
+ // The cast isn't as wide as the IV, so insert a Trunc.
+ IRBuilder<> Builder(DU.NarrowUse);
+ NewDef = Builder.CreateTrunc(DU.WideDef, DU.NarrowUse->getType());
+ }
+ else {
+ // A wider extend was hidden behind a narrower one. This may induce
+ // another round of IV widening in which the intermediate IV becomes
+ // dead. It should be very rare.
+ DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi
+ << " not wide enough to subsume " << *DU.NarrowUse << "\n");
+ DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
+ NewDef = DU.NarrowUse;
+ }
+ }
+ if (NewDef != DU.NarrowUse) {
+ DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse
+ << " replaced by " << *DU.WideDef << "\n");
+ ++NumElimExt;
+ DU.NarrowUse->replaceAllUsesWith(NewDef);
+ DeadInsts.emplace_back(DU.NarrowUse);
+ }
+ // Now that the extend is gone, we want to expose it's uses for potential
+ // further simplification. We don't need to directly inform SimplifyIVUsers
+ // of the new users, because their parent IV will be processed later as a
+ // new loop phi. If we preserved IVUsers analysis, we would also want to
+ // push the uses of WideDef here.
+
+ // No further widening is needed. The deceased [sz]ext had done it for us.
+ return nullptr;
+ }
+
+ // Does this user itself evaluate to a recurrence after widening?
+ WidenedRecTy WideAddRec = getExtendedOperandRecurrence(DU);
+ if (!WideAddRec.first)
+ WideAddRec = getWideRecurrence(DU);
+
+ assert((WideAddRec.first == nullptr) == (WideAddRec.second == Unknown));
+ if (!WideAddRec.first) {
+ // If use is a loop condition, try to promote the condition instead of
+ // truncating the IV first.
+ if (widenLoopCompare(DU))
+ return nullptr;
+
+ // This user does not evaluate to a recurrence after widening, so don't
+ // follow it. Instead insert a Trunc to kill off the original use,
+ // eventually isolating the original narrow IV so it can be removed.
+ truncateIVUse(DU, DT, LI);
+ return nullptr;
+ }
+ // Assume block terminators cannot evaluate to a recurrence. We can't to
+ // insert a Trunc after a terminator if there happens to be a critical edge.
+ assert(DU.NarrowUse != DU.NarrowUse->getParent()->getTerminator() &&
+ "SCEV is not expected to evaluate a block terminator");
+
+ // Reuse the IV increment that SCEVExpander created as long as it dominates
+ // NarrowUse.
+ Instruction *WideUse = nullptr;
+ if (WideAddRec.first == WideIncExpr &&
+ Rewriter.hoistIVInc(WideInc, DU.NarrowUse))
+ WideUse = WideInc;
+ else {
+ WideUse = cloneIVUser(DU, WideAddRec.first);
+ if (!WideUse)
+ return nullptr;
+ }
+ // Evaluation of WideAddRec ensured that the narrow expression could be
+ // extended outside the loop without overflow. This suggests that the wide use
+ // evaluates to the same expression as the extended narrow use, but doesn't
+ // absolutely guarantee it. Hence the following failsafe check. In rare cases
+ // where it fails, we simply throw away the newly created wide use.
+ if (WideAddRec.first != SE->getSCEV(WideUse)) {
+ DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse
+ << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec.first << "\n");
+ DeadInsts.emplace_back(WideUse);
+ return nullptr;
+ }
+
+ ExtendKindMap[DU.NarrowUse] = WideAddRec.second;
+ // Returning WideUse pushes it on the worklist.
+ return WideUse;
+}
+
+/// Add eligible users of NarrowDef to NarrowIVUsers.
+void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
+ const SCEV *NarrowSCEV = SE->getSCEV(NarrowDef);
+ bool NonNegativeDef =
+ SE->isKnownPredicate(ICmpInst::ICMP_SGE, NarrowSCEV,
+ SE->getConstant(NarrowSCEV->getType(), 0));
+ for (User *U : NarrowDef->users()) {
+ Instruction *NarrowUser = cast<Instruction>(U);
+
+ // Handle data flow merges and bizarre phi cycles.
+ if (!Widened.insert(NarrowUser).second)
+ continue;
+
+ bool NonNegativeUse = false;
+ if (!NonNegativeDef) {
+ // We might have a control-dependent range information for this context.
+ if (auto RangeInfo = getPostIncRangeInfo(NarrowDef, NarrowUser))
+ NonNegativeUse = RangeInfo->getSignedMin().isNonNegative();
+ }
+
+ NarrowIVUsers.emplace_back(NarrowDef, NarrowUser, WideDef,
+ NonNegativeDef || NonNegativeUse);
+ }
+}
+
+/// Process a single induction variable. First use the SCEVExpander to create a
+/// wide induction variable that evaluates to the same recurrence as the
+/// original narrow IV. Then use a worklist to forward traverse the narrow IV's
+/// def-use chain. After widenIVUse has processed all interesting IV users, the
+/// narrow IV will be isolated for removal by DeleteDeadPHIs.
+///
+/// It would be simpler to delete uses as they are processed, but we must avoid
+/// invalidating SCEV expressions.
+PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
+ // Is this phi an induction variable?
+ const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi));
+ if (!AddRec)
+ return nullptr;
+
+ // Widen the induction variable expression.
+ const SCEV *WideIVExpr = getExtendKind(OrigPhi) == SignExtended
+ ? SE->getSignExtendExpr(AddRec, WideType)
+ : SE->getZeroExtendExpr(AddRec, WideType);
+
+ assert(SE->getEffectiveSCEVType(WideIVExpr->getType()) == WideType &&
+ "Expect the new IV expression to preserve its type");
+
+ // Can the IV be extended outside the loop without overflow?
+ AddRec = dyn_cast<SCEVAddRecExpr>(WideIVExpr);
+ if (!AddRec || AddRec->getLoop() != L)
+ return nullptr;
+
+ // An AddRec must have loop-invariant operands. Since this AddRec is
+ // materialized by a loop header phi, the expression cannot have any post-loop
+ // operands, so they must dominate the loop header.
+ assert(
+ SE->properlyDominates(AddRec->getStart(), L->getHeader()) &&
+ SE->properlyDominates(AddRec->getStepRecurrence(*SE), L->getHeader()) &&
+ "Loop header phi recurrence inputs do not dominate the loop");
+
+ // Iterate over IV uses (including transitive ones) looking for IV increments
+ // of the form 'add nsw %iv, <const>'. For each increment and each use of
+ // the increment calculate control-dependent range information basing on
+ // dominating conditions inside of the loop (e.g. a range check inside of the
+ // loop). Calculated ranges are stored in PostIncRangeInfos map.
+ //
+ // Control-dependent range information is later used to prove that a narrow
+ // definition is not negative (see pushNarrowIVUsers). It's difficult to do
+ // this on demand because when pushNarrowIVUsers needs this information some
+ // of the dominating conditions might be already widened.
+ if (UsePostIncrementRanges)
+ calculatePostIncRanges(OrigPhi);
+
+ // The rewriter provides a value for the desired IV expression. This may
+ // either find an existing phi or materialize a new one. Either way, we
+ // expect a well-formed cyclic phi-with-increments. i.e. any operand not part
+ // of the phi-SCC dominates the loop entry.
+ Instruction *InsertPt = &L->getHeader()->front();
+ WidePhi = cast<PHINode>(Rewriter.expandCodeFor(AddRec, WideType, InsertPt));
+
+ // Remembering the WideIV increment generated by SCEVExpander allows
+ // widenIVUse to reuse it when widening the narrow IV's increment. We don't
+ // employ a general reuse mechanism because the call above is the only call to
+ // SCEVExpander. Henceforth, we produce 1-to-1 narrow to wide uses.
+ if (BasicBlock *LatchBlock = L->getLoopLatch()) {
+ WideInc =
+ cast<Instruction>(WidePhi->getIncomingValueForBlock(LatchBlock));
+ WideIncExpr = SE->getSCEV(WideInc);
+ // Propagate the debug location associated with the original loop increment
+ // to the new (widened) increment.
+ auto *OrigInc =
+ cast<Instruction>(OrigPhi->getIncomingValueForBlock(LatchBlock));
+ WideInc->setDebugLoc(OrigInc->getDebugLoc());
+ }
+
+ DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n");
+ ++NumWidened;
+
+ // Traverse the def-use chain using a worklist starting at the original IV.
+ assert(Widened.empty() && NarrowIVUsers.empty() && "expect initial state" );
+
+ Widened.insert(OrigPhi);
+ pushNarrowIVUsers(OrigPhi, WidePhi);
+
+ while (!NarrowIVUsers.empty()) {
+ NarrowIVDefUse DU = NarrowIVUsers.pop_back_val();
+
+ // Process a def-use edge. This may replace the use, so don't hold a
+ // use_iterator across it.
+ Instruction *WideUse = widenIVUse(DU, Rewriter);
+
+ // Follow all def-use edges from the previous narrow use.
+ if (WideUse)
+ pushNarrowIVUsers(DU.NarrowUse, WideUse);
+
+ // widenIVUse may have removed the def-use edge.
+ if (DU.NarrowDef->use_empty())
+ DeadInsts.emplace_back(DU.NarrowDef);
+ }
+
+ // Attach any debug information to the new PHI. Since OrigPhi and WidePHI
+ // evaluate the same recurrence, we can just copy the debug info over.
+ SmallVector<DbgValueInst *, 1> DbgValues;
+ llvm::findDbgValues(DbgValues, OrigPhi);
+ auto *MDPhi = MetadataAsValue::get(WidePhi->getContext(),
+ ValueAsMetadata::get(WidePhi));
+ for (auto &DbgValue : DbgValues)
+ DbgValue->setOperand(0, MDPhi);
+ return WidePhi;
+}
+
+/// Calculates control-dependent range for the given def at the given context
+/// by looking at dominating conditions inside of the loop
+void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
+ Instruction *NarrowUser) {
+ using namespace llvm::PatternMatch;
+
+ Value *NarrowDefLHS;
+ const APInt *NarrowDefRHS;
+ if (!match(NarrowDef, m_NSWAdd(m_Value(NarrowDefLHS),
+ m_APInt(NarrowDefRHS))) ||
+ !NarrowDefRHS->isNonNegative())
+ return;
+
+ auto UpdateRangeFromCondition = [&] (Value *Condition,
+ bool TrueDest) {
+ CmpInst::Predicate Pred;
+ Value *CmpRHS;
+ if (!match(Condition, m_ICmp(Pred, m_Specific(NarrowDefLHS),
+ m_Value(CmpRHS))))
+ return;
+
+ CmpInst::Predicate P =
+ TrueDest ? Pred : CmpInst::getInversePredicate(Pred);
+
+ auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS));
+ auto CmpConstrainedLHSRange =
+ ConstantRange::makeAllowedICmpRegion(P, CmpRHSRange);
+ auto NarrowDefRange =
+ CmpConstrainedLHSRange.addWithNoSignedWrap(*NarrowDefRHS);
+
+ updatePostIncRangeInfo(NarrowDef, NarrowUser, NarrowDefRange);
+ };
+
+ auto UpdateRangeFromGuards = [&](Instruction *Ctx) {
+ if (!HasGuards)
+ return;
+
+ for (Instruction &I : make_range(Ctx->getIterator().getReverse(),
+ Ctx->getParent()->rend())) {
+ Value *C = nullptr;
+ if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>(m_Value(C))))
+ UpdateRangeFromCondition(C, /*TrueDest=*/true);
+ }
+ };
+
+ UpdateRangeFromGuards(NarrowUser);
+
+ BasicBlock *NarrowUserBB = NarrowUser->getParent();
+ // If NarrowUserBB is statically unreachable asking dominator queries may
+ // yield surprising results. (e.g. the block may not have a dom tree node)
+ if (!DT->isReachableFromEntry(NarrowUserBB))
+ return;
+
+ for (auto *DTB = (*DT)[NarrowUserBB]->getIDom();
+ L->contains(DTB->getBlock());
+ DTB = DTB->getIDom()) {
+ auto *BB = DTB->getBlock();
+ auto *TI = BB->getTerminator();
+ UpdateRangeFromGuards(TI);
+
+ auto *BI = dyn_cast<BranchInst>(TI);
+ if (!BI || !BI->isConditional())
+ continue;
+
+ auto *TrueSuccessor = BI->getSuccessor(0);
+ auto *FalseSuccessor = BI->getSuccessor(1);
+
+ auto DominatesNarrowUser = [this, NarrowUser] (BasicBlockEdge BBE) {
+ return BBE.isSingleEdge() &&
+ DT->dominates(BBE, NarrowUser->getParent());
+ };
+
+ if (DominatesNarrowUser(BasicBlockEdge(BB, TrueSuccessor)))
+ UpdateRangeFromCondition(BI->getCondition(), /*TrueDest=*/true);
+
+ if (DominatesNarrowUser(BasicBlockEdge(BB, FalseSuccessor)))
+ UpdateRangeFromCondition(BI->getCondition(), /*TrueDest=*/false);
+ }
+}
+
+/// Calculates PostIncRangeInfos map for the given IV
+void WidenIV::calculatePostIncRanges(PHINode *OrigPhi) {
+ SmallPtrSet<Instruction *, 16> Visited;
+ SmallVector<Instruction *, 6> Worklist;
+ Worklist.push_back(OrigPhi);
+ Visited.insert(OrigPhi);
+
+ while (!Worklist.empty()) {
+ Instruction *NarrowDef = Worklist.pop_back_val();
+
+ for (Use &U : NarrowDef->uses()) {
+ auto *NarrowUser = cast<Instruction>(U.getUser());
+
+ // Don't go looking outside the current loop.
+ auto *NarrowUserLoop = (*LI)[NarrowUser->getParent()];
+ if (!NarrowUserLoop || !L->contains(NarrowUserLoop))
+ continue;
+
+ if (!Visited.insert(NarrowUser).second)
+ continue;
+
+ Worklist.push_back(NarrowUser);
+
+ calculatePostIncRange(NarrowDef, NarrowUser);
+ }
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Live IV Reduction - Minimize IVs live across the loop.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Simplification of IV users based on SCEV evaluation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class IndVarSimplifyVisitor : public IVVisitor {
+ ScalarEvolution *SE;
+ const TargetTransformInfo *TTI;
+ PHINode *IVPhi;
+
+public:
+ WideIVInfo WI;
+
+ IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV,
+ const TargetTransformInfo *TTI,
+ const DominatorTree *DTree)
+ : SE(SCEV), TTI(TTI), IVPhi(IV) {
+ DT = DTree;
+ WI.NarrowIV = IVPhi;
+ }
+
+ // Implement the interface used by simplifyUsersOfIV.
+ void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); }
+};
+
+} // end anonymous namespace
+
+/// Iteratively perform simplification on a worklist of IV users. Each
+/// successive simplification may push more users which may themselves be
+/// candidates for simplification.
+///
+/// Sign/Zero extend elimination is interleaved with IV simplification.
+void IndVarSimplify::simplifyAndExtend(Loop *L,
+ SCEVExpander &Rewriter,
+ LoopInfo *LI) {
+ SmallVector<WideIVInfo, 8> WideIVs;
+
+ auto *GuardDecl = L->getBlocks()[0]->getModule()->getFunction(
+ Intrinsic::getName(Intrinsic::experimental_guard));
+ bool HasGuards = GuardDecl && !GuardDecl->use_empty();
+
+ SmallVector<PHINode*, 8> LoopPhis;
+ for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+ LoopPhis.push_back(cast<PHINode>(I));
+ }
+ // Each round of simplification iterates through the SimplifyIVUsers worklist
+ // for all current phis, then determines whether any IVs can be
+ // widened. Widening adds new phis to LoopPhis, inducing another round of
+ // simplification on the wide IVs.
+ while (!LoopPhis.empty()) {
+ // Evaluate as many IV expressions as possible before widening any IVs. This
+ // forces SCEV to set no-wrap flags before evaluating sign/zero
+ // extension. The first time SCEV attempts to normalize sign/zero extension,
+ // the result becomes final. So for the most predictable results, we delay
+ // evaluation of sign/zero extend evaluation until needed, and avoid running
+ // other SCEV based analysis prior to simplifyAndExtend.
+ do {
+ PHINode *CurrIV = LoopPhis.pop_back_val();
+
+ // Information about sign/zero extensions of CurrIV.
+ IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT);
+
+ Changed |=
+ simplifyUsersOfIV(CurrIV, SE, DT, LI, DeadInsts, Rewriter, &Visitor);
+
+ if (Visitor.WI.WidestNativeType) {
+ WideIVs.push_back(Visitor.WI);
+ }
+ } while(!LoopPhis.empty());
+
+ for (; !WideIVs.empty(); WideIVs.pop_back()) {
+ WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts, HasGuards);
+ if (PHINode *WidePhi = Widener.createWideIV(Rewriter)) {
+ Changed = true;
+ LoopPhis.push_back(WidePhi);
+ }
+ }
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// linearFunctionTestReplace and its kin. Rewrite the loop exit condition.
+//===----------------------------------------------------------------------===//
+
+/// Return true if this loop's backedge taken count expression can be safely and
+/// cheaply expanded into an instruction sequence that can be used by
+/// linearFunctionTestReplace.
+///
+/// TODO: This fails for pointer-type loop counters with greater than one byte
+/// strides, consequently preventing LFTR from running. For the purpose of LFTR
+/// we could skip this check in the case that the LFTR loop counter (chosen by
+/// FindLoopCounter) is also pointer type. Instead, we could directly convert
+/// the loop test to an inequality test by checking the target data's alignment
+/// of element types (given that the initial pointer value originates from or is
+/// used by ABI constrained operation, as opposed to inttoptr/ptrtoint).
+/// However, we don't yet have a strong motivation for converting loop tests
+/// into inequality tests.
+static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE,
+ SCEVExpander &Rewriter) {
+ const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+ if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) ||
+ BackedgeTakenCount->isZero())
+ return false;
+
+ if (!L->getExitingBlock())
+ return false;
+
+ // Can't rewrite non-branch yet.
+ if (!isa<BranchInst>(L->getExitingBlock()->getTerminator()))
+ return false;
+
+ if (Rewriter.isHighCostExpansion(BackedgeTakenCount, L))
+ return false;
+
+ return true;
+}
+
+/// Return the loop header phi IFF IncV adds a loop invariant value to the phi.
+static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
+ Instruction *IncI = dyn_cast<Instruction>(IncV);
+ if (!IncI)
+ return nullptr;
+
+ switch (IncI->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Sub:
+ break;
+ case Instruction::GetElementPtr:
+ // An IV counter must preserve its type.
+ if (IncI->getNumOperands() == 2)
+ break;
+ LLVM_FALLTHROUGH;
+ default:
+ return nullptr;
+ }
+
+ PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0));
+ if (Phi && Phi->getParent() == L->getHeader()) {
+ if (isLoopInvariant(IncI->getOperand(1), L, DT))
+ return Phi;
+ return nullptr;
+ }
+ if (IncI->getOpcode() == Instruction::GetElementPtr)
+ return nullptr;
+
+ // Allow add/sub to be commuted.
+ Phi = dyn_cast<PHINode>(IncI->getOperand(1));
+ if (Phi && Phi->getParent() == L->getHeader()) {
+ if (isLoopInvariant(IncI->getOperand(0), L, DT))
+ return Phi;
+ }
+ return nullptr;
+}
+
+/// Return the compare guarding the loop latch, or NULL for unrecognized tests.
+static ICmpInst *getLoopTest(Loop *L) {
+ assert(L->getExitingBlock() && "expected loop exit");
+
+ BasicBlock *LatchBlock = L->getLoopLatch();
+ // Don't bother with LFTR if the loop is not properly simplified.
+ if (!LatchBlock)
+ return nullptr;
+
+ BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
+ assert(BI && "expected exit branch");
+
+ return dyn_cast<ICmpInst>(BI->getCondition());
+}
+
+/// linearFunctionTestReplace policy. Return true unless we can show that the
+/// current exit test is already sufficiently canonical.
+static bool needsLFTR(Loop *L, DominatorTree *DT) {
+ // Do LFTR to simplify the exit condition to an ICMP.
+ ICmpInst *Cond = getLoopTest(L);
+ if (!Cond)
+ return true;
+
+ // Do LFTR to simplify the exit ICMP to EQ/NE
+ ICmpInst::Predicate Pred = Cond->getPredicate();
+ if (Pred != ICmpInst::ICMP_NE && Pred != ICmpInst::ICMP_EQ)
+ return true;
+
+ // Look for a loop invariant RHS
+ Value *LHS = Cond->getOperand(0);
+ Value *RHS = Cond->getOperand(1);
+ if (!isLoopInvariant(RHS, L, DT)) {
+ if (!isLoopInvariant(LHS, L, DT))
+ return true;
+ std::swap(LHS, RHS);
+ }
+ // Look for a simple IV counter LHS
+ PHINode *Phi = dyn_cast<PHINode>(LHS);
+ if (!Phi)
+ Phi = getLoopPhiForCounter(LHS, L, DT);
+
+ if (!Phi)
+ return true;
+
+ // Do LFTR if PHI node is defined in the loop, but is *not* a counter.
+ int Idx = Phi->getBasicBlockIndex(L->getLoopLatch());
+ if (Idx < 0)
+ return true;
+
+ // Do LFTR if the exit condition's IV is *not* a simple counter.
+ Value *IncV = Phi->getIncomingValue(Idx);
+ return Phi != getLoopPhiForCounter(IncV, L, DT);
+}
+
+/// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils
+/// down to checking that all operands are constant and listing instructions
+/// that may hide undef.
+static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited,
+ unsigned Depth) {
+ if (isa<Constant>(V))
+ return !isa<UndefValue>(V);
+
+ if (Depth >= 6)
+ return false;
+
+ // Conservatively handle non-constant non-instructions. For example, Arguments
+ // may be undef.
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return false;
+
+ // Load and return values may be undef.
+ if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I))
+ return false;
+
+ // Optimistically handle other instructions.
+ for (Value *Op : I->operands()) {
+ if (!Visited.insert(Op).second)
+ continue;
+ if (!hasConcreteDefImpl(Op, Visited, Depth+1))
+ return false;
+ }
+ return true;
+}
+
+/// Return true if the given value is concrete. We must prove that undef can
+/// never reach it.
+///
+/// TODO: If we decide that this is a good approach to checking for undef, we
+/// may factor it into a common location.
+static bool hasConcreteDef(Value *V) {
+ SmallPtrSet<Value*, 8> Visited;
+ Visited.insert(V);
+ return hasConcreteDefImpl(V, Visited, 0);
+}
+
+/// Return true if this IV has any uses other than the (soon to be rewritten)
+/// loop exit test.
+static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
+ int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
+ Value *IncV = Phi->getIncomingValue(LatchIdx);
+
+ for (User *U : Phi->users())
+ if (U != Cond && U != IncV) return false;
+
+ for (User *U : IncV->users())
+ if (U != Cond && U != Phi) return false;
+ return true;
+}
+
+/// Find an affine IV in canonical form.
+///
+/// BECount may be an i8* pointer type. The pointer difference is already
+/// valid count without scaling the address stride, so it remains a pointer
+/// expression as far as SCEV is concerned.
+///
+/// Currently only valid for LFTR. See the comments on hasConcreteDef below.
+///
+/// FIXME: Accept -1 stride and set IVLimit = IVInit - BECount
+///
+/// FIXME: Accept non-unit stride as long as SCEV can reduce BECount * Stride.
+/// This is difficult in general for SCEV because of potential overflow. But we
+/// could at least handle constant BECounts.
+static PHINode *FindLoopCounter(Loop *L, const SCEV *BECount,
+ ScalarEvolution *SE, DominatorTree *DT) {
+ uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType());
+
+ Value *Cond =
+ cast<BranchInst>(L->getExitingBlock()->getTerminator())->getCondition();
+
+ // Loop over all of the PHI nodes, looking for a simple counter.
+ PHINode *BestPhi = nullptr;
+ const SCEV *BestInit = nullptr;
+ BasicBlock *LatchBlock = L->getLoopLatch();
+ assert(LatchBlock && "needsLFTR should guarantee a loop latch");
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+ for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+ PHINode *Phi = cast<PHINode>(I);
+ if (!SE->isSCEVable(Phi->getType()))
+ continue;
+
+ // Avoid comparing an integer IV against a pointer Limit.
+ if (BECount->getType()->isPointerTy() && !Phi->getType()->isPointerTy())
+ continue;
+
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
+ if (!AR || AR->getLoop() != L || !AR->isAffine())
+ continue;
+
+ // AR may be a pointer type, while BECount is an integer type.
+ // AR may be wider than BECount. With eq/ne tests overflow is immaterial.
+ // AR may not be a narrower type, or we may never exit.
+ uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType());
+ if (PhiWidth < BCWidth || !DL.isLegalInteger(PhiWidth))
+ continue;
+
+ const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
+ if (!Step || !Step->isOne())
+ continue;
+
+ int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
+ Value *IncV = Phi->getIncomingValue(LatchIdx);
+ if (getLoopPhiForCounter(IncV, L, DT) != Phi)
+ continue;
+
+ // Avoid reusing a potentially undef value to compute other values that may
+ // have originally had a concrete definition.
+ if (!hasConcreteDef(Phi)) {
+ // We explicitly allow unknown phis as long as they are already used by
+ // the loop test. In this case we assume that performing LFTR could not
+ // increase the number of undef users.
+ if (ICmpInst *Cond = getLoopTest(L)) {
+ if (Phi != getLoopPhiForCounter(Cond->getOperand(0), L, DT) &&
+ Phi != getLoopPhiForCounter(Cond->getOperand(1), L, DT)) {
+ continue;
+ }
+ }
+ }
+ const SCEV *Init = AR->getStart();
+
+ if (BestPhi && !AlmostDeadIV(BestPhi, LatchBlock, Cond)) {
+ // Don't force a live loop counter if another IV can be used.
+ if (AlmostDeadIV(Phi, LatchBlock, Cond))
+ continue;
+
+ // Prefer to count-from-zero. This is a more "canonical" counter form. It
+ // also prefers integer to pointer IVs.
+ if (BestInit->isZero() != Init->isZero()) {
+ if (BestInit->isZero())
+ continue;
+ }
+ // If two IVs both count from zero or both count from nonzero then the
+ // narrower is likely a dead phi that has been widened. Use the wider phi
+ // to allow the other to be eliminated.
+ else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType()))
+ continue;
+ }
+ BestPhi = Phi;
+ BestInit = Init;
+ }
+ return BestPhi;
+}
+
+/// Help linearFunctionTestReplace by generating a value that holds the RHS of
+/// the new loop test.
+static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
+ SCEVExpander &Rewriter, ScalarEvolution *SE) {
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
+ assert(AR && AR->getLoop() == L && AR->isAffine() && "bad loop counter");
+ const SCEV *IVInit = AR->getStart();
+
+ // IVInit may be a pointer while IVCount is an integer when FindLoopCounter
+ // finds a valid pointer IV. Sign extend BECount in order to materialize a
+ // GEP. Avoid running SCEVExpander on a new pointer value, instead reusing
+ // the existing GEPs whenever possible.
+ if (IndVar->getType()->isPointerTy() && !IVCount->getType()->isPointerTy()) {
+ // IVOffset will be the new GEP offset that is interpreted by GEP as a
+ // signed value. IVCount on the other hand represents the loop trip count,
+ // which is an unsigned value. FindLoopCounter only allows induction
+ // variables that have a positive unit stride of one. This means we don't
+ // have to handle the case of negative offsets (yet) and just need to zero
+ // extend IVCount.
+ Type *OfsTy = SE->getEffectiveSCEVType(IVInit->getType());
+ const SCEV *IVOffset = SE->getTruncateOrZeroExtend(IVCount, OfsTy);
+
+ // Expand the code for the iteration count.
+ assert(SE->isLoopInvariant(IVOffset, L) &&
+ "Computed iteration count is not loop invariant!");
+ BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
+ Value *GEPOffset = Rewriter.expandCodeFor(IVOffset, OfsTy, BI);
+
+ Value *GEPBase = IndVar->getIncomingValueForBlock(L->getLoopPreheader());
+ assert(AR->getStart() == SE->getSCEV(GEPBase) && "bad loop counter");
+ // We could handle pointer IVs other than i8*, but we need to compensate for
+ // gep index scaling. See canExpandBackedgeTakenCount comments.
+ assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()),
+ cast<PointerType>(GEPBase->getType())
+ ->getElementType())->isOne() &&
+ "unit stride pointer IV must be i8*");
+
+ IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+ return Builder.CreateGEP(nullptr, GEPBase, GEPOffset, "lftr.limit");
+ } else {
+ // In any other case, convert both IVInit and IVCount to integers before
+ // comparing. This may result in SCEV expansion of pointers, but in practice
+ // SCEV will fold the pointer arithmetic away as such:
+ // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc).
+ //
+ // Valid Cases: (1) both integers is most common; (2) both may be pointers
+ // for simple memset-style loops.
+ //
+ // IVInit integer and IVCount pointer would only occur if a canonical IV
+ // were generated on top of case #2, which is not expected.
+
+ const SCEV *IVLimit = nullptr;
+ // For unit stride, IVCount = Start + BECount with 2's complement overflow.
+ // For non-zero Start, compute IVCount here.
+ if (AR->getStart()->isZero())
+ IVLimit = IVCount;
+ else {
+ assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
+ const SCEV *IVInit = AR->getStart();
+
+ // For integer IVs, truncate the IV before computing IVInit + BECount.
+ if (SE->getTypeSizeInBits(IVInit->getType())
+ > SE->getTypeSizeInBits(IVCount->getType()))
+ IVInit = SE->getTruncateExpr(IVInit, IVCount->getType());
+
+ IVLimit = SE->getAddExpr(IVInit, IVCount);
+ }
+ // Expand the code for the iteration count.
+ BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
+ IRBuilder<> Builder(BI);
+ assert(SE->isLoopInvariant(IVLimit, L) &&
+ "Computed iteration count is not loop invariant!");
+ // Ensure that we generate the same type as IndVar, or a smaller integer
+ // type. In the presence of null pointer values, we have an integer type
+ // SCEV expression (IVInit) for a pointer type IV value (IndVar).
+ Type *LimitTy = IVCount->getType()->isPointerTy() ?
+ IndVar->getType() : IVCount->getType();
+ return Rewriter.expandCodeFor(IVLimit, LimitTy, BI);
+ }
+}
+
+/// This method rewrites the exit condition of the loop to be a canonical !=
+/// comparison against the incremented loop induction variable. This pass is
+/// able to rewrite the exit tests of any loop where the SCEV analysis can
+/// determine a loop-invariant trip count of the loop, which is actually a much
+/// broader range than just linear tests.
+Value *IndVarSimplify::
+linearFunctionTestReplace(Loop *L,
+ const SCEV *BackedgeTakenCount,
+ PHINode *IndVar,
+ SCEVExpander &Rewriter) {
+ assert(canExpandBackedgeTakenCount(L, SE, Rewriter) && "precondition");
+
+ // Initialize CmpIndVar and IVCount to their preincremented values.
+ Value *CmpIndVar = IndVar;
+ const SCEV *IVCount = BackedgeTakenCount;
+
+ assert(L->getLoopLatch() && "Loop no longer in simplified form?");
+
+ // If the exiting block is the same as the backedge block, we prefer to
+ // compare against the post-incremented value, otherwise we must compare
+ // against the preincremented value.
+ if (L->getExitingBlock() == L->getLoopLatch()) {
+ // Add one to the "backedge-taken" count to get the trip count.
+ // This addition may overflow, which is valid as long as the comparison is
+ // truncated to BackedgeTakenCount->getType().
+ IVCount = SE->getAddExpr(BackedgeTakenCount,
+ SE->getOne(BackedgeTakenCount->getType()));
+ // The BackedgeTaken expression contains the number of times that the
+ // backedge branches to the loop header. This is one less than the
+ // number of times the loop executes, so use the incremented indvar.
+ CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock());
+ }
+
+ Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE);
+ assert(ExitCnt->getType()->isPointerTy() ==
+ IndVar->getType()->isPointerTy() &&
+ "genLoopLimit missed a cast");
+
+ // Insert a new icmp_ne or icmp_eq instruction before the branch.
+ BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
+ ICmpInst::Predicate P;
+ if (L->contains(BI->getSuccessor(0)))
+ P = ICmpInst::ICMP_NE;
+ else
+ P = ICmpInst::ICMP_EQ;
+
+ DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n"
+ << " LHS:" << *CmpIndVar << '\n'
+ << " op:\t"
+ << (P == ICmpInst::ICMP_NE ? "!=" : "==") << "\n"
+ << " RHS:\t" << *ExitCnt << "\n"
+ << " IVCount:\t" << *IVCount << "\n");
+
+ IRBuilder<> Builder(BI);
+
+ // The new loop exit condition should reuse the debug location of the
+ // original loop exit condition.
+ if (auto *Cond = dyn_cast<Instruction>(BI->getCondition()))
+ Builder.SetCurrentDebugLocation(Cond->getDebugLoc());
+
+ // LFTR can ignore IV overflow and truncate to the width of
+ // BECount. This avoids materializing the add(zext(add)) expression.
+ unsigned CmpIndVarSize = SE->getTypeSizeInBits(CmpIndVar->getType());
+ unsigned ExitCntSize = SE->getTypeSizeInBits(ExitCnt->getType());
+ if (CmpIndVarSize > ExitCntSize) {
+ const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
+ const SCEV *ARStart = AR->getStart();
+ const SCEV *ARStep = AR->getStepRecurrence(*SE);
+ // For constant IVCount, avoid truncation.
+ if (isa<SCEVConstant>(ARStart) && isa<SCEVConstant>(IVCount)) {
+ const APInt &Start = cast<SCEVConstant>(ARStart)->getAPInt();
+ APInt Count = cast<SCEVConstant>(IVCount)->getAPInt();
+ // Note that the post-inc value of BackedgeTakenCount may have overflowed
+ // above such that IVCount is now zero.
+ if (IVCount != BackedgeTakenCount && Count == 0) {
+ Count = APInt::getMaxValue(Count.getBitWidth()).zext(CmpIndVarSize);
+ ++Count;
+ }
+ else
+ Count = Count.zext(CmpIndVarSize);
+ APInt NewLimit;
+ if (cast<SCEVConstant>(ARStep)->getValue()->isNegative())
+ NewLimit = Start - Count;
+ else
+ NewLimit = Start + Count;
+ ExitCnt = ConstantInt::get(CmpIndVar->getType(), NewLimit);
+
+ DEBUG(dbgs() << " Widen RHS:\t" << *ExitCnt << "\n");
+ } else {
+ // We try to extend trip count first. If that doesn't work we truncate IV.
+ // Zext(trunc(IV)) == IV implies equivalence of the following two:
+ // Trunc(IV) == ExitCnt and IV == zext(ExitCnt). Similarly for sext. If
+ // one of the two holds, extend the trip count, otherwise we truncate IV.
+ bool Extended = false;
+ const SCEV *IV = SE->getSCEV(CmpIndVar);
+ const SCEV *ZExtTrunc =
+ SE->getZeroExtendExpr(SE->getTruncateExpr(SE->getSCEV(CmpIndVar),
+ ExitCnt->getType()),
+ CmpIndVar->getType());
+
+ if (ZExtTrunc == IV) {
+ Extended = true;
+ ExitCnt = Builder.CreateZExt(ExitCnt, IndVar->getType(),
+ "wide.trip.count");
+ } else {
+ const SCEV *SExtTrunc =
+ SE->getSignExtendExpr(SE->getTruncateExpr(SE->getSCEV(CmpIndVar),
+ ExitCnt->getType()),
+ CmpIndVar->getType());
+ if (SExtTrunc == IV) {
+ Extended = true;
+ ExitCnt = Builder.CreateSExt(ExitCnt, IndVar->getType(),
+ "wide.trip.count");
+ }
+ }
+
+ if (!Extended)
+ CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(),
+ "lftr.wideiv");
+ }
+ }
+ Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond");
+ Value *OrigCond = BI->getCondition();
+ // It's tempting to use replaceAllUsesWith here to fully replace the old
+ // comparison, but that's not immediately safe, since users of the old
+ // comparison may not be dominated by the new comparison. Instead, just
+ // update the branch to use the new comparison; in the common case this
+ // will make old comparison dead.
+ BI->setCondition(Cond);
+ DeadInsts.push_back(OrigCond);
+
+ ++NumLFTR;
+ Changed = true;
+ return Cond;
+}
+
+//===----------------------------------------------------------------------===//
+// sinkUnusedInvariants. A late subpass to cleanup loop preheaders.
+//===----------------------------------------------------------------------===//
+
+/// If there's a single exit block, sink any loop-invariant values that
+/// were defined in the preheader but not used inside the loop into the
+/// exit block to reduce register pressure in the loop.
+void IndVarSimplify::sinkUnusedInvariants(Loop *L) {
+ BasicBlock *ExitBlock = L->getExitBlock();
+ if (!ExitBlock) return;
+
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader) return;
+
+ BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt();
+ BasicBlock::iterator I(Preheader->getTerminator());
+ while (I != Preheader->begin()) {
+ --I;
+ // New instructions were inserted at the end of the preheader.
+ if (isa<PHINode>(I))
+ break;
+
+ // Don't move instructions which might have side effects, since the side
+ // effects need to complete before instructions inside the loop. Also don't
+ // move instructions which might read memory, since the loop may modify
+ // memory. Note that it's okay if the instruction might have undefined
+ // behavior: LoopSimplify guarantees that the preheader dominates the exit
+ // block.
+ if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+ continue;
+
+ // Skip debug info intrinsics.
+ if (isa<DbgInfoIntrinsic>(I))
+ continue;
+
+ // Skip eh pad instructions.
+ if (I->isEHPad())
+ continue;
+
+ // Don't sink alloca: we never want to sink static alloca's out of the
+ // entry block, and correctly sinking dynamic alloca's requires
+ // checks for stacksave/stackrestore intrinsics.
+ // FIXME: Refactor this check somehow?
+ if (isa<AllocaInst>(I))
+ continue;
+
+ // Determine if there is a use in or before the loop (direct or
+ // otherwise).
+ bool UsedInLoop = false;
+ for (Use &U : I->uses()) {
+ Instruction *User = cast<Instruction>(U.getUser());
+ BasicBlock *UseBB = User->getParent();
+ if (PHINode *P = dyn_cast<PHINode>(User)) {
+ unsigned i =
+ PHINode::getIncomingValueNumForOperand(U.getOperandNo());
+ UseBB = P->getIncomingBlock(i);
+ }
+ if (UseBB == Preheader || L->contains(UseBB)) {
+ UsedInLoop = true;
+ break;
+ }
+ }
+
+ // If there is, the def must remain in the preheader.
+ if (UsedInLoop)
+ continue;
+
+ // Otherwise, sink it to the exit block.
+ Instruction *ToMove = &*I;
+ bool Done = false;
+
+ if (I != Preheader->begin()) {
+ // Skip debug info intrinsics.
+ do {
+ --I;
+ } while (isa<DbgInfoIntrinsic>(I) && I != Preheader->begin());
+
+ if (isa<DbgInfoIntrinsic>(I) && I == Preheader->begin())
+ Done = true;
+ } else {
+ Done = true;
+ }
+
+ ToMove->moveBefore(*ExitBlock, InsertPt);
+ if (Done) break;
+ InsertPt = ToMove->getIterator();
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// IndVarSimplify driver. Manage several subpasses of IV simplification.
+//===----------------------------------------------------------------------===//
+
+bool IndVarSimplify::run(Loop *L) {
+ // We need (and expect!) the incoming loop to be in LCSSA.
+ assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+ "LCSSA required to run indvars!");
+
+ // If LoopSimplify form is not available, stay out of trouble. Some notes:
+ // - LSR currently only supports LoopSimplify-form loops. Indvars'
+ // canonicalization can be a pessimization without LSR to "clean up"
+ // afterwards.
+ // - We depend on having a preheader; in particular,
+ // Loop::getCanonicalInductionVariable only supports loops with preheaders,
+ // and we're in trouble if we can't find the induction variable even when
+ // we've manually inserted one.
+ // - LFTR relies on having a single backedge.
+ if (!L->isLoopSimplifyForm())
+ return false;
+
+ // If there are any floating-point recurrences, attempt to
+ // transform them to use integer recurrences.
+ rewriteNonIntegerIVs(L);
+
+ const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+
+ // Create a rewriter object which we'll use to transform the code with.
+ SCEVExpander Rewriter(*SE, DL, "indvars");
+#ifndef NDEBUG
+ Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+
+ // Eliminate redundant IV users.
+ //
+ // Simplification works best when run before other consumers of SCEV. We
+ // attempt to avoid evaluating SCEVs for sign/zero extend operations until
+ // other expressions involving loop IVs have been evaluated. This helps SCEV
+ // set no-wrap flags before normalizing sign/zero extension.
+ Rewriter.disableCanonicalMode();
+ simplifyAndExtend(L, Rewriter, LI);
+
+ // Check to see if this loop has a computable loop-invariant execution count.
+ // If so, this means that we can compute the final value of any expressions
+ // that are recurrent in the loop, and substitute the exit values from the
+ // loop into any instructions outside of the loop that use the final values of
+ // the current expressions.
+ //
+ if (ReplaceExitValue != NeverRepl &&
+ !isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+ rewriteLoopExitValues(L, Rewriter);
+
+ // Eliminate redundant IV cycles.
+ NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
+
+ // If we have a trip count expression, rewrite the loop's exit condition
+ // using it. We can currently only handle loops with a single exit.
+ if (!DisableLFTR && canExpandBackedgeTakenCount(L, SE, Rewriter) &&
+ needsLFTR(L, DT)) {
+ PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT);
+ if (IndVar) {
+ // Check preconditions for proper SCEVExpander operation. SCEV does not
+ // express SCEVExpander's dependencies, such as LoopSimplify. Instead any
+ // pass that uses the SCEVExpander must do it. This does not work well for
+ // loop passes because SCEVExpander makes assumptions about all loops,
+ // while LoopPassManager only forces the current loop to be simplified.
+ //
+ // FIXME: SCEV expansion has no way to bail out, so the caller must
+ // explicitly check any assumptions made by SCEV. Brittle.
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BackedgeTakenCount);
+ if (!AR || AR->getLoop()->getLoopPreheader())
+ (void)linearFunctionTestReplace(L, BackedgeTakenCount, IndVar,
+ Rewriter);
+ }
+ }
+ // Clear the rewriter cache, because values that are in the rewriter's cache
+ // can be deleted in the loop below, causing the AssertingVH in the cache to
+ // trigger.
+ Rewriter.clear();
+
+ // Now that we're done iterating through lists, clean up any instructions
+ // which are now dead.
+ while (!DeadInsts.empty())
+ if (Instruction *Inst =
+ dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val()))
+ RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
+
+ // The Rewriter may not be used from this point on.
+
+ // Loop-invariant instructions in the preheader that aren't used in the
+ // loop may be sunk below the loop to reduce register pressure.
+ sinkUnusedInvariants(L);
+
+ // rewriteFirstIterationLoopExitValues does not rely on the computation of
+ // trip count and therefore can further simplify exit values in addition to
+ // rewriteLoopExitValues.
+ rewriteFirstIterationLoopExitValues(L);
+
+ // Clean up dead instructions.
+ Changed |= DeleteDeadPHIs(L->getHeader(), TLI);
+
+ // Check a post-condition.
+ assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+ "Indvars did not preserve LCSSA!");
+
+ // Verify that LFTR, and any other change have not interfered with SCEV's
+ // ability to compute trip count.
+#ifndef NDEBUG
+ if (VerifyIndvars && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+ SE->forgetLoop(L);
+ const SCEV *NewBECount = SE->getBackedgeTakenCount(L);
+ if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) <
+ SE->getTypeSizeInBits(NewBECount->getType()))
+ NewBECount = SE->getTruncateOrNoop(NewBECount,
+ BackedgeTakenCount->getType());
+ else
+ BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount,
+ NewBECount->getType());
+ assert(BackedgeTakenCount == NewBECount && "indvars must preserve SCEV");
+ }
+#endif
+
+ return Changed;
+}
+
+PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ Function *F = L.getHeader()->getParent();
+ const DataLayout &DL = F->getParent()->getDataLayout();
+
+ IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI);
+ if (!IVS.run(&L))
+ return PreservedAnalyses::all();
+
+ auto PA = getLoopPassPreservedAnalyses();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+namespace {
+
+struct IndVarSimplifyLegacyPass : public LoopPass {
+ static char ID; // Pass identification, replacement for typeid
+
+ IndVarSimplifyLegacyPass() : LoopPass(ID) {
+ initializeIndVarSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+ auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
+ auto *TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr;
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+ IndVarSimplify IVS(LI, SE, DT, DL, TLI, TTI);
+ return IVS.run(L);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ getLoopAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char IndVarSimplifyLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(IndVarSimplifyLegacyPass, "indvars",
+ "Induction Variable Simplification", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(IndVarSimplifyLegacyPass, "indvars",
+ "Induction Variable Simplification", false, false)
+
+Pass *llvm::createIndVarSimplifyPass() {
+ return new IndVarSimplifyLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
new file mode 100644
index 000000000000..cf98088111be
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -0,0 +1,1856 @@
+//===- InductiveRangeCheckElimination.cpp - -------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The InductiveRangeCheckElimination pass splits a loop's iteration space into
+// three disjoint ranges. It does that in a way such that the loop running in
+// the middle loop provably does not need range checks. As an example, it will
+// convert
+//
+// len = < known positive >
+// for (i = 0; i < n; i++) {
+// if (0 <= i && i < len) {
+// do_something();
+// } else {
+// throw_out_of_bounds();
+// }
+// }
+//
+// to
+//
+// len = < known positive >
+// limit = smin(n, len)
+// // no first segment
+// for (i = 0; i < limit; i++) {
+// if (0 <= i && i < len) { // this check is fully redundant
+// do_something();
+// } else {
+// throw_out_of_bounds();
+// }
+// }
+// for (i = limit; i < n; i++) {
+// if (0 <= i && i < len) {
+// do_something();
+// } else {
+// throw_out_of_bounds();
+// }
+// }
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<unsigned> LoopSizeCutoff("irce-loop-size-cutoff", cl::Hidden,
+ cl::init(64));
+
+static cl::opt<bool> PrintChangedLoops("irce-print-changed-loops", cl::Hidden,
+ cl::init(false));
+
+static cl::opt<bool> PrintRangeChecks("irce-print-range-checks", cl::Hidden,
+ cl::init(false));
+
+static cl::opt<int> MaxExitProbReciprocal("irce-max-exit-prob-reciprocal",
+ cl::Hidden, cl::init(10));
+
+static cl::opt<bool> SkipProfitabilityChecks("irce-skip-profitability-checks",
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool> AllowUnsignedLatchCondition("irce-allow-unsigned-latch",
+ cl::Hidden, cl::init(true));
+
+static const char *ClonedLoopTag = "irce.loop.clone";
+
+#define DEBUG_TYPE "irce"
+
+namespace {
+
+/// An inductive range check is conditional branch in a loop with
+///
+/// 1. a very cold successor (i.e. the branch jumps to that successor very
+/// rarely)
+///
+/// and
+///
+/// 2. a condition that is provably true for some contiguous range of values
+/// taken by the containing loop's induction variable.
+///
+class InductiveRangeCheck {
+ // Classifies a range check
+ enum RangeCheckKind : unsigned {
+ // Range check of the form "0 <= I".
+ RANGE_CHECK_LOWER = 1,
+
+ // Range check of the form "I < L" where L is known positive.
+ RANGE_CHECK_UPPER = 2,
+
+ // The logical and of the RANGE_CHECK_LOWER and RANGE_CHECK_UPPER
+ // conditions.
+ RANGE_CHECK_BOTH = RANGE_CHECK_LOWER | RANGE_CHECK_UPPER,
+
+ // Unrecognized range check condition.
+ RANGE_CHECK_UNKNOWN = (unsigned)-1
+ };
+
+ static StringRef rangeCheckKindToStr(RangeCheckKind);
+
+ const SCEV *Begin = nullptr;
+ const SCEV *Step = nullptr;
+ const SCEV *End = nullptr;
+ Use *CheckUse = nullptr;
+ RangeCheckKind Kind = RANGE_CHECK_UNKNOWN;
+ bool IsSigned = true;
+
+ static RangeCheckKind parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
+ ScalarEvolution &SE, Value *&Index,
+ Value *&Length, bool &IsSigned);
+
+ static void
+ extractRangeChecksFromCond(Loop *L, ScalarEvolution &SE, Use &ConditionUse,
+ SmallVectorImpl<InductiveRangeCheck> &Checks,
+ SmallPtrSetImpl<Value *> &Visited);
+
+public:
+ const SCEV *getBegin() const { return Begin; }
+ const SCEV *getStep() const { return Step; }
+ const SCEV *getEnd() const { return End; }
+ bool isSigned() const { return IsSigned; }
+
+ void print(raw_ostream &OS) const {
+ OS << "InductiveRangeCheck:\n";
+ OS << " Kind: " << rangeCheckKindToStr(Kind) << "\n";
+ OS << " Begin: ";
+ Begin->print(OS);
+ OS << " Step: ";
+ Step->print(OS);
+ OS << " End: ";
+ if (End)
+ End->print(OS);
+ else
+ OS << "(null)";
+ OS << "\n CheckUse: ";
+ getCheckUse()->getUser()->print(OS);
+ OS << " Operand: " << getCheckUse()->getOperandNo() << "\n";
+ }
+
+ LLVM_DUMP_METHOD
+ void dump() {
+ print(dbgs());
+ }
+
+ Use *getCheckUse() const { return CheckUse; }
+
+ /// Represents an signed integer range [Range.getBegin(), Range.getEnd()). If
+ /// R.getEnd() sle R.getBegin(), then R denotes the empty range.
+
+ class Range {
+ const SCEV *Begin;
+ const SCEV *End;
+
+ public:
+ Range(const SCEV *Begin, const SCEV *End) : Begin(Begin), End(End) {
+ assert(Begin->getType() == End->getType() && "ill-typed range!");
+ }
+
+ Type *getType() const { return Begin->getType(); }
+ const SCEV *getBegin() const { return Begin; }
+ const SCEV *getEnd() const { return End; }
+ bool isEmpty(ScalarEvolution &SE, bool IsSigned) const {
+ if (Begin == End)
+ return true;
+ if (IsSigned)
+ return SE.isKnownPredicate(ICmpInst::ICMP_SGE, Begin, End);
+ else
+ return SE.isKnownPredicate(ICmpInst::ICMP_UGE, Begin, End);
+ }
+ };
+
+ /// This is the value the condition of the branch needs to evaluate to for the
+ /// branch to take the hot successor (see (1) above).
+ bool getPassingDirection() { return true; }
+
+ /// Computes a range for the induction variable (IndVar) in which the range
+ /// check is redundant and can be constant-folded away. The induction
+ /// variable is not required to be the canonical {0,+,1} induction variable.
+ Optional<Range> computeSafeIterationSpace(ScalarEvolution &SE,
+ const SCEVAddRecExpr *IndVar,
+ bool IsLatchSigned) const;
+
+ /// Parse out a set of inductive range checks from \p BI and append them to \p
+ /// Checks.
+ ///
+ /// NB! There may be conditions feeding into \p BI that aren't inductive range
+ /// checks, and hence don't end up in \p Checks.
+ static void
+ extractRangeChecksFromBranch(BranchInst *BI, Loop *L, ScalarEvolution &SE,
+ BranchProbabilityInfo &BPI,
+ SmallVectorImpl<InductiveRangeCheck> &Checks);
+};
+
+class InductiveRangeCheckElimination : public LoopPass {
+public:
+ static char ID;
+
+ InductiveRangeCheckElimination() : LoopPass(ID) {
+ initializeInductiveRangeCheckEliminationPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<BranchProbabilityInfoWrapperPass>();
+ getLoopAnalysisUsage(AU);
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
+
+} // end anonymous namespace
+
+char InductiveRangeCheckElimination::ID = 0;
+
+INITIALIZE_PASS_BEGIN(InductiveRangeCheckElimination, "irce",
+ "Inductive range check elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(InductiveRangeCheckElimination, "irce",
+ "Inductive range check elimination", false, false)
+
+StringRef InductiveRangeCheck::rangeCheckKindToStr(
+ InductiveRangeCheck::RangeCheckKind RCK) {
+ switch (RCK) {
+ case InductiveRangeCheck::RANGE_CHECK_UNKNOWN:
+ return "RANGE_CHECK_UNKNOWN";
+
+ case InductiveRangeCheck::RANGE_CHECK_UPPER:
+ return "RANGE_CHECK_UPPER";
+
+ case InductiveRangeCheck::RANGE_CHECK_LOWER:
+ return "RANGE_CHECK_LOWER";
+
+ case InductiveRangeCheck::RANGE_CHECK_BOTH:
+ return "RANGE_CHECK_BOTH";
+ }
+
+ llvm_unreachable("unknown range check type!");
+}
+
+/// Parse a single ICmp instruction, `ICI`, into a range check. If `ICI` cannot
+/// be interpreted as a range check, return `RANGE_CHECK_UNKNOWN` and set
+/// `Index` and `Length` to `nullptr`. Otherwise set `Index` to the value being
+/// range checked, and set `Length` to the upper limit `Index` is being range
+/// checked with if (and only if) the range check type is stronger or equal to
+/// RANGE_CHECK_UPPER.
+InductiveRangeCheck::RangeCheckKind
+InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
+ ScalarEvolution &SE, Value *&Index,
+ Value *&Length, bool &IsSigned) {
+ auto IsNonNegativeAndNotLoopVarying = [&SE, L](Value *V) {
+ const SCEV *S = SE.getSCEV(V);
+ if (isa<SCEVCouldNotCompute>(S))
+ return false;
+
+ return SE.getLoopDisposition(S, L) == ScalarEvolution::LoopInvariant &&
+ SE.isKnownNonNegative(S);
+ };
+
+ ICmpInst::Predicate Pred = ICI->getPredicate();
+ Value *LHS = ICI->getOperand(0);
+ Value *RHS = ICI->getOperand(1);
+
+ switch (Pred) {
+ default:
+ return RANGE_CHECK_UNKNOWN;
+
+ case ICmpInst::ICMP_SLE:
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ICmpInst::ICMP_SGE:
+ IsSigned = true;
+ if (match(RHS, m_ConstantInt<0>())) {
+ Index = LHS;
+ return RANGE_CHECK_LOWER;
+ }
+ return RANGE_CHECK_UNKNOWN;
+
+ case ICmpInst::ICMP_SLT:
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ICmpInst::ICMP_SGT:
+ IsSigned = true;
+ if (match(RHS, m_ConstantInt<-1>())) {
+ Index = LHS;
+ return RANGE_CHECK_LOWER;
+ }
+
+ if (IsNonNegativeAndNotLoopVarying(LHS)) {
+ Index = RHS;
+ Length = LHS;
+ return RANGE_CHECK_UPPER;
+ }
+ return RANGE_CHECK_UNKNOWN;
+
+ case ICmpInst::ICMP_ULT:
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ICmpInst::ICMP_UGT:
+ IsSigned = false;
+ if (IsNonNegativeAndNotLoopVarying(LHS)) {
+ Index = RHS;
+ Length = LHS;
+ return RANGE_CHECK_BOTH;
+ }
+ return RANGE_CHECK_UNKNOWN;
+ }
+
+ llvm_unreachable("default clause returns!");
+}
+
+void InductiveRangeCheck::extractRangeChecksFromCond(
+ Loop *L, ScalarEvolution &SE, Use &ConditionUse,
+ SmallVectorImpl<InductiveRangeCheck> &Checks,
+ SmallPtrSetImpl<Value *> &Visited) {
+ Value *Condition = ConditionUse.get();
+ if (!Visited.insert(Condition).second)
+ return;
+
+ // TODO: Do the same for OR, XOR, NOT etc?
+ if (match(Condition, m_And(m_Value(), m_Value()))) {
+ extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(0),
+ Checks, Visited);
+ extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(1),
+ Checks, Visited);
+ return;
+ }
+
+ ICmpInst *ICI = dyn_cast<ICmpInst>(Condition);
+ if (!ICI)
+ return;
+
+ Value *Length = nullptr, *Index;
+ bool IsSigned;
+ auto RCKind = parseRangeCheckICmp(L, ICI, SE, Index, Length, IsSigned);
+ if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN)
+ return;
+
+ const auto *IndexAddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Index));
+ bool IsAffineIndex =
+ IndexAddRec && (IndexAddRec->getLoop() == L) && IndexAddRec->isAffine();
+
+ if (!IsAffineIndex)
+ return;
+
+ InductiveRangeCheck IRC;
+ IRC.End = Length ? SE.getSCEV(Length) : nullptr;
+ IRC.Begin = IndexAddRec->getStart();
+ IRC.Step = IndexAddRec->getStepRecurrence(SE);
+ IRC.CheckUse = &ConditionUse;
+ IRC.Kind = RCKind;
+ IRC.IsSigned = IsSigned;
+ Checks.push_back(IRC);
+}
+
+void InductiveRangeCheck::extractRangeChecksFromBranch(
+ BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo &BPI,
+ SmallVectorImpl<InductiveRangeCheck> &Checks) {
+ if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch())
+ return;
+
+ BranchProbability LikelyTaken(15, 16);
+
+ if (!SkipProfitabilityChecks &&
+ BPI.getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken)
+ return;
+
+ SmallPtrSet<Value *, 8> Visited;
+ InductiveRangeCheck::extractRangeChecksFromCond(L, SE, BI->getOperandUse(0),
+ Checks, Visited);
+}
+
+// Add metadata to the loop L to disable loop optimizations. Callers need to
+// confirm that optimizing loop L is not beneficial.
+static void DisableAllLoopOptsOnLoop(Loop &L) {
+ // We do not care about any existing loopID related metadata for L, since we
+ // are setting all loop metadata to false.
+ LLVMContext &Context = L.getHeader()->getContext();
+ // Reserve first location for self reference to the LoopID metadata node.
+ MDNode *Dummy = MDNode::get(Context, {});
+ MDNode *DisableUnroll = MDNode::get(
+ Context, {MDString::get(Context, "llvm.loop.unroll.disable")});
+ Metadata *FalseVal =
+ ConstantAsMetadata::get(ConstantInt::get(Type::getInt1Ty(Context), 0));
+ MDNode *DisableVectorize = MDNode::get(
+ Context,
+ {MDString::get(Context, "llvm.loop.vectorize.enable"), FalseVal});
+ MDNode *DisableLICMVersioning = MDNode::get(
+ Context, {MDString::get(Context, "llvm.loop.licm_versioning.disable")});
+ MDNode *DisableDistribution= MDNode::get(
+ Context,
+ {MDString::get(Context, "llvm.loop.distribute.enable"), FalseVal});
+ MDNode *NewLoopID =
+ MDNode::get(Context, {Dummy, DisableUnroll, DisableVectorize,
+ DisableLICMVersioning, DisableDistribution});
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+ L.setLoopID(NewLoopID);
+}
+
+namespace {
+
+// Keeps track of the structure of a loop. This is similar to llvm::Loop,
+// except that it is more lightweight and can track the state of a loop through
+// changing and potentially invalid IR. This structure also formalizes the
+// kinds of loops we can deal with -- ones that have a single latch that is also
+// an exiting block *and* have a canonical induction variable.
+struct LoopStructure {
+ const char *Tag = "";
+
+ BasicBlock *Header = nullptr;
+ BasicBlock *Latch = nullptr;
+
+ // `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th
+ // successor is `LatchExit', the exit block of the loop.
+ BranchInst *LatchBr = nullptr;
+ BasicBlock *LatchExit = nullptr;
+ unsigned LatchBrExitIdx = std::numeric_limits<unsigned>::max();
+
+ // The loop represented by this instance of LoopStructure is semantically
+ // equivalent to:
+ //
+ // intN_ty inc = IndVarIncreasing ? 1 : -1;
+ // pred_ty predicate = IndVarIncreasing ? ICMP_SLT : ICMP_SGT;
+ //
+ // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarBase)
+ // ... body ...
+
+ Value *IndVarBase = nullptr;
+ Value *IndVarStart = nullptr;
+ Value *IndVarStep = nullptr;
+ Value *LoopExitAt = nullptr;
+ bool IndVarIncreasing = false;
+ bool IsSignedPredicate = true;
+
+ LoopStructure() = default;
+
+ template <typename M> LoopStructure map(M Map) const {
+ LoopStructure Result;
+ Result.Tag = Tag;
+ Result.Header = cast<BasicBlock>(Map(Header));
+ Result.Latch = cast<BasicBlock>(Map(Latch));
+ Result.LatchBr = cast<BranchInst>(Map(LatchBr));
+ Result.LatchExit = cast<BasicBlock>(Map(LatchExit));
+ Result.LatchBrExitIdx = LatchBrExitIdx;
+ Result.IndVarBase = Map(IndVarBase);
+ Result.IndVarStart = Map(IndVarStart);
+ Result.IndVarStep = Map(IndVarStep);
+ Result.LoopExitAt = Map(LoopExitAt);
+ Result.IndVarIncreasing = IndVarIncreasing;
+ Result.IsSignedPredicate = IsSignedPredicate;
+ return Result;
+ }
+
+ static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &,
+ BranchProbabilityInfo &BPI,
+ Loop &,
+ const char *&);
+};
+
+/// This class is used to constrain loops to run within a given iteration space.
+/// The algorithm this class implements is given a Loop and a range [Begin,
+/// End). The algorithm then tries to break out a "main loop" out of the loop
+/// it is given in a way that the "main loop" runs with the induction variable
+/// in a subset of [Begin, End). The algorithm emits appropriate pre and post
+/// loops to run any remaining iterations. The pre loop runs any iterations in
+/// which the induction variable is < Begin, and the post loop runs any
+/// iterations in which the induction variable is >= End.
+class LoopConstrainer {
+ // The representation of a clone of the original loop we started out with.
+ struct ClonedLoop {
+ // The cloned blocks
+ std::vector<BasicBlock *> Blocks;
+
+ // `Map` maps values in the clonee into values in the cloned version
+ ValueToValueMapTy Map;
+
+ // An instance of `LoopStructure` for the cloned loop
+ LoopStructure Structure;
+ };
+
+ // Result of rewriting the range of a loop. See changeIterationSpaceEnd for
+ // more details on what these fields mean.
+ struct RewrittenRangeInfo {
+ BasicBlock *PseudoExit = nullptr;
+ BasicBlock *ExitSelector = nullptr;
+ std::vector<PHINode *> PHIValuesAtPseudoExit;
+ PHINode *IndVarEnd = nullptr;
+
+ RewrittenRangeInfo() = default;
+ };
+
+ // Calculated subranges we restrict the iteration space of the main loop to.
+ // See the implementation of `calculateSubRanges' for more details on how
+ // these fields are computed. `LowLimit` is None if there is no restriction
+ // on low end of the restricted iteration space of the main loop. `HighLimit`
+ // is None if there is no restriction on high end of the restricted iteration
+ // space of the main loop.
+
+ struct SubRanges {
+ Optional<const SCEV *> LowLimit;
+ Optional<const SCEV *> HighLimit;
+ };
+
+ // A utility function that does a `replaceUsesOfWith' on the incoming block
+ // set of a `PHINode' -- replaces instances of `Block' in the `PHINode's
+ // incoming block list with `ReplaceBy'.
+ static void replacePHIBlock(PHINode *PN, BasicBlock *Block,
+ BasicBlock *ReplaceBy);
+
+ // Compute a safe set of limits for the main loop to run in -- effectively the
+ // intersection of `Range' and the iteration space of the original loop.
+ // Return None if unable to compute the set of subranges.
+ Optional<SubRanges> calculateSubRanges(bool IsSignedPredicate) const;
+
+ // Clone `OriginalLoop' and return the result in CLResult. The IR after
+ // running `cloneLoop' is well formed except for the PHI nodes in CLResult --
+ // the PHI nodes say that there is an incoming edge from `OriginalPreheader`
+ // but there is no such edge.
+ void cloneLoop(ClonedLoop &CLResult, const char *Tag) const;
+
+ // Create the appropriate loop structure needed to describe a cloned copy of
+ // `Original`. The clone is described by `VM`.
+ Loop *createClonedLoopStructure(Loop *Original, Loop *Parent,
+ ValueToValueMapTy &VM);
+
+ // Rewrite the iteration space of the loop denoted by (LS, Preheader). The
+ // iteration space of the rewritten loop ends at ExitLoopAt. The start of the
+ // iteration space is not changed. `ExitLoopAt' is assumed to be slt
+ // `OriginalHeaderCount'.
+ //
+ // If there are iterations left to execute, control is made to jump to
+ // `ContinuationBlock', otherwise they take the normal loop exit. The
+ // returned `RewrittenRangeInfo' object is populated as follows:
+ //
+ // .PseudoExit is a basic block that unconditionally branches to
+ // `ContinuationBlock'.
+ //
+ // .ExitSelector is a basic block that decides, on exit from the loop,
+ // whether to branch to the "true" exit or to `PseudoExit'.
+ //
+ // .PHIValuesAtPseudoExit are PHINodes in `PseudoExit' that compute the value
+ // for each PHINode in the loop header on taking the pseudo exit.
+ //
+ // After changeIterationSpaceEnd, `Preheader' is no longer a legitimate
+ // preheader because it is made to branch to the loop header only
+ // conditionally.
+ RewrittenRangeInfo
+ changeIterationSpaceEnd(const LoopStructure &LS, BasicBlock *Preheader,
+ Value *ExitLoopAt,
+ BasicBlock *ContinuationBlock) const;
+
+ // The loop denoted by `LS' has `OldPreheader' as its preheader. This
+ // function creates a new preheader for `LS' and returns it.
+ BasicBlock *createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader,
+ const char *Tag) const;
+
+ // `ContinuationBlockAndPreheader' was the continuation block for some call to
+ // `changeIterationSpaceEnd' and is the preheader to the loop denoted by `LS'.
+ // This function rewrites the PHI nodes in `LS.Header' to start with the
+ // correct value.
+ void rewriteIncomingValuesForPHIs(
+ LoopStructure &LS, BasicBlock *ContinuationBlockAndPreheader,
+ const LoopConstrainer::RewrittenRangeInfo &RRI) const;
+
+ // Even though we do not preserve any passes at this time, we at least need to
+ // keep the parent loop structure consistent. The `LPPassManager' seems to
+ // verify this after running a loop pass. This function adds the list of
+ // blocks denoted by BBs to this loops parent loop if required.
+ void addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs);
+
+ // Some global state.
+ Function &F;
+ LLVMContext &Ctx;
+ ScalarEvolution &SE;
+ DominatorTree &DT;
+ LPPassManager &LPM;
+ LoopInfo &LI;
+
+ // Information about the original loop we started out with.
+ Loop &OriginalLoop;
+
+ const SCEV *LatchTakenCount = nullptr;
+ BasicBlock *OriginalPreheader = nullptr;
+
+ // The preheader of the main loop. This may or may not be different from
+ // `OriginalPreheader'.
+ BasicBlock *MainLoopPreheader = nullptr;
+
+ // The range we need to run the main loop in.
+ InductiveRangeCheck::Range Range;
+
+ // The structure of the main loop (see comment at the beginning of this class
+ // for a definition)
+ LoopStructure MainLoopStructure;
+
+public:
+ LoopConstrainer(Loop &L, LoopInfo &LI, LPPassManager &LPM,
+ const LoopStructure &LS, ScalarEvolution &SE,
+ DominatorTree &DT, InductiveRangeCheck::Range R)
+ : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()),
+ SE(SE), DT(DT), LPM(LPM), LI(LI), OriginalLoop(L), Range(R),
+ MainLoopStructure(LS) {}
+
+ // Entry point for the algorithm. Returns true on success.
+ bool run();
+};
+
+} // end anonymous namespace
+
+void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block,
+ BasicBlock *ReplaceBy) {
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ if (PN->getIncomingBlock(i) == Block)
+ PN->setIncomingBlock(i, ReplaceBy);
+}
+
+static bool CanBeMax(ScalarEvolution &SE, const SCEV *S, bool Signed) {
+ APInt Max = Signed ?
+ APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth()) :
+ APInt::getMaxValue(cast<IntegerType>(S->getType())->getBitWidth());
+ return SE.getSignedRange(S).contains(Max) &&
+ SE.getUnsignedRange(S).contains(Max);
+}
+
+static bool SumCanReachMax(ScalarEvolution &SE, const SCEV *S1, const SCEV *S2,
+ bool Signed) {
+ // S1 < INT_MAX - S2 ===> S1 + S2 < INT_MAX.
+ assert(SE.isKnownNonNegative(S2) &&
+ "We expected the 2nd arg to be non-negative!");
+ const SCEV *Max = SE.getConstant(
+ Signed ? APInt::getSignedMaxValue(
+ cast<IntegerType>(S1->getType())->getBitWidth())
+ : APInt::getMaxValue(
+ cast<IntegerType>(S1->getType())->getBitWidth()));
+ const SCEV *CapForS1 = SE.getMinusSCEV(Max, S2);
+ return !SE.isKnownPredicate(Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+ S1, CapForS1);
+}
+
+static bool CanBeMin(ScalarEvolution &SE, const SCEV *S, bool Signed) {
+ APInt Min = Signed ?
+ APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth()) :
+ APInt::getMinValue(cast<IntegerType>(S->getType())->getBitWidth());
+ return SE.getSignedRange(S).contains(Min) &&
+ SE.getUnsignedRange(S).contains(Min);
+}
+
+static bool SumCanReachMin(ScalarEvolution &SE, const SCEV *S1, const SCEV *S2,
+ bool Signed) {
+ // S1 > INT_MIN - S2 ===> S1 + S2 > INT_MIN.
+ assert(SE.isKnownNonPositive(S2) &&
+ "We expected the 2nd arg to be non-positive!");
+ const SCEV *Max = SE.getConstant(
+ Signed ? APInt::getSignedMinValue(
+ cast<IntegerType>(S1->getType())->getBitWidth())
+ : APInt::getMinValue(
+ cast<IntegerType>(S1->getType())->getBitWidth()));
+ const SCEV *CapForS1 = SE.getMinusSCEV(Max, S2);
+ return !SE.isKnownPredicate(Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT,
+ S1, CapForS1);
+}
+
+Optional<LoopStructure>
+LoopStructure::parseLoopStructure(ScalarEvolution &SE,
+ BranchProbabilityInfo &BPI,
+ Loop &L, const char *&FailureReason) {
+ if (!L.isLoopSimplifyForm()) {
+ FailureReason = "loop not in LoopSimplify form";
+ return None;
+ }
+
+ BasicBlock *Latch = L.getLoopLatch();
+ assert(Latch && "Simplified loops only have one latch!");
+
+ if (Latch->getTerminator()->getMetadata(ClonedLoopTag)) {
+ FailureReason = "loop has already been cloned";
+ return None;
+ }
+
+ if (!L.isLoopExiting(Latch)) {
+ FailureReason = "no loop latch";
+ return None;
+ }
+
+ BasicBlock *Header = L.getHeader();
+ BasicBlock *Preheader = L.getLoopPreheader();
+ if (!Preheader) {
+ FailureReason = "no preheader";
+ return None;
+ }
+
+ BranchInst *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
+ if (!LatchBr || LatchBr->isUnconditional()) {
+ FailureReason = "latch terminator not conditional branch";
+ return None;
+ }
+
+ unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0;
+
+ BranchProbability ExitProbability =
+ BPI.getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx);
+
+ if (!SkipProfitabilityChecks &&
+ ExitProbability > BranchProbability(1, MaxExitProbReciprocal)) {
+ FailureReason = "short running loop, not profitable";
+ return None;
+ }
+
+ ICmpInst *ICI = dyn_cast<ICmpInst>(LatchBr->getCondition());
+ if (!ICI || !isa<IntegerType>(ICI->getOperand(0)->getType())) {
+ FailureReason = "latch terminator branch not conditional on integral icmp";
+ return None;
+ }
+
+ const SCEV *LatchCount = SE.getExitCount(&L, Latch);
+ if (isa<SCEVCouldNotCompute>(LatchCount)) {
+ FailureReason = "could not compute latch count";
+ return None;
+ }
+
+ ICmpInst::Predicate Pred = ICI->getPredicate();
+ Value *LeftValue = ICI->getOperand(0);
+ const SCEV *LeftSCEV = SE.getSCEV(LeftValue);
+ IntegerType *IndVarTy = cast<IntegerType>(LeftValue->getType());
+
+ Value *RightValue = ICI->getOperand(1);
+ const SCEV *RightSCEV = SE.getSCEV(RightValue);
+
+ // We canonicalize `ICI` such that `LeftSCEV` is an add recurrence.
+ if (!isa<SCEVAddRecExpr>(LeftSCEV)) {
+ if (isa<SCEVAddRecExpr>(RightSCEV)) {
+ std::swap(LeftSCEV, RightSCEV);
+ std::swap(LeftValue, RightValue);
+ Pred = ICmpInst::getSwappedPredicate(Pred);
+ } else {
+ FailureReason = "no add recurrences in the icmp";
+ return None;
+ }
+ }
+
+ auto HasNoSignedWrap = [&](const SCEVAddRecExpr *AR) {
+ if (AR->getNoWrapFlags(SCEV::FlagNSW))
+ return true;
+
+ IntegerType *Ty = cast<IntegerType>(AR->getType());
+ IntegerType *WideTy =
+ IntegerType::get(Ty->getContext(), Ty->getBitWidth() * 2);
+
+ const SCEVAddRecExpr *ExtendAfterOp =
+ dyn_cast<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
+ if (ExtendAfterOp) {
+ const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy);
+ const SCEV *ExtendedStep =
+ SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy);
+
+ bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart &&
+ ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep;
+
+ if (NoSignedWrap)
+ return true;
+ }
+
+ // We may have proved this when computing the sign extension above.
+ return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap;
+ };
+
+ // Here we check whether the suggested AddRec is an induction variable that
+ // can be handled (i.e. with known constant step), and if yes, calculate its
+ // step and identify whether it is increasing or decreasing.
+ auto IsInductionVar = [&](const SCEVAddRecExpr *AR, bool &IsIncreasing,
+ ConstantInt *&StepCI) {
+ if (!AR->isAffine())
+ return false;
+
+ // Currently we only work with induction variables that have been proved to
+ // not wrap. This restriction can potentially be lifted in the future.
+
+ if (!HasNoSignedWrap(AR))
+ return false;
+
+ if (const SCEVConstant *StepExpr =
+ dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) {
+ StepCI = StepExpr->getValue();
+ assert(!StepCI->isZero() && "Zero step?");
+ IsIncreasing = !StepCI->isNegative();
+ return true;
+ }
+
+ return false;
+ };
+
+ // `ICI` is interpreted as taking the backedge if the *next* value of the
+ // induction variable satisfies some constraint.
+
+ const SCEVAddRecExpr *IndVarBase = cast<SCEVAddRecExpr>(LeftSCEV);
+ bool IsIncreasing = false;
+ bool IsSignedPredicate = true;
+ ConstantInt *StepCI;
+ if (!IsInductionVar(IndVarBase, IsIncreasing, StepCI)) {
+ FailureReason = "LHS in icmp not induction variable";
+ return None;
+ }
+
+ const SCEV *StartNext = IndVarBase->getStart();
+ const SCEV *Addend = SE.getNegativeSCEV(IndVarBase->getStepRecurrence(SE));
+ const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
+ const SCEV *Step = SE.getSCEV(StepCI);
+
+ ConstantInt *One = ConstantInt::get(IndVarTy, 1);
+ if (IsIncreasing) {
+ bool DecreasedRightValueByOne = false;
+ if (StepCI->isOne()) {
+ // Try to turn eq/ne predicates to those we can work with.
+ if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+ // while (++i != len) { while (++i < len) {
+ // ... ---> ...
+ // } }
+ // If both parts are known non-negative, it is profitable to use
+ // unsigned comparison in increasing loop. This allows us to make the
+ // comparison check against "RightSCEV + 1" more optimistic.
+ if (SE.isKnownNonNegative(IndVarStart) &&
+ SE.isKnownNonNegative(RightSCEV))
+ Pred = ICmpInst::ICMP_ULT;
+ else
+ Pred = ICmpInst::ICMP_SLT;
+ else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
+ !CanBeMin(SE, RightSCEV, /* IsSignedPredicate */ true)) {
+ // while (true) { while (true) {
+ // if (++i == len) ---> if (++i > len - 1)
+ // break; break;
+ // ... ...
+ // } }
+ // TODO: Insert ICMP_UGT if both are non-negative?
+ Pred = ICmpInst::ICMP_SGT;
+ RightSCEV = SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
+ DecreasedRightValueByOne = true;
+ }
+ }
+
+ bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT);
+ bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT);
+ bool FoundExpectedPred =
+ (LTPred && LatchBrExitIdx == 1) || (GTPred && LatchBrExitIdx == 0);
+
+ if (!FoundExpectedPred) {
+ FailureReason = "expected icmp slt semantically, found something else";
+ return None;
+ }
+
+ IsSignedPredicate =
+ Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGT;
+
+ if (!IsSignedPredicate && !AllowUnsignedLatchCondition) {
+ FailureReason = "unsigned latch conditions are explicitly prohibited";
+ return None;
+ }
+
+ // The predicate that we need to check that the induction variable lies
+ // within bounds.
+ ICmpInst::Predicate BoundPred =
+ IsSignedPredicate ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
+
+ if (LatchBrExitIdx == 0) {
+ const SCEV *StepMinusOne = SE.getMinusSCEV(Step,
+ SE.getOne(Step->getType()));
+ if (SumCanReachMax(SE, RightSCEV, StepMinusOne, IsSignedPredicate)) {
+ // TODO: this restriction is easily removable -- we just have to
+ // remember that the icmp was an slt and not an sle.
+ FailureReason = "limit may overflow when coercing le to lt";
+ return None;
+ }
+
+ if (!SE.isLoopEntryGuardedByCond(
+ &L, BoundPred, IndVarStart,
+ SE.getAddExpr(RightSCEV, Step))) {
+ FailureReason = "Induction variable start not bounded by upper limit";
+ return None;
+ }
+
+ // We need to increase the right value unless we have already decreased
+ // it virtually when we replaced EQ with SGT.
+ if (!DecreasedRightValueByOne) {
+ IRBuilder<> B(Preheader->getTerminator());
+ RightValue = B.CreateAdd(RightValue, One);
+ }
+ } else {
+ if (!SE.isLoopEntryGuardedByCond(&L, BoundPred, IndVarStart, RightSCEV)) {
+ FailureReason = "Induction variable start not bounded by upper limit";
+ return None;
+ }
+ assert(!DecreasedRightValueByOne &&
+ "Right value can be decreased only for LatchBrExitIdx == 0!");
+ }
+ } else {
+ bool IncreasedRightValueByOne = false;
+ if (StepCI->isMinusOne()) {
+ // Try to turn eq/ne predicates to those we can work with.
+ if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+ // while (--i != len) { while (--i > len) {
+ // ... ---> ...
+ // } }
+ // We intentionally don't turn the predicate into UGT even if we know
+ // that both operands are non-negative, because it will only pessimize
+ // our check against "RightSCEV - 1".
+ Pred = ICmpInst::ICMP_SGT;
+ else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
+ !CanBeMax(SE, RightSCEV, /* IsSignedPredicate */ true)) {
+ // while (true) { while (true) {
+ // if (--i == len) ---> if (--i < len + 1)
+ // break; break;
+ // ... ...
+ // } }
+ // TODO: Insert ICMP_ULT if both are non-negative?
+ Pred = ICmpInst::ICMP_SLT;
+ RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+ IncreasedRightValueByOne = true;
+ }
+ }
+
+ bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT);
+ bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT);
+
+ bool FoundExpectedPred =
+ (GTPred && LatchBrExitIdx == 1) || (LTPred && LatchBrExitIdx == 0);
+
+ if (!FoundExpectedPred) {
+ FailureReason = "expected icmp sgt semantically, found something else";
+ return None;
+ }
+
+ IsSignedPredicate =
+ Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGT;
+
+ if (!IsSignedPredicate && !AllowUnsignedLatchCondition) {
+ FailureReason = "unsigned latch conditions are explicitly prohibited";
+ return None;
+ }
+
+ // The predicate that we need to check that the induction variable lies
+ // within bounds.
+ ICmpInst::Predicate BoundPred =
+ IsSignedPredicate ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT;
+
+ if (LatchBrExitIdx == 0) {
+ const SCEV *StepPlusOne = SE.getAddExpr(Step, SE.getOne(Step->getType()));
+ if (SumCanReachMin(SE, RightSCEV, StepPlusOne, IsSignedPredicate)) {
+ // TODO: this restriction is easily removable -- we just have to
+ // remember that the icmp was an sgt and not an sge.
+ FailureReason = "limit may overflow when coercing ge to gt";
+ return None;
+ }
+
+ if (!SE.isLoopEntryGuardedByCond(
+ &L, BoundPred, IndVarStart,
+ SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())))) {
+ FailureReason = "Induction variable start not bounded by lower limit";
+ return None;
+ }
+
+ // We need to decrease the right value unless we have already increased
+ // it virtually when we replaced EQ with SLT.
+ if (!IncreasedRightValueByOne) {
+ IRBuilder<> B(Preheader->getTerminator());
+ RightValue = B.CreateSub(RightValue, One);
+ }
+ } else {
+ if (!SE.isLoopEntryGuardedByCond(&L, BoundPred, IndVarStart, RightSCEV)) {
+ FailureReason = "Induction variable start not bounded by lower limit";
+ return None;
+ }
+ assert(!IncreasedRightValueByOne &&
+ "Right value can be increased only for LatchBrExitIdx == 0!");
+ }
+ }
+ BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
+
+ assert(SE.getLoopDisposition(LatchCount, &L) ==
+ ScalarEvolution::LoopInvariant &&
+ "loop variant exit count doesn't make sense!");
+
+ assert(!L.contains(LatchExit) && "expected an exit block!");
+ const DataLayout &DL = Preheader->getModule()->getDataLayout();
+ Value *IndVarStartV =
+ SCEVExpander(SE, DL, "irce")
+ .expandCodeFor(IndVarStart, IndVarTy, Preheader->getTerminator());
+ IndVarStartV->setName("indvar.start");
+
+ LoopStructure Result;
+
+ Result.Tag = "main";
+ Result.Header = Header;
+ Result.Latch = Latch;
+ Result.LatchBr = LatchBr;
+ Result.LatchExit = LatchExit;
+ Result.LatchBrExitIdx = LatchBrExitIdx;
+ Result.IndVarStart = IndVarStartV;
+ Result.IndVarStep = StepCI;
+ Result.IndVarBase = LeftValue;
+ Result.IndVarIncreasing = IsIncreasing;
+ Result.LoopExitAt = RightValue;
+ Result.IsSignedPredicate = IsSignedPredicate;
+
+ FailureReason = nullptr;
+
+ return Result;
+}
+
+Optional<LoopConstrainer::SubRanges>
+LoopConstrainer::calculateSubRanges(bool IsSignedPredicate) const {
+ IntegerType *Ty = cast<IntegerType>(LatchTakenCount->getType());
+
+ if (Range.getType() != Ty)
+ return None;
+
+ LoopConstrainer::SubRanges Result;
+
+ // I think we can be more aggressive here and make this nuw / nsw if the
+ // addition that feeds into the icmp for the latch's terminating branch is nuw
+ // / nsw. In any case, a wrapping 2's complement addition is safe.
+ const SCEV *Start = SE.getSCEV(MainLoopStructure.IndVarStart);
+ const SCEV *End = SE.getSCEV(MainLoopStructure.LoopExitAt);
+
+ bool Increasing = MainLoopStructure.IndVarIncreasing;
+
+ // We compute `Smallest` and `Greatest` such that [Smallest, Greatest), or
+ // [Smallest, GreatestSeen] is the range of values the induction variable
+ // takes.
+
+ const SCEV *Smallest = nullptr, *Greatest = nullptr, *GreatestSeen = nullptr;
+
+ const SCEV *One = SE.getOne(Ty);
+ if (Increasing) {
+ Smallest = Start;
+ Greatest = End;
+ // No overflow, because the range [Smallest, GreatestSeen] is not empty.
+ GreatestSeen = SE.getMinusSCEV(End, One);
+ } else {
+ // These two computations may sign-overflow. Here is why that is okay:
+ //
+ // We know that the induction variable does not sign-overflow on any
+ // iteration except the last one, and it starts at `Start` and ends at
+ // `End`, decrementing by one every time.
+ //
+ // * if `Smallest` sign-overflows we know `End` is `INT_SMAX`. Since the
+ // induction variable is decreasing we know that that the smallest value
+ // the loop body is actually executed with is `INT_SMIN` == `Smallest`.
+ //
+ // * if `Greatest` sign-overflows, we know it can only be `INT_SMIN`. In
+ // that case, `Clamp` will always return `Smallest` and
+ // [`Result.LowLimit`, `Result.HighLimit`) = [`Smallest`, `Smallest`)
+ // will be an empty range. Returning an empty range is always safe.
+
+ Smallest = SE.getAddExpr(End, One);
+ Greatest = SE.getAddExpr(Start, One);
+ GreatestSeen = Start;
+ }
+
+ auto Clamp = [this, Smallest, Greatest, IsSignedPredicate](const SCEV *S) {
+ return IsSignedPredicate
+ ? SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S))
+ : SE.getUMaxExpr(Smallest, SE.getUMinExpr(Greatest, S));
+ };
+
+ // In some cases we can prove that we don't need a pre or post loop.
+ ICmpInst::Predicate PredLE =
+ IsSignedPredicate ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
+ ICmpInst::Predicate PredLT =
+ IsSignedPredicate ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+
+ bool ProvablyNoPreloop =
+ SE.isKnownPredicate(PredLE, Range.getBegin(), Smallest);
+ if (!ProvablyNoPreloop)
+ Result.LowLimit = Clamp(Range.getBegin());
+
+ bool ProvablyNoPostLoop =
+ SE.isKnownPredicate(PredLT, GreatestSeen, Range.getEnd());
+ if (!ProvablyNoPostLoop)
+ Result.HighLimit = Clamp(Range.getEnd());
+
+ return Result;
+}
+
+void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result,
+ const char *Tag) const {
+ for (BasicBlock *BB : OriginalLoop.getBlocks()) {
+ BasicBlock *Clone = CloneBasicBlock(BB, Result.Map, Twine(".") + Tag, &F);
+ Result.Blocks.push_back(Clone);
+ Result.Map[BB] = Clone;
+ }
+
+ auto GetClonedValue = [&Result](Value *V) {
+ assert(V && "null values not in domain!");
+ auto It = Result.Map.find(V);
+ if (It == Result.Map.end())
+ return V;
+ return static_cast<Value *>(It->second);
+ };
+
+ auto *ClonedLatch =
+ cast<BasicBlock>(GetClonedValue(OriginalLoop.getLoopLatch()));
+ ClonedLatch->getTerminator()->setMetadata(ClonedLoopTag,
+ MDNode::get(Ctx, {}));
+
+ Result.Structure = MainLoopStructure.map(GetClonedValue);
+ Result.Structure.Tag = Tag;
+
+ for (unsigned i = 0, e = Result.Blocks.size(); i != e; ++i) {
+ BasicBlock *ClonedBB = Result.Blocks[i];
+ BasicBlock *OriginalBB = OriginalLoop.getBlocks()[i];
+
+ assert(Result.Map[OriginalBB] == ClonedBB && "invariant!");
+
+ for (Instruction &I : *ClonedBB)
+ RemapInstruction(&I, Result.Map,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+ // Exit blocks will now have one more predecessor and their PHI nodes need
+ // to be edited to reflect that. No phi nodes need to be introduced because
+ // the loop is in LCSSA.
+
+ for (auto *SBB : successors(OriginalBB)) {
+ if (OriginalLoop.contains(SBB))
+ continue; // not an exit block
+
+ for (PHINode &PN : SBB->phis()) {
+ Value *OldIncoming = PN.getIncomingValueForBlock(OriginalBB);
+ PN.addIncoming(GetClonedValue(OldIncoming), ClonedBB);
+ }
+ }
+ }
+}
+
+LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
+ const LoopStructure &LS, BasicBlock *Preheader, Value *ExitSubloopAt,
+ BasicBlock *ContinuationBlock) const {
+ // We start with a loop with a single latch:
+ //
+ // +--------------------+
+ // | |
+ // | preheader |
+ // | |
+ // +--------+-----------+
+ // | ----------------\
+ // | / |
+ // +--------v----v------+ |
+ // | | |
+ // | header | |
+ // | | |
+ // +--------------------+ |
+ // |
+ // ..... |
+ // |
+ // +--------------------+ |
+ // | | |
+ // | latch >----------/
+ // | |
+ // +-------v------------+
+ // |
+ // |
+ // | +--------------------+
+ // | | |
+ // +---> original exit |
+ // | |
+ // +--------------------+
+ //
+ // We change the control flow to look like
+ //
+ //
+ // +--------------------+
+ // | |
+ // | preheader >-------------------------+
+ // | | |
+ // +--------v-----------+ |
+ // | /-------------+ |
+ // | / | |
+ // +--------v--v--------+ | |
+ // | | | |
+ // | header | | +--------+ |
+ // | | | | | |
+ // +--------------------+ | | +-----v-----v-----------+
+ // | | | |
+ // | | | .pseudo.exit |
+ // | | | |
+ // | | +-----------v-----------+
+ // | | |
+ // ..... | | |
+ // | | +--------v-------------+
+ // +--------------------+ | | | |
+ // | | | | | ContinuationBlock |
+ // | latch >------+ | | |
+ // | | | +----------------------+
+ // +---------v----------+ |
+ // | |
+ // | |
+ // | +---------------^-----+
+ // | | |
+ // +-----> .exit.selector |
+ // | |
+ // +----------v----------+
+ // |
+ // +--------------------+ |
+ // | | |
+ // | original exit <----+
+ // | |
+ // +--------------------+
+
+ RewrittenRangeInfo RRI;
+
+ BasicBlock *BBInsertLocation = LS.Latch->getNextNode();
+ RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector",
+ &F, BBInsertLocation);
+ RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F,
+ BBInsertLocation);
+
+ BranchInst *PreheaderJump = cast<BranchInst>(Preheader->getTerminator());
+ bool Increasing = LS.IndVarIncreasing;
+ bool IsSignedPredicate = LS.IsSignedPredicate;
+
+ IRBuilder<> B(PreheaderJump);
+
+ // EnterLoopCond - is it okay to start executing this `LS'?
+ Value *EnterLoopCond = nullptr;
+ if (Increasing)
+ EnterLoopCond = IsSignedPredicate
+ ? B.CreateICmpSLT(LS.IndVarStart, ExitSubloopAt)
+ : B.CreateICmpULT(LS.IndVarStart, ExitSubloopAt);
+ else
+ EnterLoopCond = IsSignedPredicate
+ ? B.CreateICmpSGT(LS.IndVarStart, ExitSubloopAt)
+ : B.CreateICmpUGT(LS.IndVarStart, ExitSubloopAt);
+
+ B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit);
+ PreheaderJump->eraseFromParent();
+
+ LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector);
+ B.SetInsertPoint(LS.LatchBr);
+ Value *TakeBackedgeLoopCond = nullptr;
+ if (Increasing)
+ TakeBackedgeLoopCond = IsSignedPredicate
+ ? B.CreateICmpSLT(LS.IndVarBase, ExitSubloopAt)
+ : B.CreateICmpULT(LS.IndVarBase, ExitSubloopAt);
+ else
+ TakeBackedgeLoopCond = IsSignedPredicate
+ ? B.CreateICmpSGT(LS.IndVarBase, ExitSubloopAt)
+ : B.CreateICmpUGT(LS.IndVarBase, ExitSubloopAt);
+ Value *CondForBranch = LS.LatchBrExitIdx == 1
+ ? TakeBackedgeLoopCond
+ : B.CreateNot(TakeBackedgeLoopCond);
+
+ LS.LatchBr->setCondition(CondForBranch);
+
+ B.SetInsertPoint(RRI.ExitSelector);
+
+ // IterationsLeft - are there any more iterations left, given the original
+ // upper bound on the induction variable? If not, we branch to the "real"
+ // exit.
+ Value *IterationsLeft = nullptr;
+ if (Increasing)
+ IterationsLeft = IsSignedPredicate
+ ? B.CreateICmpSLT(LS.IndVarBase, LS.LoopExitAt)
+ : B.CreateICmpULT(LS.IndVarBase, LS.LoopExitAt);
+ else
+ IterationsLeft = IsSignedPredicate
+ ? B.CreateICmpSGT(LS.IndVarBase, LS.LoopExitAt)
+ : B.CreateICmpUGT(LS.IndVarBase, LS.LoopExitAt);
+ B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit);
+
+ BranchInst *BranchToContinuation =
+ BranchInst::Create(ContinuationBlock, RRI.PseudoExit);
+
+ // We emit PHI nodes into `RRI.PseudoExit' that compute the "latest" value of
+ // each of the PHI nodes in the loop header. This feeds into the initial
+ // value of the same PHI nodes if/when we continue execution.
+ for (PHINode &PN : LS.Header->phis()) {
+ PHINode *NewPHI = PHINode::Create(PN.getType(), 2, PN.getName() + ".copy",
+ BranchToContinuation);
+
+ NewPHI->addIncoming(PN.getIncomingValueForBlock(Preheader), Preheader);
+ NewPHI->addIncoming(PN.getIncomingValueForBlock(LS.Latch),
+ RRI.ExitSelector);
+ RRI.PHIValuesAtPseudoExit.push_back(NewPHI);
+ }
+
+ RRI.IndVarEnd = PHINode::Create(LS.IndVarBase->getType(), 2, "indvar.end",
+ BranchToContinuation);
+ RRI.IndVarEnd->addIncoming(LS.IndVarStart, Preheader);
+ RRI.IndVarEnd->addIncoming(LS.IndVarBase, RRI.ExitSelector);
+
+ // The latch exit now has a branch from `RRI.ExitSelector' instead of
+ // `LS.Latch'. The PHI nodes need to be updated to reflect that.
+ for (PHINode &PN : LS.LatchExit->phis())
+ replacePHIBlock(&PN, LS.Latch, RRI.ExitSelector);
+
+ return RRI;
+}
+
+void LoopConstrainer::rewriteIncomingValuesForPHIs(
+ LoopStructure &LS, BasicBlock *ContinuationBlock,
+ const LoopConstrainer::RewrittenRangeInfo &RRI) const {
+ unsigned PHIIndex = 0;
+ for (PHINode &PN : LS.Header->phis())
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i < e; ++i)
+ if (PN.getIncomingBlock(i) == ContinuationBlock)
+ PN.setIncomingValue(i, RRI.PHIValuesAtPseudoExit[PHIIndex++]);
+
+ LS.IndVarStart = RRI.IndVarEnd;
+}
+
+BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS,
+ BasicBlock *OldPreheader,
+ const char *Tag) const {
+ BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header);
+ BranchInst::Create(LS.Header, Preheader);
+
+ for (PHINode &PN : LS.Header->phis())
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i < e; ++i)
+ replacePHIBlock(&PN, OldPreheader, Preheader);
+
+ return Preheader;
+}
+
+void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
+ Loop *ParentLoop = OriginalLoop.getParentLoop();
+ if (!ParentLoop)
+ return;
+
+ for (BasicBlock *BB : BBs)
+ ParentLoop->addBasicBlockToLoop(BB, LI);
+}
+
+Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
+ ValueToValueMapTy &VM) {
+ Loop &New = *LI.AllocateLoop();
+ if (Parent)
+ Parent->addChildLoop(&New);
+ else
+ LI.addTopLevelLoop(&New);
+ LPM.addLoop(New);
+
+ // Add all of the blocks in Original to the new loop.
+ for (auto *BB : Original->blocks())
+ if (LI.getLoopFor(BB) == Original)
+ New.addBasicBlockToLoop(cast<BasicBlock>(VM[BB]), LI);
+
+ // Add all of the subloops to the new loop.
+ for (Loop *SubLoop : *Original)
+ createClonedLoopStructure(SubLoop, &New, VM);
+
+ return &New;
+}
+
+bool LoopConstrainer::run() {
+ BasicBlock *Preheader = nullptr;
+ LatchTakenCount = SE.getExitCount(&OriginalLoop, MainLoopStructure.Latch);
+ Preheader = OriginalLoop.getLoopPreheader();
+ assert(!isa<SCEVCouldNotCompute>(LatchTakenCount) && Preheader != nullptr &&
+ "preconditions!");
+
+ OriginalPreheader = Preheader;
+ MainLoopPreheader = Preheader;
+
+ bool IsSignedPredicate = MainLoopStructure.IsSignedPredicate;
+ Optional<SubRanges> MaybeSR = calculateSubRanges(IsSignedPredicate);
+ if (!MaybeSR.hasValue()) {
+ DEBUG(dbgs() << "irce: could not compute subranges\n");
+ return false;
+ }
+
+ SubRanges SR = MaybeSR.getValue();
+ bool Increasing = MainLoopStructure.IndVarIncreasing;
+ IntegerType *IVTy =
+ cast<IntegerType>(MainLoopStructure.IndVarBase->getType());
+
+ SCEVExpander Expander(SE, F.getParent()->getDataLayout(), "irce");
+ Instruction *InsertPt = OriginalPreheader->getTerminator();
+
+ // It would have been better to make `PreLoop' and `PostLoop'
+ // `Optional<ClonedLoop>'s, but `ValueToValueMapTy' does not have a copy
+ // constructor.
+ ClonedLoop PreLoop, PostLoop;
+ bool NeedsPreLoop =
+ Increasing ? SR.LowLimit.hasValue() : SR.HighLimit.hasValue();
+ bool NeedsPostLoop =
+ Increasing ? SR.HighLimit.hasValue() : SR.LowLimit.hasValue();
+
+ Value *ExitPreLoopAt = nullptr;
+ Value *ExitMainLoopAt = nullptr;
+ const SCEVConstant *MinusOneS =
+ cast<SCEVConstant>(SE.getConstant(IVTy, -1, true /* isSigned */));
+
+ if (NeedsPreLoop) {
+ const SCEV *ExitPreLoopAtSCEV = nullptr;
+
+ if (Increasing)
+ ExitPreLoopAtSCEV = *SR.LowLimit;
+ else {
+ if (CanBeMin(SE, *SR.HighLimit, IsSignedPredicate)) {
+ DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+ << "preloop exit limit. HighLimit = " << *(*SR.HighLimit)
+ << "\n");
+ return false;
+ }
+ ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
+ }
+
+ if (!isSafeToExpandAt(ExitPreLoopAtSCEV, InsertPt, SE)) {
+ DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
+ << " preloop exit limit " << *ExitPreLoopAtSCEV
+ << " at block " << InsertPt->getParent()->getName() << "\n");
+ return false;
+ }
+
+ ExitPreLoopAt = Expander.expandCodeFor(ExitPreLoopAtSCEV, IVTy, InsertPt);
+ ExitPreLoopAt->setName("exit.preloop.at");
+ }
+
+ if (NeedsPostLoop) {
+ const SCEV *ExitMainLoopAtSCEV = nullptr;
+
+ if (Increasing)
+ ExitMainLoopAtSCEV = *SR.HighLimit;
+ else {
+ if (CanBeMin(SE, *SR.LowLimit, IsSignedPredicate)) {
+ DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+ << "mainloop exit limit. LowLimit = " << *(*SR.LowLimit)
+ << "\n");
+ return false;
+ }
+ ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
+ }
+
+ if (!isSafeToExpandAt(ExitMainLoopAtSCEV, InsertPt, SE)) {
+ DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
+ << " main loop exit limit " << *ExitMainLoopAtSCEV
+ << " at block " << InsertPt->getParent()->getName() << "\n");
+ return false;
+ }
+
+ ExitMainLoopAt = Expander.expandCodeFor(ExitMainLoopAtSCEV, IVTy, InsertPt);
+ ExitMainLoopAt->setName("exit.mainloop.at");
+ }
+
+ // We clone these ahead of time so that we don't have to deal with changing
+ // and temporarily invalid IR as we transform the loops.
+ if (NeedsPreLoop)
+ cloneLoop(PreLoop, "preloop");
+ if (NeedsPostLoop)
+ cloneLoop(PostLoop, "postloop");
+
+ RewrittenRangeInfo PreLoopRRI;
+
+ if (NeedsPreLoop) {
+ Preheader->getTerminator()->replaceUsesOfWith(MainLoopStructure.Header,
+ PreLoop.Structure.Header);
+
+ MainLoopPreheader =
+ createPreheader(MainLoopStructure, Preheader, "mainloop");
+ PreLoopRRI = changeIterationSpaceEnd(PreLoop.Structure, Preheader,
+ ExitPreLoopAt, MainLoopPreheader);
+ rewriteIncomingValuesForPHIs(MainLoopStructure, MainLoopPreheader,
+ PreLoopRRI);
+ }
+
+ BasicBlock *PostLoopPreheader = nullptr;
+ RewrittenRangeInfo PostLoopRRI;
+
+ if (NeedsPostLoop) {
+ PostLoopPreheader =
+ createPreheader(PostLoop.Structure, Preheader, "postloop");
+ PostLoopRRI = changeIterationSpaceEnd(MainLoopStructure, MainLoopPreheader,
+ ExitMainLoopAt, PostLoopPreheader);
+ rewriteIncomingValuesForPHIs(PostLoop.Structure, PostLoopPreheader,
+ PostLoopRRI);
+ }
+
+ BasicBlock *NewMainLoopPreheader =
+ MainLoopPreheader != Preheader ? MainLoopPreheader : nullptr;
+ BasicBlock *NewBlocks[] = {PostLoopPreheader, PreLoopRRI.PseudoExit,
+ PreLoopRRI.ExitSelector, PostLoopRRI.PseudoExit,
+ PostLoopRRI.ExitSelector, NewMainLoopPreheader};
+
+ // Some of the above may be nullptr, filter them out before passing to
+ // addToParentLoopIfNeeded.
+ auto NewBlocksEnd =
+ std::remove(std::begin(NewBlocks), std::end(NewBlocks), nullptr);
+
+ addToParentLoopIfNeeded(makeArrayRef(std::begin(NewBlocks), NewBlocksEnd));
+
+ DT.recalculate(F);
+
+ // We need to first add all the pre and post loop blocks into the loop
+ // structures (as part of createClonedLoopStructure), and then update the
+ // LCSSA form and LoopSimplifyForm. This is necessary for correctly updating
+ // LI when LoopSimplifyForm is generated.
+ Loop *PreL = nullptr, *PostL = nullptr;
+ if (!PreLoop.Blocks.empty()) {
+ PreL = createClonedLoopStructure(
+ &OriginalLoop, OriginalLoop.getParentLoop(), PreLoop.Map);
+ }
+
+ if (!PostLoop.Blocks.empty()) {
+ PostL = createClonedLoopStructure(
+ &OriginalLoop, OriginalLoop.getParentLoop(), PostLoop.Map);
+ }
+
+ // This function canonicalizes the loop into Loop-Simplify and LCSSA forms.
+ auto CanonicalizeLoop = [&] (Loop *L, bool IsOriginalLoop) {
+ formLCSSARecursively(*L, DT, &LI, &SE);
+ simplifyLoop(L, &DT, &LI, &SE, nullptr, true);
+ // Pre/post loops are slow paths, we do not need to perform any loop
+ // optimizations on them.
+ if (!IsOriginalLoop)
+ DisableAllLoopOptsOnLoop(*L);
+ };
+ if (PreL)
+ CanonicalizeLoop(PreL, false);
+ if (PostL)
+ CanonicalizeLoop(PostL, false);
+ CanonicalizeLoop(&OriginalLoop, true);
+
+ return true;
+}
+
+/// Computes and returns a range of values for the induction variable (IndVar)
+/// in which the range check can be safely elided. If it cannot compute such a
+/// range, returns None.
+Optional<InductiveRangeCheck::Range>
+InductiveRangeCheck::computeSafeIterationSpace(
+ ScalarEvolution &SE, const SCEVAddRecExpr *IndVar,
+ bool IsLatchSigned) const {
+ // IndVar is of the form "A + B * I" (where "I" is the canonical induction
+ // variable, that may or may not exist as a real llvm::Value in the loop) and
+ // this inductive range check is a range check on the "C + D * I" ("C" is
+ // getBegin() and "D" is getStep()). We rewrite the value being range
+ // checked to "M + N * IndVar" where "N" = "D * B^(-1)" and "M" = "C - NA".
+ //
+ // The actual inequalities we solve are of the form
+ //
+ // 0 <= M + 1 * IndVar < L given L >= 0 (i.e. N == 1)
+ //
+ // Here L stands for upper limit of the safe iteration space.
+ // The inequality is satisfied by (0 - M) <= IndVar < (L - M). To avoid
+ // overflows when calculating (0 - M) and (L - M) we, depending on type of
+ // IV's iteration space, limit the calculations by borders of the iteration
+ // space. For example, if IndVar is unsigned, (0 - M) overflows for any M > 0.
+ // If we figured out that "anything greater than (-M) is safe", we strengthen
+ // this to "everything greater than 0 is safe", assuming that values between
+ // -M and 0 just do not exist in unsigned iteration space, and we don't want
+ // to deal with overflown values.
+
+ if (!IndVar->isAffine())
+ return None;
+
+ const SCEV *A = IndVar->getStart();
+ const SCEVConstant *B = dyn_cast<SCEVConstant>(IndVar->getStepRecurrence(SE));
+ if (!B)
+ return None;
+ assert(!B->isZero() && "Recurrence with zero step?");
+
+ const SCEV *C = getBegin();
+ const SCEVConstant *D = dyn_cast<SCEVConstant>(getStep());
+ if (D != B)
+ return None;
+
+ assert(!D->getValue()->isZero() && "Recurrence with zero step?");
+ unsigned BitWidth = cast<IntegerType>(IndVar->getType())->getBitWidth();
+ const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
+
+ // Substract Y from X so that it does not go through border of the IV
+ // iteration space. Mathematically, it is equivalent to:
+ //
+ // ClampedSubstract(X, Y) = min(max(X - Y, INT_MIN), INT_MAX). [1]
+ //
+ // In [1], 'X - Y' is a mathematical substraction (result is not bounded to
+ // any width of bit grid). But after we take min/max, the result is
+ // guaranteed to be within [INT_MIN, INT_MAX].
+ //
+ // In [1], INT_MAX and INT_MIN are respectively signed and unsigned max/min
+ // values, depending on type of latch condition that defines IV iteration
+ // space.
+ auto ClampedSubstract = [&](const SCEV *X, const SCEV *Y) {
+ assert(SE.isKnownNonNegative(X) &&
+ "We can only substract from values in [0; SINT_MAX]!");
+ if (IsLatchSigned) {
+ // X is a number from signed range, Y is interpreted as signed.
+ // Even if Y is SINT_MAX, (X - Y) does not reach SINT_MIN. So the only
+ // thing we should care about is that we didn't cross SINT_MAX.
+ // So, if Y is positive, we substract Y safely.
+ // Rule 1: Y > 0 ---> Y.
+ // If 0 <= -Y <= (SINT_MAX - X), we substract Y safely.
+ // Rule 2: Y >=s (X - SINT_MAX) ---> Y.
+ // If 0 <= (SINT_MAX - X) < -Y, we can only substract (X - SINT_MAX).
+ // Rule 3: Y <s (X - SINT_MAX) ---> (X - SINT_MAX).
+ // It gives us smax(Y, X - SINT_MAX) to substract in all cases.
+ const SCEV *XMinusSIntMax = SE.getMinusSCEV(X, SIntMax);
+ return SE.getMinusSCEV(X, SE.getSMaxExpr(Y, XMinusSIntMax),
+ SCEV::FlagNSW);
+ } else
+ // X is a number from unsigned range, Y is interpreted as signed.
+ // Even if Y is SINT_MIN, (X - Y) does not reach UINT_MAX. So the only
+ // thing we should care about is that we didn't cross zero.
+ // So, if Y is negative, we substract Y safely.
+ // Rule 1: Y <s 0 ---> Y.
+ // If 0 <= Y <= X, we substract Y safely.
+ // Rule 2: Y <=s X ---> Y.
+ // If 0 <= X < Y, we should stop at 0 and can only substract X.
+ // Rule 3: Y >s X ---> X.
+ // It gives us smin(X, Y) to substract in all cases.
+ return SE.getMinusSCEV(X, SE.getSMinExpr(X, Y), SCEV::FlagNUW);
+ };
+ const SCEV *M = SE.getMinusSCEV(C, A);
+ const SCEV *Zero = SE.getZero(M->getType());
+ const SCEV *Begin = ClampedSubstract(Zero, M);
+ const SCEV *L = nullptr;
+
+ // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L".
+ // We can potentially do much better here.
+ if (const SCEV *EndLimit = getEnd())
+ L = EndLimit;
+ else {
+ assert(Kind == InductiveRangeCheck::RANGE_CHECK_LOWER && "invariant!");
+ L = SIntMax;
+ }
+ const SCEV *End = ClampedSubstract(L, M);
+ return InductiveRangeCheck::Range(Begin, End);
+}
+
+static Optional<InductiveRangeCheck::Range>
+IntersectSignedRange(ScalarEvolution &SE,
+ const Optional<InductiveRangeCheck::Range> &R1,
+ const InductiveRangeCheck::Range &R2) {
+ if (R2.isEmpty(SE, /* IsSigned */ true))
+ return None;
+ if (!R1.hasValue())
+ return R2;
+ auto &R1Value = R1.getValue();
+ // We never return empty ranges from this function, and R1 is supposed to be
+ // a result of intersection. Thus, R1 is never empty.
+ assert(!R1Value.isEmpty(SE, /* IsSigned */ true) &&
+ "We should never have empty R1!");
+
+ // TODO: we could widen the smaller range and have this work; but for now we
+ // bail out to keep things simple.
+ if (R1Value.getType() != R2.getType())
+ return None;
+
+ const SCEV *NewBegin = SE.getSMaxExpr(R1Value.getBegin(), R2.getBegin());
+ const SCEV *NewEnd = SE.getSMinExpr(R1Value.getEnd(), R2.getEnd());
+
+ // If the resulting range is empty, just return None.
+ auto Ret = InductiveRangeCheck::Range(NewBegin, NewEnd);
+ if (Ret.isEmpty(SE, /* IsSigned */ true))
+ return None;
+ return Ret;
+}
+
+static Optional<InductiveRangeCheck::Range>
+IntersectUnsignedRange(ScalarEvolution &SE,
+ const Optional<InductiveRangeCheck::Range> &R1,
+ const InductiveRangeCheck::Range &R2) {
+ if (R2.isEmpty(SE, /* IsSigned */ false))
+ return None;
+ if (!R1.hasValue())
+ return R2;
+ auto &R1Value = R1.getValue();
+ // We never return empty ranges from this function, and R1 is supposed to be
+ // a result of intersection. Thus, R1 is never empty.
+ assert(!R1Value.isEmpty(SE, /* IsSigned */ false) &&
+ "We should never have empty R1!");
+
+ // TODO: we could widen the smaller range and have this work; but for now we
+ // bail out to keep things simple.
+ if (R1Value.getType() != R2.getType())
+ return None;
+
+ const SCEV *NewBegin = SE.getUMaxExpr(R1Value.getBegin(), R2.getBegin());
+ const SCEV *NewEnd = SE.getUMinExpr(R1Value.getEnd(), R2.getEnd());
+
+ // If the resulting range is empty, just return None.
+ auto Ret = InductiveRangeCheck::Range(NewBegin, NewEnd);
+ if (Ret.isEmpty(SE, /* IsSigned */ false))
+ return None;
+ return Ret;
+}
+
+bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
+ if (skipLoop(L))
+ return false;
+
+ if (L->getBlocks().size() >= LoopSizeCutoff) {
+ DEBUG(dbgs() << "irce: giving up constraining loop, too large\n";);
+ return false;
+ }
+
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader) {
+ DEBUG(dbgs() << "irce: loop has no preheader, leaving\n");
+ return false;
+ }
+
+ LLVMContext &Context = Preheader->getContext();
+ SmallVector<InductiveRangeCheck, 16> RangeChecks;
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ BranchProbabilityInfo &BPI =
+ getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+
+ for (auto BBI : L->getBlocks())
+ if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator()))
+ InductiveRangeCheck::extractRangeChecksFromBranch(TBI, L, SE, BPI,
+ RangeChecks);
+
+ if (RangeChecks.empty())
+ return false;
+
+ auto PrintRecognizedRangeChecks = [&](raw_ostream &OS) {
+ OS << "irce: looking at loop "; L->print(OS);
+ OS << "irce: loop has " << RangeChecks.size()
+ << " inductive range checks: \n";
+ for (InductiveRangeCheck &IRC : RangeChecks)
+ IRC.print(OS);
+ };
+
+ DEBUG(PrintRecognizedRangeChecks(dbgs()));
+
+ if (PrintRangeChecks)
+ PrintRecognizedRangeChecks(errs());
+
+ const char *FailureReason = nullptr;
+ Optional<LoopStructure> MaybeLoopStructure =
+ LoopStructure::parseLoopStructure(SE, BPI, *L, FailureReason);
+ if (!MaybeLoopStructure.hasValue()) {
+ DEBUG(dbgs() << "irce: could not parse loop structure: " << FailureReason
+ << "\n";);
+ return false;
+ }
+ LoopStructure LS = MaybeLoopStructure.getValue();
+ const SCEVAddRecExpr *IndVar =
+ cast<SCEVAddRecExpr>(SE.getMinusSCEV(SE.getSCEV(LS.IndVarBase), SE.getSCEV(LS.IndVarStep)));
+
+ Optional<InductiveRangeCheck::Range> SafeIterRange;
+ Instruction *ExprInsertPt = Preheader->getTerminator();
+
+ SmallVector<InductiveRangeCheck, 4> RangeChecksToEliminate;
+ // Basing on the type of latch predicate, we interpret the IV iteration range
+ // as signed or unsigned range. We use different min/max functions (signed or
+ // unsigned) when intersecting this range with safe iteration ranges implied
+ // by range checks.
+ auto IntersectRange =
+ LS.IsSignedPredicate ? IntersectSignedRange : IntersectUnsignedRange;
+
+ IRBuilder<> B(ExprInsertPt);
+ for (InductiveRangeCheck &IRC : RangeChecks) {
+ auto Result = IRC.computeSafeIterationSpace(SE, IndVar,
+ LS.IsSignedPredicate);
+ if (Result.hasValue()) {
+ auto MaybeSafeIterRange =
+ IntersectRange(SE, SafeIterRange, Result.getValue());
+ if (MaybeSafeIterRange.hasValue()) {
+ assert(
+ !MaybeSafeIterRange.getValue().isEmpty(SE, LS.IsSignedPredicate) &&
+ "We should never return empty ranges!");
+ RangeChecksToEliminate.push_back(IRC);
+ SafeIterRange = MaybeSafeIterRange.getValue();
+ }
+ }
+ }
+
+ if (!SafeIterRange.hasValue())
+ return false;
+
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopConstrainer LC(*L, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), LPM,
+ LS, SE, DT, SafeIterRange.getValue());
+ bool Changed = LC.run();
+
+ if (Changed) {
+ auto PrintConstrainedLoopInfo = [L]() {
+ dbgs() << "irce: in function ";
+ dbgs() << L->getHeader()->getParent()->getName() << ": ";
+ dbgs() << "constrained ";
+ L->print(dbgs());
+ };
+
+ DEBUG(PrintConstrainedLoopInfo());
+
+ if (PrintChangedLoops)
+ PrintConstrainedLoopInfo();
+
+ // Optimize away the now-redundant range checks.
+
+ for (InductiveRangeCheck &IRC : RangeChecksToEliminate) {
+ ConstantInt *FoldedRangeCheck = IRC.getPassingDirection()
+ ? ConstantInt::getTrue(Context)
+ : ConstantInt::getFalse(Context);
+ IRC.getCheckUse()->set(FoldedRangeCheck);
+ }
+ }
+
+ return Changed;
+}
+
+Pass *llvm::createInductiveRangeCheckEliminationPass() {
+ return new InductiveRangeCheckElimination;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
new file mode 100644
index 000000000000..7d66c0f73821
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -0,0 +1,1012 @@
+//===- InferAddressSpace.cpp - --------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// CUDA C/C++ includes memory space designation as variable type qualifers (such
+// as __global__ and __shared__). Knowing the space of a memory access allows
+// CUDA compilers to emit faster PTX loads and stores. For example, a load from
+// shared memory can be translated to `ld.shared` which is roughly 10% faster
+// than a generic `ld` on an NVIDIA Tesla K40c.
+//
+// Unfortunately, type qualifiers only apply to variable declarations, so CUDA
+// compilers must infer the memory space of an address expression from
+// type-qualified variables.
+//
+// LLVM IR uses non-zero (so-called) specific address spaces to represent memory
+// spaces (e.g. addrspace(3) means shared memory). The Clang frontend
+// places only type-qualified variables in specific address spaces, and then
+// conservatively `addrspacecast`s each type-qualified variable to addrspace(0)
+// (so-called the generic address space) for other instructions to use.
+//
+// For example, the Clang translates the following CUDA code
+// __shared__ float a[10];
+// float v = a[i];
+// to
+// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
+// %1 = gep [10 x float], [10 x float]* %0, i64 0, i64 %i
+// %v = load float, float* %1 ; emits ld.f32
+// @a is in addrspace(3) since it's type-qualified, but its use from %1 is
+// redirected to %0 (the generic version of @a).
+//
+// The optimization implemented in this file propagates specific address spaces
+// from type-qualified variable declarations to its users. For example, it
+// optimizes the above IR to
+// %1 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+// %v = load float addrspace(3)* %1 ; emits ld.shared.f32
+// propagating the addrspace(3) from @a to %1. As the result, the NVPTX
+// codegen is able to emit ld.shared.f32 for %v.
+//
+// Address space inference works in two steps. First, it uses a data-flow
+// analysis to infer as many generic pointers as possible to point to only one
+// specific address space. In the above example, it can prove that %1 only
+// points to addrspace(3). This algorithm was published in
+// CUDA: Compiling and optimizing for a GPU platform
+// Chakrabarti, Grover, Aarts, Kong, Kudlur, Lin, Marathe, Murphy, Wang
+// ICCS 2012
+//
+// Then, address space inference replaces all refinable generic pointers with
+// equivalent specific pointers.
+//
+// The major challenge of implementing this optimization is handling PHINodes,
+// which may create loops in the data flow graph. This brings two complications.
+//
+// First, the data flow analysis in Step 1 needs to be circular. For example,
+// %generic.input = addrspacecast float addrspace(3)* %input to float*
+// loop:
+// %y = phi [ %generic.input, %y2 ]
+// %y2 = getelementptr %y, 1
+// %v = load %y2
+// br ..., label %loop, ...
+// proving %y specific requires proving both %generic.input and %y2 specific,
+// but proving %y2 specific circles back to %y. To address this complication,
+// the data flow analysis operates on a lattice:
+// uninitialized > specific address spaces > generic.
+// All address expressions (our implementation only considers phi, bitcast,
+// addrspacecast, and getelementptr) start with the uninitialized address space.
+// The monotone transfer function moves the address space of a pointer down a
+// lattice path from uninitialized to specific and then to generic. A join
+// operation of two different specific address spaces pushes the expression down
+// to the generic address space. The analysis completes once it reaches a fixed
+// point.
+//
+// Second, IR rewriting in Step 2 also needs to be circular. For example,
+// converting %y to addrspace(3) requires the compiler to know the converted
+// %y2, but converting %y2 needs the converted %y. To address this complication,
+// we break these cycles using "undef" placeholders. When converting an
+// instruction `I` to a new address space, if its operand `Op` is not converted
+// yet, we let `I` temporarily use `undef` and fix all the uses of undef later.
+// For instance, our algorithm first converts %y to
+// %y' = phi float addrspace(3)* [ %input, undef ]
+// Then, it converts %y2 to
+// %y2' = getelementptr %y', 1
+// Finally, it fixes the undef in %y' so that
+// %y' = phi float addrspace(3)* [ %input, %y2' ]
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
+
+#define DEBUG_TYPE "infer-address-spaces"
+
+using namespace llvm;
+
+static const unsigned UninitializedAddressSpace =
+ std::numeric_limits<unsigned>::max();
+
+namespace {
+
+using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
+
+/// \brief InferAddressSpaces
+class InferAddressSpaces : public FunctionPass {
+ /// Target specific address space which uses of should be replaced if
+ /// possible.
+ unsigned FlatAddrSpace;
+
+public:
+ static char ID;
+
+ InferAddressSpaces() : FunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+
+ bool runOnFunction(Function &F) override;
+
+private:
+ // Returns the new address space of V if updated; otherwise, returns None.
+ Optional<unsigned>
+ updateAddressSpace(const Value &V,
+ const ValueToAddrSpaceMapTy &InferredAddrSpace) const;
+
+ // Tries to infer the specific address space of each address expression in
+ // Postorder.
+ void inferAddressSpaces(ArrayRef<WeakTrackingVH> Postorder,
+ ValueToAddrSpaceMapTy *InferredAddrSpace) const;
+
+ bool isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const;
+
+ // Changes the flat address expressions in function F to point to specific
+ // address spaces if InferredAddrSpace says so. Postorder is the postorder of
+ // all flat expressions in the use-def graph of function F.
+ bool rewriteWithNewAddressSpaces(
+ const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder,
+ const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const;
+
+ void appendsFlatAddressExpressionToPostorderStack(
+ Value *V, std::vector<std::pair<Value *, bool>> &PostorderStack,
+ DenseSet<Value *> &Visited) const;
+
+ bool rewriteIntrinsicOperands(IntrinsicInst *II,
+ Value *OldV, Value *NewV) const;
+ void collectRewritableIntrinsicOperands(
+ IntrinsicInst *II,
+ std::vector<std::pair<Value *, bool>> &PostorderStack,
+ DenseSet<Value *> &Visited) const;
+
+ std::vector<WeakTrackingVH> collectFlatAddressExpressions(Function &F) const;
+
+ Value *cloneValueWithNewAddressSpace(
+ Value *V, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) const;
+ unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) const;
+};
+
+} // end anonymous namespace
+
+char InferAddressSpaces::ID = 0;
+
+namespace llvm {
+
+void initializeInferAddressSpacesPass(PassRegistry &);
+
+} // end namespace llvm
+
+INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
+ false, false)
+
+// Returns true if V is an address expression.
+// TODO: Currently, we consider only phi, bitcast, addrspacecast, and
+// getelementptr operators.
+static bool isAddressExpression(const Value &V) {
+ if (!isa<Operator>(V))
+ return false;
+
+ switch (cast<Operator>(V).getOpcode()) {
+ case Instruction::PHI:
+ case Instruction::BitCast:
+ case Instruction::AddrSpaceCast:
+ case Instruction::GetElementPtr:
+ case Instruction::Select:
+ return true;
+ default:
+ return false;
+ }
+}
+
+// Returns the pointer operands of V.
+//
+// Precondition: V is an address expression.
+static SmallVector<Value *, 2> getPointerOperands(const Value &V) {
+ const Operator &Op = cast<Operator>(V);
+ switch (Op.getOpcode()) {
+ case Instruction::PHI: {
+ auto IncomingValues = cast<PHINode>(Op).incoming_values();
+ return SmallVector<Value *, 2>(IncomingValues.begin(),
+ IncomingValues.end());
+ }
+ case Instruction::BitCast:
+ case Instruction::AddrSpaceCast:
+ case Instruction::GetElementPtr:
+ return {Op.getOperand(0)};
+ case Instruction::Select:
+ return {Op.getOperand(1), Op.getOperand(2)};
+ default:
+ llvm_unreachable("Unexpected instruction type.");
+ }
+}
+
+// TODO: Move logic to TTI?
+bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
+ Value *OldV,
+ Value *NewV) const {
+ Module *M = II->getParent()->getParent()->getParent();
+
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::amdgcn_atomic_inc:
+ case Intrinsic::amdgcn_atomic_dec:{
+ const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4));
+ if (!IsVolatile || !IsVolatile->isZero())
+ return false;
+
+ LLVM_FALLTHROUGH;
+ }
+ case Intrinsic::objectsize: {
+ Type *DestTy = II->getType();
+ Type *SrcTy = NewV->getType();
+ Function *NewDecl =
+ Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
+ II->setArgOperand(0, NewV);
+ II->setCalledFunction(NewDecl);
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
+// TODO: Move logic to TTI?
+void InferAddressSpaces::collectRewritableIntrinsicOperands(
+ IntrinsicInst *II, std::vector<std::pair<Value *, bool>> &PostorderStack,
+ DenseSet<Value *> &Visited) const {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::objectsize:
+ case Intrinsic::amdgcn_atomic_inc:
+ case Intrinsic::amdgcn_atomic_dec:
+ appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
+ PostorderStack, Visited);
+ break;
+ default:
+ break;
+ }
+}
+
+// Returns all flat address expressions in function F. The elements are
+// If V is an unvisited flat address expression, appends V to PostorderStack
+// and marks it as visited.
+void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack(
+ Value *V, std::vector<std::pair<Value *, bool>> &PostorderStack,
+ DenseSet<Value *> &Visited) const {
+ assert(V->getType()->isPointerTy());
+
+ // Generic addressing expressions may be hidden in nested constant
+ // expressions.
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+ // TODO: Look in non-address parts, like icmp operands.
+ if (isAddressExpression(*CE) && Visited.insert(CE).second)
+ PostorderStack.push_back(std::make_pair(CE, false));
+
+ return;
+ }
+
+ if (isAddressExpression(*V) &&
+ V->getType()->getPointerAddressSpace() == FlatAddrSpace) {
+ if (Visited.insert(V).second) {
+ PostorderStack.push_back(std::make_pair(V, false));
+
+ Operator *Op = cast<Operator>(V);
+ for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) {
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op->getOperand(I))) {
+ if (isAddressExpression(*CE) && Visited.insert(CE).second)
+ PostorderStack.emplace_back(CE, false);
+ }
+ }
+ }
+ }
+}
+
+// Returns all flat address expressions in function F. The elements are ordered
+// ordered in postorder.
+std::vector<WeakTrackingVH>
+InferAddressSpaces::collectFlatAddressExpressions(Function &F) const {
+ // This function implements a non-recursive postorder traversal of a partial
+ // use-def graph of function F.
+ std::vector<std::pair<Value *, bool>> PostorderStack;
+ // The set of visited expressions.
+ DenseSet<Value *> Visited;
+
+ auto PushPtrOperand = [&](Value *Ptr) {
+ appendsFlatAddressExpressionToPostorderStack(Ptr, PostorderStack,
+ Visited);
+ };
+
+ // Look at operations that may be interesting accelerate by moving to a known
+ // address space. We aim at generating after loads and stores, but pure
+ // addressing calculations may also be faster.
+ for (Instruction &I : instructions(F)) {
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ if (!GEP->getType()->isVectorTy())
+ PushPtrOperand(GEP->getPointerOperand());
+ } else if (auto *LI = dyn_cast<LoadInst>(&I))
+ PushPtrOperand(LI->getPointerOperand());
+ else if (auto *SI = dyn_cast<StoreInst>(&I))
+ PushPtrOperand(SI->getPointerOperand());
+ else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
+ PushPtrOperand(RMW->getPointerOperand());
+ else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
+ PushPtrOperand(CmpX->getPointerOperand());
+ else if (auto *MI = dyn_cast<MemIntrinsic>(&I)) {
+ // For memset/memcpy/memmove, any pointer operand can be replaced.
+ PushPtrOperand(MI->getRawDest());
+
+ // Handle 2nd operand for memcpy/memmove.
+ if (auto *MTI = dyn_cast<MemTransferInst>(MI))
+ PushPtrOperand(MTI->getRawSource());
+ } else if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ collectRewritableIntrinsicOperands(II, PostorderStack, Visited);
+ else if (ICmpInst *Cmp = dyn_cast<ICmpInst>(&I)) {
+ // FIXME: Handle vectors of pointers
+ if (Cmp->getOperand(0)->getType()->isPointerTy()) {
+ PushPtrOperand(Cmp->getOperand(0));
+ PushPtrOperand(Cmp->getOperand(1));
+ }
+ } else if (auto *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
+ if (!ASC->getType()->isVectorTy())
+ PushPtrOperand(ASC->getPointerOperand());
+ }
+ }
+
+ std::vector<WeakTrackingVH> Postorder; // The resultant postorder.
+ while (!PostorderStack.empty()) {
+ Value *TopVal = PostorderStack.back().first;
+ // If the operands of the expression on the top are already explored,
+ // adds that expression to the resultant postorder.
+ if (PostorderStack.back().second) {
+ if (TopVal->getType()->getPointerAddressSpace() == FlatAddrSpace)
+ Postorder.push_back(TopVal);
+ PostorderStack.pop_back();
+ continue;
+ }
+ // Otherwise, adds its operands to the stack and explores them.
+ PostorderStack.back().second = true;
+ for (Value *PtrOperand : getPointerOperands(*TopVal)) {
+ appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack,
+ Visited);
+ }
+ }
+ return Postorder;
+}
+
+// A helper function for cloneInstructionWithNewAddressSpace. Returns the clone
+// of OperandUse.get() in the new address space. If the clone is not ready yet,
+// returns an undef in the new address space as a placeholder.
+static Value *operandWithNewAddressSpaceOrCreateUndef(
+ const Use &OperandUse, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) {
+ Value *Operand = OperandUse.get();
+
+ Type *NewPtrTy =
+ Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+ if (Constant *C = dyn_cast<Constant>(Operand))
+ return ConstantExpr::getAddrSpaceCast(C, NewPtrTy);
+
+ if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand))
+ return NewOperand;
+
+ UndefUsesToFix->push_back(&OperandUse);
+ return UndefValue::get(NewPtrTy);
+}
+
+// Returns a clone of `I` with its operands converted to those specified in
+// ValueWithNewAddrSpace. Due to potential cycles in the data flow graph, an
+// operand whose address space needs to be modified might not exist in
+// ValueWithNewAddrSpace. In that case, uses undef as a placeholder operand and
+// adds that operand use to UndefUsesToFix so that caller can fix them later.
+//
+// Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast
+// from a pointer whose type already matches. Therefore, this function returns a
+// Value* instead of an Instruction*.
+static Value *cloneInstructionWithNewAddressSpace(
+ Instruction *I, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) {
+ Type *NewPtrType =
+ I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+ if (I->getOpcode() == Instruction::AddrSpaceCast) {
+ Value *Src = I->getOperand(0);
+ // Because `I` is flat, the source address space must be specific.
+ // Therefore, the inferred address space must be the source space, according
+ // to our algorithm.
+ assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
+ if (Src->getType() != NewPtrType)
+ return new BitCastInst(Src, NewPtrType);
+ return Src;
+ }
+
+ // Computes the converted pointer operands.
+ SmallVector<Value *, 4> NewPointerOperands;
+ for (const Use &OperandUse : I->operands()) {
+ if (!OperandUse.get()->getType()->isPointerTy())
+ NewPointerOperands.push_back(nullptr);
+ else
+ NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef(
+ OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix));
+ }
+
+ switch (I->getOpcode()) {
+ case Instruction::BitCast:
+ return new BitCastInst(NewPointerOperands[0], NewPtrType);
+ case Instruction::PHI: {
+ assert(I->getType()->isPointerTy());
+ PHINode *PHI = cast<PHINode>(I);
+ PHINode *NewPHI = PHINode::Create(NewPtrType, PHI->getNumIncomingValues());
+ for (unsigned Index = 0; Index < PHI->getNumIncomingValues(); ++Index) {
+ unsigned OperandNo = PHINode::getOperandNumForIncomingValue(Index);
+ NewPHI->addIncoming(NewPointerOperands[OperandNo],
+ PHI->getIncomingBlock(Index));
+ }
+ return NewPHI;
+ }
+ case Instruction::GetElementPtr: {
+ GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
+ GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+ GEP->getSourceElementType(), NewPointerOperands[0],
+ SmallVector<Value *, 4>(GEP->idx_begin(), GEP->idx_end()));
+ NewGEP->setIsInBounds(GEP->isInBounds());
+ return NewGEP;
+ }
+ case Instruction::Select:
+ assert(I->getType()->isPointerTy());
+ return SelectInst::Create(I->getOperand(0), NewPointerOperands[1],
+ NewPointerOperands[2], "", nullptr, I);
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+}
+
+// Similar to cloneInstructionWithNewAddressSpace, returns a clone of the
+// constant expression `CE` with its operands replaced as specified in
+// ValueWithNewAddrSpace.
+static Value *cloneConstantExprWithNewAddressSpace(
+ ConstantExpr *CE, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace) {
+ Type *TargetType =
+ CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+ if (CE->getOpcode() == Instruction::AddrSpaceCast) {
+ // Because CE is flat, the source address space must be specific.
+ // Therefore, the inferred address space must be the source space according
+ // to our algorithm.
+ assert(CE->getOperand(0)->getType()->getPointerAddressSpace() ==
+ NewAddrSpace);
+ return ConstantExpr::getBitCast(CE->getOperand(0), TargetType);
+ }
+
+ if (CE->getOpcode() == Instruction::BitCast) {
+ if (Value *NewOperand = ValueWithNewAddrSpace.lookup(CE->getOperand(0)))
+ return ConstantExpr::getBitCast(cast<Constant>(NewOperand), TargetType);
+ return ConstantExpr::getAddrSpaceCast(CE, TargetType);
+ }
+
+ if (CE->getOpcode() == Instruction::Select) {
+ Constant *Src0 = CE->getOperand(1);
+ Constant *Src1 = CE->getOperand(2);
+ if (Src0->getType()->getPointerAddressSpace() ==
+ Src1->getType()->getPointerAddressSpace()) {
+
+ return ConstantExpr::getSelect(
+ CE->getOperand(0), ConstantExpr::getAddrSpaceCast(Src0, TargetType),
+ ConstantExpr::getAddrSpaceCast(Src1, TargetType));
+ }
+ }
+
+ // Computes the operands of the new constant expression.
+ bool IsNew = false;
+ SmallVector<Constant *, 4> NewOperands;
+ for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) {
+ Constant *Operand = CE->getOperand(Index);
+ // If the address space of `Operand` needs to be modified, the new operand
+ // with the new address space should already be in ValueWithNewAddrSpace
+ // because (1) the constant expressions we consider (i.e. addrspacecast,
+ // bitcast, and getelementptr) do not incur cycles in the data flow graph
+ // and (2) this function is called on constant expressions in postorder.
+ if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) {
+ IsNew = true;
+ NewOperands.push_back(cast<Constant>(NewOperand));
+ } else {
+ // Otherwise, reuses the old operand.
+ NewOperands.push_back(Operand);
+ }
+ }
+
+ // If !IsNew, we will replace the Value with itself. However, replaced values
+ // are assumed to wrapped in a addrspace cast later so drop it now.
+ if (!IsNew)
+ return nullptr;
+
+ if (CE->getOpcode() == Instruction::GetElementPtr) {
+ // Needs to specify the source type while constructing a getelementptr
+ // constant expression.
+ return CE->getWithOperands(
+ NewOperands, TargetType, /*OnlyIfReduced=*/false,
+ NewOperands[0]->getType()->getPointerElementType());
+ }
+
+ return CE->getWithOperands(NewOperands, TargetType);
+}
+
+// Returns a clone of the value `V`, with its operands replaced as specified in
+// ValueWithNewAddrSpace. This function is called on every flat address
+// expression whose address space needs to be modified, in postorder.
+//
+// See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix.
+Value *InferAddressSpaces::cloneValueWithNewAddressSpace(
+ Value *V, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) const {
+ // All values in Postorder are flat address expressions.
+ assert(isAddressExpression(*V) &&
+ V->getType()->getPointerAddressSpace() == FlatAddrSpace);
+
+ if (Instruction *I = dyn_cast<Instruction>(V)) {
+ Value *NewV = cloneInstructionWithNewAddressSpace(
+ I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix);
+ if (Instruction *NewI = dyn_cast<Instruction>(NewV)) {
+ if (NewI->getParent() == nullptr) {
+ NewI->insertBefore(I);
+ NewI->takeName(I);
+ }
+ }
+ return NewV;
+ }
+
+ return cloneConstantExprWithNewAddressSpace(
+ cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace);
+}
+
+// Defines the join operation on the address space lattice (see the file header
+// comments).
+unsigned InferAddressSpaces::joinAddressSpaces(unsigned AS1,
+ unsigned AS2) const {
+ if (AS1 == FlatAddrSpace || AS2 == FlatAddrSpace)
+ return FlatAddrSpace;
+
+ if (AS1 == UninitializedAddressSpace)
+ return AS2;
+ if (AS2 == UninitializedAddressSpace)
+ return AS1;
+
+ // The join of two different specific address spaces is flat.
+ return (AS1 == AS2) ? AS1 : FlatAddrSpace;
+}
+
+bool InferAddressSpaces::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ const TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ FlatAddrSpace = TTI.getFlatAddressSpace();
+ if (FlatAddrSpace == UninitializedAddressSpace)
+ return false;
+
+ // Collects all flat address expressions in postorder.
+ std::vector<WeakTrackingVH> Postorder = collectFlatAddressExpressions(F);
+
+ // Runs a data-flow analysis to refine the address spaces of every expression
+ // in Postorder.
+ ValueToAddrSpaceMapTy InferredAddrSpace;
+ inferAddressSpaces(Postorder, &InferredAddrSpace);
+
+ // Changes the address spaces of the flat address expressions who are inferred
+ // to point to a specific address space.
+ return rewriteWithNewAddressSpaces(TTI, Postorder, InferredAddrSpace, &F);
+}
+
+// Constants need to be tracked through RAUW to handle cases with nested
+// constant expressions, so wrap values in WeakTrackingVH.
+void InferAddressSpaces::inferAddressSpaces(
+ ArrayRef<WeakTrackingVH> Postorder,
+ ValueToAddrSpaceMapTy *InferredAddrSpace) const {
+ SetVector<Value *> Worklist(Postorder.begin(), Postorder.end());
+ // Initially, all expressions are in the uninitialized address space.
+ for (Value *V : Postorder)
+ (*InferredAddrSpace)[V] = UninitializedAddressSpace;
+
+ while (!Worklist.empty()) {
+ Value *V = Worklist.pop_back_val();
+
+ // Tries to update the address space of the stack top according to the
+ // address spaces of its operands.
+ DEBUG(dbgs() << "Updating the address space of\n " << *V << '\n');
+ Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace);
+ if (!NewAS.hasValue())
+ continue;
+ // If any updates are made, grabs its users to the worklist because
+ // their address spaces can also be possibly updated.
+ DEBUG(dbgs() << " to " << NewAS.getValue() << '\n');
+ (*InferredAddrSpace)[V] = NewAS.getValue();
+
+ for (Value *User : V->users()) {
+ // Skip if User is already in the worklist.
+ if (Worklist.count(User))
+ continue;
+
+ auto Pos = InferredAddrSpace->find(User);
+ // Our algorithm only updates the address spaces of flat address
+ // expressions, which are those in InferredAddrSpace.
+ if (Pos == InferredAddrSpace->end())
+ continue;
+
+ // Function updateAddressSpace moves the address space down a lattice
+ // path. Therefore, nothing to do if User is already inferred as flat (the
+ // bottom element in the lattice).
+ if (Pos->second == FlatAddrSpace)
+ continue;
+
+ Worklist.insert(User);
+ }
+ }
+}
+
+Optional<unsigned> InferAddressSpaces::updateAddressSpace(
+ const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) const {
+ assert(InferredAddrSpace.count(&V));
+
+ // The new inferred address space equals the join of the address spaces
+ // of all its pointer operands.
+ unsigned NewAS = UninitializedAddressSpace;
+
+ const Operator &Op = cast<Operator>(V);
+ if (Op.getOpcode() == Instruction::Select) {
+ Value *Src0 = Op.getOperand(1);
+ Value *Src1 = Op.getOperand(2);
+
+ auto I = InferredAddrSpace.find(Src0);
+ unsigned Src0AS = (I != InferredAddrSpace.end()) ?
+ I->second : Src0->getType()->getPointerAddressSpace();
+
+ auto J = InferredAddrSpace.find(Src1);
+ unsigned Src1AS = (J != InferredAddrSpace.end()) ?
+ J->second : Src1->getType()->getPointerAddressSpace();
+
+ auto *C0 = dyn_cast<Constant>(Src0);
+ auto *C1 = dyn_cast<Constant>(Src1);
+
+ // If one of the inputs is a constant, we may be able to do a constant
+ // addrspacecast of it. Defer inferring the address space until the input
+ // address space is known.
+ if ((C1 && Src0AS == UninitializedAddressSpace) ||
+ (C0 && Src1AS == UninitializedAddressSpace))
+ return None;
+
+ if (C0 && isSafeToCastConstAddrSpace(C0, Src1AS))
+ NewAS = Src1AS;
+ else if (C1 && isSafeToCastConstAddrSpace(C1, Src0AS))
+ NewAS = Src0AS;
+ else
+ NewAS = joinAddressSpaces(Src0AS, Src1AS);
+ } else {
+ for (Value *PtrOperand : getPointerOperands(V)) {
+ auto I = InferredAddrSpace.find(PtrOperand);
+ unsigned OperandAS = I != InferredAddrSpace.end() ?
+ I->second : PtrOperand->getType()->getPointerAddressSpace();
+
+ // join(flat, *) = flat. So we can break if NewAS is already flat.
+ NewAS = joinAddressSpaces(NewAS, OperandAS);
+ if (NewAS == FlatAddrSpace)
+ break;
+ }
+ }
+
+ unsigned OldAS = InferredAddrSpace.lookup(&V);
+ assert(OldAS != FlatAddrSpace);
+ if (OldAS == NewAS)
+ return None;
+ return NewAS;
+}
+
+/// \p returns true if \p U is the pointer operand of a memory instruction with
+/// a single pointer operand that can have its address space changed by simply
+/// mutating the use to a new value. If the memory instruction is volatile,
+/// return true only if the target allows the memory instruction to be volatile
+/// in the new address space.
+static bool isSimplePointerUseValidToReplace(const TargetTransformInfo &TTI,
+ Use &U, unsigned AddrSpace) {
+ User *Inst = U.getUser();
+ unsigned OpNo = U.getOperandNo();
+ bool VolatileIsAllowed = false;
+ if (auto *I = dyn_cast<Instruction>(Inst))
+ VolatileIsAllowed = TTI.hasVolatileVariant(I, AddrSpace);
+
+ if (auto *LI = dyn_cast<LoadInst>(Inst))
+ return OpNo == LoadInst::getPointerOperandIndex() &&
+ (VolatileIsAllowed || !LI->isVolatile());
+
+ if (auto *SI = dyn_cast<StoreInst>(Inst))
+ return OpNo == StoreInst::getPointerOperandIndex() &&
+ (VolatileIsAllowed || !SI->isVolatile());
+
+ if (auto *RMW = dyn_cast<AtomicRMWInst>(Inst))
+ return OpNo == AtomicRMWInst::getPointerOperandIndex() &&
+ (VolatileIsAllowed || !RMW->isVolatile());
+
+ if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst))
+ return OpNo == AtomicCmpXchgInst::getPointerOperandIndex() &&
+ (VolatileIsAllowed || !CmpX->isVolatile());
+
+ return false;
+}
+
+/// Update memory intrinsic uses that require more complex processing than
+/// simple memory instructions. Thse require re-mangling and may have multiple
+/// pointer operands.
+static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
+ Value *NewV) {
+ IRBuilder<> B(MI);
+ MDNode *TBAA = MI->getMetadata(LLVMContext::MD_tbaa);
+ MDNode *ScopeMD = MI->getMetadata(LLVMContext::MD_alias_scope);
+ MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias);
+
+ if (auto *MSI = dyn_cast<MemSetInst>(MI)) {
+ B.CreateMemSet(NewV, MSI->getValue(),
+ MSI->getLength(), MSI->getAlignment(),
+ false, // isVolatile
+ TBAA, ScopeMD, NoAliasMD);
+ } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) {
+ Value *Src = MTI->getRawSource();
+ Value *Dest = MTI->getRawDest();
+
+ // Be careful in case this is a self-to-self copy.
+ if (Src == OldV)
+ Src = NewV;
+
+ if (Dest == OldV)
+ Dest = NewV;
+
+ if (isa<MemCpyInst>(MTI)) {
+ MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
+ B.CreateMemCpy(Dest, Src, MTI->getLength(),
+ MTI->getAlignment(),
+ false, // isVolatile
+ TBAA, TBAAStruct, ScopeMD, NoAliasMD);
+ } else {
+ assert(isa<MemMoveInst>(MTI));
+ B.CreateMemMove(Dest, Src, MTI->getLength(),
+ MTI->getAlignment(),
+ false, // isVolatile
+ TBAA, ScopeMD, NoAliasMD);
+ }
+ } else
+ llvm_unreachable("unhandled MemIntrinsic");
+
+ MI->eraseFromParent();
+ return true;
+}
+
+// \p returns true if it is OK to change the address space of constant \p C with
+// a ConstantExpr addrspacecast.
+bool InferAddressSpaces::isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const {
+ assert(NewAS != UninitializedAddressSpace);
+
+ unsigned SrcAS = C->getType()->getPointerAddressSpace();
+ if (SrcAS == NewAS || isa<UndefValue>(C))
+ return true;
+
+ // Prevent illegal casts between different non-flat address spaces.
+ if (SrcAS != FlatAddrSpace && NewAS != FlatAddrSpace)
+ return false;
+
+ if (isa<ConstantPointerNull>(C))
+ return true;
+
+ if (auto *Op = dyn_cast<Operator>(C)) {
+ // If we already have a constant addrspacecast, it should be safe to cast it
+ // off.
+ if (Op->getOpcode() == Instruction::AddrSpaceCast)
+ return isSafeToCastConstAddrSpace(cast<Constant>(Op->getOperand(0)), NewAS);
+
+ if (Op->getOpcode() == Instruction::IntToPtr &&
+ Op->getType()->getPointerAddressSpace() == FlatAddrSpace)
+ return true;
+ }
+
+ return false;
+}
+
+static Value::use_iterator skipToNextUser(Value::use_iterator I,
+ Value::use_iterator End) {
+ User *CurUser = I->getUser();
+ ++I;
+
+ while (I != End && I->getUser() == CurUser)
+ ++I;
+
+ return I;
+}
+
+bool InferAddressSpaces::rewriteWithNewAddressSpaces(
+ const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder,
+ const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const {
+ // For each address expression to be modified, creates a clone of it with its
+ // pointer operands converted to the new address space. Since the pointer
+ // operands are converted, the clone is naturally in the new address space by
+ // construction.
+ ValueToValueMapTy ValueWithNewAddrSpace;
+ SmallVector<const Use *, 32> UndefUsesToFix;
+ for (Value* V : Postorder) {
+ unsigned NewAddrSpace = InferredAddrSpace.lookup(V);
+ if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
+ ValueWithNewAddrSpace[V] = cloneValueWithNewAddressSpace(
+ V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
+ }
+ }
+
+ if (ValueWithNewAddrSpace.empty())
+ return false;
+
+ // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace.
+ for (const Use *UndefUse : UndefUsesToFix) {
+ User *V = UndefUse->getUser();
+ User *NewV = cast<User>(ValueWithNewAddrSpace.lookup(V));
+ unsigned OperandNo = UndefUse->getOperandNo();
+ assert(isa<UndefValue>(NewV->getOperand(OperandNo)));
+ NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(UndefUse->get()));
+ }
+
+ SmallVector<Instruction *, 16> DeadInstructions;
+
+ // Replaces the uses of the old address expressions with the new ones.
+ for (const WeakTrackingVH &WVH : Postorder) {
+ assert(WVH && "value was unexpectedly deleted");
+ Value *V = WVH;
+ Value *NewV = ValueWithNewAddrSpace.lookup(V);
+ if (NewV == nullptr)
+ continue;
+
+ DEBUG(dbgs() << "Replacing the uses of " << *V
+ << "\n with\n " << *NewV << '\n');
+
+ if (Constant *C = dyn_cast<Constant>(V)) {
+ Constant *Replace = ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+ C->getType());
+ if (C != Replace) {
+ DEBUG(dbgs() << "Inserting replacement const cast: "
+ << Replace << ": " << *Replace << '\n');
+ C->replaceAllUsesWith(Replace);
+ V = Replace;
+ }
+ }
+
+ Value::use_iterator I, E, Next;
+ for (I = V->use_begin(), E = V->use_end(); I != E; ) {
+ Use &U = *I;
+
+ // Some users may see the same pointer operand in multiple operands. Skip
+ // to the next instruction.
+ I = skipToNextUser(I, E);
+
+ if (isSimplePointerUseValidToReplace(
+ TTI, U, V->getType()->getPointerAddressSpace())) {
+ // If V is used as the pointer operand of a compatible memory operation,
+ // sets the pointer operand to NewV. This replacement does not change
+ // the element type, so the resultant load/store is still valid.
+ U.set(NewV);
+ continue;
+ }
+
+ User *CurUser = U.getUser();
+ // Handle more complex cases like intrinsic that need to be remangled.
+ if (auto *MI = dyn_cast<MemIntrinsic>(CurUser)) {
+ if (!MI->isVolatile() && handleMemIntrinsicPtrUse(MI, V, NewV))
+ continue;
+ }
+
+ if (auto *II = dyn_cast<IntrinsicInst>(CurUser)) {
+ if (rewriteIntrinsicOperands(II, V, NewV))
+ continue;
+ }
+
+ if (isa<Instruction>(CurUser)) {
+ if (ICmpInst *Cmp = dyn_cast<ICmpInst>(CurUser)) {
+ // If we can infer that both pointers are in the same addrspace,
+ // transform e.g.
+ // %cmp = icmp eq float* %p, %q
+ // into
+ // %cmp = icmp eq float addrspace(3)* %new_p, %new_q
+
+ unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+ int SrcIdx = U.getOperandNo();
+ int OtherIdx = (SrcIdx == 0) ? 1 : 0;
+ Value *OtherSrc = Cmp->getOperand(OtherIdx);
+
+ if (Value *OtherNewV = ValueWithNewAddrSpace.lookup(OtherSrc)) {
+ if (OtherNewV->getType()->getPointerAddressSpace() == NewAS) {
+ Cmp->setOperand(OtherIdx, OtherNewV);
+ Cmp->setOperand(SrcIdx, NewV);
+ continue;
+ }
+ }
+
+ // Even if the type mismatches, we can cast the constant.
+ if (auto *KOtherSrc = dyn_cast<Constant>(OtherSrc)) {
+ if (isSafeToCastConstAddrSpace(KOtherSrc, NewAS)) {
+ Cmp->setOperand(SrcIdx, NewV);
+ Cmp->setOperand(OtherIdx,
+ ConstantExpr::getAddrSpaceCast(KOtherSrc, NewV->getType()));
+ continue;
+ }
+ }
+ }
+
+ if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(CurUser)) {
+ unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+ if (ASC->getDestAddressSpace() == NewAS) {
+ if (ASC->getType()->getPointerElementType() !=
+ NewV->getType()->getPointerElementType()) {
+ NewV = CastInst::Create(Instruction::BitCast, NewV,
+ ASC->getType(), "", ASC);
+ }
+ ASC->replaceAllUsesWith(NewV);
+ DeadInstructions.push_back(ASC);
+ continue;
+ }
+ }
+
+ // Otherwise, replaces the use with flat(NewV).
+ if (Instruction *I = dyn_cast<Instruction>(V)) {
+ BasicBlock::iterator InsertPos = std::next(I->getIterator());
+ while (isa<PHINode>(InsertPos))
+ ++InsertPos;
+ U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
+ } else {
+ U.set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+ V->getType()));
+ }
+ }
+ }
+
+ if (V->use_empty()) {
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ DeadInstructions.push_back(I);
+ }
+ }
+
+ for (Instruction *I : DeadInstructions)
+ RecursivelyDeleteTriviallyDeadInstructions(I);
+
+ return true;
+}
+
+FunctionPass *llvm::createInferAddressSpacesPass() {
+ return new InferAddressSpaces();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
new file mode 100644
index 000000000000..141c9938bf8b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -0,0 +1,2545 @@
+//===- JumpThreading.cpp - Thread control through conditional blocks ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Jump Threading pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/JumpThreading.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <utility>
+
+using namespace llvm;
+using namespace jumpthreading;
+
+#define DEBUG_TYPE "jump-threading"
+
+STATISTIC(NumThreads, "Number of jumps threaded");
+STATISTIC(NumFolds, "Number of terminators folded");
+STATISTIC(NumDupes, "Number of branch blocks duplicated to eliminate phi");
+
+static cl::opt<unsigned>
+BBDuplicateThreshold("jump-threading-threshold",
+ cl::desc("Max block size to duplicate for jump threading"),
+ cl::init(6), cl::Hidden);
+
+static cl::opt<unsigned>
+ImplicationSearchThreshold(
+ "jump-threading-implication-search-threshold",
+ cl::desc("The number of predecessors to search for a stronger "
+ "condition to use to thread over a weaker condition"),
+ cl::init(3), cl::Hidden);
+
+static cl::opt<bool> PrintLVIAfterJumpThreading(
+ "print-lvi-after-jump-threading",
+ cl::desc("Print the LazyValueInfo cache after JumpThreading"), cl::init(false),
+ cl::Hidden);
+
+namespace {
+
+ /// This pass performs 'jump threading', which looks at blocks that have
+ /// multiple predecessors and multiple successors. If one or more of the
+ /// predecessors of the block can be proven to always jump to one of the
+ /// successors, we forward the edge from the predecessor to the successor by
+ /// duplicating the contents of this block.
+ ///
+ /// An example of when this can occur is code like this:
+ ///
+ /// if () { ...
+ /// X = 4;
+ /// }
+ /// if (X < 3) {
+ ///
+ /// In this case, the unconditional branch at the end of the first if can be
+ /// revectored to the false side of the second if.
+ class JumpThreading : public FunctionPass {
+ JumpThreadingPass Impl;
+
+ public:
+ static char ID; // Pass identification
+
+ JumpThreading(int T = -1) : FunctionPass(ID), Impl(T) {
+ initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ if (PrintLVIAfterJumpThreading)
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<LazyValueInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+
+ void releaseMemory() override { Impl.releaseMemory(); }
+ };
+
+} // end anonymous namespace
+
+char JumpThreading::ID = 0;
+
+INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
+ "Jump Threading", false, false)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(JumpThreading, "jump-threading",
+ "Jump Threading", false, false)
+
+// Public interface to the Jump Threading pass
+FunctionPass *llvm::createJumpThreadingPass(int Threshold) {
+ return new JumpThreading(Threshold);
+}
+
+JumpThreadingPass::JumpThreadingPass(int T) {
+ BBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
+}
+
+// Update branch probability information according to conditional
+// branch probablity. This is usually made possible for cloned branches
+// in inline instances by the context specific profile in the caller.
+// For instance,
+//
+// [Block PredBB]
+// [Branch PredBr]
+// if (t) {
+// Block A;
+// } else {
+// Block B;
+// }
+//
+// [Block BB]
+// cond = PN([true, %A], [..., %B]); // PHI node
+// [Branch CondBr]
+// if (cond) {
+// ... // P(cond == true) = 1%
+// }
+//
+// Here we know that when block A is taken, cond must be true, which means
+// P(cond == true | A) = 1
+//
+// Given that P(cond == true) = P(cond == true | A) * P(A) +
+// P(cond == true | B) * P(B)
+// we get:
+// P(cond == true ) = P(A) + P(cond == true | B) * P(B)
+//
+// which gives us:
+// P(A) is less than P(cond == true), i.e.
+// P(t == true) <= P(cond == true)
+//
+// In other words, if we know P(cond == true) is unlikely, we know
+// that P(t == true) is also unlikely.
+//
+static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
+ BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!CondBr)
+ return;
+
+ BranchProbability BP;
+ uint64_t TrueWeight, FalseWeight;
+ if (!CondBr->extractProfMetadata(TrueWeight, FalseWeight))
+ return;
+
+ // Returns the outgoing edge of the dominating predecessor block
+ // that leads to the PhiNode's incoming block:
+ auto GetPredOutEdge =
+ [](BasicBlock *IncomingBB,
+ BasicBlock *PhiBB) -> std::pair<BasicBlock *, BasicBlock *> {
+ auto *PredBB = IncomingBB;
+ auto *SuccBB = PhiBB;
+ while (true) {
+ BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
+ if (PredBr && PredBr->isConditional())
+ return {PredBB, SuccBB};
+ auto *SinglePredBB = PredBB->getSinglePredecessor();
+ if (!SinglePredBB)
+ return {nullptr, nullptr};
+ SuccBB = PredBB;
+ PredBB = SinglePredBB;
+ }
+ };
+
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ Value *PhiOpnd = PN->getIncomingValue(i);
+ ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd);
+
+ if (!CI || !CI->getType()->isIntegerTy(1))
+ continue;
+
+ BP = (CI->isOne() ? BranchProbability::getBranchProbability(
+ TrueWeight, TrueWeight + FalseWeight)
+ : BranchProbability::getBranchProbability(
+ FalseWeight, TrueWeight + FalseWeight));
+
+ auto PredOutEdge = GetPredOutEdge(PN->getIncomingBlock(i), BB);
+ if (!PredOutEdge.first)
+ return;
+
+ BasicBlock *PredBB = PredOutEdge.first;
+ BranchInst *PredBr = cast<BranchInst>(PredBB->getTerminator());
+
+ uint64_t PredTrueWeight, PredFalseWeight;
+ // FIXME: We currently only set the profile data when it is missing.
+ // With PGO, this can be used to refine even existing profile data with
+ // context information. This needs to be done after more performance
+ // testing.
+ if (PredBr->extractProfMetadata(PredTrueWeight, PredFalseWeight))
+ continue;
+
+ // We can not infer anything useful when BP >= 50%, because BP is the
+ // upper bound probability value.
+ if (BP >= BranchProbability(50, 100))
+ continue;
+
+ SmallVector<uint32_t, 2> Weights;
+ if (PredBr->getSuccessor(0) == PredOutEdge.second) {
+ Weights.push_back(BP.getNumerator());
+ Weights.push_back(BP.getCompl().getNumerator());
+ } else {
+ Weights.push_back(BP.getCompl().getNumerator());
+ Weights.push_back(BP.getNumerator());
+ }
+ PredBr->setMetadata(LLVMContext::MD_prof,
+ MDBuilder(PredBr->getParent()->getContext())
+ .createBranchWeights(Weights));
+ }
+}
+
+/// runOnFunction - Toplevel algorithm.
+bool JumpThreading::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+ auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+ auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ std::unique_ptr<BlockFrequencyInfo> BFI;
+ std::unique_ptr<BranchProbabilityInfo> BPI;
+ bool HasProfileData = F.hasProfileData();
+ if (HasProfileData) {
+ LoopInfo LI{DominatorTree(F)};
+ BPI.reset(new BranchProbabilityInfo(F, LI, TLI));
+ BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
+ }
+
+ bool Changed = Impl.runImpl(F, TLI, LVI, AA, HasProfileData, std::move(BFI),
+ std::move(BPI));
+ if (PrintLVIAfterJumpThreading) {
+ dbgs() << "LVI for function '" << F.getName() << "':\n";
+ LVI->printLVI(F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ dbgs());
+ }
+ return Changed;
+}
+
+PreservedAnalyses JumpThreadingPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &LVI = AM.getResult<LazyValueAnalysis>(F);
+ auto &AA = AM.getResult<AAManager>(F);
+
+ std::unique_ptr<BlockFrequencyInfo> BFI;
+ std::unique_ptr<BranchProbabilityInfo> BPI;
+ if (F.hasProfileData()) {
+ LoopInfo LI{DominatorTree(F)};
+ BPI.reset(new BranchProbabilityInfo(F, LI, &TLI));
+ BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
+ }
+
+ bool Changed = runImpl(F, &TLI, &LVI, &AA, HasProfileData, std::move(BFI),
+ std::move(BPI));
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
+ LazyValueInfo *LVI_, AliasAnalysis *AA_,
+ bool HasProfileData_,
+ std::unique_ptr<BlockFrequencyInfo> BFI_,
+ std::unique_ptr<BranchProbabilityInfo> BPI_) {
+ DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
+ TLI = TLI_;
+ LVI = LVI_;
+ AA = AA_;
+ BFI.reset();
+ BPI.reset();
+ // When profile data is available, we need to update edge weights after
+ // successful jump threading, which requires both BPI and BFI being available.
+ HasProfileData = HasProfileData_;
+ auto *GuardDecl = F.getParent()->getFunction(
+ Intrinsic::getName(Intrinsic::experimental_guard));
+ HasGuards = GuardDecl && !GuardDecl->use_empty();
+ if (HasProfileData) {
+ BPI = std::move(BPI_);
+ BFI = std::move(BFI_);
+ }
+
+ // Remove unreachable blocks from function as they may result in infinite
+ // loop. We do threading if we found something profitable. Jump threading a
+ // branch can create other opportunities. If these opportunities form a cycle
+ // i.e. if any jump threading is undoing previous threading in the path, then
+ // we will loop forever. We take care of this issue by not jump threading for
+ // back edges. This works for normal cases but not for unreachable blocks as
+ // they may have cycle with no back edge.
+ bool EverChanged = false;
+ EverChanged |= removeUnreachableBlocks(F, LVI);
+
+ FindLoopHeaders(F);
+
+ bool Changed;
+ do {
+ Changed = false;
+ for (Function::iterator I = F.begin(), E = F.end(); I != E;) {
+ BasicBlock *BB = &*I;
+ // Thread all of the branches we can over this block.
+ while (ProcessBlock(BB))
+ Changed = true;
+
+ ++I;
+
+ // If the block is trivially dead, zap it. This eliminates the successor
+ // edges which simplifies the CFG.
+ if (pred_empty(BB) &&
+ BB != &BB->getParent()->getEntryBlock()) {
+ DEBUG(dbgs() << " JT: Deleting dead block '" << BB->getName()
+ << "' with terminator: " << *BB->getTerminator() << '\n');
+ LoopHeaders.erase(BB);
+ LVI->eraseBlock(BB);
+ DeleteDeadBlock(BB);
+ Changed = true;
+ continue;
+ }
+
+ BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+
+ // Can't thread an unconditional jump, but if the block is "almost
+ // empty", we can replace uses of it with uses of the successor and make
+ // this dead.
+ // We should not eliminate the loop header or latch either, because
+ // eliminating a loop header or latch might later prevent LoopSimplify
+ // from transforming nested loops into simplified form. We will rely on
+ // later passes in backend to clean up empty blocks.
+ if (BI && BI->isUnconditional() &&
+ BB != &BB->getParent()->getEntryBlock() &&
+ // If the terminator is the only non-phi instruction, try to nuke it.
+ BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB) &&
+ !LoopHeaders.count(BI->getSuccessor(0))) {
+ // FIXME: It is always conservatively correct to drop the info
+ // for a block even if it doesn't get erased. This isn't totally
+ // awesome, but it allows us to use AssertingVH to prevent nasty
+ // dangling pointer issues within LazyValueInfo.
+ LVI->eraseBlock(BB);
+ if (TryToSimplifyUncondBranchFromEmptyBlock(BB))
+ Changed = true;
+ }
+ }
+ EverChanged |= Changed;
+ } while (Changed);
+
+ LoopHeaders.clear();
+ return EverChanged;
+}
+
+// Replace uses of Cond with ToVal when safe to do so. If all uses are
+// replaced, we can remove Cond. We cannot blindly replace all uses of Cond
+// because we may incorrectly replace uses when guards/assumes are uses of
+// of `Cond` and we used the guards/assume to reason about the `Cond` value
+// at the end of block. RAUW unconditionally replaces all uses
+// including the guards/assumes themselves and the uses before the
+// guard/assume.
+static void ReplaceFoldableUses(Instruction *Cond, Value *ToVal) {
+ assert(Cond->getType() == ToVal->getType());
+ auto *BB = Cond->getParent();
+ // We can unconditionally replace all uses in non-local blocks (i.e. uses
+ // strictly dominated by BB), since LVI information is true from the
+ // terminator of BB.
+ replaceNonLocalUsesWith(Cond, ToVal);
+ for (Instruction &I : reverse(*BB)) {
+ // Reached the Cond whose uses we are trying to replace, so there are no
+ // more uses.
+ if (&I == Cond)
+ break;
+ // We only replace uses in instructions that are guaranteed to reach the end
+ // of BB, where we know Cond is ToVal.
+ if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+ break;
+ I.replaceUsesOfWith(Cond, ToVal);
+ }
+ if (Cond->use_empty() && !Cond->mayHaveSideEffects())
+ Cond->eraseFromParent();
+}
+
+/// Return the cost of duplicating a piece of this block from first non-phi
+/// and before StopAt instruction to thread across it. Stop scanning the block
+/// when exceeding the threshold. If duplication is impossible, returns ~0U.
+static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
+ Instruction *StopAt,
+ unsigned Threshold) {
+ assert(StopAt->getParent() == BB && "Not an instruction from proper BB?");
+ /// Ignore PHI nodes, these will be flattened when duplication happens.
+ BasicBlock::const_iterator I(BB->getFirstNonPHI());
+
+ // FIXME: THREADING will delete values that are just used to compute the
+ // branch, so they shouldn't count against the duplication cost.
+
+ unsigned Bonus = 0;
+ if (BB->getTerminator() == StopAt) {
+ // Threading through a switch statement is particularly profitable. If this
+ // block ends in a switch, decrease its cost to make it more likely to
+ // happen.
+ if (isa<SwitchInst>(StopAt))
+ Bonus = 6;
+
+ // The same holds for indirect branches, but slightly more so.
+ if (isa<IndirectBrInst>(StopAt))
+ Bonus = 8;
+ }
+
+ // Bump the threshold up so the early exit from the loop doesn't skip the
+ // terminator-based Size adjustment at the end.
+ Threshold += Bonus;
+
+ // Sum up the cost of each instruction until we get to the terminator. Don't
+ // include the terminator because the copy won't include it.
+ unsigned Size = 0;
+ for (; &*I != StopAt; ++I) {
+
+ // Stop scanning the block if we've reached the threshold.
+ if (Size > Threshold)
+ return Size;
+
+ // Debugger intrinsics don't incur code size.
+ if (isa<DbgInfoIntrinsic>(I)) continue;
+
+ // If this is a pointer->pointer bitcast, it is free.
+ if (isa<BitCastInst>(I) && I->getType()->isPointerTy())
+ continue;
+
+ // Bail out if this instruction gives back a token type, it is not possible
+ // to duplicate it if it is used outside this BB.
+ if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB))
+ return ~0U;
+
+ // All other instructions count for at least one unit.
+ ++Size;
+
+ // Calls are more expensive. If they are non-intrinsic calls, we model them
+ // as having cost of 4. If they are a non-vector intrinsic, we model them
+ // as having cost of 2 total, and if they are a vector intrinsic, we model
+ // them as having cost 1.
+ if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+ if (CI->cannotDuplicate() || CI->isConvergent())
+ // Blocks with NoDuplicate are modelled as having infinite cost, so they
+ // are never duplicated.
+ return ~0U;
+ else if (!isa<IntrinsicInst>(CI))
+ Size += 3;
+ else if (!CI->getType()->isVectorTy())
+ Size += 1;
+ }
+ }
+
+ return Size > Bonus ? Size - Bonus : 0;
+}
+
+/// FindLoopHeaders - We do not want jump threading to turn proper loop
+/// structures into irreducible loops. Doing this breaks up the loop nesting
+/// hierarchy and pessimizes later transformations. To prevent this from
+/// happening, we first have to find the loop headers. Here we approximate this
+/// by finding targets of backedges in the CFG.
+///
+/// Note that there definitely are cases when we want to allow threading of
+/// edges across a loop header. For example, threading a jump from outside the
+/// loop (the preheader) to an exit block of the loop is definitely profitable.
+/// It is also almost always profitable to thread backedges from within the loop
+/// to exit blocks, and is often profitable to thread backedges to other blocks
+/// within the loop (forming a nested loop). This simple analysis is not rich
+/// enough to track all of these properties and keep it up-to-date as the CFG
+/// mutates, so we don't allow any of these transformations.
+void JumpThreadingPass::FindLoopHeaders(Function &F) {
+ SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
+ FindFunctionBackedges(F, Edges);
+
+ for (const auto &Edge : Edges)
+ LoopHeaders.insert(Edge.second);
+}
+
+/// getKnownConstant - Helper method to determine if we can thread over a
+/// terminator with the given value as its condition, and if so what value to
+/// use for that. What kind of value this is depends on whether we want an
+/// integer or a block address, but an undef is always accepted.
+/// Returns null if Val is null or not an appropriate constant.
+static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) {
+ if (!Val)
+ return nullptr;
+
+ // Undef is "known" enough.
+ if (UndefValue *U = dyn_cast<UndefValue>(Val))
+ return U;
+
+ if (Preference == WantBlockAddress)
+ return dyn_cast<BlockAddress>(Val->stripPointerCasts());
+
+ return dyn_cast<ConstantInt>(Val);
+}
+
+/// ComputeValueKnownInPredecessors - Given a basic block BB and a value V, see
+/// if we can infer that the value is a known ConstantInt/BlockAddress or undef
+/// in any of our predecessors. If so, return the known list of value and pred
+/// BB in the result vector.
+///
+/// This returns true if there were any known values.
+bool JumpThreadingPass::ComputeValueKnownInPredecessors(
+ Value *V, BasicBlock *BB, PredValueInfo &Result,
+ ConstantPreference Preference, Instruction *CxtI) {
+ // This method walks up use-def chains recursively. Because of this, we could
+ // get into an infinite loop going around loops in the use-def chain. To
+ // prevent this, keep track of what (value, block) pairs we've already visited
+ // and terminate the search if we loop back to them
+ if (!RecursionSet.insert(std::make_pair(V, BB)).second)
+ return false;
+
+ // An RAII help to remove this pair from the recursion set once the recursion
+ // stack pops back out again.
+ RecursionSetRemover remover(RecursionSet, std::make_pair(V, BB));
+
+ // If V is a constant, then it is known in all predecessors.
+ if (Constant *KC = getKnownConstant(V, Preference)) {
+ for (BasicBlock *Pred : predecessors(BB))
+ Result.push_back(std::make_pair(KC, Pred));
+
+ return !Result.empty();
+ }
+
+ // If V is a non-instruction value, or an instruction in a different block,
+ // then it can't be derived from a PHI.
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I || I->getParent() != BB) {
+
+ // Okay, if this is a live-in value, see if it has a known value at the end
+ // of any of our predecessors.
+ //
+ // FIXME: This should be an edge property, not a block end property.
+ /// TODO: Per PR2563, we could infer value range information about a
+ /// predecessor based on its terminator.
+ //
+ // FIXME: change this to use the more-rich 'getPredicateOnEdge' method if
+ // "I" is a non-local compare-with-a-constant instruction. This would be
+ // able to handle value inequalities better, for example if the compare is
+ // "X < 4" and "X < 3" is known true but "X < 4" itself is not available.
+ // Perhaps getConstantOnEdge should be smart enough to do this?
+
+ for (BasicBlock *P : predecessors(BB)) {
+ // If the value is known by LazyValueInfo to be a constant in a
+ // predecessor, use that information to try to thread this block.
+ Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI);
+ if (Constant *KC = getKnownConstant(PredCst, Preference))
+ Result.push_back(std::make_pair(KC, P));
+ }
+
+ return !Result.empty();
+ }
+
+ /// If I is a PHI node, then we know the incoming values for any constants.
+ if (PHINode *PN = dyn_cast<PHINode>(I)) {
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ Value *InVal = PN->getIncomingValue(i);
+ if (Constant *KC = getKnownConstant(InVal, Preference)) {
+ Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i)));
+ } else {
+ Constant *CI = LVI->getConstantOnEdge(InVal,
+ PN->getIncomingBlock(i),
+ BB, CxtI);
+ if (Constant *KC = getKnownConstant(CI, Preference))
+ Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i)));
+ }
+ }
+
+ return !Result.empty();
+ }
+
+ // Handle Cast instructions. Only see through Cast when the source operand is
+ // PHI or Cmp and the source type is i1 to save the compilation time.
+ if (CastInst *CI = dyn_cast<CastInst>(I)) {
+ Value *Source = CI->getOperand(0);
+ if (!Source->getType()->isIntegerTy(1))
+ return false;
+ if (!isa<PHINode>(Source) && !isa<CmpInst>(Source))
+ return false;
+ ComputeValueKnownInPredecessors(Source, BB, Result, Preference, CxtI);
+ if (Result.empty())
+ return false;
+
+ // Convert the known values.
+ for (auto &R : Result)
+ R.first = ConstantExpr::getCast(CI->getOpcode(), R.first, CI->getType());
+
+ return true;
+ }
+
+ // Handle some boolean conditions.
+ if (I->getType()->getPrimitiveSizeInBits() == 1) {
+ assert(Preference == WantInteger && "One-bit non-integer type?");
+ // X | true -> true
+ // X & false -> false
+ if (I->getOpcode() == Instruction::Or ||
+ I->getOpcode() == Instruction::And) {
+ PredValueInfoTy LHSVals, RHSVals;
+
+ ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
+ WantInteger, CxtI);
+ ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals,
+ WantInteger, CxtI);
+
+ if (LHSVals.empty() && RHSVals.empty())
+ return false;
+
+ ConstantInt *InterestingVal;
+ if (I->getOpcode() == Instruction::Or)
+ InterestingVal = ConstantInt::getTrue(I->getContext());
+ else
+ InterestingVal = ConstantInt::getFalse(I->getContext());
+
+ SmallPtrSet<BasicBlock*, 4> LHSKnownBBs;
+
+ // Scan for the sentinel. If we find an undef, force it to the
+ // interesting value: x|undef -> true and x&undef -> false.
+ for (const auto &LHSVal : LHSVals)
+ if (LHSVal.first == InterestingVal || isa<UndefValue>(LHSVal.first)) {
+ Result.emplace_back(InterestingVal, LHSVal.second);
+ LHSKnownBBs.insert(LHSVal.second);
+ }
+ for (const auto &RHSVal : RHSVals)
+ if (RHSVal.first == InterestingVal || isa<UndefValue>(RHSVal.first)) {
+ // If we already inferred a value for this block on the LHS, don't
+ // re-add it.
+ if (!LHSKnownBBs.count(RHSVal.second))
+ Result.emplace_back(InterestingVal, RHSVal.second);
+ }
+
+ return !Result.empty();
+ }
+
+ // Handle the NOT form of XOR.
+ if (I->getOpcode() == Instruction::Xor &&
+ isa<ConstantInt>(I->getOperand(1)) &&
+ cast<ConstantInt>(I->getOperand(1))->isOne()) {
+ ComputeValueKnownInPredecessors(I->getOperand(0), BB, Result,
+ WantInteger, CxtI);
+ if (Result.empty())
+ return false;
+
+ // Invert the known values.
+ for (auto &R : Result)
+ R.first = ConstantExpr::getNot(R.first);
+
+ return true;
+ }
+
+ // Try to simplify some other binary operator values.
+ } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+ assert(Preference != WantBlockAddress
+ && "A binary operator creating a block address?");
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+ PredValueInfoTy LHSVals;
+ ComputeValueKnownInPredecessors(BO->getOperand(0), BB, LHSVals,
+ WantInteger, CxtI);
+
+ // Try to use constant folding to simplify the binary operator.
+ for (const auto &LHSVal : LHSVals) {
+ Constant *V = LHSVal.first;
+ Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI);
+
+ if (Constant *KC = getKnownConstant(Folded, WantInteger))
+ Result.push_back(std::make_pair(KC, LHSVal.second));
+ }
+ }
+
+ return !Result.empty();
+ }
+
+ // Handle compare with phi operand, where the PHI is defined in this block.
+ if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
+ assert(Preference == WantInteger && "Compares only produce integers");
+ Type *CmpType = Cmp->getType();
+ Value *CmpLHS = Cmp->getOperand(0);
+ Value *CmpRHS = Cmp->getOperand(1);
+ CmpInst::Predicate Pred = Cmp->getPredicate();
+
+ PHINode *PN = dyn_cast<PHINode>(CmpLHS);
+ if (PN && PN->getParent() == BB) {
+ const DataLayout &DL = PN->getModule()->getDataLayout();
+ // We can do this simplification if any comparisons fold to true or false.
+ // See if any do.
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ BasicBlock *PredBB = PN->getIncomingBlock(i);
+ Value *LHS = PN->getIncomingValue(i);
+ Value *RHS = CmpRHS->DoPHITranslation(BB, PredBB);
+
+ Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL});
+ if (!Res) {
+ if (!isa<Constant>(RHS))
+ continue;
+
+ LazyValueInfo::Tristate
+ ResT = LVI->getPredicateOnEdge(Pred, LHS,
+ cast<Constant>(RHS), PredBB, BB,
+ CxtI ? CxtI : Cmp);
+ if (ResT == LazyValueInfo::Unknown)
+ continue;
+ Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT);
+ }
+
+ if (Constant *KC = getKnownConstant(Res, WantInteger))
+ Result.push_back(std::make_pair(KC, PredBB));
+ }
+
+ return !Result.empty();
+ }
+
+ // If comparing a live-in value against a constant, see if we know the
+ // live-in value on any predecessors.
+ if (isa<Constant>(CmpRHS) && !CmpType->isVectorTy()) {
+ Constant *CmpConst = cast<Constant>(CmpRHS);
+
+ if (!isa<Instruction>(CmpLHS) ||
+ cast<Instruction>(CmpLHS)->getParent() != BB) {
+ for (BasicBlock *P : predecessors(BB)) {
+ // If the value is known by LazyValueInfo to be a constant in a
+ // predecessor, use that information to try to thread this block.
+ LazyValueInfo::Tristate Res =
+ LVI->getPredicateOnEdge(Pred, CmpLHS,
+ CmpConst, P, BB, CxtI ? CxtI : Cmp);
+ if (Res == LazyValueInfo::Unknown)
+ continue;
+
+ Constant *ResC = ConstantInt::get(CmpType, Res);
+ Result.push_back(std::make_pair(ResC, P));
+ }
+
+ return !Result.empty();
+ }
+
+ // InstCombine can fold some forms of constant range checks into
+ // (icmp (add (x, C1)), C2). See if we have we have such a thing with
+ // x as a live-in.
+ {
+ using namespace PatternMatch;
+
+ Value *AddLHS;
+ ConstantInt *AddConst;
+ if (isa<ConstantInt>(CmpConst) &&
+ match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) {
+ if (!isa<Instruction>(AddLHS) ||
+ cast<Instruction>(AddLHS)->getParent() != BB) {
+ for (BasicBlock *P : predecessors(BB)) {
+ // If the value is known by LazyValueInfo to be a ConstantRange in
+ // a predecessor, use that information to try to thread this
+ // block.
+ ConstantRange CR = LVI->getConstantRangeOnEdge(
+ AddLHS, P, BB, CxtI ? CxtI : cast<Instruction>(CmpLHS));
+ // Propagate the range through the addition.
+ CR = CR.add(AddConst->getValue());
+
+ // Get the range where the compare returns true.
+ ConstantRange CmpRange = ConstantRange::makeExactICmpRegion(
+ Pred, cast<ConstantInt>(CmpConst)->getValue());
+
+ Constant *ResC;
+ if (CmpRange.contains(CR))
+ ResC = ConstantInt::getTrue(CmpType);
+ else if (CmpRange.inverse().contains(CR))
+ ResC = ConstantInt::getFalse(CmpType);
+ else
+ continue;
+
+ Result.push_back(std::make_pair(ResC, P));
+ }
+
+ return !Result.empty();
+ }
+ }
+ }
+
+ // Try to find a constant value for the LHS of a comparison,
+ // and evaluate it statically if we can.
+ PredValueInfoTy LHSVals;
+ ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
+ WantInteger, CxtI);
+
+ for (const auto &LHSVal : LHSVals) {
+ Constant *V = LHSVal.first;
+ Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst);
+ if (Constant *KC = getKnownConstant(Folded, WantInteger))
+ Result.push_back(std::make_pair(KC, LHSVal.second));
+ }
+
+ return !Result.empty();
+ }
+ }
+
+ if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+ // Handle select instructions where at least one operand is a known constant
+ // and we can figure out the condition value for any predecessor block.
+ Constant *TrueVal = getKnownConstant(SI->getTrueValue(), Preference);
+ Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference);
+ PredValueInfoTy Conds;
+ if ((TrueVal || FalseVal) &&
+ ComputeValueKnownInPredecessors(SI->getCondition(), BB, Conds,
+ WantInteger, CxtI)) {
+ for (auto &C : Conds) {
+ Constant *Cond = C.first;
+
+ // Figure out what value to use for the condition.
+ bool KnownCond;
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Cond)) {
+ // A known boolean.
+ KnownCond = CI->isOne();
+ } else {
+ assert(isa<UndefValue>(Cond) && "Unexpected condition value");
+ // Either operand will do, so be sure to pick the one that's a known
+ // constant.
+ // FIXME: Do this more cleverly if both values are known constants?
+ KnownCond = (TrueVal != nullptr);
+ }
+
+ // See if the select has a known constant value for this predecessor.
+ if (Constant *Val = KnownCond ? TrueVal : FalseVal)
+ Result.push_back(std::make_pair(Val, C.second));
+ }
+
+ return !Result.empty();
+ }
+ }
+
+ // If all else fails, see if LVI can figure out a constant value for us.
+ Constant *CI = LVI->getConstant(V, BB, CxtI);
+ if (Constant *KC = getKnownConstant(CI, Preference)) {
+ for (BasicBlock *Pred : predecessors(BB))
+ Result.push_back(std::make_pair(KC, Pred));
+ }
+
+ return !Result.empty();
+}
+
+/// GetBestDestForBranchOnUndef - If we determine that the specified block ends
+/// in an undefined jump, decide which block is best to revector to.
+///
+/// Since we can pick an arbitrary destination, we pick the successor with the
+/// fewest predecessors. This should reduce the in-degree of the others.
+static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) {
+ TerminatorInst *BBTerm = BB->getTerminator();
+ unsigned MinSucc = 0;
+ BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
+ // Compute the successor with the minimum number of predecessors.
+ unsigned MinNumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+ for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) {
+ TestBB = BBTerm->getSuccessor(i);
+ unsigned NumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+ if (NumPreds < MinNumPreds) {
+ MinSucc = i;
+ MinNumPreds = NumPreds;
+ }
+ }
+
+ return MinSucc;
+}
+
+static bool hasAddressTakenAndUsed(BasicBlock *BB) {
+ if (!BB->hasAddressTaken()) return false;
+
+ // If the block has its address taken, it may be a tree of dead constants
+ // hanging off of it. These shouldn't keep the block alive.
+ BlockAddress *BA = BlockAddress::get(BB);
+ BA->removeDeadConstantUsers();
+ return !BA->use_empty();
+}
+
+/// ProcessBlock - If there are any predecessors whose control can be threaded
+/// through to a successor, transform them now.
+bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
+ // If the block is trivially dead, just return and let the caller nuke it.
+ // This simplifies other transformations.
+ if (pred_empty(BB) &&
+ BB != &BB->getParent()->getEntryBlock())
+ return false;
+
+ // If this block has a single predecessor, and if that pred has a single
+ // successor, merge the blocks. This encourages recursive jump threading
+ // because now the condition in this block can be threaded through
+ // predecessors of our predecessor block.
+ if (BasicBlock *SinglePred = BB->getSinglePredecessor()) {
+ const TerminatorInst *TI = SinglePred->getTerminator();
+ if (!TI->isExceptional() && TI->getNumSuccessors() == 1 &&
+ SinglePred != BB && !hasAddressTakenAndUsed(BB)) {
+ // If SinglePred was a loop header, BB becomes one.
+ if (LoopHeaders.erase(SinglePred))
+ LoopHeaders.insert(BB);
+
+ LVI->eraseBlock(SinglePred);
+ MergeBasicBlockIntoOnlyPred(BB);
+
+ // Now that BB is merged into SinglePred (i.e. SinglePred Code followed by
+ // BB code within one basic block `BB`), we need to invalidate the LVI
+ // information associated with BB, because the LVI information need not be
+ // true for all of BB after the merge. For example,
+ // Before the merge, LVI info and code is as follows:
+ // SinglePred: <LVI info1 for %p val>
+ // %y = use of %p
+ // call @exit() // need not transfer execution to successor.
+ // assume(%p) // from this point on %p is true
+ // br label %BB
+ // BB: <LVI info2 for %p val, i.e. %p is true>
+ // %x = use of %p
+ // br label exit
+ //
+ // Note that this LVI info for blocks BB and SinglPred is correct for %p
+ // (info2 and info1 respectively). After the merge and the deletion of the
+ // LVI info1 for SinglePred. We have the following code:
+ // BB: <LVI info2 for %p val>
+ // %y = use of %p
+ // call @exit()
+ // assume(%p)
+ // %x = use of %p <-- LVI info2 is correct from here onwards.
+ // br label exit
+ // LVI info2 for BB is incorrect at the beginning of BB.
+
+ // Invalidate LVI information for BB if the LVI is not provably true for
+ // all of BB.
+ if (any_of(*BB, [](Instruction &I) {
+ return !isGuaranteedToTransferExecutionToSuccessor(&I);
+ }))
+ LVI->eraseBlock(BB);
+ return true;
+ }
+ }
+
+ if (TryToUnfoldSelectInCurrBB(BB))
+ return true;
+
+ // Look if we can propagate guards to predecessors.
+ if (HasGuards && ProcessGuards(BB))
+ return true;
+
+ // What kind of constant we're looking for.
+ ConstantPreference Preference = WantInteger;
+
+ // Look to see if the terminator is a conditional branch, switch or indirect
+ // branch, if not we can't thread it.
+ Value *Condition;
+ Instruction *Terminator = BB->getTerminator();
+ if (BranchInst *BI = dyn_cast<BranchInst>(Terminator)) {
+ // Can't thread an unconditional jump.
+ if (BI->isUnconditional()) return false;
+ Condition = BI->getCondition();
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) {
+ Condition = SI->getCondition();
+ } else if (IndirectBrInst *IB = dyn_cast<IndirectBrInst>(Terminator)) {
+ // Can't thread indirect branch with no successors.
+ if (IB->getNumSuccessors() == 0) return false;
+ Condition = IB->getAddress()->stripPointerCasts();
+ Preference = WantBlockAddress;
+ } else {
+ return false; // Must be an invoke.
+ }
+
+ // Run constant folding to see if we can reduce the condition to a simple
+ // constant.
+ if (Instruction *I = dyn_cast<Instruction>(Condition)) {
+ Value *SimpleVal =
+ ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI);
+ if (SimpleVal) {
+ I->replaceAllUsesWith(SimpleVal);
+ if (isInstructionTriviallyDead(I, TLI))
+ I->eraseFromParent();
+ Condition = SimpleVal;
+ }
+ }
+
+ // If the terminator is branching on an undef, we can pick any of the
+ // successors to branch to. Let GetBestDestForJumpOnUndef decide.
+ if (isa<UndefValue>(Condition)) {
+ unsigned BestSucc = GetBestDestForJumpOnUndef(BB);
+
+ // Fold the branch/switch.
+ TerminatorInst *BBTerm = BB->getTerminator();
+ for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) {
+ if (i == BestSucc) continue;
+ BBTerm->getSuccessor(i)->removePredecessor(BB, true);
+ }
+
+ DEBUG(dbgs() << " In block '" << BB->getName()
+ << "' folding undef terminator: " << *BBTerm << '\n');
+ BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm);
+ BBTerm->eraseFromParent();
+ return true;
+ }
+
+ // If the terminator of this block is branching on a constant, simplify the
+ // terminator to an unconditional branch. This can occur due to threading in
+ // other blocks.
+ if (getKnownConstant(Condition, Preference)) {
+ DEBUG(dbgs() << " In block '" << BB->getName()
+ << "' folding terminator: " << *BB->getTerminator() << '\n');
+ ++NumFolds;
+ ConstantFoldTerminator(BB, true);
+ return true;
+ }
+
+ Instruction *CondInst = dyn_cast<Instruction>(Condition);
+
+ // All the rest of our checks depend on the condition being an instruction.
+ if (!CondInst) {
+ // FIXME: Unify this with code below.
+ if (ProcessThreadableEdges(Condition, BB, Preference, Terminator))
+ return true;
+ return false;
+ }
+
+ if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
+ // If we're branching on a conditional, LVI might be able to determine
+ // it's value at the branch instruction. We only handle comparisons
+ // against a constant at this time.
+ // TODO: This should be extended to handle switches as well.
+ BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+ Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
+ if (CondBr && CondConst) {
+ // We should have returned as soon as we turn a conditional branch to
+ // unconditional. Because its no longer interesting as far as jump
+ // threading is concerned.
+ assert(CondBr->isConditional() && "Threading on unconditional terminator");
+
+ LazyValueInfo::Tristate Ret =
+ LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
+ CondConst, CondBr);
+ if (Ret != LazyValueInfo::Unknown) {
+ unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0;
+ unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1;
+ CondBr->getSuccessor(ToRemove)->removePredecessor(BB, true);
+ BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
+ CondBr->eraseFromParent();
+ if (CondCmp->use_empty())
+ CondCmp->eraseFromParent();
+ // We can safely replace *some* uses of the CondInst if it has
+ // exactly one value as returned by LVI. RAUW is incorrect in the
+ // presence of guards and assumes, that have the `Cond` as the use. This
+ // is because we use the guards/assume to reason about the `Cond` value
+ // at the end of block, but RAUW unconditionally replaces all uses
+ // including the guards/assumes themselves and the uses before the
+ // guard/assume.
+ else if (CondCmp->getParent() == BB) {
+ auto *CI = Ret == LazyValueInfo::True ?
+ ConstantInt::getTrue(CondCmp->getType()) :
+ ConstantInt::getFalse(CondCmp->getType());
+ ReplaceFoldableUses(CondCmp, CI);
+ }
+ return true;
+ }
+
+ // We did not manage to simplify this branch, try to see whether
+ // CondCmp depends on a known phi-select pattern.
+ if (TryToUnfoldSelect(CondCmp, BB))
+ return true;
+ }
+ }
+
+ // Check for some cases that are worth simplifying. Right now we want to look
+ // for loads that are used by a switch or by the condition for the branch. If
+ // we see one, check to see if it's partially redundant. If so, insert a PHI
+ // which can then be used to thread the values.
+ Value *SimplifyValue = CondInst;
+ if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
+ if (isa<Constant>(CondCmp->getOperand(1)))
+ SimplifyValue = CondCmp->getOperand(0);
+
+ // TODO: There are other places where load PRE would be profitable, such as
+ // more complex comparisons.
+ if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue))
+ if (SimplifyPartiallyRedundantLoad(LI))
+ return true;
+
+ // Before threading, try to propagate profile data backwards:
+ if (PHINode *PN = dyn_cast<PHINode>(CondInst))
+ if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+ updatePredecessorProfileMetadata(PN, BB);
+
+ // Handle a variety of cases where we are branching on something derived from
+ // a PHI node in the current block. If we can prove that any predecessors
+ // compute a predictable value based on a PHI node, thread those predecessors.
+ if (ProcessThreadableEdges(CondInst, BB, Preference, Terminator))
+ return true;
+
+ // If this is an otherwise-unfoldable branch on a phi node in the current
+ // block, see if we can simplify.
+ if (PHINode *PN = dyn_cast<PHINode>(CondInst))
+ if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+ return ProcessBranchOnPHI(PN);
+
+ // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
+ if (CondInst->getOpcode() == Instruction::Xor &&
+ CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+ return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst));
+
+ // Search for a stronger dominating condition that can be used to simplify a
+ // conditional branch leaving BB.
+ if (ProcessImpliedCondition(BB))
+ return true;
+
+ return false;
+}
+
+bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
+ auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BI || !BI->isConditional())
+ return false;
+
+ Value *Cond = BI->getCondition();
+ BasicBlock *CurrentBB = BB;
+ BasicBlock *CurrentPred = BB->getSinglePredecessor();
+ unsigned Iter = 0;
+
+ auto &DL = BB->getModule()->getDataLayout();
+
+ while (CurrentPred && Iter++ < ImplicationSearchThreshold) {
+ auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator());
+ if (!PBI || !PBI->isConditional())
+ return false;
+ if (PBI->getSuccessor(0) != CurrentBB && PBI->getSuccessor(1) != CurrentBB)
+ return false;
+
+ bool CondIsTrue = PBI->getSuccessor(0) == CurrentBB;
+ Optional<bool> Implication =
+ isImpliedCondition(PBI->getCondition(), Cond, DL, CondIsTrue);
+ if (Implication) {
+ BI->getSuccessor(*Implication ? 1 : 0)->removePredecessor(BB);
+ BranchInst::Create(BI->getSuccessor(*Implication ? 0 : 1), BI);
+ BI->eraseFromParent();
+ return true;
+ }
+ CurrentBB = CurrentPred;
+ CurrentPred = CurrentBB->getSinglePredecessor();
+ }
+
+ return false;
+}
+
+/// Return true if Op is an instruction defined in the given block.
+static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) {
+ if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+ if (OpInst->getParent() == BB)
+ return true;
+ return false;
+}
+
+/// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
+/// load instruction, eliminate it by replacing it with a PHI node. This is an
+/// important optimization that encourages jump threading, and needs to be run
+/// interlaced with other jump threading tasks.
+bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
+ // Don't hack volatile and ordered loads.
+ if (!LI->isUnordered()) return false;
+
+ // If the load is defined in a block with exactly one predecessor, it can't be
+ // partially redundant.
+ BasicBlock *LoadBB = LI->getParent();
+ if (LoadBB->getSinglePredecessor())
+ return false;
+
+ // If the load is defined in an EH pad, it can't be partially redundant,
+ // because the edges between the invoke and the EH pad cannot have other
+ // instructions between them.
+ if (LoadBB->isEHPad())
+ return false;
+
+ Value *LoadedPtr = LI->getOperand(0);
+
+ // If the loaded operand is defined in the LoadBB and its not a phi,
+ // it can't be available in predecessors.
+ if (isOpDefinedInBlock(LoadedPtr, LoadBB) && !isa<PHINode>(LoadedPtr))
+ return false;
+
+ // Scan a few instructions up from the load, to see if it is obviously live at
+ // the entry to its block.
+ BasicBlock::iterator BBIt(LI);
+ bool IsLoadCSE;
+ if (Value *AvailableVal = FindAvailableLoadedValue(
+ LI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
+ // If the value of the load is locally available within the block, just use
+ // it. This frequently occurs for reg2mem'd allocas.
+
+ if (IsLoadCSE) {
+ LoadInst *NLI = cast<LoadInst>(AvailableVal);
+ combineMetadataForCSE(NLI, LI);
+ };
+
+ // If the returned value is the load itself, replace with an undef. This can
+ // only happen in dead loops.
+ if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType());
+ if (AvailableVal->getType() != LI->getType())
+ AvailableVal =
+ CastInst::CreateBitOrPointerCast(AvailableVal, LI->getType(), "", LI);
+ LI->replaceAllUsesWith(AvailableVal);
+ LI->eraseFromParent();
+ return true;
+ }
+
+ // Otherwise, if we scanned the whole block and got to the top of the block,
+ // we know the block is locally transparent to the load. If not, something
+ // might clobber its value.
+ if (BBIt != LoadBB->begin())
+ return false;
+
+ // If all of the loads and stores that feed the value have the same AA tags,
+ // then we can propagate them onto any newly inserted loads.
+ AAMDNodes AATags;
+ LI->getAAMetadata(AATags);
+
+ SmallPtrSet<BasicBlock*, 8> PredsScanned;
+
+ using AvailablePredsTy = SmallVector<std::pair<BasicBlock *, Value *>, 8>;
+
+ AvailablePredsTy AvailablePreds;
+ BasicBlock *OneUnavailablePred = nullptr;
+ SmallVector<LoadInst*, 8> CSELoads;
+
+ // If we got here, the loaded value is transparent through to the start of the
+ // block. Check to see if it is available in any of the predecessor blocks.
+ for (BasicBlock *PredBB : predecessors(LoadBB)) {
+ // If we already scanned this predecessor, skip it.
+ if (!PredsScanned.insert(PredBB).second)
+ continue;
+
+ BBIt = PredBB->end();
+ unsigned NumScanedInst = 0;
+ Value *PredAvailable = nullptr;
+ // NOTE: We don't CSE load that is volatile or anything stronger than
+ // unordered, that should have been checked when we entered the function.
+ assert(LI->isUnordered() && "Attempting to CSE volatile or atomic loads");
+ // If this is a load on a phi pointer, phi-translate it and search
+ // for available load/store to the pointer in predecessors.
+ Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB);
+ PredAvailable = FindAvailablePtrLoadStore(
+ Ptr, LI->getType(), LI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan,
+ AA, &IsLoadCSE, &NumScanedInst);
+
+ // If PredBB has a single predecessor, continue scanning through the
+ // single precessor.
+ BasicBlock *SinglePredBB = PredBB;
+ while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() &&
+ NumScanedInst < DefMaxInstsToScan) {
+ SinglePredBB = SinglePredBB->getSinglePredecessor();
+ if (SinglePredBB) {
+ BBIt = SinglePredBB->end();
+ PredAvailable = FindAvailablePtrLoadStore(
+ Ptr, LI->getType(), LI->isAtomic(), SinglePredBB, BBIt,
+ (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,
+ &NumScanedInst);
+ }
+ }
+
+ if (!PredAvailable) {
+ OneUnavailablePred = PredBB;
+ continue;
+ }
+
+ if (IsLoadCSE)
+ CSELoads.push_back(cast<LoadInst>(PredAvailable));
+
+ // If so, this load is partially redundant. Remember this info so that we
+ // can create a PHI node.
+ AvailablePreds.push_back(std::make_pair(PredBB, PredAvailable));
+ }
+
+ // If the loaded value isn't available in any predecessor, it isn't partially
+ // redundant.
+ if (AvailablePreds.empty()) return false;
+
+ // Okay, the loaded value is available in at least one (and maybe all!)
+ // predecessors. If the value is unavailable in more than one unique
+ // predecessor, we want to insert a merge block for those common predecessors.
+ // This ensures that we only have to insert one reload, thus not increasing
+ // code size.
+ BasicBlock *UnavailablePred = nullptr;
+
+ // If the value is unavailable in one of predecessors, we will end up
+ // inserting a new instruction into them. It is only valid if all the
+ // instructions before LI are guaranteed to pass execution to its successor,
+ // or if LI is safe to speculate.
+ // TODO: If this logic becomes more complex, and we will perform PRE insertion
+ // farther than to a predecessor, we need to reuse the code from GVN's PRE.
+ // It requires domination tree analysis, so for this simple case it is an
+ // overkill.
+ if (PredsScanned.size() != AvailablePreds.size() &&
+ !isSafeToSpeculativelyExecute(LI))
+ for (auto I = LoadBB->begin(); &*I != LI; ++I)
+ if (!isGuaranteedToTransferExecutionToSuccessor(&*I))
+ return false;
+
+ // If there is exactly one predecessor where the value is unavailable, the
+ // already computed 'OneUnavailablePred' block is it. If it ends in an
+ // unconditional branch, we know that it isn't a critical edge.
+ if (PredsScanned.size() == AvailablePreds.size()+1 &&
+ OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) {
+ UnavailablePred = OneUnavailablePred;
+ } else if (PredsScanned.size() != AvailablePreds.size()) {
+ // Otherwise, we had multiple unavailable predecessors or we had a critical
+ // edge from the one.
+ SmallVector<BasicBlock*, 8> PredsToSplit;
+ SmallPtrSet<BasicBlock*, 8> AvailablePredSet;
+
+ for (const auto &AvailablePred : AvailablePreds)
+ AvailablePredSet.insert(AvailablePred.first);
+
+ // Add all the unavailable predecessors to the PredsToSplit list.
+ for (BasicBlock *P : predecessors(LoadBB)) {
+ // If the predecessor is an indirect goto, we can't split the edge.
+ if (isa<IndirectBrInst>(P->getTerminator()))
+ return false;
+
+ if (!AvailablePredSet.count(P))
+ PredsToSplit.push_back(P);
+ }
+
+ // Split them out to their own block.
+ UnavailablePred = SplitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split");
+ }
+
+ // If the value isn't available in all predecessors, then there will be
+ // exactly one where it isn't available. Insert a load on that edge and add
+ // it to the AvailablePreds list.
+ if (UnavailablePred) {
+ assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
+ "Can't handle critical edge here!");
+ LoadInst *NewVal = new LoadInst(
+ LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
+ LI->getName() + ".pr", false, LI->getAlignment(), LI->getOrdering(),
+ LI->getSyncScopeID(), UnavailablePred->getTerminator());
+ NewVal->setDebugLoc(LI->getDebugLoc());
+ if (AATags)
+ NewVal->setAAMetadata(AATags);
+
+ AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
+ }
+
+ // Now we know that each predecessor of this block has a value in
+ // AvailablePreds, sort them for efficient access as we're walking the preds.
+ array_pod_sort(AvailablePreds.begin(), AvailablePreds.end());
+
+ // Create a PHI node at the start of the block for the PRE'd load value.
+ pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB);
+ PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "",
+ &LoadBB->front());
+ PN->takeName(LI);
+ PN->setDebugLoc(LI->getDebugLoc());
+
+ // Insert new entries into the PHI for each predecessor. A single block may
+ // have multiple entries here.
+ for (pred_iterator PI = PB; PI != PE; ++PI) {
+ BasicBlock *P = *PI;
+ AvailablePredsTy::iterator I =
+ std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(),
+ std::make_pair(P, (Value*)nullptr));
+
+ assert(I != AvailablePreds.end() && I->first == P &&
+ "Didn't find entry for predecessor!");
+
+ // If we have an available predecessor but it requires casting, insert the
+ // cast in the predecessor and use the cast. Note that we have to update the
+ // AvailablePreds vector as we go so that all of the PHI entries for this
+ // predecessor use the same bitcast.
+ Value *&PredV = I->second;
+ if (PredV->getType() != LI->getType())
+ PredV = CastInst::CreateBitOrPointerCast(PredV, LI->getType(), "",
+ P->getTerminator());
+
+ PN->addIncoming(PredV, I->first);
+ }
+
+ for (LoadInst *PredLI : CSELoads) {
+ combineMetadataForCSE(PredLI, LI);
+ }
+
+ LI->replaceAllUsesWith(PN);
+ LI->eraseFromParent();
+
+ return true;
+}
+
+/// FindMostPopularDest - The specified list contains multiple possible
+/// threadable destinations. Pick the one that occurs the most frequently in
+/// the list.
+static BasicBlock *
+FindMostPopularDest(BasicBlock *BB,
+ const SmallVectorImpl<std::pair<BasicBlock *,
+ BasicBlock *>> &PredToDestList) {
+ assert(!PredToDestList.empty());
+
+ // Determine popularity. If there are multiple possible destinations, we
+ // explicitly choose to ignore 'undef' destinations. We prefer to thread
+ // blocks with known and real destinations to threading undef. We'll handle
+ // them later if interesting.
+ DenseMap<BasicBlock*, unsigned> DestPopularity;
+ for (const auto &PredToDest : PredToDestList)
+ if (PredToDest.second)
+ DestPopularity[PredToDest.second]++;
+
+ // Find the most popular dest.
+ DenseMap<BasicBlock*, unsigned>::iterator DPI = DestPopularity.begin();
+ BasicBlock *MostPopularDest = DPI->first;
+ unsigned Popularity = DPI->second;
+ SmallVector<BasicBlock*, 4> SamePopularity;
+
+ for (++DPI; DPI != DestPopularity.end(); ++DPI) {
+ // If the popularity of this entry isn't higher than the popularity we've
+ // seen so far, ignore it.
+ if (DPI->second < Popularity)
+ ; // ignore.
+ else if (DPI->second == Popularity) {
+ // If it is the same as what we've seen so far, keep track of it.
+ SamePopularity.push_back(DPI->first);
+ } else {
+ // If it is more popular, remember it.
+ SamePopularity.clear();
+ MostPopularDest = DPI->first;
+ Popularity = DPI->second;
+ }
+ }
+
+ // Okay, now we know the most popular destination. If there is more than one
+ // destination, we need to determine one. This is arbitrary, but we need
+ // to make a deterministic decision. Pick the first one that appears in the
+ // successor list.
+ if (!SamePopularity.empty()) {
+ SamePopularity.push_back(MostPopularDest);
+ TerminatorInst *TI = BB->getTerminator();
+ for (unsigned i = 0; ; ++i) {
+ assert(i != TI->getNumSuccessors() && "Didn't find any successor!");
+
+ if (!is_contained(SamePopularity, TI->getSuccessor(i)))
+ continue;
+
+ MostPopularDest = TI->getSuccessor(i);
+ break;
+ }
+ }
+
+ // Okay, we have finally picked the most popular destination.
+ return MostPopularDest;
+}
+
+bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
+ ConstantPreference Preference,
+ Instruction *CxtI) {
+ // If threading this would thread across a loop header, don't even try to
+ // thread the edge.
+ if (LoopHeaders.count(BB))
+ return false;
+
+ PredValueInfoTy PredValues;
+ if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference, CxtI))
+ return false;
+
+ assert(!PredValues.empty() &&
+ "ComputeValueKnownInPredecessors returned true with no values");
+
+ DEBUG(dbgs() << "IN BB: " << *BB;
+ for (const auto &PredValue : PredValues) {
+ dbgs() << " BB '" << BB->getName() << "': FOUND condition = "
+ << *PredValue.first
+ << " for pred '" << PredValue.second->getName() << "'.\n";
+ });
+
+ // Decide what we want to thread through. Convert our list of known values to
+ // a list of known destinations for each pred. This also discards duplicate
+ // predecessors and keeps track of the undefined inputs (which are represented
+ // as a null dest in the PredToDestList).
+ SmallPtrSet<BasicBlock*, 16> SeenPreds;
+ SmallVector<std::pair<BasicBlock*, BasicBlock*>, 16> PredToDestList;
+
+ BasicBlock *OnlyDest = nullptr;
+ BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL;
+ Constant *OnlyVal = nullptr;
+ Constant *MultipleVal = (Constant *)(intptr_t)~0ULL;
+
+ unsigned PredWithKnownDest = 0;
+ for (const auto &PredValue : PredValues) {
+ BasicBlock *Pred = PredValue.second;
+ if (!SeenPreds.insert(Pred).second)
+ continue; // Duplicate predecessor entry.
+
+ Constant *Val = PredValue.first;
+
+ BasicBlock *DestBB;
+ if (isa<UndefValue>(Val))
+ DestBB = nullptr;
+ else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+ assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
+ DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+ assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
+ DestBB = SI->findCaseValue(cast<ConstantInt>(Val))->getCaseSuccessor();
+ } else {
+ assert(isa<IndirectBrInst>(BB->getTerminator())
+ && "Unexpected terminator");
+ assert(isa<BlockAddress>(Val) && "Expecting a constant blockaddress");
+ DestBB = cast<BlockAddress>(Val)->getBasicBlock();
+ }
+
+ // If we have exactly one destination, remember it for efficiency below.
+ if (PredToDestList.empty()) {
+ OnlyDest = DestBB;
+ OnlyVal = Val;
+ } else {
+ if (OnlyDest != DestBB)
+ OnlyDest = MultipleDestSentinel;
+ // It possible we have same destination, but different value, e.g. default
+ // case in switchinst.
+ if (Val != OnlyVal)
+ OnlyVal = MultipleVal;
+ }
+
+ // We know where this predecessor is going.
+ ++PredWithKnownDest;
+
+ // If the predecessor ends with an indirect goto, we can't change its
+ // destination.
+ if (isa<IndirectBrInst>(Pred->getTerminator()))
+ continue;
+
+ PredToDestList.push_back(std::make_pair(Pred, DestBB));
+ }
+
+ // If all edges were unthreadable, we fail.
+ if (PredToDestList.empty())
+ return false;
+
+ // If all the predecessors go to a single known successor, we want to fold,
+ // not thread. By doing so, we do not need to duplicate the current block and
+ // also miss potential opportunities in case we dont/cant duplicate.
+ if (OnlyDest && OnlyDest != MultipleDestSentinel) {
+ if (PredWithKnownDest ==
+ (size_t)std::distance(pred_begin(BB), pred_end(BB))) {
+ bool SeenFirstBranchToOnlyDest = false;
+ for (BasicBlock *SuccBB : successors(BB)) {
+ if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest)
+ SeenFirstBranchToOnlyDest = true; // Don't modify the first branch.
+ else
+ SuccBB->removePredecessor(BB, true); // This is unreachable successor.
+ }
+
+ // Finally update the terminator.
+ TerminatorInst *Term = BB->getTerminator();
+ BranchInst::Create(OnlyDest, Term);
+ Term->eraseFromParent();
+
+ // If the condition is now dead due to the removal of the old terminator,
+ // erase it.
+ if (auto *CondInst = dyn_cast<Instruction>(Cond)) {
+ if (CondInst->use_empty() && !CondInst->mayHaveSideEffects())
+ CondInst->eraseFromParent();
+ // We can safely replace *some* uses of the CondInst if it has
+ // exactly one value as returned by LVI. RAUW is incorrect in the
+ // presence of guards and assumes, that have the `Cond` as the use. This
+ // is because we use the guards/assume to reason about the `Cond` value
+ // at the end of block, but RAUW unconditionally replaces all uses
+ // including the guards/assumes themselves and the uses before the
+ // guard/assume.
+ else if (OnlyVal && OnlyVal != MultipleVal &&
+ CondInst->getParent() == BB)
+ ReplaceFoldableUses(CondInst, OnlyVal);
+ }
+ return true;
+ }
+ }
+
+ // Determine which is the most common successor. If we have many inputs and
+ // this block is a switch, we want to start by threading the batch that goes
+ // to the most popular destination first. If we only know about one
+ // threadable destination (the common case) we can avoid this.
+ BasicBlock *MostPopularDest = OnlyDest;
+
+ if (MostPopularDest == MultipleDestSentinel)
+ MostPopularDest = FindMostPopularDest(BB, PredToDestList);
+
+ // Now that we know what the most popular destination is, factor all
+ // predecessors that will jump to it into a single predecessor.
+ SmallVector<BasicBlock*, 16> PredsToFactor;
+ for (const auto &PredToDest : PredToDestList)
+ if (PredToDest.second == MostPopularDest) {
+ BasicBlock *Pred = PredToDest.first;
+
+ // This predecessor may be a switch or something else that has multiple
+ // edges to the block. Factor each of these edges by listing them
+ // according to # occurrences in PredsToFactor.
+ for (BasicBlock *Succ : successors(Pred))
+ if (Succ == BB)
+ PredsToFactor.push_back(Pred);
+ }
+
+ // If the threadable edges are branching on an undefined value, we get to pick
+ // the destination that these predecessors should get to.
+ if (!MostPopularDest)
+ MostPopularDest = BB->getTerminator()->
+ getSuccessor(GetBestDestForJumpOnUndef(BB));
+
+ // Ok, try to thread it!
+ return ThreadEdge(BB, PredsToFactor, MostPopularDest);
+}
+
+/// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on
+/// a PHI node in the current block. See if there are any simplifications we
+/// can do based on inputs to the phi node.
+bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) {
+ BasicBlock *BB = PN->getParent();
+
+ // TODO: We could make use of this to do it once for blocks with common PHI
+ // values.
+ SmallVector<BasicBlock*, 1> PredBBs;
+ PredBBs.resize(1);
+
+ // If any of the predecessor blocks end in an unconditional branch, we can
+ // *duplicate* the conditional branch into that block in order to further
+ // encourage jump threading and to eliminate cases where we have branch on a
+ // phi of an icmp (branch on icmp is much better).
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ BasicBlock *PredBB = PN->getIncomingBlock(i);
+ if (BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()))
+ if (PredBr->isUnconditional()) {
+ PredBBs[0] = PredBB;
+ // Try to duplicate BB into PredBB.
+ if (DuplicateCondBranchOnPHIIntoPred(BB, PredBBs))
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// ProcessBranchOnXOR - We have an otherwise unthreadable conditional branch on
+/// a xor instruction in the current block. See if there are any
+/// simplifications we can do based on inputs to the xor.
+bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) {
+ BasicBlock *BB = BO->getParent();
+
+ // If either the LHS or RHS of the xor is a constant, don't do this
+ // optimization.
+ if (isa<ConstantInt>(BO->getOperand(0)) ||
+ isa<ConstantInt>(BO->getOperand(1)))
+ return false;
+
+ // If the first instruction in BB isn't a phi, we won't be able to infer
+ // anything special about any particular predecessor.
+ if (!isa<PHINode>(BB->front()))
+ return false;
+
+ // If this BB is a landing pad, we won't be able to split the edge into it.
+ if (BB->isEHPad())
+ return false;
+
+ // If we have a xor as the branch input to this block, and we know that the
+ // LHS or RHS of the xor in any predecessor is true/false, then we can clone
+ // the condition into the predecessor and fix that value to true, saving some
+ // logical ops on that path and encouraging other paths to simplify.
+ //
+ // This copies something like this:
+ //
+ // BB:
+ // %X = phi i1 [1], [%X']
+ // %Y = icmp eq i32 %A, %B
+ // %Z = xor i1 %X, %Y
+ // br i1 %Z, ...
+ //
+ // Into:
+ // BB':
+ // %Y = icmp ne i32 %A, %B
+ // br i1 %Y, ...
+
+ PredValueInfoTy XorOpValues;
+ bool isLHS = true;
+ if (!ComputeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues,
+ WantInteger, BO)) {
+ assert(XorOpValues.empty());
+ if (!ComputeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues,
+ WantInteger, BO))
+ return false;
+ isLHS = false;
+ }
+
+ assert(!XorOpValues.empty() &&
+ "ComputeValueKnownInPredecessors returned true with no values");
+
+ // Scan the information to see which is most popular: true or false. The
+ // predecessors can be of the set true, false, or undef.
+ unsigned NumTrue = 0, NumFalse = 0;
+ for (const auto &XorOpValue : XorOpValues) {
+ if (isa<UndefValue>(XorOpValue.first))
+ // Ignore undefs for the count.
+ continue;
+ if (cast<ConstantInt>(XorOpValue.first)->isZero())
+ ++NumFalse;
+ else
+ ++NumTrue;
+ }
+
+ // Determine which value to split on, true, false, or undef if neither.
+ ConstantInt *SplitVal = nullptr;
+ if (NumTrue > NumFalse)
+ SplitVal = ConstantInt::getTrue(BB->getContext());
+ else if (NumTrue != 0 || NumFalse != 0)
+ SplitVal = ConstantInt::getFalse(BB->getContext());
+
+ // Collect all of the blocks that this can be folded into so that we can
+ // factor this once and clone it once.
+ SmallVector<BasicBlock*, 8> BlocksToFoldInto;
+ for (const auto &XorOpValue : XorOpValues) {
+ if (XorOpValue.first != SplitVal && !isa<UndefValue>(XorOpValue.first))
+ continue;
+
+ BlocksToFoldInto.push_back(XorOpValue.second);
+ }
+
+ // If we inferred a value for all of the predecessors, then duplication won't
+ // help us. However, we can just replace the LHS or RHS with the constant.
+ if (BlocksToFoldInto.size() ==
+ cast<PHINode>(BB->front()).getNumIncomingValues()) {
+ if (!SplitVal) {
+ // If all preds provide undef, just nuke the xor, because it is undef too.
+ BO->replaceAllUsesWith(UndefValue::get(BO->getType()));
+ BO->eraseFromParent();
+ } else if (SplitVal->isZero()) {
+ // If all preds provide 0, replace the xor with the other input.
+ BO->replaceAllUsesWith(BO->getOperand(isLHS));
+ BO->eraseFromParent();
+ } else {
+ // If all preds provide 1, set the computed value to 1.
+ BO->setOperand(!isLHS, SplitVal);
+ }
+
+ return true;
+ }
+
+ // Try to duplicate BB into PredBB.
+ return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto);
+}
+
+/// AddPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new
+/// predecessor to the PHIBB block. If it has PHI nodes, add entries for
+/// NewPred using the entries from OldPred (suitably mapped).
+static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
+ BasicBlock *OldPred,
+ BasicBlock *NewPred,
+ DenseMap<Instruction*, Value*> &ValueMap) {
+ for (PHINode &PN : PHIBB->phis()) {
+ // Ok, we have a PHI node. Figure out what the incoming value was for the
+ // DestBlock.
+ Value *IV = PN.getIncomingValueForBlock(OldPred);
+
+ // Remap the value if necessary.
+ if (Instruction *Inst = dyn_cast<Instruction>(IV)) {
+ DenseMap<Instruction*, Value*>::iterator I = ValueMap.find(Inst);
+ if (I != ValueMap.end())
+ IV = I->second;
+ }
+
+ PN.addIncoming(IV, NewPred);
+ }
+}
+
+/// ThreadEdge - We have decided that it is safe and profitable to factor the
+/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB
+/// across BB. Transform the IR to reflect this change.
+bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
+ const SmallVectorImpl<BasicBlock *> &PredBBs,
+ BasicBlock *SuccBB) {
+ // If threading to the same block as we come from, we would infinite loop.
+ if (SuccBB == BB) {
+ DEBUG(dbgs() << " Not threading across BB '" << BB->getName()
+ << "' - would thread to self!\n");
+ return false;
+ }
+
+ // If threading this would thread across a loop header, don't thread the edge.
+ // See the comments above FindLoopHeaders for justifications and caveats.
+ if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
+ DEBUG({
+ bool BBIsHeader = LoopHeaders.count(BB);
+ bool SuccIsHeader = LoopHeaders.count(SuccBB);
+ dbgs() << " Not threading across "
+ << (BBIsHeader ? "loop header BB '" : "block BB '") << BB->getName()
+ << "' to dest " << (SuccIsHeader ? "loop header BB '" : "block BB '")
+ << SuccBB->getName() << "' - it might create an irreducible loop!\n";
+ });
+ return false;
+ }
+
+ unsigned JumpThreadCost =
+ getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
+ if (JumpThreadCost > BBDupThreshold) {
+ DEBUG(dbgs() << " Not threading BB '" << BB->getName()
+ << "' - Cost is too high: " << JumpThreadCost << "\n");
+ return false;
+ }
+
+ // And finally, do it! Start by factoring the predecessors if needed.
+ BasicBlock *PredBB;
+ if (PredBBs.size() == 1)
+ PredBB = PredBBs[0];
+ else {
+ DEBUG(dbgs() << " Factoring out " << PredBBs.size()
+ << " common predecessors.\n");
+ PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
+ }
+
+ // And finally, do it!
+ DEBUG(dbgs() << " Threading edge from '" << PredBB->getName() << "' to '"
+ << SuccBB->getName() << "' with cost: " << JumpThreadCost
+ << ", across block:\n "
+ << *BB << "\n");
+
+ LVI->threadEdge(PredBB, BB, SuccBB);
+
+ // We are going to have to map operands from the original BB block to the new
+ // copy of the block 'NewBB'. If there are PHI nodes in BB, evaluate them to
+ // account for entry from PredBB.
+ DenseMap<Instruction*, Value*> ValueMapping;
+
+ BasicBlock *NewBB = BasicBlock::Create(BB->getContext(),
+ BB->getName()+".thread",
+ BB->getParent(), BB);
+ NewBB->moveAfter(PredBB);
+
+ // Set the block frequency of NewBB.
+ if (HasProfileData) {
+ auto NewBBFreq =
+ BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB);
+ BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+ }
+
+ BasicBlock::iterator BI = BB->begin();
+ for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+ ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+
+ // Clone the non-phi instructions of BB into NewBB, keeping track of the
+ // mapping and using it to remap operands in the cloned instructions.
+ for (; !isa<TerminatorInst>(BI); ++BI) {
+ Instruction *New = BI->clone();
+ New->setName(BI->getName());
+ NewBB->getInstList().push_back(New);
+ ValueMapping[&*BI] = New;
+
+ // Remap operands to patch up intra-block references.
+ for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+ if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+ DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst);
+ if (I != ValueMapping.end())
+ New->setOperand(i, I->second);
+ }
+ }
+
+ // We didn't copy the terminator from BB over to NewBB, because there is now
+ // an unconditional jump to SuccBB. Insert the unconditional jump.
+ BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB);
+ NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc());
+
+ // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
+ // PHI nodes for NewBB now.
+ AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping);
+
+ // If there were values defined in BB that are used outside the block, then we
+ // now have to update all uses of the value to use either the original value,
+ // the cloned value, or some PHI derived value. This can require arbitrary
+ // PHI insertion, of which we are prepared to do, clean these up now.
+ SSAUpdater SSAUpdate;
+ SmallVector<Use*, 16> UsesToRename;
+ for (Instruction &I : *BB) {
+ // Scan all uses of this instruction to see if it is used outside of its
+ // block, and if so, record them in UsesToRename.
+ for (Use &U : I.uses()) {
+ Instruction *User = cast<Instruction>(U.getUser());
+ if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+ if (UserPN->getIncomingBlock(U) == BB)
+ continue;
+ } else if (User->getParent() == BB)
+ continue;
+
+ UsesToRename.push_back(&U);
+ }
+
+ // If there are no uses outside the block, we're done with this instruction.
+ if (UsesToRename.empty())
+ continue;
+
+ DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
+
+ // We found a use of I outside of BB. Rename all uses of I that are outside
+ // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks
+ // with the two values we know.
+ SSAUpdate.Initialize(I.getType(), I.getName());
+ SSAUpdate.AddAvailableValue(BB, &I);
+ SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]);
+
+ while (!UsesToRename.empty())
+ SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
+ DEBUG(dbgs() << "\n");
+ }
+
+ // Ok, NewBB is good to go. Update the terminator of PredBB to jump to
+ // NewBB instead of BB. This eliminates predecessors from BB, which requires
+ // us to simplify any PHI nodes in BB.
+ TerminatorInst *PredTerm = PredBB->getTerminator();
+ for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
+ if (PredTerm->getSuccessor(i) == BB) {
+ BB->removePredecessor(PredBB, true);
+ PredTerm->setSuccessor(i, NewBB);
+ }
+
+ // At this point, the IR is fully up to date and consistent. Do a quick scan
+ // over the new instructions and zap any that are constants or dead. This
+ // frequently happens because of phi translation.
+ SimplifyInstructionsInBlock(NewBB, TLI);
+
+ // Update the edge weight from BB to SuccBB, which should be less than before.
+ UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB);
+
+ // Threaded an edge!
+ ++NumThreads;
+ return true;
+}
+
+/// Create a new basic block that will be the predecessor of BB and successor of
+/// all blocks in Preds. When profile data is available, update the frequency of
+/// this new block.
+BasicBlock *JumpThreadingPass::SplitBlockPreds(BasicBlock *BB,
+ ArrayRef<BasicBlock *> Preds,
+ const char *Suffix) {
+ // Collect the frequencies of all predecessors of BB, which will be used to
+ // update the edge weight on BB->SuccBB.
+ BlockFrequency PredBBFreq(0);
+ if (HasProfileData)
+ for (auto Pred : Preds)
+ PredBBFreq += BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB);
+
+ BasicBlock *PredBB = SplitBlockPredecessors(BB, Preds, Suffix);
+
+ // Set the block frequency of the newly created PredBB, which is the sum of
+ // frequencies of Preds.
+ if (HasProfileData)
+ BFI->setBlockFreq(PredBB, PredBBFreq.getFrequency());
+ return PredBB;
+}
+
+bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) {
+ const TerminatorInst *TI = BB->getTerminator();
+ assert(TI->getNumSuccessors() > 1 && "not a split");
+
+ MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof);
+ if (!WeightsNode)
+ return false;
+
+ MDString *MDName = cast<MDString>(WeightsNode->getOperand(0));
+ if (MDName->getString() != "branch_weights")
+ return false;
+
+ // Ensure there are weights for all of the successors. Note that the first
+ // operand to the metadata node is a name, not a weight.
+ return WeightsNode->getNumOperands() == TI->getNumSuccessors() + 1;
+}
+
+/// Update the block frequency of BB and branch weight and the metadata on the
+/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 -
+/// Freq(PredBB->BB) / Freq(BB->SuccBB).
+void JumpThreadingPass::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
+ BasicBlock *BB,
+ BasicBlock *NewBB,
+ BasicBlock *SuccBB) {
+ if (!HasProfileData)
+ return;
+
+ assert(BFI && BPI && "BFI & BPI should have been created here");
+
+ // As the edge from PredBB to BB is deleted, we have to update the block
+ // frequency of BB.
+ auto BBOrigFreq = BFI->getBlockFreq(BB);
+ auto NewBBFreq = BFI->getBlockFreq(NewBB);
+ auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB);
+ auto BBNewFreq = BBOrigFreq - NewBBFreq;
+ BFI->setBlockFreq(BB, BBNewFreq.getFrequency());
+
+ // Collect updated outgoing edges' frequencies from BB and use them to update
+ // edge probabilities.
+ SmallVector<uint64_t, 4> BBSuccFreq;
+ for (BasicBlock *Succ : successors(BB)) {
+ auto SuccFreq = (Succ == SuccBB)
+ ? BB2SuccBBFreq - NewBBFreq
+ : BBOrigFreq * BPI->getEdgeProbability(BB, Succ);
+ BBSuccFreq.push_back(SuccFreq.getFrequency());
+ }
+
+ uint64_t MaxBBSuccFreq =
+ *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end());
+
+ SmallVector<BranchProbability, 4> BBSuccProbs;
+ if (MaxBBSuccFreq == 0)
+ BBSuccProbs.assign(BBSuccFreq.size(),
+ {1, static_cast<unsigned>(BBSuccFreq.size())});
+ else {
+ for (uint64_t Freq : BBSuccFreq)
+ BBSuccProbs.push_back(
+ BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq));
+ // Normalize edge probabilities so that they sum up to one.
+ BranchProbability::normalizeProbabilities(BBSuccProbs.begin(),
+ BBSuccProbs.end());
+ }
+
+ // Update edge probabilities in BPI.
+ for (int I = 0, E = BBSuccProbs.size(); I < E; I++)
+ BPI->setEdgeProbability(BB, I, BBSuccProbs[I]);
+
+ // Update the profile metadata as well.
+ //
+ // Don't do this if the profile of the transformed blocks was statically
+ // estimated. (This could occur despite the function having an entry
+ // frequency in completely cold parts of the CFG.)
+ //
+ // In this case we don't want to suggest to subsequent passes that the
+ // calculated weights are fully consistent. Consider this graph:
+ //
+ // check_1
+ // 50% / |
+ // eq_1 | 50%
+ // \ |
+ // check_2
+ // 50% / |
+ // eq_2 | 50%
+ // \ |
+ // check_3
+ // 50% / |
+ // eq_3 | 50%
+ // \ |
+ //
+ // Assuming the blocks check_* all compare the same value against 1, 2 and 3,
+ // the overall probabilities are inconsistent; the total probability that the
+ // value is either 1, 2 or 3 is 150%.
+ //
+ // As a consequence if we thread eq_1 -> check_2 to check_3, check_2->check_3
+ // becomes 0%. This is even worse if the edge whose probability becomes 0% is
+ // the loop exit edge. Then based solely on static estimation we would assume
+ // the loop was extremely hot.
+ //
+ // FIXME this locally as well so that BPI and BFI are consistent as well. We
+ // shouldn't make edges extremely likely or unlikely based solely on static
+ // estimation.
+ if (BBSuccProbs.size() >= 2 && doesBlockHaveProfileData(BB)) {
+ SmallVector<uint32_t, 4> Weights;
+ for (auto Prob : BBSuccProbs)
+ Weights.push_back(Prob.getNumerator());
+
+ auto TI = BB->getTerminator();
+ TI->setMetadata(
+ LLVMContext::MD_prof,
+ MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights));
+ }
+}
+
+/// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch
+/// to BB which contains an i1 PHI node and a conditional branch on that PHI.
+/// If we can duplicate the contents of BB up into PredBB do so now, this
+/// improves the odds that the branch will be on an analyzable instruction like
+/// a compare.
+bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
+ BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs) {
+ assert(!PredBBs.empty() && "Can't handle an empty set");
+
+ // If BB is a loop header, then duplicating this block outside the loop would
+ // cause us to transform this into an irreducible loop, don't do this.
+ // See the comments above FindLoopHeaders for justifications and caveats.
+ if (LoopHeaders.count(BB)) {
+ DEBUG(dbgs() << " Not duplicating loop header '" << BB->getName()
+ << "' into predecessor block '" << PredBBs[0]->getName()
+ << "' - it might create an irreducible loop!\n");
+ return false;
+ }
+
+ unsigned DuplicationCost =
+ getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
+ if (DuplicationCost > BBDupThreshold) {
+ DEBUG(dbgs() << " Not duplicating BB '" << BB->getName()
+ << "' - Cost is too high: " << DuplicationCost << "\n");
+ return false;
+ }
+
+ // And finally, do it! Start by factoring the predecessors if needed.
+ BasicBlock *PredBB;
+ if (PredBBs.size() == 1)
+ PredBB = PredBBs[0];
+ else {
+ DEBUG(dbgs() << " Factoring out " << PredBBs.size()
+ << " common predecessors.\n");
+ PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
+ }
+
+ // Okay, we decided to do this! Clone all the instructions in BB onto the end
+ // of PredBB.
+ DEBUG(dbgs() << " Duplicating block '" << BB->getName() << "' into end of '"
+ << PredBB->getName() << "' to eliminate branch on phi. Cost: "
+ << DuplicationCost << " block is:" << *BB << "\n");
+
+ // Unless PredBB ends with an unconditional branch, split the edge so that we
+ // can just clone the bits from BB into the end of the new PredBB.
+ BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
+
+ if (!OldPredBranch || !OldPredBranch->isUnconditional()) {
+ PredBB = SplitEdge(PredBB, BB);
+ OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
+ }
+
+ // We are going to have to map operands from the original BB block into the
+ // PredBB block. Evaluate PHI nodes in BB.
+ DenseMap<Instruction*, Value*> ValueMapping;
+
+ BasicBlock::iterator BI = BB->begin();
+ for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+ ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+ // Clone the non-phi instructions of BB into PredBB, keeping track of the
+ // mapping and using it to remap operands in the cloned instructions.
+ for (; BI != BB->end(); ++BI) {
+ Instruction *New = BI->clone();
+
+ // Remap operands to patch up intra-block references.
+ for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+ if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+ DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst);
+ if (I != ValueMapping.end())
+ New->setOperand(i, I->second);
+ }
+
+ // If this instruction can be simplified after the operands are updated,
+ // just use the simplified value instead. This frequently happens due to
+ // phi translation.
+ if (Value *IV = SimplifyInstruction(
+ New,
+ {BB->getModule()->getDataLayout(), TLI, nullptr, nullptr, New})) {
+ ValueMapping[&*BI] = IV;
+ if (!New->mayHaveSideEffects()) {
+ New->deleteValue();
+ New = nullptr;
+ }
+ } else {
+ ValueMapping[&*BI] = New;
+ }
+ if (New) {
+ // Otherwise, insert the new instruction into the block.
+ New->setName(BI->getName());
+ PredBB->getInstList().insert(OldPredBranch->getIterator(), New);
+ }
+ }
+
+ // Check to see if the targets of the branch had PHI nodes. If so, we need to
+ // add entries to the PHI nodes for branch from PredBB now.
+ BranchInst *BBBranch = cast<BranchInst>(BB->getTerminator());
+ AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(0), BB, PredBB,
+ ValueMapping);
+ AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB,
+ ValueMapping);
+
+ // If there were values defined in BB that are used outside the block, then we
+ // now have to update all uses of the value to use either the original value,
+ // the cloned value, or some PHI derived value. This can require arbitrary
+ // PHI insertion, of which we are prepared to do, clean these up now.
+ SSAUpdater SSAUpdate;
+ SmallVector<Use*, 16> UsesToRename;
+ for (Instruction &I : *BB) {
+ // Scan all uses of this instruction to see if it is used outside of its
+ // block, and if so, record them in UsesToRename.
+ for (Use &U : I.uses()) {
+ Instruction *User = cast<Instruction>(U.getUser());
+ if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+ if (UserPN->getIncomingBlock(U) == BB)
+ continue;
+ } else if (User->getParent() == BB)
+ continue;
+
+ UsesToRename.push_back(&U);
+ }
+
+ // If there are no uses outside the block, we're done with this instruction.
+ if (UsesToRename.empty())
+ continue;
+
+ DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
+
+ // We found a use of I outside of BB. Rename all uses of I that are outside
+ // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks
+ // with the two values we know.
+ SSAUpdate.Initialize(I.getType(), I.getName());
+ SSAUpdate.AddAvailableValue(BB, &I);
+ SSAUpdate.AddAvailableValue(PredBB, ValueMapping[&I]);
+
+ while (!UsesToRename.empty())
+ SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
+ DEBUG(dbgs() << "\n");
+ }
+
+ // PredBB no longer jumps to BB, remove entries in the PHI node for the edge
+ // that we nuked.
+ BB->removePredecessor(PredBB, true);
+
+ // Remove the unconditional branch at the end of the PredBB block.
+ OldPredBranch->eraseFromParent();
+
+ ++NumDupes;
+ return true;
+}
+
+/// TryToUnfoldSelect - Look for blocks of the form
+/// bb1:
+/// %a = select
+/// br bb2
+///
+/// bb2:
+/// %p = phi [%a, %bb1] ...
+/// %c = icmp %p
+/// br i1 %c
+///
+/// And expand the select into a branch structure if one of its arms allows %c
+/// to be folded. This later enables threading from bb1 over bb2.
+bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
+ BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+ PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0));
+ Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1));
+
+ if (!CondBr || !CondBr->isConditional() || !CondLHS ||
+ CondLHS->getParent() != BB)
+ return false;
+
+ for (unsigned I = 0, E = CondLHS->getNumIncomingValues(); I != E; ++I) {
+ BasicBlock *Pred = CondLHS->getIncomingBlock(I);
+ SelectInst *SI = dyn_cast<SelectInst>(CondLHS->getIncomingValue(I));
+
+ // Look if one of the incoming values is a select in the corresponding
+ // predecessor.
+ if (!SI || SI->getParent() != Pred || !SI->hasOneUse())
+ continue;
+
+ BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
+ if (!PredTerm || !PredTerm->isUnconditional())
+ continue;
+
+ // Now check if one of the select values would allow us to constant fold the
+ // terminator in BB. We don't do the transform if both sides fold, those
+ // cases will be threaded in any case.
+ LazyValueInfo::Tristate LHSFolds =
+ LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1),
+ CondRHS, Pred, BB, CondCmp);
+ LazyValueInfo::Tristate RHSFolds =
+ LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2),
+ CondRHS, Pred, BB, CondCmp);
+ if ((LHSFolds != LazyValueInfo::Unknown ||
+ RHSFolds != LazyValueInfo::Unknown) &&
+ LHSFolds != RHSFolds) {
+ // Expand the select.
+ //
+ // Pred --
+ // | v
+ // | NewBB
+ // | |
+ // |-----
+ // v
+ // BB
+ BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold",
+ BB->getParent(), BB);
+ // Move the unconditional branch to NewBB.
+ PredTerm->removeFromParent();
+ NewBB->getInstList().insert(NewBB->end(), PredTerm);
+ // Create a conditional branch and update PHI nodes.
+ BranchInst::Create(NewBB, BB, SI->getCondition(), Pred);
+ CondLHS->setIncomingValue(I, SI->getFalseValue());
+ CondLHS->addIncoming(SI->getTrueValue(), NewBB);
+ // The select is now dead.
+ SI->eraseFromParent();
+
+ // Update any other PHI nodes in BB.
+ for (BasicBlock::iterator BI = BB->begin();
+ PHINode *Phi = dyn_cast<PHINode>(BI); ++BI)
+ if (Phi != CondLHS)
+ Phi->addIncoming(Phi->getIncomingValueForBlock(Pred), NewBB);
+ return true;
+ }
+ }
+ return false;
+}
+
+/// TryToUnfoldSelectInCurrBB - Look for PHI/Select or PHI/CMP/Select in the
+/// same BB in the form
+/// bb:
+/// %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ...
+/// %s = select %p, trueval, falseval
+///
+/// or
+///
+/// bb:
+/// %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ...
+/// %c = cmp %p, 0
+/// %s = select %c, trueval, falseval
+///
+/// And expand the select into a branch structure. This later enables
+/// jump-threading over bb in this pass.
+///
+/// Using the similar approach of SimplifyCFG::FoldCondBranchOnPHI(), unfold
+/// select if the associated PHI has at least one constant. If the unfolded
+/// select is not jump-threaded, it will be folded again in the later
+/// optimizations.
+bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
+ // If threading this would thread across a loop header, don't thread the edge.
+ // See the comments above FindLoopHeaders for justifications and caveats.
+ if (LoopHeaders.count(BB))
+ return false;
+
+ for (BasicBlock::iterator BI = BB->begin();
+ PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
+ // Look for a Phi having at least one constant incoming value.
+ if (llvm::all_of(PN->incoming_values(),
+ [](Value *V) { return !isa<ConstantInt>(V); }))
+ continue;
+
+ auto isUnfoldCandidate = [BB](SelectInst *SI, Value *V) {
+ // Check if SI is in BB and use V as condition.
+ if (SI->getParent() != BB)
+ return false;
+ Value *Cond = SI->getCondition();
+ return (Cond && Cond == V && Cond->getType()->isIntegerTy(1));
+ };
+
+ SelectInst *SI = nullptr;
+ for (Use &U : PN->uses()) {
+ if (ICmpInst *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
+ // Look for a ICmp in BB that compares PN with a constant and is the
+ // condition of a Select.
+ if (Cmp->getParent() == BB && Cmp->hasOneUse() &&
+ isa<ConstantInt>(Cmp->getOperand(1 - U.getOperandNo())))
+ if (SelectInst *SelectI = dyn_cast<SelectInst>(Cmp->user_back()))
+ if (isUnfoldCandidate(SelectI, Cmp->use_begin()->get())) {
+ SI = SelectI;
+ break;
+ }
+ } else if (SelectInst *SelectI = dyn_cast<SelectInst>(U.getUser())) {
+ // Look for a Select in BB that uses PN as condtion.
+ if (isUnfoldCandidate(SelectI, U.get())) {
+ SI = SelectI;
+ break;
+ }
+ }
+ }
+
+ if (!SI)
+ continue;
+ // Expand the select.
+ TerminatorInst *Term =
+ SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
+ PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
+ NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
+ NewPN->addIncoming(SI->getFalseValue(), BB);
+ SI->replaceAllUsesWith(NewPN);
+ SI->eraseFromParent();
+ return true;
+ }
+ return false;
+}
+
+/// Try to propagate a guard from the current BB into one of its predecessors
+/// in case if another branch of execution implies that the condition of this
+/// guard is always true. Currently we only process the simplest case that
+/// looks like:
+///
+/// Start:
+/// %cond = ...
+/// br i1 %cond, label %T1, label %F1
+/// T1:
+/// br label %Merge
+/// F1:
+/// br label %Merge
+/// Merge:
+/// %condGuard = ...
+/// call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+///
+/// And cond either implies condGuard or !condGuard. In this case all the
+/// instructions before the guard can be duplicated in both branches, and the
+/// guard is then threaded to one of them.
+bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
+ using namespace PatternMatch;
+
+ // We only want to deal with two predecessors.
+ BasicBlock *Pred1, *Pred2;
+ auto PI = pred_begin(BB), PE = pred_end(BB);
+ if (PI == PE)
+ return false;
+ Pred1 = *PI++;
+ if (PI == PE)
+ return false;
+ Pred2 = *PI++;
+ if (PI != PE)
+ return false;
+ if (Pred1 == Pred2)
+ return false;
+
+ // Try to thread one of the guards of the block.
+ // TODO: Look up deeper than to immediate predecessor?
+ auto *Parent = Pred1->getSinglePredecessor();
+ if (!Parent || Parent != Pred2->getSinglePredecessor())
+ return false;
+
+ if (auto *BI = dyn_cast<BranchInst>(Parent->getTerminator()))
+ for (auto &I : *BB)
+ if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>()))
+ if (ThreadGuard(BB, cast<IntrinsicInst>(&I), BI))
+ return true;
+
+ return false;
+}
+
+/// Try to propagate the guard from BB which is the lower block of a diamond
+/// to one of its branches, in case if diamond's condition implies guard's
+/// condition.
+bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
+ BranchInst *BI) {
+ assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?");
+ assert(BI->isConditional() && "Unconditional branch has 2 successors?");
+ Value *GuardCond = Guard->getArgOperand(0);
+ Value *BranchCond = BI->getCondition();
+ BasicBlock *TrueDest = BI->getSuccessor(0);
+ BasicBlock *FalseDest = BI->getSuccessor(1);
+
+ auto &DL = BB->getModule()->getDataLayout();
+ bool TrueDestIsSafe = false;
+ bool FalseDestIsSafe = false;
+
+ // True dest is safe if BranchCond => GuardCond.
+ auto Impl = isImpliedCondition(BranchCond, GuardCond, DL);
+ if (Impl && *Impl)
+ TrueDestIsSafe = true;
+ else {
+ // False dest is safe if !BranchCond => GuardCond.
+ Impl = isImpliedCondition(BranchCond, GuardCond, DL, /* LHSIsTrue */ false);
+ if (Impl && *Impl)
+ FalseDestIsSafe = true;
+ }
+
+ if (!TrueDestIsSafe && !FalseDestIsSafe)
+ return false;
+
+ BasicBlock *UnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
+ BasicBlock *GuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
+
+ ValueToValueMapTy UnguardedMapping, GuardedMapping;
+ Instruction *AfterGuard = Guard->getNextNode();
+ unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold);
+ if (Cost > BBDupThreshold)
+ return false;
+ // Duplicate all instructions before the guard and the guard itself to the
+ // branch where implication is not proved.
+ GuardedBlock = DuplicateInstructionsInSplitBetween(
+ BB, GuardedBlock, AfterGuard, GuardedMapping);
+ assert(GuardedBlock && "Could not create the guarded block?");
+ // Duplicate all instructions before the guard in the unguarded branch.
+ // Since we have successfully duplicated the guarded block and this block
+ // has fewer instructions, we expect it to succeed.
+ UnguardedBlock = DuplicateInstructionsInSplitBetween(BB, UnguardedBlock,
+ Guard, UnguardedMapping);
+ assert(UnguardedBlock && "Could not create the unguarded block?");
+ DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
+ << GuardedBlock->getName() << "\n");
+
+ // Some instructions before the guard may still have uses. For them, we need
+ // to create Phi nodes merging their copies in both guarded and unguarded
+ // branches. Those instructions that have no uses can be just removed.
+ SmallVector<Instruction *, 4> ToRemove;
+ for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI)
+ if (!isa<PHINode>(&*BI))
+ ToRemove.push_back(&*BI);
+
+ Instruction *InsertionPoint = &*BB->getFirstInsertionPt();
+ assert(InsertionPoint && "Empty block?");
+ // Substitute with Phis & remove.
+ for (auto *Inst : reverse(ToRemove)) {
+ if (!Inst->use_empty()) {
+ PHINode *NewPN = PHINode::Create(Inst->getType(), 2);
+ NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock);
+ NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock);
+ NewPN->insertBefore(InsertionPoint);
+ Inst->replaceAllUsesWith(NewPN);
+ }
+ Inst->eraseFromParent();
+ }
+ return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
new file mode 100644
index 000000000000..946474fef062
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -0,0 +1,1576 @@
+//===-- LICM.cpp - Loop Invariant Code Motion Pass ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs loop invariant code motion, attempting to remove as much
+// code from the body of a loop as possible. It does this by either hoisting
+// code into the preheader block, or by sinking code to the exit blocks if it is
+// safe. This pass also promotes must-aliased memory locations in the loop to
+// live in registers, thus hoisting and sinking "invariant" loads and stores.
+//
+// This pass uses alias analysis for two purposes:
+//
+// 1. Moving loop invariant loads and calls out of loops. If we can determine
+// that a load or call inside of a loop never aliases anything stored to,
+// we can hoist it or sink it like any other instruction.
+// 2. Scalar Promotion of Memory - If there is a store instruction inside of
+// the loop, we try to move the store to happen AFTER the loop instead of
+// inside of the loop. This can only happen if a few conditions are true:
+// A. The pointer stored through is loop invariant
+// B. There are no stores or loads in the loop which _may_ alias the
+// pointer. There are no calls in the loop which mod/ref the pointer.
+// If these conditions are true, we can promote the loads and stores in the
+// loop of the pointer to use a temporary alloca'd variable. We then use
+// the SSAUpdater to construct the appropriate SSA form for the value.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LICM.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PredIteratorCache.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "licm"
+
+STATISTIC(NumSunk, "Number of instructions sunk out of loop");
+STATISTIC(NumHoisted, "Number of instructions hoisted out of loop");
+STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
+STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk");
+STATISTIC(NumPromoted, "Number of memory locations promoted to registers");
+
+/// Memory promotion is enabled by default.
+static cl::opt<bool>
+ DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false),
+ cl::desc("Disable memory promotion in LICM pass"));
+
+static cl::opt<uint32_t> MaxNumUsesTraversed(
+ "licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
+ cl::desc("Max num uses visited for identifying load "
+ "invariance in loop using invariant start (default = 8)"));
+
+static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
+static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
+ const LoopSafetyInfo *SafetyInfo,
+ TargetTransformInfo *TTI, bool &FreeInLoop);
+static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
+ const LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE);
+static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
+ const Loop *CurLoop, LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE, bool FreeInLoop);
+static bool isSafeToExecuteUnconditionally(Instruction &Inst,
+ const DominatorTree *DT,
+ const Loop *CurLoop,
+ const LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE,
+ const Instruction *CtxI = nullptr);
+static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
+ const AAMDNodes &AAInfo,
+ AliasSetTracker *CurAST);
+static Instruction *
+CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
+ const LoopInfo *LI,
+ const LoopSafetyInfo *SafetyInfo);
+
+namespace {
+struct LoopInvariantCodeMotion {
+ bool runOnLoop(Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
+ TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
+ ScalarEvolution *SE, MemorySSA *MSSA,
+ OptimizationRemarkEmitter *ORE, bool DeleteAST);
+
+ DenseMap<Loop *, AliasSetTracker *> &getLoopToAliasSetMap() {
+ return LoopToAliasSetMap;
+ }
+
+private:
+ DenseMap<Loop *, AliasSetTracker *> LoopToAliasSetMap;
+
+ AliasSetTracker *collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
+ AliasAnalysis *AA);
+};
+
+struct LegacyLICMPass : public LoopPass {
+ static char ID; // Pass identification, replacement for typeid
+ LegacyLICMPass() : LoopPass(ID) {
+ initializeLegacyLICMPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L)) {
+ // If we have run LICM on a previous loop but now we are skipping
+ // (because we've hit the opt-bisect limit), we need to clear the
+ // loop alias information.
+ for (auto &LTAS : LICM.getLoopToAliasSetMap())
+ delete LTAS.second;
+ LICM.getLoopToAliasSetMap().clear();
+ return false;
+ }
+
+ auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+ MemorySSA *MSSA = EnableMSSALoopDependency
+ ? (&getAnalysis<MemorySSAWrapperPass>().getMSSA())
+ : nullptr;
+ // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+ // pass. Function analyses need to be preserved across loop transformations
+ // but ORE cannot be preserved (see comment before the pass definition).
+ OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
+ return LICM.runOnLoop(L,
+ &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+ &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
+ &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *L->getHeader()->getParent()),
+ SE ? &SE->getSE() : nullptr, MSSA, &ORE, false);
+ }
+
+ /// This transformation requires natural loop information & requires that
+ /// loop preheaders be inserted into the CFG...
+ ///
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ if (EnableMSSALoopDependency)
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ getLoopAnalysisUsage(AU);
+ }
+
+ using llvm::Pass::doFinalization;
+
+ bool doFinalization() override {
+ assert(LICM.getLoopToAliasSetMap().empty() &&
+ "Didn't free loop alias sets");
+ return false;
+ }
+
+private:
+ LoopInvariantCodeMotion LICM;
+
+ /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
+ void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To,
+ Loop *L) override;
+
+ /// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias
+ /// set.
+ void deleteAnalysisValue(Value *V, Loop *L) override;
+
+ /// Simple Analysis hook. Delete loop L from alias set map.
+ void deleteAnalysisLoop(Loop *L) override;
+};
+} // namespace
+
+PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR, LPMUpdater &) {
+ const auto &FAM =
+ AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
+ Function *F = L.getHeader()->getParent();
+
+ auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
+ // FIXME: This should probably be optional rather than required.
+ if (!ORE)
+ report_fatal_error("LICM: OptimizationRemarkEmitterAnalysis not "
+ "cached at a higher level");
+
+ LoopInvariantCodeMotion LICM;
+ if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.TTI, &AR.SE,
+ AR.MSSA, ORE, true))
+ return PreservedAnalyses::all();
+
+ auto PA = getLoopPassPreservedAnalyses();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+char LegacyLICMPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LegacyLICMPass, "licm", "Loop Invariant Code Motion",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false,
+ false)
+
+Pass *llvm::createLICMPass() { return new LegacyLICMPass(); }
+
+/// Hoist expressions out of the specified loop. Note, alias info for inner
+/// loop is not preserved so it is not a good idea to run LICM multiple
+/// times on one loop.
+/// We should delete AST for inner loops in the new pass manager to avoid
+/// memory leak.
+///
+bool LoopInvariantCodeMotion::runOnLoop(
+ Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
+ TargetLibraryInfo *TLI, TargetTransformInfo *TTI, ScalarEvolution *SE,
+ MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, bool DeleteAST) {
+ bool Changed = false;
+
+ assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
+
+ AliasSetTracker *CurAST = collectAliasInfoForLoop(L, LI, AA);
+
+ // Get the preheader block to move instructions into...
+ BasicBlock *Preheader = L->getLoopPreheader();
+
+ // Compute loop safety information.
+ LoopSafetyInfo SafetyInfo;
+ computeLoopSafetyInfo(&SafetyInfo, L);
+
+ // We want to visit all of the instructions in this loop... that are not parts
+ // of our subloops (they have already had their invariants hoisted out of
+ // their loop, into this loop, so there is no need to process the BODIES of
+ // the subloops).
+ //
+ // Traverse the body of the loop in depth first order on the dominator tree so
+ // that we are guaranteed to see definitions before we see uses. This allows
+ // us to sink instructions in one pass, without iteration. After sinking
+ // instructions, we perform another pass to hoist them out of the loop.
+ //
+ if (L->hasDedicatedExits())
+ Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,
+ CurAST, &SafetyInfo, ORE);
+ if (Preheader)
+ Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
+ CurAST, &SafetyInfo, ORE);
+
+ // Now that all loop invariants have been removed from the loop, promote any
+ // memory references to scalars that we can.
+ // Don't sink stores from loops without dedicated block exits. Exits
+ // containing indirect branches are not transformed by loop simplify,
+ // make sure we catch that. An additional load may be generated in the
+ // preheader for SSA updater, so also avoid sinking when no preheader
+ // is available.
+ if (!DisablePromotion && Preheader && L->hasDedicatedExits()) {
+ // Figure out the loop exits and their insertion points
+ SmallVector<BasicBlock *, 8> ExitBlocks;
+ L->getUniqueExitBlocks(ExitBlocks);
+
+ // We can't insert into a catchswitch.
+ bool HasCatchSwitch = llvm::any_of(ExitBlocks, [](BasicBlock *Exit) {
+ return isa<CatchSwitchInst>(Exit->getTerminator());
+ });
+
+ if (!HasCatchSwitch) {
+ SmallVector<Instruction *, 8> InsertPts;
+ InsertPts.reserve(ExitBlocks.size());
+ for (BasicBlock *ExitBlock : ExitBlocks)
+ InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
+
+ PredIteratorCache PIC;
+
+ bool Promoted = false;
+
+ // Loop over all of the alias sets in the tracker object.
+ for (AliasSet &AS : *CurAST) {
+ // We can promote this alias set if it has a store, if it is a "Must"
+ // alias set, if the pointer is loop invariant, and if we are not
+ // eliminating any volatile loads or stores.
+ if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
+ AS.isVolatile() || !L->isLoopInvariant(AS.begin()->getValue()))
+ continue;
+
+ assert(
+ !AS.empty() &&
+ "Must alias set should have at least one pointer element in it!");
+
+ SmallSetVector<Value *, 8> PointerMustAliases;
+ for (const auto &ASI : AS)
+ PointerMustAliases.insert(ASI.getValue());
+
+ Promoted |= promoteLoopAccessesToScalars(PointerMustAliases, ExitBlocks,
+ InsertPts, PIC, LI, DT, TLI, L,
+ CurAST, &SafetyInfo, ORE);
+ }
+
+ // Once we have promoted values across the loop body we have to
+ // recursively reform LCSSA as any nested loop may now have values defined
+ // within the loop used in the outer loop.
+ // FIXME: This is really heavy handed. It would be a bit better to use an
+ // SSAUpdater strategy during promotion that was LCSSA aware and reformed
+ // it as it went.
+ if (Promoted)
+ formLCSSARecursively(*L, *DT, LI, SE);
+
+ Changed |= Promoted;
+ }
+ }
+
+ // Check that neither this loop nor its parent have had LCSSA broken. LICM is
+ // specifically moving instructions across the loop boundary and so it is
+ // especially in need of sanity checking here.
+ assert(L->isLCSSAForm(*DT) && "Loop not left in LCSSA form after LICM!");
+ assert((!L->getParentLoop() || L->getParentLoop()->isLCSSAForm(*DT)) &&
+ "Parent loop not left in LCSSA form after LICM!");
+
+ // If this loop is nested inside of another one, save the alias information
+ // for when we process the outer loop.
+ if (L->getParentLoop() && !DeleteAST)
+ LoopToAliasSetMap[L] = CurAST;
+ else
+ delete CurAST;
+
+ if (Changed && SE)
+ SE->forgetLoopDispositions(L);
+ return Changed;
+}
+
+/// Walk the specified region of the CFG (defined by all blocks dominated by
+/// the specified block, and that are in the current loop) in reverse depth
+/// first order w.r.t the DominatorTree. This allows us to visit uses before
+/// definitions, allowing us to sink a loop body in one pass without iteration.
+///
+bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
+ DominatorTree *DT, TargetLibraryInfo *TLI,
+ TargetTransformInfo *TTI, Loop *CurLoop,
+ AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE) {
+
+ // Verify inputs.
+ assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
+ CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr &&
+ "Unexpected input to sinkRegion");
+
+ // We want to visit children before parents. We will enque all the parents
+ // before their children in the worklist and process the worklist in reverse
+ // order.
+ SmallVector<DomTreeNode *, 16> Worklist = collectChildrenInLoop(N, CurLoop);
+
+ bool Changed = false;
+ for (DomTreeNode *DTN : reverse(Worklist)) {
+ BasicBlock *BB = DTN->getBlock();
+ // Only need to process the contents of this block if it is not part of a
+ // subloop (which would already have been processed).
+ if (inSubLoop(BB, CurLoop, LI))
+ continue;
+
+ for (BasicBlock::iterator II = BB->end(); II != BB->begin();) {
+ Instruction &I = *--II;
+
+ // If the instruction is dead, we would try to sink it because it isn't
+ // used in the loop, instead, just delete it.
+ if (isInstructionTriviallyDead(&I, TLI)) {
+ DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
+ ++II;
+ CurAST->deleteValue(&I);
+ I.eraseFromParent();
+ Changed = true;
+ continue;
+ }
+
+ // Check to see if we can sink this instruction to the exit blocks
+ // of the loop. We can do this if the all users of the instruction are
+ // outside of the loop. In this case, it doesn't even matter if the
+ // operands of the instruction are loop invariant.
+ //
+ bool FreeInLoop = false;
+ if (isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) &&
+ canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) {
+ if (sink(I, LI, DT, CurLoop, SafetyInfo, ORE, FreeInLoop)) {
+ if (!FreeInLoop) {
+ ++II;
+ CurAST->deleteValue(&I);
+ I.eraseFromParent();
+ }
+ Changed = true;
+ }
+ }
+ }
+ }
+ return Changed;
+}
+
+/// Walk the specified region of the CFG (defined by all blocks dominated by
+/// the specified block, and that are in the current loop) in depth first
+/// order w.r.t the DominatorTree. This allows us to visit definitions before
+/// uses, allowing us to hoist a loop body in one pass without iteration.
+///
+bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
+ DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
+ AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE) {
+ // Verify inputs.
+ assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
+ CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr &&
+ "Unexpected input to hoistRegion");
+
+ // We want to visit parents before children. We will enque all the parents
+ // before their children in the worklist and process the worklist in order.
+ SmallVector<DomTreeNode *, 16> Worklist = collectChildrenInLoop(N, CurLoop);
+
+ bool Changed = false;
+ for (DomTreeNode *DTN : Worklist) {
+ BasicBlock *BB = DTN->getBlock();
+ // Only need to process the contents of this block if it is not part of a
+ // subloop (which would already have been processed).
+ if (!inSubLoop(BB, CurLoop, LI))
+ for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
+ Instruction &I = *II++;
+ // Try constant folding this instruction. If all the operands are
+ // constants, it is technically hoistable, but it would be better to
+ // just fold it.
+ if (Constant *C = ConstantFoldInstruction(
+ &I, I.getModule()->getDataLayout(), TLI)) {
+ DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n');
+ CurAST->copyValue(&I, C);
+ I.replaceAllUsesWith(C);
+ if (isInstructionTriviallyDead(&I, TLI)) {
+ CurAST->deleteValue(&I);
+ I.eraseFromParent();
+ }
+ Changed = true;
+ continue;
+ }
+
+ // Attempt to remove floating point division out of the loop by
+ // converting it to a reciprocal multiplication.
+ if (I.getOpcode() == Instruction::FDiv &&
+ CurLoop->isLoopInvariant(I.getOperand(1)) &&
+ I.hasAllowReciprocal()) {
+ auto Divisor = I.getOperand(1);
+ auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
+ auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
+ ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
+ ReciprocalDivisor->insertBefore(&I);
+
+ auto Product =
+ BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor);
+ Product->setFastMathFlags(I.getFastMathFlags());
+ Product->insertAfter(&I);
+ I.replaceAllUsesWith(Product);
+ I.eraseFromParent();
+
+ hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
+ Changed = true;
+ continue;
+ }
+
+ // Try hoisting the instruction out to the preheader. We can only do
+ // this if all of the operands of the instruction are loop invariant and
+ // if it is safe to hoist the instruction.
+ //
+ if (CurLoop->hasLoopInvariantOperands(&I) &&
+ canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) &&
+ isSafeToExecuteUnconditionally(
+ I, DT, CurLoop, SafetyInfo, ORE,
+ CurLoop->getLoopPreheader()->getTerminator()))
+ Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE);
+ }
+ }
+
+ return Changed;
+}
+
+/// Computes loop safety information, checks loop body & header
+/// for the possibility of may throw exception.
+///
+void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
+ assert(CurLoop != nullptr && "CurLoop cant be null");
+ BasicBlock *Header = CurLoop->getHeader();
+ // Setting default safety values.
+ SafetyInfo->MayThrow = false;
+ SafetyInfo->HeaderMayThrow = false;
+ // Iterate over header and compute safety info.
+ for (BasicBlock::iterator I = Header->begin(), E = Header->end();
+ (I != E) && !SafetyInfo->HeaderMayThrow; ++I)
+ SafetyInfo->HeaderMayThrow |=
+ !isGuaranteedToTransferExecutionToSuccessor(&*I);
+
+ SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow;
+ // Iterate over loop instructions and compute safety info.
+ // Skip header as it has been computed and stored in HeaderMayThrow.
+ // The first block in loopinfo.Blocks is guaranteed to be the header.
+ assert(Header == *CurLoop->getBlocks().begin() &&
+ "First block must be header");
+ for (Loop::block_iterator BB = std::next(CurLoop->block_begin()),
+ BBE = CurLoop->block_end();
+ (BB != BBE) && !SafetyInfo->MayThrow; ++BB)
+ for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
+ (I != E) && !SafetyInfo->MayThrow; ++I)
+ SafetyInfo->MayThrow |= !isGuaranteedToTransferExecutionToSuccessor(&*I);
+
+ // Compute funclet colors if we might sink/hoist in a function with a funclet
+ // personality routine.
+ Function *Fn = CurLoop->getHeader()->getParent();
+ if (Fn->hasPersonalityFn())
+ if (Constant *PersonalityFn = Fn->getPersonalityFn())
+ if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
+ SafetyInfo->BlockColors = colorEHFunclets(*Fn);
+}
+
+// Return true if LI is invariant within scope of the loop. LI is invariant if
+// CurLoop is dominated by an invariant.start representing the same memory
+// location and size as the memory location LI loads from, and also the
+// invariant.start has no uses.
+static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
+ Loop *CurLoop) {
+ Value *Addr = LI->getOperand(0);
+ const DataLayout &DL = LI->getModule()->getDataLayout();
+ const uint32_t LocSizeInBits = DL.getTypeSizeInBits(
+ cast<PointerType>(Addr->getType())->getElementType());
+
+ // if the type is i8 addrspace(x)*, we know this is the type of
+ // llvm.invariant.start operand
+ auto *PtrInt8Ty = PointerType::get(Type::getInt8Ty(LI->getContext()),
+ LI->getPointerAddressSpace());
+ unsigned BitcastsVisited = 0;
+ // Look through bitcasts until we reach the i8* type (this is invariant.start
+ // operand type).
+ while (Addr->getType() != PtrInt8Ty) {
+ auto *BC = dyn_cast<BitCastInst>(Addr);
+ // Avoid traversing high number of bitcast uses.
+ if (++BitcastsVisited > MaxNumUsesTraversed || !BC)
+ return false;
+ Addr = BC->getOperand(0);
+ }
+
+ unsigned UsesVisited = 0;
+ // Traverse all uses of the load operand value, to see if invariant.start is
+ // one of the uses, and whether it dominates the load instruction.
+ for (auto *U : Addr->users()) {
+ // Avoid traversing for Load operand with high number of users.
+ if (++UsesVisited > MaxNumUsesTraversed)
+ return false;
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
+ // If there are escaping uses of invariant.start instruction, the load maybe
+ // non-invariant.
+ if (!II || II->getIntrinsicID() != Intrinsic::invariant_start ||
+ !II->use_empty())
+ continue;
+ unsigned InvariantSizeInBits =
+ cast<ConstantInt>(II->getArgOperand(0))->getSExtValue() * 8;
+ // Confirm the invariant.start location size contains the load operand size
+ // in bits. Also, the invariant.start should dominate the load, and we
+ // should not hoist the load out of a loop that contains this dominating
+ // invariant.start.
+ if (LocSizeInBits <= InvariantSizeInBits &&
+ DT->properlyDominates(II->getParent(), CurLoop->getHeader()))
+ return true;
+ }
+
+ return false;
+}
+
+bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
+ Loop *CurLoop, AliasSetTracker *CurAST,
+ LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE) {
+ // SafetyInfo is nullptr if we are checking for sinking from preheader to
+ // loop body.
+ const bool SinkingToLoopBody = !SafetyInfo;
+ // Loads have extra constraints we have to verify before we can hoist them.
+ if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+ if (!LI->isUnordered())
+ return false; // Don't sink/hoist volatile or ordered atomic loads!
+
+ // Loads from constant memory are always safe to move, even if they end up
+ // in the same alias set as something that ends up being modified.
+ if (AA->pointsToConstantMemory(LI->getOperand(0)))
+ return true;
+ if (LI->getMetadata(LLVMContext::MD_invariant_load))
+ return true;
+
+ if (LI->isAtomic() && SinkingToLoopBody)
+ return false; // Don't sink unordered atomic loads to loop body.
+
+ // This checks for an invariant.start dominating the load.
+ if (isLoadInvariantInLoop(LI, DT, CurLoop))
+ return true;
+
+ // Don't hoist loads which have may-aliased stores in loop.
+ uint64_t Size = 0;
+ if (LI->getType()->isSized())
+ Size = I.getModule()->getDataLayout().getTypeStoreSize(LI->getType());
+
+ AAMDNodes AAInfo;
+ LI->getAAMetadata(AAInfo);
+
+ bool Invalidated =
+ pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo, CurAST);
+ // Check loop-invariant address because this may also be a sinkable load
+ // whose address is not necessarily loop-invariant.
+ if (ORE && Invalidated && CurLoop->isLoopInvariant(LI->getPointerOperand()))
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(
+ DEBUG_TYPE, "LoadWithLoopInvariantAddressInvalidated", LI)
+ << "failed to move load with loop-invariant address "
+ "because the loop may invalidate its value";
+ });
+
+ return !Invalidated;
+ } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+ // Don't sink or hoist dbg info; it's legal, but not useful.
+ if (isa<DbgInfoIntrinsic>(I))
+ return false;
+
+ // Don't sink calls which can throw.
+ if (CI->mayThrow())
+ return false;
+
+ // Handle simple cases by querying alias analysis.
+ FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI);
+ if (Behavior == FMRB_DoesNotAccessMemory)
+ return true;
+ if (AliasAnalysis::onlyReadsMemory(Behavior)) {
+ // A readonly argmemonly function only reads from memory pointed to by
+ // it's arguments with arbitrary offsets. If we can prove there are no
+ // writes to this memory in the loop, we can hoist or sink.
+ if (AliasAnalysis::onlyAccessesArgPointees(Behavior)) {
+ for (Value *Op : CI->arg_operands())
+ if (Op->getType()->isPointerTy() &&
+ pointerInvalidatedByLoop(Op, MemoryLocation::UnknownSize,
+ AAMDNodes(), CurAST))
+ return false;
+ return true;
+ }
+ // If this call only reads from memory and there are no writes to memory
+ // in the loop, we can hoist or sink the call as appropriate.
+ bool FoundMod = false;
+ for (AliasSet &AS : *CurAST) {
+ if (!AS.isForwardingAliasSet() && AS.isMod()) {
+ FoundMod = true;
+ break;
+ }
+ }
+ if (!FoundMod)
+ return true;
+ }
+
+ // FIXME: This should use mod/ref information to see if we can hoist or
+ // sink the call.
+
+ return false;
+ }
+
+ // Only these instructions are hoistable/sinkable.
+ if (!isa<BinaryOperator>(I) && !isa<CastInst>(I) && !isa<SelectInst>(I) &&
+ !isa<GetElementPtrInst>(I) && !isa<CmpInst>(I) &&
+ !isa<InsertElementInst>(I) && !isa<ExtractElementInst>(I) &&
+ !isa<ShuffleVectorInst>(I) && !isa<ExtractValueInst>(I) &&
+ !isa<InsertValueInst>(I))
+ return false;
+
+ // If we are checking for sinking from preheader to loop body it will be
+ // always safe as there is no speculative execution.
+ if (SinkingToLoopBody)
+ return true;
+
+ // TODO: Plumb the context instruction through to make hoisting and sinking
+ // more powerful. Hoisting of loads already works due to the special casing
+ // above.
+ return isSafeToExecuteUnconditionally(I, DT, CurLoop, SafetyInfo, nullptr);
+}
+
+/// Returns true if a PHINode is a trivially replaceable with an
+/// Instruction.
+/// This is true when all incoming values are that instruction.
+/// This pattern occurs most often with LCSSA PHI nodes.
+///
+static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) {
+ for (const Value *IncValue : PN.incoming_values())
+ if (IncValue != &I)
+ return false;
+
+ return true;
+}
+
+/// Return true if the instruction is free in the loop.
+static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
+ const TargetTransformInfo *TTI) {
+
+ if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ if (TTI->getUserCost(GEP) != TargetTransformInfo::TCC_Free)
+ return false;
+ // For a GEP, we cannot simply use getUserCost because currently it
+ // optimistically assume that a GEP will fold into addressing mode
+ // regardless of its users.
+ const BasicBlock *BB = GEP->getParent();
+ for (const User *U : GEP->users()) {
+ const Instruction *UI = cast<Instruction>(U);
+ if (CurLoop->contains(UI) &&
+ (BB != UI->getParent() ||
+ (!isa<StoreInst>(UI) && !isa<LoadInst>(UI))))
+ return false;
+ }
+ return true;
+ } else
+ return TTI->getUserCost(&I) == TargetTransformInfo::TCC_Free;
+}
+
+/// Return true if the only users of this instruction are outside of
+/// the loop. If this is true, we can sink the instruction to the exit
+/// blocks of the loop.
+///
+/// We also return true if the instruction could be folded away in lowering.
+/// (e.g., a GEP can be folded into a load as an addressing mode in the loop).
+static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
+ const LoopSafetyInfo *SafetyInfo,
+ TargetTransformInfo *TTI, bool &FreeInLoop) {
+ const auto &BlockColors = SafetyInfo->BlockColors;
+ bool IsFree = isFreeInLoop(I, CurLoop, TTI);
+ for (const User *U : I.users()) {
+ const Instruction *UI = cast<Instruction>(U);
+ if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
+ const BasicBlock *BB = PN->getParent();
+ // We cannot sink uses in catchswitches.
+ if (isa<CatchSwitchInst>(BB->getTerminator()))
+ return false;
+
+ // We need to sink a callsite to a unique funclet. Avoid sinking if the
+ // phi use is too muddled.
+ if (isa<CallInst>(I))
+ if (!BlockColors.empty() &&
+ BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1)
+ return false;
+ }
+
+ if (CurLoop->contains(UI)) {
+ if (IsFree) {
+ FreeInLoop = true;
+ continue;
+ }
+ return false;
+ }
+ }
+ return true;
+}
+
+static Instruction *
+CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
+ const LoopInfo *LI,
+ const LoopSafetyInfo *SafetyInfo) {
+ Instruction *New;
+ if (auto *CI = dyn_cast<CallInst>(&I)) {
+ const auto &BlockColors = SafetyInfo->BlockColors;
+
+ // Sinking call-sites need to be handled differently from other
+ // instructions. The cloned call-site needs a funclet bundle operand
+ // appropriate for it's location in the CFG.
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ for (unsigned BundleIdx = 0, BundleEnd = CI->getNumOperandBundles();
+ BundleIdx != BundleEnd; ++BundleIdx) {
+ OperandBundleUse Bundle = CI->getOperandBundleAt(BundleIdx);
+ if (Bundle.getTagID() == LLVMContext::OB_funclet)
+ continue;
+
+ OpBundles.emplace_back(Bundle);
+ }
+
+ if (!BlockColors.empty()) {
+ const ColorVector &CV = BlockColors.find(&ExitBlock)->second;
+ assert(CV.size() == 1 && "non-unique color for exit block!");
+ BasicBlock *BBColor = CV.front();
+ Instruction *EHPad = BBColor->getFirstNonPHI();
+ if (EHPad->isEHPad())
+ OpBundles.emplace_back("funclet", EHPad);
+ }
+
+ New = CallInst::Create(CI, OpBundles);
+ } else {
+ New = I.clone();
+ }
+
+ ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New);
+ if (!I.getName().empty())
+ New->setName(I.getName() + ".le");
+
+ // Build LCSSA PHI nodes for any in-loop operands. Note that this is
+ // particularly cheap because we can rip off the PHI node that we're
+ // replacing for the number and blocks of the predecessors.
+ // OPT: If this shows up in a profile, we can instead finish sinking all
+ // invariant instructions, and then walk their operands to re-establish
+ // LCSSA. That will eliminate creating PHI nodes just to nuke them when
+ // sinking bottom-up.
+ for (User::op_iterator OI = New->op_begin(), OE = New->op_end(); OI != OE;
+ ++OI)
+ if (Instruction *OInst = dyn_cast<Instruction>(*OI))
+ if (Loop *OLoop = LI->getLoopFor(OInst->getParent()))
+ if (!OLoop->contains(&PN)) {
+ PHINode *OpPN =
+ PHINode::Create(OInst->getType(), PN.getNumIncomingValues(),
+ OInst->getName() + ".lcssa", &ExitBlock.front());
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+ OpPN->addIncoming(OInst, PN.getIncomingBlock(i));
+ *OI = OpPN;
+ }
+ return New;
+}
+
+static Instruction *sinkThroughTriviallyReplacablePHI(
+ PHINode *TPN, Instruction *I, LoopInfo *LI,
+ SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
+ const LoopSafetyInfo *SafetyInfo, const Loop *CurLoop) {
+ assert(isTriviallyReplacablePHI(*TPN, *I) &&
+ "Expect only trivially replacalbe PHI");
+ BasicBlock *ExitBlock = TPN->getParent();
+ Instruction *New;
+ auto It = SunkCopies.find(ExitBlock);
+ if (It != SunkCopies.end())
+ New = It->second;
+ else
+ New = SunkCopies[ExitBlock] =
+ CloneInstructionInExitBlock(*I, *ExitBlock, *TPN, LI, SafetyInfo);
+ return New;
+}
+
+static bool canSplitPredecessors(PHINode *PN, LoopSafetyInfo *SafetyInfo) {
+ BasicBlock *BB = PN->getParent();
+ if (!BB->canSplitPredecessors())
+ return false;
+ // It's not impossible to split EHPad blocks, but if BlockColors already exist
+ // it require updating BlockColors for all offspring blocks accordingly. By
+ // skipping such corner case, we can make updating BlockColors after splitting
+ // predecessor fairly simple.
+ if (!SafetyInfo->BlockColors.empty() && BB->getFirstNonPHI()->isEHPad())
+ return false;
+ for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+ BasicBlock *BBPred = *PI;
+ if (isa<IndirectBrInst>(BBPred->getTerminator()))
+ return false;
+ }
+ return true;
+}
+
+static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
+ LoopInfo *LI, const Loop *CurLoop,
+ LoopSafetyInfo *SafetyInfo) {
+#ifndef NDEBUG
+ SmallVector<BasicBlock *, 32> ExitBlocks;
+ CurLoop->getUniqueExitBlocks(ExitBlocks);
+ SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
+ ExitBlocks.end());
+#endif
+ BasicBlock *ExitBB = PN->getParent();
+ assert(ExitBlockSet.count(ExitBB) && "Expect the PHI is in an exit block.");
+
+ // Split predecessors of the loop exit to make instructions in the loop are
+ // exposed to exit blocks through trivially replacable PHIs while keeping the
+ // loop in the canonical form where each predecessor of each exit block should
+ // be contained within the loop. For example, this will convert the loop below
+ // from
+ //
+ // LB1:
+ // %v1 =
+ // br %LE, %LB2
+ // LB2:
+ // %v2 =
+ // br %LE, %LB1
+ // LE:
+ // %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replacable
+ //
+ // to
+ //
+ // LB1:
+ // %v1 =
+ // br %LE.split, %LB2
+ // LB2:
+ // %v2 =
+ // br %LE.split2, %LB1
+ // LE.split:
+ // %p1 = phi [%v1, %LB1] <-- trivially replacable
+ // br %LE
+ // LE.split2:
+ // %p2 = phi [%v2, %LB2] <-- trivially replacable
+ // br %LE
+ // LE:
+ // %p = phi [%p1, %LE.split], [%p2, %LE.split2]
+ //
+ auto &BlockColors = SafetyInfo->BlockColors;
+ SmallSetVector<BasicBlock *, 8> PredBBs(pred_begin(ExitBB), pred_end(ExitBB));
+ while (!PredBBs.empty()) {
+ BasicBlock *PredBB = *PredBBs.begin();
+ assert(CurLoop->contains(PredBB) &&
+ "Expect all predecessors are in the loop");
+ if (PN->getBasicBlockIndex(PredBB) >= 0) {
+ BasicBlock *NewPred = SplitBlockPredecessors(
+ ExitBB, PredBB, ".split.loop.exit", DT, LI, true);
+ // Since we do not allow splitting EH-block with BlockColors in
+ // canSplitPredecessors(), we can simply assign predecessor's color to
+ // the new block.
+ if (!BlockColors.empty())
+ BlockColors[NewPred] = BlockColors[PredBB];
+ }
+ PredBBs.remove(PredBB);
+ }
+}
+
+/// When an instruction is found to only be used outside of the loop, this
+/// function moves it to the exit blocks and patches up SSA form as needed.
+/// This method is guaranteed to remove the original instruction from its
+/// position, and may either delete it or move it to outside of the loop.
+///
+static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
+ const Loop *CurLoop, LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE, bool FreeInLoop) {
+ DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
+ << "sinking " << ore::NV("Inst", &I);
+ });
+ bool Changed = false;
+ if (isa<LoadInst>(I))
+ ++NumMovedLoads;
+ else if (isa<CallInst>(I))
+ ++NumMovedCalls;
+ ++NumSunk;
+
+ // Iterate over users to be ready for actual sinking. Replace users via
+ // unrechable blocks with undef and make all user PHIs trivially replcable.
+ SmallPtrSet<Instruction *, 8> VisitedUsers;
+ for (Value::user_iterator UI = I.user_begin(), UE = I.user_end(); UI != UE;) {
+ auto *User = cast<Instruction>(*UI);
+ Use &U = UI.getUse();
+ ++UI;
+
+ if (VisitedUsers.count(User) || CurLoop->contains(User))
+ continue;
+
+ if (!DT->isReachableFromEntry(User->getParent())) {
+ U = UndefValue::get(I.getType());
+ Changed = true;
+ continue;
+ }
+
+ // The user must be a PHI node.
+ PHINode *PN = cast<PHINode>(User);
+
+ // Surprisingly, instructions can be used outside of loops without any
+ // exits. This can only happen in PHI nodes if the incoming block is
+ // unreachable.
+ BasicBlock *BB = PN->getIncomingBlock(U);
+ if (!DT->isReachableFromEntry(BB)) {
+ U = UndefValue::get(I.getType());
+ Changed = true;
+ continue;
+ }
+
+ VisitedUsers.insert(PN);
+ if (isTriviallyReplacablePHI(*PN, I))
+ continue;
+
+ if (!canSplitPredecessors(PN, SafetyInfo))
+ return Changed;
+
+ // Split predecessors of the PHI so that we can make users trivially
+ // replacable.
+ splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop, SafetyInfo);
+
+ // Should rebuild the iterators, as they may be invalidated by
+ // splitPredecessorsOfLoopExit().
+ UI = I.user_begin();
+ UE = I.user_end();
+ }
+
+ if (VisitedUsers.empty())
+ return Changed;
+
+#ifndef NDEBUG
+ SmallVector<BasicBlock *, 32> ExitBlocks;
+ CurLoop->getUniqueExitBlocks(ExitBlocks);
+ SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
+ ExitBlocks.end());
+#endif
+
+ // Clones of this instruction. Don't create more than one per exit block!
+ SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies;
+
+ // If this instruction is only used outside of the loop, then all users are
+ // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
+ // the instruction.
+ SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end());
+ for (auto *UI : Users) {
+ auto *User = cast<Instruction>(UI);
+
+ if (CurLoop->contains(User))
+ continue;
+
+ PHINode *PN = cast<PHINode>(User);
+ assert(ExitBlockSet.count(PN->getParent()) &&
+ "The LCSSA PHI is not in an exit block!");
+ // The PHI must be trivially replacable.
+ Instruction *New = sinkThroughTriviallyReplacablePHI(PN, &I, LI, SunkCopies,
+ SafetyInfo, CurLoop);
+ PN->replaceAllUsesWith(New);
+ PN->eraseFromParent();
+ Changed = true;
+ }
+ return Changed;
+}
+
+/// When an instruction is found to only use loop invariant operands that
+/// is safe to hoist, this instruction is called to do the dirty work.
+///
+static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
+ const LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE) {
+ auto *Preheader = CurLoop->getLoopPreheader();
+ DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
+ << "\n");
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "Hoisted", &I) << "hoisting "
+ << ore::NV("Inst", &I);
+ });
+
+ // Metadata can be dependent on conditions we are hoisting above.
+ // Conservatively strip all metadata on the instruction unless we were
+ // guaranteed to execute I if we entered the loop, in which case the metadata
+ // is valid in the loop preheader.
+ if (I.hasMetadataOtherThanDebugLoc() &&
+ // The check on hasMetadataOtherThanDebugLoc is to prevent us from burning
+ // time in isGuaranteedToExecute if we don't actually have anything to
+ // drop. It is a compile time optimization, not required for correctness.
+ !isGuaranteedToExecute(I, DT, CurLoop, SafetyInfo))
+ I.dropUnknownNonDebugMetadata();
+
+ // Move the new node to the Preheader, before its terminator.
+ I.moveBefore(Preheader->getTerminator());
+
+ // Do not retain debug locations when we are moving instructions to different
+ // basic blocks, because we want to avoid jumpy line tables. Calls, however,
+ // need to retain their debug locs because they may be inlined.
+ // FIXME: How do we retain source locations without causing poor debugging
+ // behavior?
+ if (!isa<CallInst>(I))
+ I.setDebugLoc(DebugLoc());
+
+ if (isa<LoadInst>(I))
+ ++NumMovedLoads;
+ else if (isa<CallInst>(I))
+ ++NumMovedCalls;
+ ++NumHoisted;
+ return true;
+}
+
+/// Only sink or hoist an instruction if it is not a trapping instruction,
+/// or if the instruction is known not to trap when moved to the preheader.
+/// or if it is a trapping instruction and is guaranteed to execute.
+static bool isSafeToExecuteUnconditionally(Instruction &Inst,
+ const DominatorTree *DT,
+ const Loop *CurLoop,
+ const LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE,
+ const Instruction *CtxI) {
+ if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT))
+ return true;
+
+ bool GuaranteedToExecute =
+ isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo);
+
+ if (!GuaranteedToExecute) {
+ auto *LI = dyn_cast<LoadInst>(&Inst);
+ if (LI && CurLoop->isLoopInvariant(LI->getPointerOperand()))
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(
+ DEBUG_TYPE, "LoadWithLoopInvariantAddressCondExecuted", LI)
+ << "failed to hoist load with loop-invariant address "
+ "because load is conditionally executed";
+ });
+ }
+
+ return GuaranteedToExecute;
+}
+
+namespace {
+class LoopPromoter : public LoadAndStorePromoter {
+ Value *SomePtr; // Designated pointer to store to.
+ const SmallSetVector<Value *, 8> &PointerMustAliases;
+ SmallVectorImpl<BasicBlock *> &LoopExitBlocks;
+ SmallVectorImpl<Instruction *> &LoopInsertPts;
+ PredIteratorCache &PredCache;
+ AliasSetTracker &AST;
+ LoopInfo &LI;
+ DebugLoc DL;
+ int Alignment;
+ bool UnorderedAtomic;
+ AAMDNodes AATags;
+
+ Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ if (Loop *L = LI.getLoopFor(I->getParent()))
+ if (!L->contains(BB)) {
+ // We need to create an LCSSA PHI node for the incoming value and
+ // store that.
+ PHINode *PN = PHINode::Create(I->getType(), PredCache.size(BB),
+ I->getName() + ".lcssa", &BB->front());
+ for (BasicBlock *Pred : PredCache.get(BB))
+ PN->addIncoming(I, Pred);
+ return PN;
+ }
+ return V;
+ }
+
+public:
+ LoopPromoter(Value *SP, ArrayRef<const Instruction *> Insts, SSAUpdater &S,
+ const SmallSetVector<Value *, 8> &PMA,
+ SmallVectorImpl<BasicBlock *> &LEB,
+ SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC,
+ AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment,
+ bool UnorderedAtomic, const AAMDNodes &AATags)
+ : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
+ LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast),
+ LI(li), DL(std::move(dl)), Alignment(alignment),
+ UnorderedAtomic(UnorderedAtomic), AATags(AATags) {}
+
+ bool isInstInList(Instruction *I,
+ const SmallVectorImpl<Instruction *> &) const override {
+ Value *Ptr;
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ Ptr = LI->getOperand(0);
+ else
+ Ptr = cast<StoreInst>(I)->getPointerOperand();
+ return PointerMustAliases.count(Ptr);
+ }
+
+ void doExtraRewritesBeforeFinalDeletion() const override {
+ // Insert stores after in the loop exit blocks. Each exit block gets a
+ // store of the live-out values that feed them. Since we've already told
+ // the SSA updater about the defs in the loop and the preheader
+ // definition, it is all set and we can start using it.
+ for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) {
+ BasicBlock *ExitBlock = LoopExitBlocks[i];
+ Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
+ LiveInValue = maybeInsertLCSSAPHI(LiveInValue, ExitBlock);
+ Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock);
+ Instruction *InsertPos = LoopInsertPts[i];
+ StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
+ if (UnorderedAtomic)
+ NewSI->setOrdering(AtomicOrdering::Unordered);
+ NewSI->setAlignment(Alignment);
+ NewSI->setDebugLoc(DL);
+ if (AATags)
+ NewSI->setAAMetadata(AATags);
+ }
+ }
+
+ void replaceLoadWithValue(LoadInst *LI, Value *V) const override {
+ // Update alias analysis.
+ AST.copyValue(LI, V);
+ }
+ void instructionDeleted(Instruction *I) const override { AST.deleteValue(I); }
+};
+
+
+/// Return true iff we can prove that a caller of this function can not inspect
+/// the contents of the provided object in a well defined program.
+bool isKnownNonEscaping(Value *Object, const TargetLibraryInfo *TLI) {
+ if (isa<AllocaInst>(Object))
+ // Since the alloca goes out of scope, we know the caller can't retain a
+ // reference to it and be well defined. Thus, we don't need to check for
+ // capture.
+ return true;
+
+ // For all other objects we need to know that the caller can't possibly
+ // have gotten a reference to the object. There are two components of
+ // that:
+ // 1) Object can't be escaped by this function. This is what
+ // PointerMayBeCaptured checks.
+ // 2) Object can't have been captured at definition site. For this, we
+ // need to know the return value is noalias. At the moment, we use a
+ // weaker condition and handle only AllocLikeFunctions (which are
+ // known to be noalias). TODO
+ return isAllocLikeFn(Object, TLI) &&
+ !PointerMayBeCaptured(Object, true, true);
+}
+
+} // namespace
+
+/// Try to promote memory values to scalars by sinking stores out of the
+/// loop and moving loads to before the loop. We do this by looping over
+/// the stores in the loop, looking for stores to Must pointers which are
+/// loop invariant.
+///
+bool llvm::promoteLoopAccessesToScalars(
+ const SmallSetVector<Value *, 8> &PointerMustAliases,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks,
+ SmallVectorImpl<Instruction *> &InsertPts, PredIteratorCache &PIC,
+ LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
+ Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE) {
+ // Verify inputs.
+ assert(LI != nullptr && DT != nullptr && CurLoop != nullptr &&
+ CurAST != nullptr && SafetyInfo != nullptr &&
+ "Unexpected Input to promoteLoopAccessesToScalars");
+
+ Value *SomePtr = *PointerMustAliases.begin();
+ BasicBlock *Preheader = CurLoop->getLoopPreheader();
+
+ // It isn't safe to promote a load/store from the loop if the load/store is
+ // conditional. For example, turning:
+ //
+ // for () { if (c) *P += 1; }
+ //
+ // into:
+ //
+ // tmp = *P; for () { if (c) tmp +=1; } *P = tmp;
+ //
+ // is not safe, because *P may only be valid to access if 'c' is true.
+ //
+ // The safety property divides into two parts:
+ // p1) The memory may not be dereferenceable on entry to the loop. In this
+ // case, we can't insert the required load in the preheader.
+ // p2) The memory model does not allow us to insert a store along any dynamic
+ // path which did not originally have one.
+ //
+ // If at least one store is guaranteed to execute, both properties are
+ // satisfied, and promotion is legal.
+ //
+ // This, however, is not a necessary condition. Even if no store/load is
+ // guaranteed to execute, we can still establish these properties.
+ // We can establish (p1) by proving that hoisting the load into the preheader
+ // is safe (i.e. proving dereferenceability on all paths through the loop). We
+ // can use any access within the alias set to prove dereferenceability,
+ // since they're all must alias.
+ //
+ // There are two ways establish (p2):
+ // a) Prove the location is thread-local. In this case the memory model
+ // requirement does not apply, and stores are safe to insert.
+ // b) Prove a store dominates every exit block. In this case, if an exit
+ // blocks is reached, the original dynamic path would have taken us through
+ // the store, so inserting a store into the exit block is safe. Note that this
+ // is different from the store being guaranteed to execute. For instance,
+ // if an exception is thrown on the first iteration of the loop, the original
+ // store is never executed, but the exit blocks are not executed either.
+
+ bool DereferenceableInPH = false;
+ bool SafeToInsertStore = false;
+
+ SmallVector<Instruction *, 64> LoopUses;
+
+ // We start with an alignment of one and try to find instructions that allow
+ // us to prove better alignment.
+ unsigned Alignment = 1;
+ // Keep track of which types of access we see
+ bool SawUnorderedAtomic = false;
+ bool SawNotAtomic = false;
+ AAMDNodes AATags;
+
+ const DataLayout &MDL = Preheader->getModule()->getDataLayout();
+
+ bool IsKnownThreadLocalObject = false;
+ if (SafetyInfo->MayThrow) {
+ // If a loop can throw, we have to insert a store along each unwind edge.
+ // That said, we can't actually make the unwind edge explicit. Therefore,
+ // we have to prove that the store is dead along the unwind edge. We do
+ // this by proving that the caller can't have a reference to the object
+ // after return and thus can't possibly load from the object.
+ Value *Object = GetUnderlyingObject(SomePtr, MDL);
+ if (!isKnownNonEscaping(Object, TLI))
+ return false;
+ // Subtlety: Alloca's aren't visible to callers, but *are* potentially
+ // visible to other threads if captured and used during their lifetimes.
+ IsKnownThreadLocalObject = !isa<AllocaInst>(Object);
+ }
+
+ // Check that all of the pointers in the alias set have the same type. We
+ // cannot (yet) promote a memory location that is loaded and stored in
+ // different sizes. While we are at it, collect alignment and AA info.
+ for (Value *ASIV : PointerMustAliases) {
+ // Check that all of the pointers in the alias set have the same type. We
+ // cannot (yet) promote a memory location that is loaded and stored in
+ // different sizes.
+ if (SomePtr->getType() != ASIV->getType())
+ return false;
+
+ for (User *U : ASIV->users()) {
+ // Ignore instructions that are outside the loop.
+ Instruction *UI = dyn_cast<Instruction>(U);
+ if (!UI || !CurLoop->contains(UI))
+ continue;
+
+ // If there is an non-load/store instruction in the loop, we can't promote
+ // it.
+ if (LoadInst *Load = dyn_cast<LoadInst>(UI)) {
+ assert(!Load->isVolatile() && "AST broken");
+ if (!Load->isUnordered())
+ return false;
+
+ SawUnorderedAtomic |= Load->isAtomic();
+ SawNotAtomic |= !Load->isAtomic();
+
+ if (!DereferenceableInPH)
+ DereferenceableInPH = isSafeToExecuteUnconditionally(
+ *Load, DT, CurLoop, SafetyInfo, ORE, Preheader->getTerminator());
+ } else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) {
+ // Stores *of* the pointer are not interesting, only stores *to* the
+ // pointer.
+ if (UI->getOperand(1) != ASIV)
+ continue;
+ assert(!Store->isVolatile() && "AST broken");
+ if (!Store->isUnordered())
+ return false;
+
+ SawUnorderedAtomic |= Store->isAtomic();
+ SawNotAtomic |= !Store->isAtomic();
+
+ // If the store is guaranteed to execute, both properties are satisfied.
+ // We may want to check if a store is guaranteed to execute even if we
+ // already know that promotion is safe, since it may have higher
+ // alignment than any other guaranteed stores, in which case we can
+ // raise the alignment on the promoted store.
+ unsigned InstAlignment = Store->getAlignment();
+ if (!InstAlignment)
+ InstAlignment =
+ MDL.getABITypeAlignment(Store->getValueOperand()->getType());
+
+ if (!DereferenceableInPH || !SafeToInsertStore ||
+ (InstAlignment > Alignment)) {
+ if (isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo)) {
+ DereferenceableInPH = true;
+ SafeToInsertStore = true;
+ Alignment = std::max(Alignment, InstAlignment);
+ }
+ }
+
+ // If a store dominates all exit blocks, it is safe to sink.
+ // As explained above, if an exit block was executed, a dominating
+ // store must have been been executed at least once, so we are not
+ // introducing stores on paths that did not have them.
+ // Note that this only looks at explicit exit blocks. If we ever
+ // start sinking stores into unwind edges (see above), this will break.
+ if (!SafeToInsertStore)
+ SafeToInsertStore = llvm::all_of(ExitBlocks, [&](BasicBlock *Exit) {
+ return DT->dominates(Store->getParent(), Exit);
+ });
+
+ // If the store is not guaranteed to execute, we may still get
+ // deref info through it.
+ if (!DereferenceableInPH) {
+ DereferenceableInPH = isDereferenceableAndAlignedPointer(
+ Store->getPointerOperand(), Store->getAlignment(), MDL,
+ Preheader->getTerminator(), DT);
+ }
+ } else
+ return false; // Not a load or store.
+
+ // Merge the AA tags.
+ if (LoopUses.empty()) {
+ // On the first load/store, just take its AA tags.
+ UI->getAAMetadata(AATags);
+ } else if (AATags) {
+ UI->getAAMetadata(AATags, /* Merge = */ true);
+ }
+
+ LoopUses.push_back(UI);
+ }
+ }
+
+ // If we found both an unordered atomic instruction and a non-atomic memory
+ // access, bail. We can't blindly promote non-atomic to atomic since we
+ // might not be able to lower the result. We can't downgrade since that
+ // would violate memory model. Also, align 0 is an error for atomics.
+ if (SawUnorderedAtomic && SawNotAtomic)
+ return false;
+
+ // If we couldn't prove we can hoist the load, bail.
+ if (!DereferenceableInPH)
+ return false;
+
+ // We know we can hoist the load, but don't have a guaranteed store.
+ // Check whether the location is thread-local. If it is, then we can insert
+ // stores along paths which originally didn't have them without violating the
+ // memory model.
+ if (!SafeToInsertStore) {
+ if (IsKnownThreadLocalObject)
+ SafeToInsertStore = true;
+ else {
+ Value *Object = GetUnderlyingObject(SomePtr, MDL);
+ SafeToInsertStore =
+ (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) &&
+ !PointerMayBeCaptured(Object, true, true);
+ }
+ }
+
+ // If we've still failed to prove we can sink the store, give up.
+ if (!SafeToInsertStore)
+ return false;
+
+ // Otherwise, this is safe to promote, lets do it!
+ DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
+ << '\n');
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar",
+ LoopUses[0])
+ << "Moving accesses to memory location out of the loop";
+ });
+ ++NumPromoted;
+
+ // Grab a debug location for the inserted loads/stores; given that the
+ // inserted loads/stores have little relation to the original loads/stores,
+ // this code just arbitrarily picks a location from one, since any debug
+ // location is better than none.
+ DebugLoc DL = LoopUses[0]->getDebugLoc();
+
+ // We use the SSAUpdater interface to insert phi nodes as required.
+ SmallVector<PHINode *, 16> NewPHIs;
+ SSAUpdater SSA(&NewPHIs);
+ LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
+ InsertPts, PIC, *CurAST, *LI, DL, Alignment,
+ SawUnorderedAtomic, AATags);
+
+ // Set up the preheader to have a definition of the value. It is the live-out
+ // value from the preheader that uses in the loop will use.
+ LoadInst *PreheaderLoad = new LoadInst(
+ SomePtr, SomePtr->getName() + ".promoted", Preheader->getTerminator());
+ if (SawUnorderedAtomic)
+ PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
+ PreheaderLoad->setAlignment(Alignment);
+ PreheaderLoad->setDebugLoc(DL);
+ if (AATags)
+ PreheaderLoad->setAAMetadata(AATags);
+ SSA.AddAvailableValue(Preheader, PreheaderLoad);
+
+ // Rewrite all the loads in the loop and remember all the definitions from
+ // stores in the loop.
+ Promoter.run(LoopUses);
+
+ // If the SSAUpdater didn't use the load in the preheader, just zap it now.
+ if (PreheaderLoad->use_empty())
+ PreheaderLoad->eraseFromParent();
+
+ return true;
+}
+
+/// Returns an owning pointer to an alias set which incorporates aliasing info
+/// from L and all subloops of L.
+/// FIXME: In new pass manager, there is no helper function to handle loop
+/// analysis such as cloneBasicBlockAnalysis, so the AST needs to be recomputed
+/// from scratch for every loop. Hook up with the helper functions when
+/// available in the new pass manager to avoid redundant computation.
+AliasSetTracker *
+LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
+ AliasAnalysis *AA) {
+ AliasSetTracker *CurAST = nullptr;
+ SmallVector<Loop *, 4> RecomputeLoops;
+ for (Loop *InnerL : L->getSubLoops()) {
+ auto MapI = LoopToAliasSetMap.find(InnerL);
+ // If the AST for this inner loop is missing it may have been merged into
+ // some other loop's AST and then that loop unrolled, and so we need to
+ // recompute it.
+ if (MapI == LoopToAliasSetMap.end()) {
+ RecomputeLoops.push_back(InnerL);
+ continue;
+ }
+ AliasSetTracker *InnerAST = MapI->second;
+
+ if (CurAST != nullptr) {
+ // What if InnerLoop was modified by other passes ?
+ CurAST->add(*InnerAST);
+
+ // Once we've incorporated the inner loop's AST into ours, we don't need
+ // the subloop's anymore.
+ delete InnerAST;
+ } else {
+ CurAST = InnerAST;
+ }
+ LoopToAliasSetMap.erase(MapI);
+ }
+ if (CurAST == nullptr)
+ CurAST = new AliasSetTracker(*AA);
+
+ auto mergeLoop = [&](Loop *L) {
+ // Loop over the body of this loop, looking for calls, invokes, and stores.
+ for (BasicBlock *BB : L->blocks())
+ CurAST->add(*BB); // Incorporate the specified basic block
+ };
+
+ // Add everything from the sub loops that are no longer directly available.
+ for (Loop *InnerL : RecomputeLoops)
+ mergeLoop(InnerL);
+
+ // And merge in this loop.
+ mergeLoop(L);
+
+ return CurAST;
+}
+
+/// Simple analysis hook. Clone alias set info.
+///
+void LegacyLICMPass::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To,
+ Loop *L) {
+ AliasSetTracker *AST = LICM.getLoopToAliasSetMap().lookup(L);
+ if (!AST)
+ return;
+
+ AST->copyValue(From, To);
+}
+
+/// Simple Analysis hook. Delete value V from alias set
+///
+void LegacyLICMPass::deleteAnalysisValue(Value *V, Loop *L) {
+ AliasSetTracker *AST = LICM.getLoopToAliasSetMap().lookup(L);
+ if (!AST)
+ return;
+
+ AST->deleteValue(V);
+}
+
+/// Simple Analysis hook. Delete value L from alias set map.
+///
+void LegacyLICMPass::deleteAnalysisLoop(Loop *L) {
+ AliasSetTracker *AST = LICM.getLoopToAliasSetMap().lookup(L);
+ if (!AST)
+ return;
+
+ delete AST;
+ LICM.getLoopToAliasSetMap().erase(L);
+}
+
+/// Return true if the body of this loop may store into the memory
+/// location pointed to by V.
+///
+static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
+ const AAMDNodes &AAInfo,
+ AliasSetTracker *CurAST) {
+ // Check to see if any of the basic blocks in CurLoop invalidate *V.
+ return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod();
+}
+
+/// Little predicate that returns true if the specified basic block is in
+/// a subloop of the current one, not the current one itself.
+///
+static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI) {
+ assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop");
+ return LI->getLoopFor(BB) != CurLoop;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
new file mode 100644
index 000000000000..a64c99117d64
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
@@ -0,0 +1,25 @@
+//===- LoopAccessAnalysisPrinter.cpp - Loop Access Analysis Printer --------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-accesses"
+
+PreservedAnalyses
+LoopAccessInfoPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR, LPMUpdater &) {
+ Function &F = *L.getHeader()->getParent();
+ auto &LAI = AM.getResult<LoopAccessAnalysis>(L, AR);
+ OS << "Loop access info in function '" << F.getName() << "':\n";
+ OS.indent(2) << L.getHeader()->getName() << ":\n";
+ LAI.print(OS, 4);
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
new file mode 100644
index 000000000000..7f7c6de76450
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -0,0 +1,336 @@
+//===-------- LoopDataPrefetch.cpp - Loop Data Prefetching Pass -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Loop Data Prefetching Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
+
+#define DEBUG_TYPE "loop-data-prefetch"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+// By default, we limit this to creating 16 PHIs (which is a little over half
+// of the allocatable register set).
+static cl::opt<bool>
+PrefetchWrites("loop-prefetch-writes", cl::Hidden, cl::init(false),
+ cl::desc("Prefetch write addresses"));
+
+static cl::opt<unsigned>
+ PrefetchDistance("prefetch-distance",
+ cl::desc("Number of instructions to prefetch ahead"),
+ cl::Hidden);
+
+static cl::opt<unsigned>
+ MinPrefetchStride("min-prefetch-stride",
+ cl::desc("Min stride to add prefetches"), cl::Hidden);
+
+static cl::opt<unsigned> MaxPrefetchIterationsAhead(
+ "max-prefetch-iters-ahead",
+ cl::desc("Max number of iterations to prefetch ahead"), cl::Hidden);
+
+STATISTIC(NumPrefetches, "Number of prefetches inserted");
+
+namespace {
+
+/// Loop prefetch implementation class.
+class LoopDataPrefetch {
+public:
+ LoopDataPrefetch(AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE,
+ const TargetTransformInfo *TTI,
+ OptimizationRemarkEmitter *ORE)
+ : AC(AC), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
+
+ bool run();
+
+private:
+ bool runOnLoop(Loop *L);
+
+ /// \brief Check if the the stride of the accesses is large enough to
+ /// warrant a prefetch.
+ bool isStrideLargeEnough(const SCEVAddRecExpr *AR);
+
+ unsigned getMinPrefetchStride() {
+ if (MinPrefetchStride.getNumOccurrences() > 0)
+ return MinPrefetchStride;
+ return TTI->getMinPrefetchStride();
+ }
+
+ unsigned getPrefetchDistance() {
+ if (PrefetchDistance.getNumOccurrences() > 0)
+ return PrefetchDistance;
+ return TTI->getPrefetchDistance();
+ }
+
+ unsigned getMaxPrefetchIterationsAhead() {
+ if (MaxPrefetchIterationsAhead.getNumOccurrences() > 0)
+ return MaxPrefetchIterationsAhead;
+ return TTI->getMaxPrefetchIterationsAhead();
+ }
+
+ AssumptionCache *AC;
+ LoopInfo *LI;
+ ScalarEvolution *SE;
+ const TargetTransformInfo *TTI;
+ OptimizationRemarkEmitter *ORE;
+};
+
+/// Legacy class for inserting loop data prefetches.
+class LoopDataPrefetchLegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+ LoopDataPrefetchLegacyPass() : FunctionPass(ID) {
+ initializeLoopDataPrefetchLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+
+ bool runOnFunction(Function &F) override;
+ };
+}
+
+char LoopDataPrefetchLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopDataPrefetchLegacyPass, "loop-data-prefetch",
+ "Loop Data Prefetch", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopDataPrefetchLegacyPass, "loop-data-prefetch",
+ "Loop Data Prefetch", false, false)
+
+FunctionPass *llvm::createLoopDataPrefetchPass() {
+ return new LoopDataPrefetchLegacyPass();
+}
+
+bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) {
+ unsigned TargetMinStride = getMinPrefetchStride();
+ // No need to check if any stride goes.
+ if (TargetMinStride <= 1)
+ return true;
+
+ const auto *ConstStride = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
+ // If MinStride is set, don't prefetch unless we can ensure that stride is
+ // larger.
+ if (!ConstStride)
+ return false;
+
+ unsigned AbsStride = std::abs(ConstStride->getAPInt().getSExtValue());
+ return TargetMinStride <= AbsStride;
+}
+
+PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
+ ScalarEvolution *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+ AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
+ OptimizationRemarkEmitter *ORE =
+ &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+ LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE);
+ bool Changed = LDP.run();
+
+ if (Changed) {
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LoopAnalysis>();
+ return PA;
+ }
+
+ return PreservedAnalyses::all();
+}
+
+bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ AssumptionCache *AC =
+ &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ OptimizationRemarkEmitter *ORE =
+ &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+ const TargetTransformInfo *TTI =
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+ LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE);
+ return LDP.run();
+}
+
+bool LoopDataPrefetch::run() {
+ // If PrefetchDistance is not set, don't run the pass. This gives an
+ // opportunity for targets to run this pass for selected subtargets only
+ // (whose TTI sets PrefetchDistance).
+ if (getPrefetchDistance() == 0)
+ return false;
+ assert(TTI->getCacheLineSize() && "Cache line size is not set for target");
+
+ bool MadeChange = false;
+
+ for (Loop *I : *LI)
+ for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
+ MadeChange |= runOnLoop(*L);
+
+ return MadeChange;
+}
+
+bool LoopDataPrefetch::runOnLoop(Loop *L) {
+ bool MadeChange = false;
+
+ // Only prefetch in the inner-most loop
+ if (!L->empty())
+ return MadeChange;
+
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+ // Calculate the number of iterations ahead to prefetch
+ CodeMetrics Metrics;
+ for (const auto BB : L->blocks()) {
+ // If the loop already has prefetches, then assume that the user knows
+ // what they are doing and don't add any more.
+ for (auto &I : *BB)
+ if (CallInst *CI = dyn_cast<CallInst>(&I))
+ if (Function *F = CI->getCalledFunction())
+ if (F->getIntrinsicID() == Intrinsic::prefetch)
+ return MadeChange;
+
+ Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
+ }
+ unsigned LoopSize = Metrics.NumInsts;
+ if (!LoopSize)
+ LoopSize = 1;
+
+ unsigned ItersAhead = getPrefetchDistance() / LoopSize;
+ if (!ItersAhead)
+ ItersAhead = 1;
+
+ if (ItersAhead > getMaxPrefetchIterationsAhead())
+ return MadeChange;
+
+ DEBUG(dbgs() << "Prefetching " << ItersAhead
+ << " iterations ahead (loop size: " << LoopSize << ") in "
+ << L->getHeader()->getParent()->getName() << ": " << *L);
+
+ SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads;
+ for (const auto BB : L->blocks()) {
+ for (auto &I : *BB) {
+ Value *PtrValue;
+ Instruction *MemI;
+
+ if (LoadInst *LMemI = dyn_cast<LoadInst>(&I)) {
+ MemI = LMemI;
+ PtrValue = LMemI->getPointerOperand();
+ } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&I)) {
+ if (!PrefetchWrites) continue;
+ MemI = SMemI;
+ PtrValue = SMemI->getPointerOperand();
+ } else continue;
+
+ unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
+ if (PtrAddrSpace)
+ continue;
+
+ if (L->isLoopInvariant(PtrValue))
+ continue;
+
+ const SCEV *LSCEV = SE->getSCEV(PtrValue);
+ const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+ if (!LSCEVAddRec)
+ continue;
+
+ // Check if the the stride of the accesses is large enough to warrant a
+ // prefetch.
+ if (!isStrideLargeEnough(LSCEVAddRec))
+ continue;
+
+ // We don't want to double prefetch individual cache lines. If this load
+ // is known to be within one cache line of some other load that has
+ // already been prefetched, then don't prefetch this one as well.
+ bool DupPref = false;
+ for (const auto &PrefLoad : PrefLoads) {
+ const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, PrefLoad.second);
+ if (const SCEVConstant *ConstPtrDiff =
+ dyn_cast<SCEVConstant>(PtrDiff)) {
+ int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());
+ if (PD < (int64_t) TTI->getCacheLineSize()) {
+ DupPref = true;
+ break;
+ }
+ }
+ }
+ if (DupPref)
+ continue;
+
+ const SCEV *NextLSCEV = SE->getAddExpr(LSCEVAddRec, SE->getMulExpr(
+ SE->getConstant(LSCEVAddRec->getType(), ItersAhead),
+ LSCEVAddRec->getStepRecurrence(*SE)));
+ if (!isSafeToExpand(NextLSCEV, *SE))
+ continue;
+
+ PrefLoads.push_back(std::make_pair(MemI, LSCEVAddRec));
+
+ Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), PtrAddrSpace);
+ SCEVExpander SCEVE(*SE, I.getModule()->getDataLayout(), "prefaddr");
+ Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, MemI);
+
+ IRBuilder<> Builder(MemI);
+ Module *M = BB->getParent()->getParent();
+ Type *I32 = Type::getInt32Ty(BB->getContext());
+ Value *PrefetchFunc = Intrinsic::getDeclaration(M, Intrinsic::prefetch);
+ Builder.CreateCall(
+ PrefetchFunc,
+ {PrefPtrValue,
+ ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1),
+ ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
+ ++NumPrefetches;
+ DEBUG(dbgs() << " Access: " << *PtrValue << ", SCEV: " << *LSCEV
+ << "\n");
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI)
+ << "prefetched memory access";
+ });
+
+ MadeChange = true;
+ }
+ }
+
+ return MadeChange;
+}
+
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
new file mode 100644
index 000000000000..15cd1086f209
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -0,0 +1,267 @@
+//===- LoopDeletion.cpp - Dead Loop Deletion Pass ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Dead Loop Deletion Pass. This pass is responsible
+// for eliminating loops with non-infinite computable trip counts that have no
+// side effects or volatile instructions, and do not contribute to the
+// computation of the function's return value.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopDeletion.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-delete"
+
+STATISTIC(NumDeleted, "Number of loops deleted");
+
+enum class LoopDeletionResult {
+ Unmodified,
+ Modified,
+ Deleted,
+};
+
+/// Determines if a loop is dead.
+///
+/// This assumes that we've already checked for unique exit and exiting blocks,
+/// and that the code is in LCSSA form.
+static bool isLoopDead(Loop *L, ScalarEvolution &SE,
+ SmallVectorImpl<BasicBlock *> &ExitingBlocks,
+ BasicBlock *ExitBlock, bool &Changed,
+ BasicBlock *Preheader) {
+ // Make sure that all PHI entries coming from the loop are loop invariant.
+ // Because the code is in LCSSA form, any values used outside of the loop
+ // must pass through a PHI in the exit block, meaning that this check is
+ // sufficient to guarantee that no loop-variant values are used outside
+ // of the loop.
+ bool AllEntriesInvariant = true;
+ bool AllOutgoingValuesSame = true;
+ for (PHINode &P : ExitBlock->phis()) {
+ Value *incoming = P.getIncomingValueForBlock(ExitingBlocks[0]);
+
+ // Make sure all exiting blocks produce the same incoming value for the exit
+ // block. If there are different incoming values for different exiting
+ // blocks, then it is impossible to statically determine which value should
+ // be used.
+ AllOutgoingValuesSame =
+ all_of(makeArrayRef(ExitingBlocks).slice(1), [&](BasicBlock *BB) {
+ return incoming == P.getIncomingValueForBlock(BB);
+ });
+
+ if (!AllOutgoingValuesSame)
+ break;
+
+ if (Instruction *I = dyn_cast<Instruction>(incoming))
+ if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator())) {
+ AllEntriesInvariant = false;
+ break;
+ }
+ }
+
+ if (Changed)
+ SE.forgetLoopDispositions(L);
+
+ if (!AllEntriesInvariant || !AllOutgoingValuesSame)
+ return false;
+
+ // Make sure that no instructions in the block have potential side-effects.
+ // This includes instructions that could write to memory, and loads that are
+ // marked volatile.
+ for (auto &I : L->blocks())
+ if (any_of(*I, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+ return false;
+ return true;
+}
+
+/// This function returns true if there is no viable path from the
+/// entry block to the header of \p L. Right now, it only does
+/// a local search to save compile time.
+static bool isLoopNeverExecuted(Loop *L) {
+ using namespace PatternMatch;
+
+ auto *Preheader = L->getLoopPreheader();
+ // TODO: We can relax this constraint, since we just need a loop
+ // predecessor.
+ assert(Preheader && "Needs preheader!");
+
+ if (Preheader == &Preheader->getParent()->getEntryBlock())
+ return false;
+ // All predecessors of the preheader should have a constant conditional
+ // branch, with the loop's preheader as not-taken.
+ for (auto *Pred: predecessors(Preheader)) {
+ BasicBlock *Taken, *NotTaken;
+ ConstantInt *Cond;
+ if (!match(Pred->getTerminator(),
+ m_Br(m_ConstantInt(Cond), Taken, NotTaken)))
+ return false;
+ if (!Cond->getZExtValue())
+ std::swap(Taken, NotTaken);
+ if (Taken == Preheader)
+ return false;
+ }
+ assert(!pred_empty(Preheader) &&
+ "Preheader should have predecessors at this point!");
+ // All the predecessors have the loop preheader as not-taken target.
+ return true;
+}
+
+/// Remove a loop if it is dead.
+///
+/// A loop is considered dead if it does not impact the observable behavior of
+/// the program other than finite running time. This never removes a loop that
+/// might be infinite (unless it is never executed), as doing so could change
+/// the halting/non-halting nature of a program.
+///
+/// This entire process relies pretty heavily on LoopSimplify form and LCSSA in
+/// order to make various safety checks work.
+///
+/// \returns true if any changes were made. This may mutate the loop even if it
+/// is unable to delete it due to hoisting trivially loop invariant
+/// instructions out of the loop.
+static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
+ ScalarEvolution &SE, LoopInfo &LI) {
+ assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
+
+ // We can only remove the loop if there is a preheader that we can branch from
+ // after removing it. Also, if LoopSimplify form is not available, stay out
+ // of trouble.
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader || !L->hasDedicatedExits()) {
+ DEBUG(dbgs()
+ << "Deletion requires Loop with preheader and dedicated exits.\n");
+ return LoopDeletionResult::Unmodified;
+ }
+ // We can't remove loops that contain subloops. If the subloops were dead,
+ // they would already have been removed in earlier executions of this pass.
+ if (L->begin() != L->end()) {
+ DEBUG(dbgs() << "Loop contains subloops.\n");
+ return LoopDeletionResult::Unmodified;
+ }
+
+
+ BasicBlock *ExitBlock = L->getUniqueExitBlock();
+
+ if (ExitBlock && isLoopNeverExecuted(L)) {
+ DEBUG(dbgs() << "Loop is proven to never execute, delete it!");
+ // Set incoming value to undef for phi nodes in the exit block.
+ for (PHINode &P : ExitBlock->phis()) {
+ std::fill(P.incoming_values().begin(), P.incoming_values().end(),
+ UndefValue::get(P.getType()));
+ }
+ deleteDeadLoop(L, &DT, &SE, &LI);
+ ++NumDeleted;
+ return LoopDeletionResult::Deleted;
+ }
+
+ // The remaining checks below are for a loop being dead because all statements
+ // in the loop are invariant.
+ SmallVector<BasicBlock *, 4> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+
+ // We require that the loop only have a single exit block. Otherwise, we'd
+ // be in the situation of needing to be able to solve statically which exit
+ // block will be branched to, or trying to preserve the branching logic in
+ // a loop invariant manner.
+ if (!ExitBlock) {
+ DEBUG(dbgs() << "Deletion requires single exit block\n");
+ return LoopDeletionResult::Unmodified;
+ }
+ // Finally, we have to check that the loop really is dead.
+ bool Changed = false;
+ if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader)) {
+ DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n");
+ return Changed ? LoopDeletionResult::Modified
+ : LoopDeletionResult::Unmodified;
+ }
+
+ // Don't remove loops for which we can't solve the trip count.
+ // They could be infinite, in which case we'd be changing program behavior.
+ const SCEV *S = SE.getMaxBackedgeTakenCount(L);
+ if (isa<SCEVCouldNotCompute>(S)) {
+ DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n");
+ return Changed ? LoopDeletionResult::Modified
+ : LoopDeletionResult::Unmodified;
+ }
+
+ DEBUG(dbgs() << "Loop is invariant, delete it!");
+ deleteDeadLoop(L, &DT, &SE, &LI);
+ ++NumDeleted;
+
+ return LoopDeletionResult::Deleted;
+}
+
+PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &Updater) {
+
+ DEBUG(dbgs() << "Analyzing Loop for deletion: ");
+ DEBUG(L.dump());
+ std::string LoopName = L.getName();
+ auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI);
+ if (Result == LoopDeletionResult::Unmodified)
+ return PreservedAnalyses::all();
+
+ if (Result == LoopDeletionResult::Deleted)
+ Updater.markLoopAsDeleted(L, LoopName);
+
+ return getLoopPassPreservedAnalyses();
+}
+
+namespace {
+class LoopDeletionLegacyPass : public LoopPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+ LoopDeletionLegacyPass() : LoopPass(ID) {
+ initializeLoopDeletionLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ // Possibly eliminate loop L if it is dead.
+ bool runOnLoop(Loop *L, LPPassManager &) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ getLoopAnalysisUsage(AU);
+ }
+};
+}
+
+char LoopDeletionLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopDeletionLegacyPass, "loop-deletion",
+ "Delete dead loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(LoopDeletionLegacyPass, "loop-deletion",
+ "Delete dead loops", false, false)
+
+Pass *llvm::createLoopDeletionPass() { return new LoopDeletionLegacyPass(); }
+
+bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
+ if (skipLoop(L))
+ return false;
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+ DEBUG(dbgs() << "Analyzing Loop for deletion: ");
+ DEBUG(L->dump());
+
+ LoopDeletionResult Result = deleteLoopIfDead(L, DT, SE, LI);
+
+ if (Result == LoopDeletionResult::Deleted)
+ LPM.markLoopAsDeleted(*L);
+
+ return Result != LoopDeletionResult::Unmodified;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
new file mode 100644
index 000000000000..0d7e3db901cb
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -0,0 +1,1025 @@
+//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Loop Distribution Pass. Its main focus is to
+// distribute loops that cannot be vectorized due to dependence cycles. It
+// tries to isolate the offending dependences into a new loop allowing
+// vectorization of the remaining parts.
+//
+// For dependence analysis, the pass uses the LoopVectorizer's
+// LoopAccessAnalysis. Because this analysis presumes no change in the order of
+// memory operations, special care is taken to preserve the lexical order of
+// these operations.
+//
+// Similarly to the Vectorizer, the pass also supports loop versioning to
+// run-time disambiguate potentially overlapping arrays.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopDistribute.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <cassert>
+#include <functional>
+#include <list>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define LDIST_NAME "loop-distribute"
+#define DEBUG_TYPE LDIST_NAME
+
+static cl::opt<bool>
+ LDistVerify("loop-distribute-verify", cl::Hidden,
+ cl::desc("Turn on DominatorTree and LoopInfo verification "
+ "after Loop Distribution"),
+ cl::init(false));
+
+static cl::opt<bool> DistributeNonIfConvertible(
+ "loop-distribute-non-if-convertible", cl::Hidden,
+ cl::desc("Whether to distribute into a loop that may not be "
+ "if-convertible by the loop vectorizer"),
+ cl::init(false));
+
+static cl::opt<unsigned> DistributeSCEVCheckThreshold(
+ "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden,
+ cl::desc("The maximum number of SCEV checks allowed for Loop "
+ "Distribution"));
+
+static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
+ "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
+ cl::Hidden,
+ cl::desc(
+ "The maximum number of SCEV checks allowed for Loop "
+ "Distribution for loop marked with #pragma loop distribute(enable)"));
+
+static cl::opt<bool> EnableLoopDistribute(
+ "enable-loop-distribute", cl::Hidden,
+ cl::desc("Enable the new, experimental LoopDistribution Pass"),
+ cl::init(false));
+
+STATISTIC(NumLoopsDistributed, "Number of loops distributed");
+
+namespace {
+
+/// \brief Maintains the set of instructions of the loop for a partition before
+/// cloning. After cloning, it hosts the new loop.
+class InstPartition {
+ using InstructionSet = SmallPtrSet<Instruction *, 8>;
+
+public:
+ InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
+ : DepCycle(DepCycle), OrigLoop(L) {
+ Set.insert(I);
+ }
+
+ /// \brief Returns whether this partition contains a dependence cycle.
+ bool hasDepCycle() const { return DepCycle; }
+
+ /// \brief Adds an instruction to this partition.
+ void add(Instruction *I) { Set.insert(I); }
+
+ /// \brief Collection accessors.
+ InstructionSet::iterator begin() { return Set.begin(); }
+ InstructionSet::iterator end() { return Set.end(); }
+ InstructionSet::const_iterator begin() const { return Set.begin(); }
+ InstructionSet::const_iterator end() const { return Set.end(); }
+ bool empty() const { return Set.empty(); }
+
+ /// \brief Moves this partition into \p Other. This partition becomes empty
+ /// after this.
+ void moveTo(InstPartition &Other) {
+ Other.Set.insert(Set.begin(), Set.end());
+ Set.clear();
+ Other.DepCycle |= DepCycle;
+ }
+
+ /// \brief Populates the partition with a transitive closure of all the
+ /// instructions that the seeded instructions dependent on.
+ void populateUsedSet() {
+ // FIXME: We currently don't use control-dependence but simply include all
+ // blocks (possibly empty at the end) and let simplifycfg mostly clean this
+ // up.
+ for (auto *B : OrigLoop->getBlocks())
+ Set.insert(B->getTerminator());
+
+ // Follow the use-def chains to form a transitive closure of all the
+ // instructions that the originally seeded instructions depend on.
+ SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.pop_back_val();
+ // Insert instructions from the loop that we depend on.
+ for (Value *V : I->operand_values()) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second)
+ Worklist.push_back(I);
+ }
+ }
+ }
+
+ /// \brief Clones the original loop.
+ ///
+ /// Updates LoopInfo and DominatorTree using the information that block \p
+ /// LoopDomBB dominates the loop.
+ Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
+ unsigned Index, LoopInfo *LI,
+ DominatorTree *DT) {
+ ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
+ VMap, Twine(".ldist") + Twine(Index),
+ LI, DT, ClonedLoopBlocks);
+ return ClonedLoop;
+ }
+
+ /// \brief The cloned loop. If this partition is mapped to the original loop,
+ /// this is null.
+ const Loop *getClonedLoop() const { return ClonedLoop; }
+
+ /// \brief Returns the loop where this partition ends up after distribution.
+ /// If this partition is mapped to the original loop then use the block from
+ /// the loop.
+ const Loop *getDistributedLoop() const {
+ return ClonedLoop ? ClonedLoop : OrigLoop;
+ }
+
+ /// \brief The VMap that is populated by cloning and then used in
+ /// remapinstruction to remap the cloned instructions.
+ ValueToValueMapTy &getVMap() { return VMap; }
+
+ /// \brief Remaps the cloned instructions using VMap.
+ void remapInstructions() {
+ remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
+ }
+
+ /// \brief Based on the set of instructions selected for this partition,
+ /// removes the unnecessary ones.
+ void removeUnusedInsts() {
+ SmallVector<Instruction *, 8> Unused;
+
+ for (auto *Block : OrigLoop->getBlocks())
+ for (auto &Inst : *Block)
+ if (!Set.count(&Inst)) {
+ Instruction *NewInst = &Inst;
+ if (!VMap.empty())
+ NewInst = cast<Instruction>(VMap[NewInst]);
+
+ assert(!isa<BranchInst>(NewInst) &&
+ "Branches are marked used early on");
+ Unused.push_back(NewInst);
+ }
+
+ // Delete the instructions backwards, as it has a reduced likelihood of
+ // having to update as many def-use and use-def chains.
+ for (auto *Inst : reverse(Unused)) {
+ if (!Inst->use_empty())
+ Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+ Inst->eraseFromParent();
+ }
+ }
+
+ void print() const {
+ if (DepCycle)
+ dbgs() << " (cycle)\n";
+ for (auto *I : Set)
+ // Prefix with the block name.
+ dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n";
+ }
+
+ void printBlocks() const {
+ for (auto *BB : getDistributedLoop()->getBlocks())
+ dbgs() << *BB;
+ }
+
+private:
+ /// \brief Instructions from OrigLoop selected for this partition.
+ InstructionSet Set;
+
+ /// \brief Whether this partition contains a dependence cycle.
+ bool DepCycle;
+
+ /// \brief The original loop.
+ Loop *OrigLoop;
+
+ /// \brief The cloned loop. If this partition is mapped to the original loop,
+ /// this is null.
+ Loop *ClonedLoop = nullptr;
+
+ /// \brief The blocks of ClonedLoop including the preheader. If this
+ /// partition is mapped to the original loop, this is empty.
+ SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
+
+ /// \brief These gets populated once the set of instructions have been
+ /// finalized. If this partition is mapped to the original loop, these are not
+ /// set.
+ ValueToValueMapTy VMap;
+};
+
+/// \brief Holds the set of Partitions. It populates them, merges them and then
+/// clones the loops.
+class InstPartitionContainer {
+ using InstToPartitionIdT = DenseMap<Instruction *, int>;
+
+public:
+ InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
+ : L(L), LI(LI), DT(DT) {}
+
+ /// \brief Returns the number of partitions.
+ unsigned getSize() const { return PartitionContainer.size(); }
+
+ /// \brief Adds \p Inst into the current partition if that is marked to
+ /// contain cycles. Otherwise start a new partition for it.
+ void addToCyclicPartition(Instruction *Inst) {
+ // If the current partition is non-cyclic. Start a new one.
+ if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
+ PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
+ else
+ PartitionContainer.back().add(Inst);
+ }
+
+ /// \brief Adds \p Inst into a partition that is not marked to contain
+ /// dependence cycles.
+ ///
+ // Initially we isolate memory instructions into as many partitions as
+ // possible, then later we may merge them back together.
+ void addToNewNonCyclicPartition(Instruction *Inst) {
+ PartitionContainer.emplace_back(Inst, L);
+ }
+
+ /// \brief Merges adjacent non-cyclic partitions.
+ ///
+ /// The idea is that we currently only want to isolate the non-vectorizable
+ /// partition. We could later allow more distribution among these partition
+ /// too.
+ void mergeAdjacentNonCyclic() {
+ mergeAdjacentPartitionsIf(
+ [](const InstPartition *P) { return !P->hasDepCycle(); });
+ }
+
+ /// \brief If a partition contains only conditional stores, we won't vectorize
+ /// it. Try to merge it with a previous cyclic partition.
+ void mergeNonIfConvertible() {
+ mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
+ if (Partition->hasDepCycle())
+ return true;
+
+ // Now, check if all stores are conditional in this partition.
+ bool seenStore = false;
+
+ for (auto *Inst : *Partition)
+ if (isa<StoreInst>(Inst)) {
+ seenStore = true;
+ if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT))
+ return false;
+ }
+ return seenStore;
+ });
+ }
+
+ /// \brief Merges the partitions according to various heuristics.
+ void mergeBeforePopulating() {
+ mergeAdjacentNonCyclic();
+ if (!DistributeNonIfConvertible)
+ mergeNonIfConvertible();
+ }
+
+ /// \brief Merges partitions in order to ensure that no loads are duplicated.
+ ///
+ /// We can't duplicate loads because that could potentially reorder them.
+ /// LoopAccessAnalysis provides dependency information with the context that
+ /// the order of memory operation is preserved.
+ ///
+ /// Return if any partitions were merged.
+ bool mergeToAvoidDuplicatedLoads() {
+ using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>;
+ using ToBeMergedT = EquivalenceClasses<InstPartition *>;
+
+ LoadToPartitionT LoadToPartition;
+ ToBeMergedT ToBeMerged;
+
+ // Step through the partitions and create equivalence between partitions
+ // that contain the same load. Also put partitions in between them in the
+ // same equivalence class to avoid reordering of memory operations.
+ for (PartitionContainerT::iterator I = PartitionContainer.begin(),
+ E = PartitionContainer.end();
+ I != E; ++I) {
+ auto *PartI = &*I;
+
+ // If a load occurs in two partitions PartI and PartJ, merge all
+ // partitions (PartI, PartJ] into PartI.
+ for (Instruction *Inst : *PartI)
+ if (isa<LoadInst>(Inst)) {
+ bool NewElt;
+ LoadToPartitionT::iterator LoadToPart;
+
+ std::tie(LoadToPart, NewElt) =
+ LoadToPartition.insert(std::make_pair(Inst, PartI));
+ if (!NewElt) {
+ DEBUG(dbgs() << "Merging partitions due to this load in multiple "
+ << "partitions: " << PartI << ", "
+ << LoadToPart->second << "\n" << *Inst << "\n");
+
+ auto PartJ = I;
+ do {
+ --PartJ;
+ ToBeMerged.unionSets(PartI, &*PartJ);
+ } while (&*PartJ != LoadToPart->second);
+ }
+ }
+ }
+ if (ToBeMerged.empty())
+ return false;
+
+ // Merge the member of an equivalence class into its class leader. This
+ // makes the members empty.
+ for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
+ I != E; ++I) {
+ if (!I->isLeader())
+ continue;
+
+ auto PartI = I->getData();
+ for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
+ ToBeMerged.member_end())) {
+ PartJ->moveTo(*PartI);
+ }
+ }
+
+ // Remove the empty partitions.
+ PartitionContainer.remove_if(
+ [](const InstPartition &P) { return P.empty(); });
+
+ return true;
+ }
+
+ /// \brief Sets up the mapping between instructions to partitions. If the
+ /// instruction is duplicated across multiple partitions, set the entry to -1.
+ void setupPartitionIdOnInstructions() {
+ int PartitionID = 0;
+ for (const auto &Partition : PartitionContainer) {
+ for (Instruction *Inst : Partition) {
+ bool NewElt;
+ InstToPartitionIdT::iterator Iter;
+
+ std::tie(Iter, NewElt) =
+ InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
+ if (!NewElt)
+ Iter->second = -1;
+ }
+ ++PartitionID;
+ }
+ }
+
+ /// \brief Populates the partition with everything that the seeding
+ /// instructions require.
+ void populateUsedSet() {
+ for (auto &P : PartitionContainer)
+ P.populateUsedSet();
+ }
+
+ /// \brief This performs the main chunk of the work of cloning the loops for
+ /// the partitions.
+ void cloneLoops() {
+ BasicBlock *OrigPH = L->getLoopPreheader();
+ // At this point the predecessor of the preheader is either the memcheck
+ // block or the top part of the original preheader.
+ BasicBlock *Pred = OrigPH->getSinglePredecessor();
+ assert(Pred && "Preheader does not have a single predecessor");
+ BasicBlock *ExitBlock = L->getExitBlock();
+ assert(ExitBlock && "No single exit block");
+ Loop *NewLoop;
+
+ assert(!PartitionContainer.empty() && "at least two partitions expected");
+ // We're cloning the preheader along with the loop so we already made sure
+ // it was empty.
+ assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
+ "preheader not empty");
+
+ // Create a loop for each partition except the last. Clone the original
+ // loop before PH along with adding a preheader for the cloned loop. Then
+ // update PH to point to the newly added preheader.
+ BasicBlock *TopPH = OrigPH;
+ unsigned Index = getSize() - 1;
+ for (auto I = std::next(PartitionContainer.rbegin()),
+ E = PartitionContainer.rend();
+ I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) {
+ auto *Part = &*I;
+
+ NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
+
+ Part->getVMap()[ExitBlock] = TopPH;
+ Part->remapInstructions();
+ }
+ Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
+
+ // Now go in forward order and update the immediate dominator for the
+ // preheaders with the exiting block of the previous loop. Dominance
+ // within the loop is updated in cloneLoopWithPreheader.
+ for (auto Curr = PartitionContainer.cbegin(),
+ Next = std::next(PartitionContainer.cbegin()),
+ E = PartitionContainer.cend();
+ Next != E; ++Curr, ++Next)
+ DT->changeImmediateDominator(
+ Next->getDistributedLoop()->getLoopPreheader(),
+ Curr->getDistributedLoop()->getExitingBlock());
+ }
+
+ /// \brief Removes the dead instructions from the cloned loops.
+ void removeUnusedInsts() {
+ for (auto &Partition : PartitionContainer)
+ Partition.removeUnusedInsts();
+ }
+
+ /// \brief For each memory pointer, it computes the partitionId the pointer is
+ /// used in.
+ ///
+ /// This returns an array of int where the I-th entry corresponds to I-th
+ /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple
+ /// partitions its entry is set to -1.
+ SmallVector<int, 8>
+ computePartitionSetForPointers(const LoopAccessInfo &LAI) {
+ const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
+
+ unsigned N = RtPtrCheck->Pointers.size();
+ SmallVector<int, 8> PtrToPartitions(N);
+ for (unsigned I = 0; I < N; ++I) {
+ Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
+ auto Instructions =
+ LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
+
+ int &Partition = PtrToPartitions[I];
+ // First set it to uninitialized.
+ Partition = -2;
+ for (Instruction *Inst : Instructions) {
+ // Note that this could be -1 if Inst is duplicated across multiple
+ // partitions.
+ int ThisPartition = this->InstToPartitionId[Inst];
+ if (Partition == -2)
+ Partition = ThisPartition;
+ // -1 means belonging to multiple partitions.
+ else if (Partition == -1)
+ break;
+ else if (Partition != (int)ThisPartition)
+ Partition = -1;
+ }
+ assert(Partition != -2 && "Pointer not belonging to any partition");
+ }
+
+ return PtrToPartitions;
+ }
+
+ void print(raw_ostream &OS) const {
+ unsigned Index = 0;
+ for (const auto &P : PartitionContainer) {
+ OS << "Partition " << Index++ << " (" << &P << "):\n";
+ P.print();
+ }
+ }
+
+ void dump() const { print(dbgs()); }
+
+#ifndef NDEBUG
+ friend raw_ostream &operator<<(raw_ostream &OS,
+ const InstPartitionContainer &Partitions) {
+ Partitions.print(OS);
+ return OS;
+ }
+#endif
+
+ void printBlocks() const {
+ unsigned Index = 0;
+ for (const auto &P : PartitionContainer) {
+ dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n";
+ P.printBlocks();
+ }
+ }
+
+private:
+ using PartitionContainerT = std::list<InstPartition>;
+
+ /// \brief List of partitions.
+ PartitionContainerT PartitionContainer;
+
+ /// \brief Mapping from Instruction to partition Id. If the instruction
+ /// belongs to multiple partitions the entry contains -1.
+ InstToPartitionIdT InstToPartitionId;
+
+ Loop *L;
+ LoopInfo *LI;
+ DominatorTree *DT;
+
+ /// \brief The control structure to merge adjacent partitions if both satisfy
+ /// the \p Predicate.
+ template <class UnaryPredicate>
+ void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
+ InstPartition *PrevMatch = nullptr;
+ for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
+ auto DoesMatch = Predicate(&*I);
+ if (PrevMatch == nullptr && DoesMatch) {
+ PrevMatch = &*I;
+ ++I;
+ } else if (PrevMatch != nullptr && DoesMatch) {
+ I->moveTo(*PrevMatch);
+ I = PartitionContainer.erase(I);
+ } else {
+ PrevMatch = nullptr;
+ ++I;
+ }
+ }
+ }
+};
+
+/// \brief For each memory instruction, this class maintains difference of the
+/// number of unsafe dependences that start out from this instruction minus
+/// those that end here.
+///
+/// By traversing the memory instructions in program order and accumulating this
+/// number, we know whether any unsafe dependence crosses over a program point.
+class MemoryInstructionDependences {
+ using Dependence = MemoryDepChecker::Dependence;
+
+public:
+ struct Entry {
+ Instruction *Inst;
+ unsigned NumUnsafeDependencesStartOrEnd = 0;
+
+ Entry(Instruction *Inst) : Inst(Inst) {}
+ };
+
+ using AccessesType = SmallVector<Entry, 8>;
+
+ AccessesType::const_iterator begin() const { return Accesses.begin(); }
+ AccessesType::const_iterator end() const { return Accesses.end(); }
+
+ MemoryInstructionDependences(
+ const SmallVectorImpl<Instruction *> &Instructions,
+ const SmallVectorImpl<Dependence> &Dependences) {
+ Accesses.append(Instructions.begin(), Instructions.end());
+
+ DEBUG(dbgs() << "Backward dependences:\n");
+ for (auto &Dep : Dependences)
+ if (Dep.isPossiblyBackward()) {
+ // Note that the designations source and destination follow the program
+ // order, i.e. source is always first. (The direction is given by the
+ // DepType.)
+ ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
+ --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
+
+ DEBUG(Dep.print(dbgs(), 2, Instructions));
+ }
+ }
+
+private:
+ AccessesType Accesses;
+};
+
+/// \brief The actual class performing the per-loop work.
+class LoopDistributeForLoop {
+public:
+ LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
+ ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
+ : L(L), F(F), LI(LI), DT(DT), SE(SE), ORE(ORE) {
+ setForced();
+ }
+
+ /// \brief Try to distribute an inner-most loop.
+ bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
+ assert(L->empty() && "Only process inner loops.");
+
+ DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName()
+ << "\" checking " << *L << "\n");
+
+ if (!L->getExitBlock())
+ return fail("MultipleExitBlocks", "multiple exit blocks");
+ if (!L->isLoopSimplifyForm())
+ return fail("NotLoopSimplifyForm",
+ "loop is not in loop-simplify form");
+
+ BasicBlock *PH = L->getLoopPreheader();
+
+ // LAA will check that we only have a single exiting block.
+ LAI = &GetLAA(*L);
+
+ // Currently, we only distribute to isolate the part of the loop with
+ // dependence cycles to enable partial vectorization.
+ if (LAI->canVectorizeMemory())
+ return fail("MemOpsCanBeVectorized",
+ "memory operations are safe for vectorization");
+
+ auto *Dependences = LAI->getDepChecker().getDependences();
+ if (!Dependences || Dependences->empty())
+ return fail("NoUnsafeDeps", "no unsafe dependences to isolate");
+
+ InstPartitionContainer Partitions(L, LI, DT);
+
+ // First, go through each memory operation and assign them to consecutive
+ // partitions (the order of partitions follows program order). Put those
+ // with unsafe dependences into "cyclic" partition otherwise put each store
+ // in its own "non-cyclic" partition (we'll merge these later).
+ //
+ // Note that a memory operation (e.g. Load2 below) at a program point that
+ // has an unsafe dependence (Store3->Load1) spanning over it must be
+ // included in the same cyclic partition as the dependent operations. This
+ // is to preserve the original program order after distribution. E.g.:
+ //
+ // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive
+ // Load1 -. 1 0->1
+ // Load2 | /Unsafe/ 0 1
+ // Store3 -' -1 1->0
+ // Load4 0 0
+ //
+ // NumUnsafeDependencesActive > 0 indicates this situation and in this case
+ // we just keep assigning to the same cyclic partition until
+ // NumUnsafeDependencesActive reaches 0.
+ const MemoryDepChecker &DepChecker = LAI->getDepChecker();
+ MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
+ *Dependences);
+
+ int NumUnsafeDependencesActive = 0;
+ for (auto &InstDep : MID) {
+ Instruction *I = InstDep.Inst;
+ // We update NumUnsafeDependencesActive post-instruction, catch the
+ // start of a dependence directly via NumUnsafeDependencesStartOrEnd.
+ if (NumUnsafeDependencesActive ||
+ InstDep.NumUnsafeDependencesStartOrEnd > 0)
+ Partitions.addToCyclicPartition(I);
+ else
+ Partitions.addToNewNonCyclicPartition(I);
+ NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
+ assert(NumUnsafeDependencesActive >= 0 &&
+ "Negative number of dependences active");
+ }
+
+ // Add partitions for values used outside. These partitions can be out of
+ // order from the original program order. This is OK because if the
+ // partition uses a load we will merge this partition with the original
+ // partition of the load that we set up in the previous loop (see
+ // mergeToAvoidDuplicatedLoads).
+ auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
+ for (auto *Inst : DefsUsedOutside)
+ Partitions.addToNewNonCyclicPartition(Inst);
+
+ DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
+ if (Partitions.getSize() < 2)
+ return fail("CantIsolateUnsafeDeps",
+ "cannot isolate unsafe dependencies");
+
+ // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
+ // should be able to vectorize these together.
+ Partitions.mergeBeforePopulating();
+ DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
+ if (Partitions.getSize() < 2)
+ return fail("CantIsolateUnsafeDeps",
+ "cannot isolate unsafe dependencies");
+
+ // Now, populate the partitions with non-memory operations.
+ Partitions.populateUsedSet();
+ DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
+
+ // In order to preserve original lexical order for loads, keep them in the
+ // partition that we set up in the MemoryInstructionDependences loop.
+ if (Partitions.mergeToAvoidDuplicatedLoads()) {
+ DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
+ << Partitions);
+ if (Partitions.getSize() < 2)
+ return fail("CantIsolateUnsafeDeps",
+ "cannot isolate unsafe dependencies");
+ }
+
+ // Don't distribute the loop if we need too many SCEV run-time checks.
+ const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate();
+ if (Pred.getComplexity() > (IsForced.getValueOr(false)
+ ? PragmaDistributeSCEVCheckThreshold
+ : DistributeSCEVCheckThreshold))
+ return fail("TooManySCEVRuntimeChecks",
+ "too many SCEV run-time checks needed.\n");
+
+ DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
+ // We're done forming the partitions set up the reverse mapping from
+ // instructions to partitions.
+ Partitions.setupPartitionIdOnInstructions();
+
+ // To keep things simple have an empty preheader before we version or clone
+ // the loop. (Also split if this has no predecessor, i.e. entry, because we
+ // rely on PH having a predecessor.)
+ if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
+ SplitBlock(PH, PH->getTerminator(), DT, LI);
+
+ // If we need run-time checks, version the loop now.
+ auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
+ const auto *RtPtrChecking = LAI->getRuntimePointerChecking();
+ const auto &AllChecks = RtPtrChecking->getChecks();
+ auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
+ RtPtrChecking);
+
+ if (!Pred.isAlwaysTrue() || !Checks.empty()) {
+ DEBUG(dbgs() << "\nPointers:\n");
+ DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+ LoopVersioning LVer(*LAI, L, LI, DT, SE, false);
+ LVer.setAliasChecks(std::move(Checks));
+ LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate());
+ LVer.versionLoop(DefsUsedOutside);
+ LVer.annotateLoopWithNoAlias();
+ }
+
+ // Create identical copies of the original loop for each partition and hook
+ // them up sequentially.
+ Partitions.cloneLoops();
+
+ // Now, we remove the instruction from each loop that don't belong to that
+ // partition.
+ Partitions.removeUnusedInsts();
+ DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
+ DEBUG(Partitions.printBlocks());
+
+ if (LDistVerify) {
+ LI->verify(*DT);
+ DT->verifyDomTree();
+ }
+
+ ++NumLoopsDistributed;
+ // Report the success.
+ ORE->emit([&]() {
+ return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(),
+ L->getHeader())
+ << "distributed loop";
+ });
+ return true;
+ }
+
+ /// \brief Provide diagnostics then \return with false.
+ bool fail(StringRef RemarkName, StringRef Message) {
+ LLVMContext &Ctx = F->getContext();
+ bool Forced = isForced().getValueOr(false);
+
+ DEBUG(dbgs() << "Skipping; " << Message << "\n");
+
+ // With Rpass-missed report that distribution failed.
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed",
+ L->getStartLoc(), L->getHeader())
+ << "loop not distributed: use -Rpass-analysis=loop-distribute for "
+ "more "
+ "info";
+ });
+
+ // With Rpass-analysis report why. This is on by default if distribution
+ // was requested explicitly.
+ ORE->emit(OptimizationRemarkAnalysis(
+ Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME,
+ RemarkName, L->getStartLoc(), L->getHeader())
+ << "loop not distributed: " << Message);
+
+ // Also issue a warning if distribution was requested explicitly but it
+ // failed.
+ if (Forced)
+ Ctx.diagnose(DiagnosticInfoOptimizationFailure(
+ *F, L->getStartLoc(), "loop not distributed: failed "
+ "explicitly specified loop distribution"));
+
+ return false;
+ }
+
+ /// \brief Return if distribution forced to be enabled/disabled for the loop.
+ ///
+ /// If the optional has a value, it indicates whether distribution was forced
+ /// to be enabled (true) or disabled (false). If the optional has no value
+ /// distribution was not forced either way.
+ const Optional<bool> &isForced() const { return IsForced; }
+
+private:
+ /// \brief Filter out checks between pointers from the same partition.
+ ///
+ /// \p PtrToPartition contains the partition number for pointers. Partition
+ /// number -1 means that the pointer is used in multiple partitions. In this
+ /// case we can't safely omit the check.
+ SmallVector<RuntimePointerChecking::PointerCheck, 4>
+ includeOnlyCrossPartitionChecks(
+ const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks,
+ const SmallVectorImpl<int> &PtrToPartition,
+ const RuntimePointerChecking *RtPtrChecking) {
+ SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
+
+ copy_if(AllChecks, std::back_inserter(Checks),
+ [&](const RuntimePointerChecking::PointerCheck &Check) {
+ for (unsigned PtrIdx1 : Check.first->Members)
+ for (unsigned PtrIdx2 : Check.second->Members)
+ // Only include this check if there is a pair of pointers
+ // that require checking and the pointers fall into
+ // separate partitions.
+ //
+ // (Note that we already know at this point that the two
+ // pointer groups need checking but it doesn't follow
+ // that each pair of pointers within the two groups need
+ // checking as well.
+ //
+ // In other words we don't want to include a check just
+ // because there is a pair of pointers between the two
+ // pointer groups that require checks and a different
+ // pair whose pointers fall into different partitions.)
+ if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
+ !RuntimePointerChecking::arePointersInSamePartition(
+ PtrToPartition, PtrIdx1, PtrIdx2))
+ return true;
+ return false;
+ });
+
+ return Checks;
+ }
+
+ /// \brief Check whether the loop metadata is forcing distribution to be
+ /// enabled/disabled.
+ void setForced() {
+ Optional<const MDOperand *> Value =
+ findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
+ if (!Value)
+ return;
+
+ const MDOperand *Op = *Value;
+ assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
+ IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
+ }
+
+ Loop *L;
+ Function *F;
+
+ // Analyses used.
+ LoopInfo *LI;
+ const LoopAccessInfo *LAI = nullptr;
+ DominatorTree *DT;
+ ScalarEvolution *SE;
+ OptimizationRemarkEmitter *ORE;
+
+ /// \brief Indicates whether distribution is forced to be enabled/disabled for
+ /// the loop.
+ ///
+ /// If the optional has a value, it indicates whether distribution was forced
+ /// to be enabled (true) or disabled (false). If the optional has no value
+ /// distribution was not forced either way.
+ Optional<bool> IsForced;
+};
+
+} // end anonymous namespace
+
+/// Shared implementation between new and old PMs.
+static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
+ ScalarEvolution *SE, OptimizationRemarkEmitter *ORE,
+ std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
+ // Build up a worklist of inner-loops to vectorize. This is necessary as the
+ // act of distributing a loop creates new loops and can invalidate iterators
+ // across the loops.
+ SmallVector<Loop *, 8> Worklist;
+
+ for (Loop *TopLevelLoop : *LI)
+ for (Loop *L : depth_first(TopLevelLoop))
+ // We only handle inner-most loops.
+ if (L->empty())
+ Worklist.push_back(L);
+
+ // Now walk the identified inner loops.
+ bool Changed = false;
+ for (Loop *L : Worklist) {
+ LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE);
+
+ // If distribution was forced for the specific loop to be
+ // enabled/disabled, follow that. Otherwise use the global flag.
+ if (LDL.isForced().getValueOr(EnableLoopDistribute))
+ Changed |= LDL.processLoop(GetLAA);
+ }
+
+ // Process each loop nest in the function.
+ return Changed;
+}
+
+namespace {
+
+/// \brief The pass class.
+class LoopDistributeLegacy : public FunctionPass {
+public:
+ static char ID;
+
+ LoopDistributeLegacy() : FunctionPass(ID) {
+ // The default is set by the caller.
+ initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+ std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+ [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
+
+ return runImpl(F, LI, DT, SE, ORE, GetLAA);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<LoopAccessLegacyAnalysis>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+PreservedAnalyses LoopDistributePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+ // We don't directly need these analyses but they're required for loop
+ // analyses so provide them below.
+ auto &AA = AM.getResult<AAManager>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+
+ auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+ std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+ [&](Loop &L) -> const LoopAccessInfo & {
+ LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr};
+ return LAM.getResult<LoopAccessAnalysis>(L, AR);
+ };
+
+ bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA);
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<LoopAnalysis>();
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+char LoopDistributeLegacy::ID;
+
+static const char ldist_name[] = "Loop Distribution";
+
+INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false)
+
+FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
new file mode 100644
index 000000000000..21551f0a0825
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -0,0 +1,1728 @@
+//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements an idiom recognizer that transforms simple loops into a
+// non-loop form. In cases that this kicks in, it can be a significant
+// performance win.
+//
+// If compiling for code size we avoid idiom recognition if the resulting
+// code could be larger than the code for the original loop. One way this could
+// happen is if the loop is not removable after idiom recognition due to the
+// presence of non-idiom instructions. The initial implementation of the
+// heuristics applies to idioms in multi-block loops.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO List:
+//
+// Future loop memory idioms to recognize:
+// memcmp, memmove, strlen, etc.
+// Future floating point idioms to recognize in -ffast-math mode:
+// fpowi
+// Future integer operation idioms to recognize:
+// ctpop, ctlz, cttz
+//
+// Beware that isel's default lowering for ctpop is highly inefficient for
+// i64 and larger types when i64 is legal and the value has few bits set. It
+// would be good to enhance isel to emit a loop for ctpop in this case.
+//
+// This could recognize common matrix multiplies and dot product idioms and
+// replace them with calls to BLAS (if linked in??).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-idiom"
+
+STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
+STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
+
+static cl::opt<bool> UseLIRCodeSizeHeurs(
+ "use-lir-code-size-heurs",
+ cl::desc("Use loop idiom recognition code size heuristics when compiling"
+ "with -Os/-Oz"),
+ cl::init(true), cl::Hidden);
+
+namespace {
+
+class LoopIdiomRecognize {
+ Loop *CurLoop = nullptr;
+ AliasAnalysis *AA;
+ DominatorTree *DT;
+ LoopInfo *LI;
+ ScalarEvolution *SE;
+ TargetLibraryInfo *TLI;
+ const TargetTransformInfo *TTI;
+ const DataLayout *DL;
+ bool ApplyCodeSizeHeuristics;
+
+public:
+ explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
+ LoopInfo *LI, ScalarEvolution *SE,
+ TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+ : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL) {}
+
+ bool runOnLoop(Loop *L);
+
+private:
+ using StoreList = SmallVector<StoreInst *, 8>;
+ using StoreListMap = MapVector<Value *, StoreList>;
+
+ StoreListMap StoreRefsForMemset;
+ StoreListMap StoreRefsForMemsetPattern;
+ StoreList StoreRefsForMemcpy;
+ bool HasMemset;
+ bool HasMemsetPattern;
+ bool HasMemcpy;
+
+ /// Return code for isLegalStore()
+ enum LegalStoreKind {
+ None = 0,
+ Memset,
+ MemsetPattern,
+ Memcpy,
+ UnorderedAtomicMemcpy,
+ DontUse // Dummy retval never to be used. Allows catching errors in retval
+ // handling.
+ };
+
+ /// \name Countable Loop Idiom Handling
+ /// @{
+
+ bool runOnCountableLoop();
+ bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks);
+
+ void collectStores(BasicBlock *BB);
+ LegalStoreKind isLegalStore(StoreInst *SI);
+ bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
+ bool ForMemset);
+ bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
+
+ bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
+ unsigned StoreAlignment, Value *StoredVal,
+ Instruction *TheStore,
+ SmallPtrSetImpl<Instruction *> &Stores,
+ const SCEVAddRecExpr *Ev, const SCEV *BECount,
+ bool NegStride, bool IsLoopMemset = false);
+ bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
+ bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
+ bool IsLoopMemset = false);
+
+ /// @}
+ /// \name Noncountable Loop Idiom Handling
+ /// @{
+
+ bool runOnNoncountableLoop();
+
+ bool recognizePopcount();
+ void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
+ PHINode *CntPhi, Value *Var);
+ bool recognizeAndInsertCTLZ();
+ void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
+ PHINode *CntPhi, Value *Var, const DebugLoc DL,
+ bool ZeroCheck, bool IsCntPhiUsedOutsideLoop);
+
+ /// @}
+};
+
+class LoopIdiomRecognizeLegacyPass : public LoopPass {
+public:
+ static char ID;
+
+ explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
+ initializeLoopIdiomRecognizeLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+
+ AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ const TargetTransformInfo *TTI =
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *L->getHeader()->getParent());
+ const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
+
+ LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL);
+ return LIR.runOnLoop(L);
+ }
+
+ /// This transformation requires natural loop information & requires that
+ /// loop preheaders be inserted into the CFG.
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ getLoopAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char LoopIdiomRecognizeLegacyPass::ID = 0;
+
+PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ const auto *DL = &L.getHeader()->getModule()->getDataLayout();
+
+ LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL);
+ if (!LIR.runOnLoop(&L))
+ return PreservedAnalyses::all();
+
+ return getLoopPassPreservedAnalyses();
+}
+
+INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
+ "Recognize loop idioms", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
+ "Recognize loop idioms", false, false)
+
+Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
+
+static void deleteDeadInstruction(Instruction *I) {
+ I->replaceAllUsesWith(UndefValue::get(I->getType()));
+ I->eraseFromParent();
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Implementation of LoopIdiomRecognize
+//
+//===----------------------------------------------------------------------===//
+
+bool LoopIdiomRecognize::runOnLoop(Loop *L) {
+ CurLoop = L;
+ // If the loop could not be converted to canonical form, it must have an
+ // indirectbr in it, just give up.
+ if (!L->getLoopPreheader())
+ return false;
+
+ // Disable loop idiom recognition if the function's name is a common idiom.
+ StringRef Name = L->getHeader()->getParent()->getName();
+ if (Name == "memset" || Name == "memcpy")
+ return false;
+
+ // Determine if code size heuristics need to be applied.
+ ApplyCodeSizeHeuristics =
+ L->getHeader()->getParent()->optForSize() && UseLIRCodeSizeHeurs;
+
+ HasMemset = TLI->has(LibFunc_memset);
+ HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
+ HasMemcpy = TLI->has(LibFunc_memcpy);
+
+ if (HasMemset || HasMemsetPattern || HasMemcpy)
+ if (SE->hasLoopInvariantBackedgeTakenCount(L))
+ return runOnCountableLoop();
+
+ return runOnNoncountableLoop();
+}
+
+bool LoopIdiomRecognize::runOnCountableLoop() {
+ const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
+ assert(!isa<SCEVCouldNotCompute>(BECount) &&
+ "runOnCountableLoop() called on a loop without a predictable"
+ "backedge-taken count");
+
+ // If this loop executes exactly one time, then it should be peeled, not
+ // optimized by this pass.
+ if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
+ if (BECst->getAPInt() == 0)
+ return false;
+
+ SmallVector<BasicBlock *, 8> ExitBlocks;
+ CurLoop->getUniqueExitBlocks(ExitBlocks);
+
+ DEBUG(dbgs() << "loop-idiom Scanning: F["
+ << CurLoop->getHeader()->getParent()->getName() << "] Loop %"
+ << CurLoop->getHeader()->getName() << "\n");
+
+ bool MadeChange = false;
+
+ // The following transforms hoist stores/memsets into the loop pre-header.
+ // Give up if the loop has instructions may throw.
+ LoopSafetyInfo SafetyInfo;
+ computeLoopSafetyInfo(&SafetyInfo, CurLoop);
+ if (SafetyInfo.MayThrow)
+ return MadeChange;
+
+ // Scan all the blocks in the loop that are not in subloops.
+ for (auto *BB : CurLoop->getBlocks()) {
+ // Ignore blocks in subloops.
+ if (LI->getLoopFor(BB) != CurLoop)
+ continue;
+
+ MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
+ }
+ return MadeChange;
+}
+
+static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
+ const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
+ return ConstStride->getAPInt();
+}
+
+/// getMemSetPatternValue - If a strided store of the specified value is safe to
+/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
+/// be passed in. Otherwise, return null.
+///
+/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
+/// just replicate their input array and then pass on to memset_pattern16.
+static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
+ // If the value isn't a constant, we can't promote it to being in a constant
+ // array. We could theoretically do a store to an alloca or something, but
+ // that doesn't seem worthwhile.
+ Constant *C = dyn_cast<Constant>(V);
+ if (!C)
+ return nullptr;
+
+ // Only handle simple values that are a power of two bytes in size.
+ uint64_t Size = DL->getTypeSizeInBits(V->getType());
+ if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
+ return nullptr;
+
+ // Don't care enough about darwin/ppc to implement this.
+ if (DL->isBigEndian())
+ return nullptr;
+
+ // Convert to size in bytes.
+ Size /= 8;
+
+ // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
+ // if the top and bottom are the same (e.g. for vectors and large integers).
+ if (Size > 16)
+ return nullptr;
+
+ // If the constant is exactly 16 bytes, just use it.
+ if (Size == 16)
+ return C;
+
+ // Otherwise, we'll use an array of the constants.
+ unsigned ArraySize = 16 / Size;
+ ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
+ return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
+}
+
+LoopIdiomRecognize::LegalStoreKind
+LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
+ // Don't touch volatile stores.
+ if (SI->isVolatile())
+ return LegalStoreKind::None;
+ // We only want simple or unordered-atomic stores.
+ if (!SI->isUnordered())
+ return LegalStoreKind::None;
+
+ // Don't convert stores of non-integral pointer types to memsets (which stores
+ // integers).
+ if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType()))
+ return LegalStoreKind::None;
+
+ // Avoid merging nontemporal stores.
+ if (SI->getMetadata(LLVMContext::MD_nontemporal))
+ return LegalStoreKind::None;
+
+ Value *StoredVal = SI->getValueOperand();
+ Value *StorePtr = SI->getPointerOperand();
+
+ // Reject stores that are so large that they overflow an unsigned.
+ uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
+ if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
+ return LegalStoreKind::None;
+
+ // See if the pointer expression is an AddRec like {base,+,1} on the current
+ // loop, which indicates a strided store. If we have something else, it's a
+ // random store we can't handle.
+ const SCEVAddRecExpr *StoreEv =
+ dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+ if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
+ return LegalStoreKind::None;
+
+ // Check to see if we have a constant stride.
+ if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
+ return LegalStoreKind::None;
+
+ // See if the store can be turned into a memset.
+
+ // If the stored value is a byte-wise value (like i32 -1), then it may be
+ // turned into a memset of i8 -1, assuming that all the consecutive bytes
+ // are stored. A store of i32 0x01020304 can never be turned into a memset,
+ // but it can be turned into memset_pattern if the target supports it.
+ Value *SplatValue = isBytewiseValue(StoredVal);
+ Constant *PatternValue = nullptr;
+
+ // Note: memset and memset_pattern on unordered-atomic is yet not supported
+ bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
+
+ // If we're allowed to form a memset, and the stored value would be
+ // acceptable for memset, use it.
+ if (!UnorderedAtomic && HasMemset && SplatValue &&
+ // Verify that the stored value is loop invariant. If not, we can't
+ // promote the memset.
+ CurLoop->isLoopInvariant(SplatValue)) {
+ // It looks like we can use SplatValue.
+ return LegalStoreKind::Memset;
+ } else if (!UnorderedAtomic && HasMemsetPattern &&
+ // Don't create memset_pattern16s with address spaces.
+ StorePtr->getType()->getPointerAddressSpace() == 0 &&
+ (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
+ // It looks like we can use PatternValue!
+ return LegalStoreKind::MemsetPattern;
+ }
+
+ // Otherwise, see if the store can be turned into a memcpy.
+ if (HasMemcpy) {
+ // Check to see if the stride matches the size of the store. If so, then we
+ // know that every byte is touched in the loop.
+ APInt Stride = getStoreStride(StoreEv);
+ unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
+ if (StoreSize != Stride && StoreSize != -Stride)
+ return LegalStoreKind::None;
+
+ // The store must be feeding a non-volatile load.
+ LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
+
+ // Only allow non-volatile loads
+ if (!LI || LI->isVolatile())
+ return LegalStoreKind::None;
+ // Only allow simple or unordered-atomic loads
+ if (!LI->isUnordered())
+ return LegalStoreKind::None;
+
+ // See if the pointer expression is an AddRec like {base,+,1} on the current
+ // loop, which indicates a strided load. If we have something else, it's a
+ // random load we can't handle.
+ const SCEVAddRecExpr *LoadEv =
+ dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+ if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
+ return LegalStoreKind::None;
+
+ // The store and load must share the same stride.
+ if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
+ return LegalStoreKind::None;
+
+ // Success. This store can be converted into a memcpy.
+ UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
+ return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
+ : LegalStoreKind::Memcpy;
+ }
+ // This store can't be transformed into a memset/memcpy.
+ return LegalStoreKind::None;
+}
+
+void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
+ StoreRefsForMemset.clear();
+ StoreRefsForMemsetPattern.clear();
+ StoreRefsForMemcpy.clear();
+ for (Instruction &I : *BB) {
+ StoreInst *SI = dyn_cast<StoreInst>(&I);
+ if (!SI)
+ continue;
+
+ // Make sure this is a strided store with a constant stride.
+ switch (isLegalStore(SI)) {
+ case LegalStoreKind::None:
+ // Nothing to do
+ break;
+ case LegalStoreKind::Memset: {
+ // Find the base pointer.
+ Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
+ StoreRefsForMemset[Ptr].push_back(SI);
+ } break;
+ case LegalStoreKind::MemsetPattern: {
+ // Find the base pointer.
+ Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
+ StoreRefsForMemsetPattern[Ptr].push_back(SI);
+ } break;
+ case LegalStoreKind::Memcpy:
+ case LegalStoreKind::UnorderedAtomicMemcpy:
+ StoreRefsForMemcpy.push_back(SI);
+ break;
+ default:
+ assert(false && "unhandled return value");
+ break;
+ }
+ }
+}
+
+/// runOnLoopBlock - Process the specified block, which lives in a counted loop
+/// with the specified backedge count. This block is known to be in the current
+/// loop and not in any subloops.
+bool LoopIdiomRecognize::runOnLoopBlock(
+ BasicBlock *BB, const SCEV *BECount,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks) {
+ // We can only promote stores in this block if they are unconditionally
+ // executed in the loop. For a block to be unconditionally executed, it has
+ // to dominate all the exit blocks of the loop. Verify this now.
+ for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+ if (!DT->dominates(BB, ExitBlocks[i]))
+ return false;
+
+ bool MadeChange = false;
+ // Look for store instructions, which may be optimized to memset/memcpy.
+ collectStores(BB);
+
+ // Look for a single store or sets of stores with a common base, which can be
+ // optimized into a memset (memset_pattern). The latter most commonly happens
+ // with structs and handunrolled loops.
+ for (auto &SL : StoreRefsForMemset)
+ MadeChange |= processLoopStores(SL.second, BECount, true);
+
+ for (auto &SL : StoreRefsForMemsetPattern)
+ MadeChange |= processLoopStores(SL.second, BECount, false);
+
+ // Optimize the store into a memcpy, if it feeds an similarly strided load.
+ for (auto &SI : StoreRefsForMemcpy)
+ MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
+
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+ Instruction *Inst = &*I++;
+ // Look for memset instructions, which may be optimized to a larger memset.
+ if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
+ WeakTrackingVH InstPtr(&*I);
+ if (!processLoopMemSet(MSI, BECount))
+ continue;
+ MadeChange = true;
+
+ // If processing the memset invalidated our iterator, start over from the
+ // top of the block.
+ if (!InstPtr)
+ I = BB->begin();
+ continue;
+ }
+ }
+
+ return MadeChange;
+}
+
+/// processLoopStores - See if this store(s) can be promoted to a memset.
+bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
+ const SCEV *BECount,
+ bool ForMemset) {
+ // Try to find consecutive stores that can be transformed into memsets.
+ SetVector<StoreInst *> Heads, Tails;
+ SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
+
+ // Do a quadratic search on all of the given stores and find
+ // all of the pairs of stores that follow each other.
+ SmallVector<unsigned, 16> IndexQueue;
+ for (unsigned i = 0, e = SL.size(); i < e; ++i) {
+ assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
+
+ Value *FirstStoredVal = SL[i]->getValueOperand();
+ Value *FirstStorePtr = SL[i]->getPointerOperand();
+ const SCEVAddRecExpr *FirstStoreEv =
+ cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
+ APInt FirstStride = getStoreStride(FirstStoreEv);
+ unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
+
+ // See if we can optimize just this store in isolation.
+ if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
+ Heads.insert(SL[i]);
+ continue;
+ }
+
+ Value *FirstSplatValue = nullptr;
+ Constant *FirstPatternValue = nullptr;
+
+ if (ForMemset)
+ FirstSplatValue = isBytewiseValue(FirstStoredVal);
+ else
+ FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
+
+ assert((FirstSplatValue || FirstPatternValue) &&
+ "Expected either splat value or pattern value.");
+
+ IndexQueue.clear();
+ // If a store has multiple consecutive store candidates, search Stores
+ // array according to the sequence: from i+1 to e, then from i-1 to 0.
+ // This is because usually pairing with immediate succeeding or preceding
+ // candidate create the best chance to find memset opportunity.
+ unsigned j = 0;
+ for (j = i + 1; j < e; ++j)
+ IndexQueue.push_back(j);
+ for (j = i; j > 0; --j)
+ IndexQueue.push_back(j - 1);
+
+ for (auto &k : IndexQueue) {
+ assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
+ Value *SecondStorePtr = SL[k]->getPointerOperand();
+ const SCEVAddRecExpr *SecondStoreEv =
+ cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
+ APInt SecondStride = getStoreStride(SecondStoreEv);
+
+ if (FirstStride != SecondStride)
+ continue;
+
+ Value *SecondStoredVal = SL[k]->getValueOperand();
+ Value *SecondSplatValue = nullptr;
+ Constant *SecondPatternValue = nullptr;
+
+ if (ForMemset)
+ SecondSplatValue = isBytewiseValue(SecondStoredVal);
+ else
+ SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
+
+ assert((SecondSplatValue || SecondPatternValue) &&
+ "Expected either splat value or pattern value.");
+
+ if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
+ if (ForMemset) {
+ if (FirstSplatValue != SecondSplatValue)
+ continue;
+ } else {
+ if (FirstPatternValue != SecondPatternValue)
+ continue;
+ }
+ Tails.insert(SL[k]);
+ Heads.insert(SL[i]);
+ ConsecutiveChain[SL[i]] = SL[k];
+ break;
+ }
+ }
+ }
+
+ // We may run into multiple chains that merge into a single chain. We mark the
+ // stores that we transformed so that we don't visit the same store twice.
+ SmallPtrSet<Value *, 16> TransformedStores;
+ bool Changed = false;
+
+ // For stores that start but don't end a link in the chain:
+ for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
+ it != e; ++it) {
+ if (Tails.count(*it))
+ continue;
+
+ // We found a store instr that starts a chain. Now follow the chain and try
+ // to transform it.
+ SmallPtrSet<Instruction *, 8> AdjacentStores;
+ StoreInst *I = *it;
+
+ StoreInst *HeadStore = I;
+ unsigned StoreSize = 0;
+
+ // Collect the chain into a list.
+ while (Tails.count(I) || Heads.count(I)) {
+ if (TransformedStores.count(I))
+ break;
+ AdjacentStores.insert(I);
+
+ StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
+ // Move to the next value in the chain.
+ I = ConsecutiveChain[I];
+ }
+
+ Value *StoredVal = HeadStore->getValueOperand();
+ Value *StorePtr = HeadStore->getPointerOperand();
+ const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+ APInt Stride = getStoreStride(StoreEv);
+
+ // Check to see if the stride matches the size of the stores. If so, then
+ // we know that every byte is touched in the loop.
+ if (StoreSize != Stride && StoreSize != -Stride)
+ continue;
+
+ bool NegStride = StoreSize == -Stride;
+
+ if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(),
+ StoredVal, HeadStore, AdjacentStores, StoreEv,
+ BECount, NegStride)) {
+ TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+/// processLoopMemSet - See if this memset can be promoted to a large memset.
+bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
+ const SCEV *BECount) {
+ // We can only handle non-volatile memsets with a constant size.
+ if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
+ return false;
+
+ // If we're not allowed to hack on memset, we fail.
+ if (!HasMemset)
+ return false;
+
+ Value *Pointer = MSI->getDest();
+
+ // See if the pointer expression is an AddRec like {base,+,1} on the current
+ // loop, which indicates a strided store. If we have something else, it's a
+ // random store we can't handle.
+ const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
+ if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
+ return false;
+
+ // Reject memsets that are so large that they overflow an unsigned.
+ uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+ if ((SizeInBytes >> 32) != 0)
+ return false;
+
+ // Check to see if the stride matches the size of the memset. If so, then we
+ // know that every byte is touched in the loop.
+ const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
+ if (!ConstStride)
+ return false;
+
+ APInt Stride = ConstStride->getAPInt();
+ if (SizeInBytes != Stride && SizeInBytes != -Stride)
+ return false;
+
+ // Verify that the memset value is loop invariant. If not, we can't promote
+ // the memset.
+ Value *SplatValue = MSI->getValue();
+ if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
+ return false;
+
+ SmallPtrSet<Instruction *, 1> MSIs;
+ MSIs.insert(MSI);
+ bool NegStride = SizeInBytes == -Stride;
+ return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
+ MSI->getAlignment(), SplatValue, MSI, MSIs, Ev,
+ BECount, NegStride, /*IsLoopMemset=*/true);
+}
+
+/// mayLoopAccessLocation - Return true if the specified loop might access the
+/// specified pointer location, which is a loop-strided access. The 'Access'
+/// argument specifies what the verboten forms of access are (read or write).
+static bool
+mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
+ const SCEV *BECount, unsigned StoreSize,
+ AliasAnalysis &AA,
+ SmallPtrSetImpl<Instruction *> &IgnoredStores) {
+ // Get the location that may be stored across the loop. Since the access is
+ // strided positively through memory, we say that the modified location starts
+ // at the pointer and has infinite size.
+ uint64_t AccessSize = MemoryLocation::UnknownSize;
+
+ // If the loop iterates a fixed number of times, we can refine the access size
+ // to be exactly the size of the memset, which is (BECount+1)*StoreSize
+ if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
+ AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize;
+
+ // TODO: For this to be really effective, we have to dive into the pointer
+ // operand in the store. Store to &A[i] of 100 will always return may alias
+ // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
+ // which will then no-alias a store to &A[100].
+ MemoryLocation StoreLoc(Ptr, AccessSize);
+
+ for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
+ ++BI)
+ for (Instruction &I : **BI)
+ if (IgnoredStores.count(&I) == 0 &&
+ isModOrRefSet(
+ intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
+ return true;
+
+ return false;
+}
+
+// If we have a negative stride, Start refers to the end of the memory location
+// we're trying to memset. Therefore, we need to recompute the base pointer,
+// which is just Start - BECount*Size.
+static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
+ Type *IntPtr, unsigned StoreSize,
+ ScalarEvolution *SE) {
+ const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
+ if (StoreSize != 1)
+ Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
+ SCEV::FlagNUW);
+ return SE->getMinusSCEV(Start, Index);
+}
+
+/// Compute the number of bytes as a SCEV from the backedge taken count.
+///
+/// This also maps the SCEV into the provided type and tries to handle the
+/// computation in a way that will fold cleanly.
+static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
+ unsigned StoreSize, Loop *CurLoop,
+ const DataLayout *DL, ScalarEvolution *SE) {
+ const SCEV *NumBytesS;
+ // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
+ // pointer size if it isn't already.
+ //
+ // If we're going to need to zero extend the BE count, check if we can add
+ // one to it prior to zero extending without overflow. Provided this is safe,
+ // it allows better simplification of the +1.
+ if (DL->getTypeSizeInBits(BECount->getType()) <
+ DL->getTypeSizeInBits(IntPtr) &&
+ SE->isLoopEntryGuardedByCond(
+ CurLoop, ICmpInst::ICMP_NE, BECount,
+ SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
+ NumBytesS = SE->getZeroExtendExpr(
+ SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
+ IntPtr);
+ } else {
+ NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
+ SE->getOne(IntPtr), SCEV::FlagNUW);
+ }
+
+ // And scale it based on the store size.
+ if (StoreSize != 1) {
+ NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
+ SCEV::FlagNUW);
+ }
+ return NumBytesS;
+}
+
+/// processLoopStridedStore - We see a strided store of some value. If we can
+/// transform this into a memset or memset_pattern in the loop preheader, do so.
+bool LoopIdiomRecognize::processLoopStridedStore(
+ Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment,
+ Value *StoredVal, Instruction *TheStore,
+ SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
+ const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
+ Value *SplatValue = isBytewiseValue(StoredVal);
+ Constant *PatternValue = nullptr;
+
+ if (!SplatValue)
+ PatternValue = getMemSetPatternValue(StoredVal, DL);
+
+ assert((SplatValue || PatternValue) &&
+ "Expected either splat value or pattern value.");
+
+ // The trip count of the loop and the base pointer of the addrec SCEV is
+ // guaranteed to be loop invariant, which means that it should dominate the
+ // header. This allows us to insert code for it in the preheader.
+ unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
+ BasicBlock *Preheader = CurLoop->getLoopPreheader();
+ IRBuilder<> Builder(Preheader->getTerminator());
+ SCEVExpander Expander(*SE, *DL, "loop-idiom");
+
+ Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
+ Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS);
+
+ const SCEV *Start = Ev->getStart();
+ // Handle negative strided loops.
+ if (NegStride)
+ Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE);
+
+ // TODO: ideally we should still be able to generate memset if SCEV expander
+ // is taught to generate the dependencies at the latest point.
+ if (!isSafeToExpand(Start, *SE))
+ return false;
+
+ // Okay, we have a strided store "p[i]" of a splattable value. We can turn
+ // this into a memset in the loop preheader now if we want. However, this
+ // would be unsafe to do if there is anything else in the loop that may read
+ // or write to the aliased location. Check for any overlap by generating the
+ // base pointer and checking the region.
+ Value *BasePtr =
+ Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
+ if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
+ StoreSize, *AA, Stores)) {
+ Expander.clear();
+ // If we generated new code for the base pointer, clean up.
+ RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI);
+ return false;
+ }
+
+ if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
+ return false;
+
+ // Okay, everything looks good, insert the memset.
+
+ const SCEV *NumBytesS =
+ getNumBytes(BECount, IntPtr, StoreSize, CurLoop, DL, SE);
+
+ // TODO: ideally we should still be able to generate memset if SCEV expander
+ // is taught to generate the dependencies at the latest point.
+ if (!isSafeToExpand(NumBytesS, *SE))
+ return false;
+
+ Value *NumBytes =
+ Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
+
+ CallInst *NewCall;
+ if (SplatValue) {
+ NewCall =
+ Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment);
+ } else {
+ // Everything is emitted in default address space
+ Type *Int8PtrTy = DestInt8PtrTy;
+
+ Module *M = TheStore->getModule();
+ Value *MSP =
+ M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(),
+ Int8PtrTy, Int8PtrTy, IntPtr);
+ inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI);
+
+ // Otherwise we should form a memset_pattern16. PatternValue is known to be
+ // an constant array of 16-bytes. Plop the value into a mergable global.
+ GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
+ GlobalValue::PrivateLinkage,
+ PatternValue, ".memset_pattern");
+ GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
+ GV->setAlignment(16);
+ Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
+ NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
+ }
+
+ DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
+ << " from store to: " << *Ev << " at: " << *TheStore << "\n");
+ NewCall->setDebugLoc(TheStore->getDebugLoc());
+
+ // Okay, the memset has been formed. Zap the original store and anything that
+ // feeds into it.
+ for (auto *I : Stores)
+ deleteDeadInstruction(I);
+ ++NumMemSet;
+ return true;
+}
+
+/// If the stored value is a strided load in the same loop with the same stride
+/// this may be transformable into a memcpy. This kicks in for stuff like
+/// for (i) A[i] = B[i];
+bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
+ const SCEV *BECount) {
+ assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
+
+ Value *StorePtr = SI->getPointerOperand();
+ const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+ APInt Stride = getStoreStride(StoreEv);
+ unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
+ bool NegStride = StoreSize == -Stride;
+
+ // The store must be feeding a non-volatile load.
+ LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
+ assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
+
+ // See if the pointer expression is an AddRec like {base,+,1} on the current
+ // loop, which indicates a strided load. If we have something else, it's a
+ // random load we can't handle.
+ const SCEVAddRecExpr *LoadEv =
+ cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+
+ // The trip count of the loop and the base pointer of the addrec SCEV is
+ // guaranteed to be loop invariant, which means that it should dominate the
+ // header. This allows us to insert code for it in the preheader.
+ BasicBlock *Preheader = CurLoop->getLoopPreheader();
+ IRBuilder<> Builder(Preheader->getTerminator());
+ SCEVExpander Expander(*SE, *DL, "loop-idiom");
+
+ const SCEV *StrStart = StoreEv->getStart();
+ unsigned StrAS = SI->getPointerAddressSpace();
+ Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS);
+
+ // Handle negative strided loops.
+ if (NegStride)
+ StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE);
+
+ // Okay, we have a strided store "p[i]" of a loaded value. We can turn
+ // this into a memcpy in the loop preheader now if we want. However, this
+ // would be unsafe to do if there is anything else in the loop that may read
+ // or write the memory region we're storing to. This includes the load that
+ // feeds the stores. Check for an alias by generating the base address and
+ // checking everything.
+ Value *StoreBasePtr = Expander.expandCodeFor(
+ StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
+
+ SmallPtrSet<Instruction *, 1> Stores;
+ Stores.insert(SI);
+ if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
+ StoreSize, *AA, Stores)) {
+ Expander.clear();
+ // If we generated new code for the base pointer, clean up.
+ RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
+ return false;
+ }
+
+ const SCEV *LdStart = LoadEv->getStart();
+ unsigned LdAS = LI->getPointerAddressSpace();
+
+ // Handle negative strided loops.
+ if (NegStride)
+ LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE);
+
+ // For a memcpy, we have to make sure that the input array is not being
+ // mutated by the loop.
+ Value *LoadBasePtr = Expander.expandCodeFor(
+ LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
+
+ if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
+ StoreSize, *AA, Stores)) {
+ Expander.clear();
+ // If we generated new code for the base pointer, clean up.
+ RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
+ RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
+ return false;
+ }
+
+ if (avoidLIRForMultiBlockLoop())
+ return false;
+
+ // Okay, everything is safe, we can transform this!
+
+ const SCEV *NumBytesS =
+ getNumBytes(BECount, IntPtrTy, StoreSize, CurLoop, DL, SE);
+
+ Value *NumBytes =
+ Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
+
+ unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
+ CallInst *NewCall = nullptr;
+ // Check whether to generate an unordered atomic memcpy:
+ // If the load or store are atomic, then they must neccessarily be unordered
+ // by previous checks.
+ if (!SI->isAtomic() && !LI->isAtomic())
+ NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align);
+ else {
+ // We cannot allow unaligned ops for unordered load/store, so reject
+ // anything where the alignment isn't at least the element size.
+ if (Align < StoreSize)
+ return false;
+
+ // If the element.atomic memcpy is not lowered into explicit
+ // loads/stores later, then it will be lowered into an element-size
+ // specific lib call. If the lib call doesn't exist for our store size, then
+ // we shouldn't generate the memcpy.
+ if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
+ return false;
+
+ // Create the call.
+ // Note that unordered atomic loads/stores are *required* by the spec to
+ // have an alignment but non-atomic loads/stores may not.
+ NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
+ StoreBasePtr, SI->getAlignment(), LoadBasePtr, LI->getAlignment(),
+ NumBytes, StoreSize);
+ }
+ NewCall->setDebugLoc(SI->getDebugLoc());
+
+ DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"
+ << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
+ << " from store ptr=" << *StoreEv << " at: " << *SI << "\n");
+
+ // Okay, the memcpy has been formed. Zap the original store and anything that
+ // feeds into it.
+ deleteDeadInstruction(SI);
+ ++NumMemCpy;
+ return true;
+}
+
+// When compiling for codesize we avoid idiom recognition for a multi-block loop
+// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
+//
+bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
+ bool IsLoopMemset) {
+ if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
+ if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) {
+ DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
+ << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
+ << " avoided: multi-block top-level loop\n");
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool LoopIdiomRecognize::runOnNoncountableLoop() {
+ return recognizePopcount() || recognizeAndInsertCTLZ();
+}
+
+/// Check if the given conditional branch is based on the comparison between
+/// a variable and zero, and if the variable is non-zero, the control yields to
+/// the loop entry. If the branch matches the behavior, the variable involved
+/// in the comparison is returned. This function will be called to see if the
+/// precondition and postcondition of the loop are in desirable form.
+static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) {
+ if (!BI || !BI->isConditional())
+ return nullptr;
+
+ ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+ if (!Cond)
+ return nullptr;
+
+ ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
+ if (!CmpZero || !CmpZero->isZero())
+ return nullptr;
+
+ ICmpInst::Predicate Pred = Cond->getPredicate();
+ if ((Pred == ICmpInst::ICMP_NE && BI->getSuccessor(0) == LoopEntry) ||
+ (Pred == ICmpInst::ICMP_EQ && BI->getSuccessor(1) == LoopEntry))
+ return Cond->getOperand(0);
+
+ return nullptr;
+}
+
+// Check if the recurrence variable `VarX` is in the right form to create
+// the idiom. Returns the value coerced to a PHINode if so.
+static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
+ BasicBlock *LoopEntry) {
+ auto *PhiX = dyn_cast<PHINode>(VarX);
+ if (PhiX && PhiX->getParent() == LoopEntry &&
+ (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
+ return PhiX;
+ return nullptr;
+}
+
+/// Return true iff the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction counting the population bit.
+/// 2) \p CntPhi is set to the corresponding phi node.
+/// 3) \p Var is set to the value whose population bits are being counted.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+/// if (x0 != 0)
+/// goto loop-exit // the precondition of the loop
+/// cnt0 = init-val;
+/// do {
+/// x1 = phi (x0, x2);
+/// cnt1 = phi(cnt0, cnt2);
+///
+/// cnt2 = cnt1 + 1;
+/// ...
+/// x2 = x1 & (x1 - 1);
+/// ...
+/// } while(x != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
+ Instruction *&CntInst, PHINode *&CntPhi,
+ Value *&Var) {
+ // step 1: Check to see if the look-back branch match this pattern:
+ // "if (a!=0) goto loop-entry".
+ BasicBlock *LoopEntry;
+ Instruction *DefX2, *CountInst;
+ Value *VarX1, *VarX0;
+ PHINode *PhiX, *CountPhi;
+
+ DefX2 = CountInst = nullptr;
+ VarX1 = VarX0 = nullptr;
+ PhiX = CountPhi = nullptr;
+ LoopEntry = *(CurLoop->block_begin());
+
+ // step 1: Check if the loop-back branch is in desirable form.
+ {
+ if (Value *T = matchCondition(
+ dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+ DefX2 = dyn_cast<Instruction>(T);
+ else
+ return false;
+ }
+
+ // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
+ {
+ if (!DefX2 || DefX2->getOpcode() != Instruction::And)
+ return false;
+
+ BinaryOperator *SubOneOp;
+
+ if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
+ VarX1 = DefX2->getOperand(1);
+ else {
+ VarX1 = DefX2->getOperand(0);
+ SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
+ }
+ if (!SubOneOp)
+ return false;
+
+ Instruction *SubInst = cast<Instruction>(SubOneOp);
+ ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1));
+ if (!Dec ||
+ !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) ||
+ (SubInst->getOpcode() == Instruction::Add &&
+ Dec->isMinusOne()))) {
+ return false;
+ }
+ }
+
+ // step 3: Check the recurrence of variable X
+ PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
+ if (!PhiX)
+ return false;
+
+ // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
+ {
+ CountInst = nullptr;
+ for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+ IterE = LoopEntry->end();
+ Iter != IterE; Iter++) {
+ Instruction *Inst = &*Iter;
+ if (Inst->getOpcode() != Instruction::Add)
+ continue;
+
+ ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+ if (!Inc || !Inc->isOne())
+ continue;
+
+ PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
+ if (!Phi)
+ continue;
+
+ // Check if the result of the instruction is live of the loop.
+ bool LiveOutLoop = false;
+ for (User *U : Inst->users()) {
+ if ((cast<Instruction>(U))->getParent() != LoopEntry) {
+ LiveOutLoop = true;
+ break;
+ }
+ }
+
+ if (LiveOutLoop) {
+ CountInst = Inst;
+ CountPhi = Phi;
+ break;
+ }
+ }
+
+ if (!CountInst)
+ return false;
+ }
+
+ // step 5: check if the precondition is in this form:
+ // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
+ {
+ auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+ Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
+ if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
+ return false;
+
+ CntInst = CountInst;
+ CntPhi = CountPhi;
+ Var = T;
+ }
+
+ return true;
+}
+
+/// Return true if the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
+/// or nullptr if there is no such.
+/// 2) \p CntPhi is set to the corresponding phi node
+/// or nullptr if there is no such.
+/// 3) \p Var is set to the value whose CTLZ could be used.
+/// 4) \p DefX is set to the instruction calculating Loop exit condition.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+/// if (x0 == 0)
+/// goto loop-exit // the precondition of the loop
+/// cnt0 = init-val;
+/// do {
+/// x = phi (x0, x.next); //PhiX
+/// cnt = phi(cnt0, cnt.next);
+///
+/// cnt.next = cnt + 1;
+/// ...
+/// x.next = x >> 1; // DefX
+/// ...
+/// } while(x.next != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
+ Instruction *&CntInst, PHINode *&CntPhi,
+ Instruction *&DefX) {
+ BasicBlock *LoopEntry;
+ Value *VarX = nullptr;
+
+ DefX = nullptr;
+ PhiX = nullptr;
+ CntInst = nullptr;
+ CntPhi = nullptr;
+ LoopEntry = *(CurLoop->block_begin());
+
+ // step 1: Check if the loop-back branch is in desirable form.
+ if (Value *T = matchCondition(
+ dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+ DefX = dyn_cast<Instruction>(T);
+ else
+ return false;
+
+ // step 2: detect instructions corresponding to "x.next = x >> 1"
+ if (!DefX || DefX->getOpcode() != Instruction::AShr)
+ return false;
+ ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
+ if (!Shft || !Shft->isOne())
+ return false;
+ VarX = DefX->getOperand(0);
+
+ // step 3: Check the recurrence of variable X
+ PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
+ if (!PhiX)
+ return false;
+
+ // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
+ // TODO: We can skip the step. If loop trip count is known (CTLZ),
+ // then all uses of "cnt.next" could be optimized to the trip count
+ // plus "cnt0". Currently it is not optimized.
+ // This step could be used to detect POPCNT instruction:
+ // cnt.next = cnt + (x.next & 1)
+ for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+ IterE = LoopEntry->end();
+ Iter != IterE; Iter++) {
+ Instruction *Inst = &*Iter;
+ if (Inst->getOpcode() != Instruction::Add)
+ continue;
+
+ ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+ if (!Inc || !Inc->isOne())
+ continue;
+
+ PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
+ if (!Phi)
+ continue;
+
+ CntInst = Inst;
+ CntPhi = Phi;
+ break;
+ }
+ if (!CntInst)
+ return false;
+
+ return true;
+}
+
+/// Recognize CTLZ idiom in a non-countable loop and convert the loop
+/// to countable (with CTLZ trip count).
+/// If CTLZ inserted as a new trip count returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
+ // Give up if the loop has multiple blocks or multiple backedges.
+ if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+ return false;
+
+ Instruction *CntInst, *DefX;
+ PHINode *CntPhi, *PhiX;
+ if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX))
+ return false;
+
+ bool IsCntPhiUsedOutsideLoop = false;
+ for (User *U : CntPhi->users())
+ if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+ IsCntPhiUsedOutsideLoop = true;
+ break;
+ }
+ bool IsCntInstUsedOutsideLoop = false;
+ for (User *U : CntInst->users())
+ if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+ IsCntInstUsedOutsideLoop = true;
+ break;
+ }
+ // If both CntInst and CntPhi are used outside the loop the profitability
+ // is questionable.
+ if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
+ return false;
+
+ // For some CPUs result of CTLZ(X) intrinsic is undefined
+ // when X is 0. If we can not guarantee X != 0, we need to check this
+ // when expand.
+ bool ZeroCheck = false;
+ // It is safe to assume Preheader exist as it was checked in
+ // parent function RunOnLoop.
+ BasicBlock *PH = CurLoop->getLoopPreheader();
+ Value *InitX = PhiX->getIncomingValueForBlock(PH);
+ // If we check X != 0 before entering the loop we don't need a zero
+ // check in CTLZ intrinsic, but only if Cnt Phi is not used outside of the
+ // loop (if it is used we count CTLZ(X >> 1)).
+ if (!IsCntPhiUsedOutsideLoop)
+ if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
+ if (BranchInst *PreCondBr =
+ dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
+ if (matchCondition(PreCondBr, PH) == InitX)
+ ZeroCheck = true;
+ }
+
+ // Check if CTLZ intrinsic is profitable. Assume it is always profitable
+ // if we delete the loop (the loop has only 6 instructions):
+ // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
+ // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
+ // %shr = ashr %n.addr.0, 1
+ // %tobool = icmp eq %shr, 0
+ // %inc = add nsw %i.0, 1
+ // br i1 %tobool
+
+ IRBuilder<> Builder(PH->getTerminator());
+ SmallVector<const Value *, 2> Ops =
+ {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()};
+ ArrayRef<const Value *> Args(Ops);
+ if (CurLoop->getHeader()->size() != 6 &&
+ TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
+ TargetTransformInfo::TCC_Basic)
+ return false;
+
+ const DebugLoc DL = DefX->getDebugLoc();
+ transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck,
+ IsCntPhiUsedOutsideLoop);
+ return true;
+}
+
+/// Recognizes a population count idiom in a non-countable loop.
+///
+/// If detected, transforms the relevant code to issue the popcount intrinsic
+/// function call, and returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizePopcount() {
+ if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
+ return false;
+
+ // Counting population are usually conducted by few arithmetic instructions.
+ // Such instructions can be easily "absorbed" by vacant slots in a
+ // non-compact loop. Therefore, recognizing popcount idiom only makes sense
+ // in a compact loop.
+
+ // Give up if the loop has multiple blocks or multiple backedges.
+ if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+ return false;
+
+ BasicBlock *LoopBody = *(CurLoop->block_begin());
+ if (LoopBody->size() >= 20) {
+ // The loop is too big, bail out.
+ return false;
+ }
+
+ // It should have a preheader containing nothing but an unconditional branch.
+ BasicBlock *PH = CurLoop->getLoopPreheader();
+ if (!PH || &PH->front() != PH->getTerminator())
+ return false;
+ auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
+ if (!EntryBI || EntryBI->isConditional())
+ return false;
+
+ // It should have a precondition block where the generated popcount instrinsic
+ // function can be inserted.
+ auto *PreCondBB = PH->getSinglePredecessor();
+ if (!PreCondBB)
+ return false;
+ auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+ if (!PreCondBI || PreCondBI->isUnconditional())
+ return false;
+
+ Instruction *CntInst;
+ PHINode *CntPhi;
+ Value *Val;
+ if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
+ return false;
+
+ transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
+ return true;
+}
+
+static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+ const DebugLoc &DL) {
+ Value *Ops[] = {Val};
+ Type *Tys[] = {Val->getType()};
+
+ Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+ Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
+ CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+ CI->setDebugLoc(DL);
+
+ return CI;
+}
+
+static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+ const DebugLoc &DL, bool ZeroCheck) {
+ Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
+ Type *Tys[] = {Val->getType()};
+
+ Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+ Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys);
+ CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+ CI->setDebugLoc(DL);
+
+ return CI;
+}
+
+/// Transform the following loop:
+/// loop:
+/// CntPhi = PHI [Cnt0, CntInst]
+/// PhiX = PHI [InitX, DefX]
+/// CntInst = CntPhi + 1
+/// DefX = PhiX >> 1
+/// LOOP_BODY
+/// Br: loop if (DefX != 0)
+/// Use(CntPhi) or Use(CntInst)
+///
+/// Into:
+/// If CntPhi used outside the loop:
+/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
+/// Count = CountPrev + 1
+/// else
+/// Count = BitWidth(InitX) - CTLZ(InitX)
+/// loop:
+/// CntPhi = PHI [Cnt0, CntInst]
+/// PhiX = PHI [InitX, DefX]
+/// PhiCount = PHI [Count, Dec]
+/// CntInst = CntPhi + 1
+/// DefX = PhiX >> 1
+/// Dec = PhiCount - 1
+/// LOOP_BODY
+/// Br: loop if (Dec != 0)
+/// Use(CountPrev + Cnt0) // Use(CntPhi)
+/// or
+/// Use(Count + Cnt0) // Use(CntInst)
+///
+/// If LOOP_BODY is empty the loop will be deleted.
+/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
+void LoopIdiomRecognize::transformLoopToCountable(
+ BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
+ const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
+ BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator());
+
+ // Step 1: Insert the CTLZ instruction at the end of the preheader block
+ // Count = BitWidth - CTLZ(InitX);
+ // If there are uses of CntPhi create:
+ // CountPrev = BitWidth - CTLZ(InitX >> 1);
+ IRBuilder<> Builder(PreheaderBr);
+ Builder.SetCurrentDebugLocation(DL);
+ Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;
+
+ if (IsCntPhiUsedOutsideLoop)
+ InitXNext = Builder.CreateAShr(InitX,
+ ConstantInt::get(InitX->getType(), 1));
+ else
+ InitXNext = InitX;
+ CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
+ Count = Builder.CreateSub(
+ ConstantInt::get(CTLZ->getType(),
+ CTLZ->getType()->getIntegerBitWidth()),
+ CTLZ);
+ if (IsCntPhiUsedOutsideLoop) {
+ CountPrev = Count;
+ Count = Builder.CreateAdd(
+ CountPrev,
+ ConstantInt::get(CountPrev->getType(), 1));
+ }
+ if (IsCntPhiUsedOutsideLoop)
+ NewCount = Builder.CreateZExtOrTrunc(CountPrev,
+ cast<IntegerType>(CntInst->getType()));
+ else
+ NewCount = Builder.CreateZExtOrTrunc(Count,
+ cast<IntegerType>(CntInst->getType()));
+
+ // If the CTLZ counter's initial value is not zero, insert Add Inst.
+ Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
+ ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+ if (!InitConst || !InitConst->isZero())
+ NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+
+ // Step 2: Insert new IV and loop condition:
+ // loop:
+ // ...
+ // PhiCount = PHI [Count, Dec]
+ // ...
+ // Dec = PhiCount - 1
+ // ...
+ // Br: loop if (Dec != 0)
+ BasicBlock *Body = *(CurLoop->block_begin());
+ auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+ ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+ Type *Ty = Count->getType();
+
+ PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+ Builder.SetInsertPoint(LbCond);
+ Instruction *TcDec = cast<Instruction>(
+ Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+ "tcdec", false, true));
+
+ TcPhi->addIncoming(Count, Preheader);
+ TcPhi->addIncoming(TcDec, Body);
+
+ CmpInst::Predicate Pred =
+ (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
+ LbCond->setPredicate(Pred);
+ LbCond->setOperand(0, TcDec);
+ LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+
+ // Step 3: All the references to the original counter outside
+ // the loop are replaced with the NewCount -- the value returned from
+ // __builtin_ctlz(x).
+ if (IsCntPhiUsedOutsideLoop)
+ CntPhi->replaceUsesOutsideBlock(NewCount, Body);
+ else
+ CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+ // step 4: Forget the "non-computable" trip-count SCEV associated with the
+ // loop. The loop would otherwise not be deleted even if it becomes empty.
+ SE->forgetLoop(CurLoop);
+}
+
+void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
+ Instruction *CntInst,
+ PHINode *CntPhi, Value *Var) {
+ BasicBlock *PreHead = CurLoop->getLoopPreheader();
+ auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+ const DebugLoc DL = CntInst->getDebugLoc();
+
+ // Assuming before transformation, the loop is following:
+ // if (x) // the precondition
+ // do { cnt++; x &= x - 1; } while(x);
+
+ // Step 1: Insert the ctpop instruction at the end of the precondition block
+ IRBuilder<> Builder(PreCondBr);
+ Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
+ {
+ PopCnt = createPopcntIntrinsic(Builder, Var, DL);
+ NewCount = PopCntZext =
+ Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
+
+ if (NewCount != PopCnt)
+ (cast<Instruction>(NewCount))->setDebugLoc(DL);
+
+ // TripCnt is exactly the number of iterations the loop has
+ TripCnt = NewCount;
+
+ // If the population counter's initial value is not zero, insert Add Inst.
+ Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
+ ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+ if (!InitConst || !InitConst->isZero()) {
+ NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+ (cast<Instruction>(NewCount))->setDebugLoc(DL);
+ }
+ }
+
+ // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
+ // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
+ // function would be partial dead code, and downstream passes will drag
+ // it back from the precondition block to the preheader.
+ {
+ ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
+
+ Value *Opnd0 = PopCntZext;
+ Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
+ if (PreCond->getOperand(0) != Var)
+ std::swap(Opnd0, Opnd1);
+
+ ICmpInst *NewPreCond = cast<ICmpInst>(
+ Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
+ PreCondBr->setCondition(NewPreCond);
+
+ RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
+ }
+
+ // Step 3: Note that the population count is exactly the trip count of the
+ // loop in question, which enable us to to convert the loop from noncountable
+ // loop into a countable one. The benefit is twofold:
+ //
+ // - If the loop only counts population, the entire loop becomes dead after
+ // the transformation. It is a lot easier to prove a countable loop dead
+ // than to prove a noncountable one. (In some C dialects, an infinite loop
+ // isn't dead even if it computes nothing useful. In general, DCE needs
+ // to prove a noncountable loop finite before safely delete it.)
+ //
+ // - If the loop also performs something else, it remains alive.
+ // Since it is transformed to countable form, it can be aggressively
+ // optimized by some optimizations which are in general not applicable
+ // to a noncountable loop.
+ //
+ // After this step, this loop (conceptually) would look like following:
+ // newcnt = __builtin_ctpop(x);
+ // t = newcnt;
+ // if (x)
+ // do { cnt++; x &= x-1; t--) } while (t > 0);
+ BasicBlock *Body = *(CurLoop->block_begin());
+ {
+ auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+ ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+ Type *Ty = TripCnt->getType();
+
+ PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+ Builder.SetInsertPoint(LbCond);
+ Instruction *TcDec = cast<Instruction>(
+ Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+ "tcdec", false, true));
+
+ TcPhi->addIncoming(TripCnt, PreHead);
+ TcPhi->addIncoming(TcDec, Body);
+
+ CmpInst::Predicate Pred =
+ (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
+ LbCond->setPredicate(Pred);
+ LbCond->setOperand(0, TcDec);
+ LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+ }
+
+ // Step 4: All the references to the original population counter outside
+ // the loop are replaced with the NewCount -- the value returned from
+ // __builtin_ctpop().
+ CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+ // step 5: Forget the "non-computable" trip-count SCEV associated with the
+ // loop. The loop would otherwise not be deleted even if it becomes empty.
+ SE->forgetLoop(CurLoop);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
new file mode 100644
index 000000000000..40d468a084d4
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -0,0 +1,223 @@
+//===- LoopInstSimplify.cpp - Loop Instruction Simplification Pass --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs lightweight instruction simplification on loop bodies.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopInstSimplify.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/User.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <algorithm>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-instsimplify"
+
+STATISTIC(NumSimplified, "Number of redundant instructions simplified");
+
+static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI,
+ AssumptionCache *AC,
+ const TargetLibraryInfo *TLI) {
+ SmallVector<BasicBlock *, 8> ExitBlocks;
+ L->getUniqueExitBlocks(ExitBlocks);
+ array_pod_sort(ExitBlocks.begin(), ExitBlocks.end());
+
+ SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
+
+ // The bit we are stealing from the pointer represents whether this basic
+ // block is the header of a subloop, in which case we only process its phis.
+ using WorklistItem = PointerIntPair<BasicBlock *, 1>;
+ SmallVector<WorklistItem, 16> VisitStack;
+ SmallPtrSet<BasicBlock *, 32> Visited;
+
+ bool Changed = false;
+ bool LocalChanged;
+ do {
+ LocalChanged = false;
+
+ VisitStack.clear();
+ Visited.clear();
+
+ VisitStack.push_back(WorklistItem(L->getHeader(), false));
+
+ while (!VisitStack.empty()) {
+ WorklistItem Item = VisitStack.pop_back_val();
+ BasicBlock *BB = Item.getPointer();
+ bool IsSubloopHeader = Item.getInt();
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+ // Simplify instructions in the current basic block.
+ for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+ Instruction *I = &*BI++;
+
+ // The first time through the loop ToSimplify is empty and we try to
+ // simplify all instructions. On later iterations ToSimplify is not
+ // empty and we only bother simplifying instructions that are in it.
+ if (!ToSimplify->empty() && !ToSimplify->count(I))
+ continue;
+
+ // Don't bother simplifying unused instructions.
+ if (!I->use_empty()) {
+ Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC});
+ if (V && LI->replacementPreservesLCSSAForm(I, V)) {
+ // Mark all uses for resimplification next time round the loop.
+ for (User *U : I->users())
+ Next->insert(cast<Instruction>(U));
+
+ I->replaceAllUsesWith(V);
+ LocalChanged = true;
+ ++NumSimplified;
+ }
+ }
+ if (RecursivelyDeleteTriviallyDeadInstructions(I, TLI)) {
+ // RecursivelyDeleteTriviallyDeadInstruction can remove more than one
+ // instruction, so simply incrementing the iterator does not work.
+ // When instructions get deleted re-iterate instead.
+ BI = BB->begin();
+ BE = BB->end();
+ LocalChanged = true;
+ }
+
+ if (IsSubloopHeader && !isa<PHINode>(I))
+ break;
+ }
+
+ // Add all successors to the worklist, except for loop exit blocks and the
+ // bodies of subloops. We visit the headers of loops so that we can
+ // process
+ // their phis, but we contract the rest of the subloop body and only
+ // follow
+ // edges leading back to the original loop.
+ for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE;
+ ++SI) {
+ BasicBlock *SuccBB = *SI;
+ if (!Visited.insert(SuccBB).second)
+ continue;
+
+ const Loop *SuccLoop = LI->getLoopFor(SuccBB);
+ if (SuccLoop && SuccLoop->getHeader() == SuccBB &&
+ L->contains(SuccLoop)) {
+ VisitStack.push_back(WorklistItem(SuccBB, true));
+
+ SmallVector<BasicBlock *, 8> SubLoopExitBlocks;
+ SuccLoop->getExitBlocks(SubLoopExitBlocks);
+
+ for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) {
+ BasicBlock *ExitBB = SubLoopExitBlocks[i];
+ if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB).second)
+ VisitStack.push_back(WorklistItem(ExitBB, false));
+ }
+
+ continue;
+ }
+
+ bool IsExitBlock =
+ std::binary_search(ExitBlocks.begin(), ExitBlocks.end(), SuccBB);
+ if (IsExitBlock)
+ continue;
+
+ VisitStack.push_back(WorklistItem(SuccBB, false));
+ }
+ }
+
+ // Place the list of instructions to simplify on the next loop iteration
+ // into ToSimplify.
+ std::swap(ToSimplify, Next);
+ Next->clear();
+
+ Changed |= LocalChanged;
+ } while (LocalChanged);
+
+ return Changed;
+}
+
+namespace {
+
+class LoopInstSimplifyLegacyPass : public LoopPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+
+ LoopInstSimplifyLegacyPass() : LoopPass(ID) {
+ initializeLoopInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+ DominatorTreeWrapperPass *DTWP =
+ getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ AssumptionCache *AC =
+ &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *L->getHeader()->getParent());
+ const TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+ return SimplifyLoopInst(L, DT, LI, AC, TLI);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.setPreservesCFG();
+ getLoopAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ if (!SimplifyLoopInst(&L, &AR.DT, &AR.LI, &AR.AC, &AR.TLI))
+ return PreservedAnalyses::all();
+
+ auto PA = getLoopPassPreservedAnalyses();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+char LoopInstSimplifyLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopInstSimplifyLegacyPass, "loop-instsimplify",
+ "Simplify instructions in loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(LoopInstSimplifyLegacyPass, "loop-instsimplify",
+ "Simplify instructions in loops", false, false)
+
+Pass *llvm::createLoopInstSimplifyPass() {
+ return new LoopInstSimplifyLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
new file mode 100644
index 000000000000..4f8dafef230a
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -0,0 +1,1427 @@
+//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This Pass handles loop interchange transform.
+// This pass interchanges loops to provide a more cache-friendly memory access
+// patterns.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <cassert>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-interchange"
+
+static cl::opt<int> LoopInterchangeCostThreshold(
+ "loop-interchange-threshold", cl::init(0), cl::Hidden,
+ cl::desc("Interchange if you gain more than this number"));
+
+namespace {
+
+using LoopVector = SmallVector<Loop *, 8>;
+
+// TODO: Check if we can use a sparse matrix here.
+using CharMatrix = std::vector<std::vector<char>>;
+
+} // end anonymous namespace
+
+// Maximum number of dependencies that can be handled in the dependency matrix.
+static const unsigned MaxMemInstrCount = 100;
+
+// Maximum loop depth supported.
+static const unsigned MaxLoopNestDepth = 10;
+
+#ifdef DUMP_DEP_MATRICIES
+static void printDepMatrix(CharMatrix &DepMatrix) {
+ for (auto &Row : DepMatrix) {
+ for (auto D : Row)
+ DEBUG(dbgs() << D << " ");
+ DEBUG(dbgs() << "\n");
+ }
+}
+#endif
+
+static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
+ Loop *L, DependenceInfo *DI) {
+ using ValueVector = SmallVector<Value *, 16>;
+
+ ValueVector MemInstr;
+
+ // For each block.
+ for (BasicBlock *BB : L->blocks()) {
+ // Scan the BB and collect legal loads and stores.
+ for (Instruction &I : *BB) {
+ if (!isa<Instruction>(I))
+ return false;
+ if (auto *Ld = dyn_cast<LoadInst>(&I)) {
+ if (!Ld->isSimple())
+ return false;
+ MemInstr.push_back(&I);
+ } else if (auto *St = dyn_cast<StoreInst>(&I)) {
+ if (!St->isSimple())
+ return false;
+ MemInstr.push_back(&I);
+ }
+ }
+ }
+
+ DEBUG(dbgs() << "Found " << MemInstr.size()
+ << " Loads and Stores to analyze\n");
+
+ ValueVector::iterator I, IE, J, JE;
+
+ for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
+ for (J = I, JE = MemInstr.end(); J != JE; ++J) {
+ std::vector<char> Dep;
+ Instruction *Src = cast<Instruction>(*I);
+ Instruction *Dst = cast<Instruction>(*J);
+ if (Src == Dst)
+ continue;
+ // Ignore Input dependencies.
+ if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
+ continue;
+ // Track Output, Flow, and Anti dependencies.
+ if (auto D = DI->depends(Src, Dst, true)) {
+ assert(D->isOrdered() && "Expected an output, flow or anti dep.");
+ DEBUG(StringRef DepType =
+ D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
+ dbgs() << "Found " << DepType
+ << " dependency between Src and Dst\n"
+ << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
+ unsigned Levels = D->getLevels();
+ char Direction;
+ for (unsigned II = 1; II <= Levels; ++II) {
+ const SCEV *Distance = D->getDistance(II);
+ const SCEVConstant *SCEVConst =
+ dyn_cast_or_null<SCEVConstant>(Distance);
+ if (SCEVConst) {
+ const ConstantInt *CI = SCEVConst->getValue();
+ if (CI->isNegative())
+ Direction = '<';
+ else if (CI->isZero())
+ Direction = '=';
+ else
+ Direction = '>';
+ Dep.push_back(Direction);
+ } else if (D->isScalar(II)) {
+ Direction = 'S';
+ Dep.push_back(Direction);
+ } else {
+ unsigned Dir = D->getDirection(II);
+ if (Dir == Dependence::DVEntry::LT ||
+ Dir == Dependence::DVEntry::LE)
+ Direction = '<';
+ else if (Dir == Dependence::DVEntry::GT ||
+ Dir == Dependence::DVEntry::GE)
+ Direction = '>';
+ else if (Dir == Dependence::DVEntry::EQ)
+ Direction = '=';
+ else
+ Direction = '*';
+ Dep.push_back(Direction);
+ }
+ }
+ while (Dep.size() != Level) {
+ Dep.push_back('I');
+ }
+
+ DepMatrix.push_back(Dep);
+ if (DepMatrix.size() > MaxMemInstrCount) {
+ DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
+ << " dependencies inside loop\n");
+ return false;
+ }
+ }
+ }
+ }
+
+ // We don't have a DepMatrix to check legality return false.
+ if (DepMatrix.empty())
+ return false;
+ return true;
+}
+
+// A loop is moved from index 'from' to an index 'to'. Update the Dependence
+// matrix by exchanging the two columns.
+static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
+ unsigned ToIndx) {
+ unsigned numRows = DepMatrix.size();
+ for (unsigned i = 0; i < numRows; ++i) {
+ char TmpVal = DepMatrix[i][ToIndx];
+ DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx];
+ DepMatrix[i][FromIndx] = TmpVal;
+ }
+}
+
+// Checks if outermost non '=','S'or'I' dependence in the dependence matrix is
+// '>'
+static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row,
+ unsigned Column) {
+ for (unsigned i = 0; i <= Column; ++i) {
+ if (DepMatrix[Row][i] == '<')
+ return false;
+ if (DepMatrix[Row][i] == '>')
+ return true;
+ }
+ // All dependencies were '=','S' or 'I'
+ return false;
+}
+
+// Checks if no dependence exist in the dependency matrix in Row before Column.
+static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
+ unsigned Column) {
+ for (unsigned i = 0; i < Column; ++i) {
+ if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' &&
+ DepMatrix[Row][i] != 'I')
+ return false;
+ }
+ return true;
+}
+
+static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
+ unsigned OuterLoopId, char InnerDep,
+ char OuterDep) {
+ if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId))
+ return false;
+
+ if (InnerDep == OuterDep)
+ return true;
+
+ // It is legal to interchange if and only if after interchange no row has a
+ // '>' direction as the leftmost non-'='.
+
+ if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I')
+ return true;
+
+ if (InnerDep == '<')
+ return true;
+
+ if (InnerDep == '>') {
+ // If OuterLoopId represents outermost loop then interchanging will make the
+ // 1st dependency as '>'
+ if (OuterLoopId == 0)
+ return false;
+
+ // If all dependencies before OuterloopId are '=','S'or 'I'. Then
+ // interchanging will result in this row having an outermost non '='
+ // dependency of '>'
+ if (!containsNoDependence(DepMatrix, Row, OuterLoopId))
+ return true;
+ }
+
+ return false;
+}
+
+// Checks if it is legal to interchange 2 loops.
+// [Theorem] A permutation of the loops in a perfect nest is legal if and only
+// if the direction matrix, after the same permutation is applied to its
+// columns, has no ">" direction as the leftmost non-"=" direction in any row.
+static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
+ unsigned InnerLoopId,
+ unsigned OuterLoopId) {
+ unsigned NumRows = DepMatrix.size();
+ // For each row check if it is valid to interchange.
+ for (unsigned Row = 0; Row < NumRows; ++Row) {
+ char InnerDep = DepMatrix[Row][InnerLoopId];
+ char OuterDep = DepMatrix[Row][OuterLoopId];
+ if (InnerDep == '*' || OuterDep == '*')
+ return false;
+ if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep))
+ return false;
+ }
+ return true;
+}
+
+static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) {
+ DEBUG(dbgs() << "Calling populateWorklist on Func: "
+ << L.getHeader()->getParent()->getName() << " Loop: %"
+ << L.getHeader()->getName() << '\n');
+ LoopVector LoopList;
+ Loop *CurrentLoop = &L;
+ const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
+ while (!Vec->empty()) {
+ // The current loop has multiple subloops in it hence it is not tightly
+ // nested.
+ // Discard all loops above it added into Worklist.
+ if (Vec->size() != 1) {
+ LoopList.clear();
+ return;
+ }
+ LoopList.push_back(CurrentLoop);
+ CurrentLoop = Vec->front();
+ Vec = &CurrentLoop->getSubLoops();
+ }
+ LoopList.push_back(CurrentLoop);
+ V.push_back(std::move(LoopList));
+}
+
+static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
+ PHINode *InnerIndexVar = L->getCanonicalInductionVariable();
+ if (InnerIndexVar)
+ return InnerIndexVar;
+ if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr)
+ return nullptr;
+ for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PhiVar = cast<PHINode>(I);
+ Type *PhiTy = PhiVar->getType();
+ if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
+ !PhiTy->isPointerTy())
+ return nullptr;
+ const SCEVAddRecExpr *AddRec =
+ dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar));
+ if (!AddRec || !AddRec->isAffine())
+ continue;
+ const SCEV *Step = AddRec->getStepRecurrence(*SE);
+ if (!isa<SCEVConstant>(Step))
+ continue;
+ // Found the induction variable.
+ // FIXME: Handle loops with more than one induction variable. Note that,
+ // currently, legality makes sure we have only one induction variable.
+ return PhiVar;
+ }
+ return nullptr;
+}
+
+namespace {
+
+/// LoopInterchangeLegality checks if it is legal to interchange the loop.
+class LoopInterchangeLegality {
+public:
+ LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+ LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA,
+ OptimizationRemarkEmitter *ORE)
+ : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
+ PreserveLCSSA(PreserveLCSSA), ORE(ORE) {}
+
+ /// Check if the loops can be interchanged.
+ bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
+ CharMatrix &DepMatrix);
+
+ /// Check if the loop structure is understood. We do not handle triangular
+ /// loops for now.
+ bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
+
+ bool currentLimitations();
+
+ bool hasInnerLoopReduction() { return InnerLoopHasReduction; }
+
+private:
+ bool tightlyNested(Loop *Outer, Loop *Inner);
+ bool containsUnsafeInstructionsInHeader(BasicBlock *BB);
+ bool areAllUsesReductions(Instruction *Ins, Loop *L);
+ bool containsUnsafeInstructionsInLatch(BasicBlock *BB);
+ bool findInductionAndReductions(Loop *L,
+ SmallVector<PHINode *, 8> &Inductions,
+ SmallVector<PHINode *, 8> &Reductions);
+
+ Loop *OuterLoop;
+ Loop *InnerLoop;
+
+ ScalarEvolution *SE;
+ LoopInfo *LI;
+ DominatorTree *DT;
+ bool PreserveLCSSA;
+
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter *ORE;
+
+ bool InnerLoopHasReduction = false;
+};
+
+/// LoopInterchangeProfitability checks if it is profitable to interchange the
+/// loop.
+class LoopInterchangeProfitability {
+public:
+ LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+ OptimizationRemarkEmitter *ORE)
+ : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
+
+ /// Check if the loop interchange is profitable.
+ bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
+ CharMatrix &DepMatrix);
+
+private:
+ int getInstrOrderCost();
+
+ Loop *OuterLoop;
+ Loop *InnerLoop;
+
+ /// Scev analysis.
+ ScalarEvolution *SE;
+
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter *ORE;
+};
+
+/// LoopInterchangeTransform interchanges the loop.
+class LoopInterchangeTransform {
+public:
+ LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+ LoopInfo *LI, DominatorTree *DT,
+ BasicBlock *LoopNestExit,
+ bool InnerLoopContainsReductions)
+ : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
+ LoopExit(LoopNestExit),
+ InnerLoopHasReduction(InnerLoopContainsReductions) {}
+
+ /// Interchange OuterLoop and InnerLoop.
+ bool transform();
+ void restructureLoops(Loop *InnerLoop, Loop *OuterLoop);
+ void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
+
+private:
+ void splitInnerLoopLatch(Instruction *);
+ void splitInnerLoopHeader();
+ bool adjustLoopLinks();
+ void adjustLoopPreheaders();
+ bool adjustLoopBranches();
+ void updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred,
+ BasicBlock *NewPred);
+
+ Loop *OuterLoop;
+ Loop *InnerLoop;
+
+ /// Scev analysis.
+ ScalarEvolution *SE;
+
+ LoopInfo *LI;
+ DominatorTree *DT;
+ BasicBlock *LoopExit;
+ bool InnerLoopHasReduction;
+};
+
+// Main LoopInterchange Pass.
+struct LoopInterchange : public FunctionPass {
+ static char ID;
+ ScalarEvolution *SE = nullptr;
+ LoopInfo *LI = nullptr;
+ DependenceInfo *DI = nullptr;
+ DominatorTree *DT = nullptr;
+ bool PreserveLCSSA;
+
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter *ORE;
+
+ LoopInterchange() : FunctionPass(ID) {
+ initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<DependenceAnalysisWrapperPass>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequiredID(LCSSAID);
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+ PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+ // Build up a worklist of loop pairs to analyze.
+ SmallVector<LoopVector, 8> Worklist;
+
+ for (Loop *L : *LI)
+ populateWorklist(*L, Worklist);
+
+ DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n");
+ bool Changed = true;
+ while (!Worklist.empty()) {
+ LoopVector LoopList = Worklist.pop_back_val();
+ Changed = processLoopList(LoopList, F);
+ }
+ return Changed;
+ }
+
+ bool isComputableLoopNest(LoopVector LoopList) {
+ for (Loop *L : LoopList) {
+ const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
+ if (ExitCountOuter == SE->getCouldNotCompute()) {
+ DEBUG(dbgs() << "Couldn't compute backedge count\n");
+ return false;
+ }
+ if (L->getNumBackEdges() != 1) {
+ DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
+ return false;
+ }
+ if (!L->getExitingBlock()) {
+ DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
+ return false;
+ }
+ }
+ return true;
+ }
+
+ unsigned selectLoopForInterchange(const LoopVector &LoopList) {
+ // TODO: Add a better heuristic to select the loop to be interchanged based
+ // on the dependence matrix. Currently we select the innermost loop.
+ return LoopList.size() - 1;
+ }
+
+ bool processLoopList(LoopVector LoopList, Function &F) {
+ bool Changed = false;
+ unsigned LoopNestDepth = LoopList.size();
+ if (LoopNestDepth < 2) {
+ DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
+ return false;
+ }
+ if (LoopNestDepth > MaxLoopNestDepth) {
+ DEBUG(dbgs() << "Cannot handle loops of depth greater than "
+ << MaxLoopNestDepth << "\n");
+ return false;
+ }
+ if (!isComputableLoopNest(LoopList)) {
+ DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
+ return false;
+ }
+
+ DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth << "\n");
+
+ CharMatrix DependencyMatrix;
+ Loop *OuterMostLoop = *(LoopList.begin());
+ if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
+ OuterMostLoop, DI)) {
+ DEBUG(dbgs() << "Populating dependency matrix failed\n");
+ return false;
+ }
+#ifdef DUMP_DEP_MATRICIES
+ DEBUG(dbgs() << "Dependence before interchange\n");
+ printDepMatrix(DependencyMatrix);
+#endif
+
+ BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch();
+ BranchInst *OuterMostLoopLatchBI =
+ dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator());
+ if (!OuterMostLoopLatchBI)
+ return false;
+
+ // Since we currently do not handle LCSSA PHI's any failure in loop
+ // condition will now branch to LoopNestExit.
+ // TODO: This should be removed once we handle LCSSA PHI nodes.
+
+ // Get the Outermost loop exit.
+ BasicBlock *LoopNestExit;
+ if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader())
+ LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1);
+ else
+ LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0);
+
+ if (isa<PHINode>(LoopNestExit->begin())) {
+ DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now "
+ "since on failure all loops branch to loop nest exit.\n");
+ return false;
+ }
+
+ unsigned SelecLoopId = selectLoopForInterchange(LoopList);
+ // Move the selected loop outwards to the best possible position.
+ for (unsigned i = SelecLoopId; i > 0; i--) {
+ bool Interchanged =
+ processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix);
+ if (!Interchanged)
+ return Changed;
+ // Loops interchanged reflect the same in LoopList
+ std::swap(LoopList[i - 1], LoopList[i]);
+
+ // Update the DependencyMatrix
+ interChangeDependencies(DependencyMatrix, i, i - 1);
+ DT->recalculate(F);
+#ifdef DUMP_DEP_MATRICIES
+ DEBUG(dbgs() << "Dependence after interchange\n");
+ printDepMatrix(DependencyMatrix);
+#endif
+ Changed |= Interchanged;
+ }
+ return Changed;
+ }
+
+ bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
+ unsigned OuterLoopId, BasicBlock *LoopNestExit,
+ std::vector<std::vector<char>> &DependencyMatrix) {
+ DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
+ << " and OuterLoopId = " << OuterLoopId << "\n");
+ Loop *InnerLoop = LoopList[InnerLoopId];
+ Loop *OuterLoop = LoopList[OuterLoopId];
+
+ LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT,
+ PreserveLCSSA, ORE);
+ if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+ DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n");
+ return false;
+ }
+ DEBUG(dbgs() << "Loops are legal to interchange\n");
+ LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
+ if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+ DEBUG(dbgs() << "Interchanging loops not profitable\n");
+ return false;
+ }
+
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "Interchanged",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Loop interchanged with enclosing loop.";
+ });
+
+ LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
+ LoopNestExit, LIL.hasInnerLoopReduction());
+ LIT.transform();
+ DEBUG(dbgs() << "Loops interchanged\n");
+ return true;
+ }
+};
+
+} // end anonymous namespace
+
+bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) {
+ return llvm::none_of(Ins->users(), [=](User *U) -> bool {
+ auto *UserIns = dyn_cast<PHINode>(U);
+ RecurrenceDescriptor RD;
+ return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD);
+ });
+}
+
+bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader(
+ BasicBlock *BB) {
+ for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+ // Load corresponding to reduction PHI's are safe while concluding if
+ // tightly nested.
+ if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+ if (!areAllUsesReductions(L, InnerLoop))
+ return true;
+ } else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+ return true;
+ }
+ return false;
+}
+
+bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch(
+ BasicBlock *BB) {
+ for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+ // Stores corresponding to reductions are safe while concluding if tightly
+ // nested.
+ if (StoreInst *L = dyn_cast<StoreInst>(I)) {
+ if (!isa<PHINode>(L->getOperand(0)))
+ return true;
+ } else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+ return true;
+ }
+ return false;
+}
+
+bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
+ BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+
+ DEBUG(dbgs() << "Checking if loops are tightly nested\n");
+
+ // A perfectly nested loop will not have any branch in between the outer and
+ // inner block i.e. outer header will branch to either inner preheader and
+ // outerloop latch.
+ BranchInst *OuterLoopHeaderBI =
+ dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
+ if (!OuterLoopHeaderBI)
+ return false;
+
+ for (BasicBlock *Succ : OuterLoopHeaderBI->successors())
+ if (Succ != InnerLoopPreHeader && Succ != OuterLoopLatch)
+ return false;
+
+ DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
+ // We do not have any basic block in between now make sure the outer header
+ // and outer loop latch doesn't contain any unsafe instructions.
+ if (containsUnsafeInstructionsInHeader(OuterLoopHeader) ||
+ containsUnsafeInstructionsInLatch(OuterLoopLatch))
+ return false;
+
+ DEBUG(dbgs() << "Loops are perfectly nested\n");
+ // We have a perfect loop nest.
+ return true;
+}
+
+bool LoopInterchangeLegality::isLoopStructureUnderstood(
+ PHINode *InnerInduction) {
+ unsigned Num = InnerInduction->getNumOperands();
+ BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
+ for (unsigned i = 0; i < Num; ++i) {
+ Value *Val = InnerInduction->getOperand(i);
+ if (isa<Constant>(Val))
+ continue;
+ Instruction *I = dyn_cast<Instruction>(Val);
+ if (!I)
+ return false;
+ // TODO: Handle triangular loops.
+ // e.g. for(int i=0;i<N;i++)
+ // for(int j=i;j<N;j++)
+ unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
+ if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
+ InnerLoopPreheader &&
+ !OuterLoop->isLoopInvariant(I)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool LoopInterchangeLegality::findInductionAndReductions(
+ Loop *L, SmallVector<PHINode *, 8> &Inductions,
+ SmallVector<PHINode *, 8> &Reductions) {
+ if (!L->getLoopLatch() || !L->getLoopPredecessor())
+ return false;
+ for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+ RecurrenceDescriptor RD;
+ InductionDescriptor ID;
+ PHINode *PHI = cast<PHINode>(I);
+ if (InductionDescriptor::isInductionPHI(PHI, L, SE, ID))
+ Inductions.push_back(PHI);
+ else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
+ Reductions.push_back(PHI);
+ else {
+ DEBUG(
+ dbgs() << "Failed to recognize PHI as an induction or reduction.\n");
+ return false;
+ }
+ }
+ return true;
+}
+
+static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
+ for (auto I = Block->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PHI = cast<PHINode>(I);
+ // Reduction lcssa phi will have only 1 incoming block that from loop latch.
+ if (PHI->getNumIncomingValues() > 1)
+ return false;
+ Instruction *Ins = dyn_cast<Instruction>(PHI->getIncomingValue(0));
+ if (!Ins)
+ return false;
+ // Incoming value for lcssa phi's in outer loop exit can only be inner loop
+ // exits lcssa phi else it would not be tightly nested.
+ if (!isa<PHINode>(Ins) && isOuterLoopExitBlock)
+ return false;
+ }
+ return true;
+}
+
+static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock,
+ BasicBlock *LoopHeader) {
+ if (BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator())) {
+ assert(BI->getNumSuccessors() == 2 &&
+ "Branch leaving loop latch must have 2 successors");
+ for (BasicBlock *Succ : BI->successors()) {
+ if (Succ == LoopHeader)
+ continue;
+ return Succ;
+ }
+ }
+ return nullptr;
+}
+
+// This function indicates the current limitations in the transform as a result
+// of which we do not proceed.
+bool LoopInterchangeLegality::currentLimitations() {
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+ BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+ BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+ BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+
+ PHINode *InnerInductionVar;
+ SmallVector<PHINode *, 8> Inductions;
+ SmallVector<PHINode *, 8> Reductions;
+ if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) {
+ DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes "
+ << "are supported currently.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Only inner loops with induction or reduction PHI nodes can be"
+ " interchange currently.";
+ });
+ return true;
+ }
+
+ // TODO: Currently we handle only loops with 1 induction variable.
+ if (Inductions.size() != 1) {
+ DEBUG(dbgs() << "We currently only support loops with 1 induction variable."
+ << "Failed to interchange due to current limitation\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Only inner loops with 1 induction variable can be "
+ "interchanged currently.";
+ });
+ return true;
+ }
+ if (Reductions.size() > 0)
+ InnerLoopHasReduction = true;
+
+ InnerInductionVar = Inductions.pop_back_val();
+ Reductions.clear();
+ if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) {
+ DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes "
+ << "are supported currently.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Only outer loops with induction or reduction PHI nodes can be"
+ " interchanged currently.";
+ });
+ return true;
+ }
+
+ // Outer loop cannot have reduction because then loops will not be tightly
+ // nested.
+ if (!Reductions.empty()) {
+ DEBUG(dbgs() << "Outer loops with reductions are not supported "
+ << "currently.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Outer loops with reductions cannot be interchangeed "
+ "currently.";
+ });
+ return true;
+ }
+ // TODO: Currently we handle only loops with 1 induction variable.
+ if (Inductions.size() != 1) {
+ DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
+ << "supported currently.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Only outer loops with 1 induction variable can be "
+ "interchanged currently.";
+ });
+ return true;
+ }
+
+ // TODO: Triangular loops are not handled for now.
+ if (!isLoopStructureUnderstood(InnerInductionVar)) {
+ DEBUG(dbgs() << "Loop structure not understood by pass\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Inner loop structure not understood currently.";
+ });
+ return true;
+ }
+
+ // TODO: We only handle LCSSA PHI's corresponding to reduction for now.
+ BasicBlock *LoopExitBlock =
+ getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader);
+ if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) {
+ DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuter",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Only outer loops with LCSSA PHIs can be interchange "
+ "currently.";
+ });
+ return true;
+ }
+
+ LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader);
+ if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) {
+ DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Only inner loops with LCSSA PHIs can be interchange "
+ "currently.";
+ });
+ return true;
+ }
+
+ // TODO: Current limitation: Since we split the inner loop latch at the point
+ // were induction variable is incremented (induction.next); We cannot have
+ // more than 1 user of induction.next since it would result in broken code
+ // after split.
+ // e.g.
+ // for(i=0;i<N;i++) {
+ // for(j = 0;j<M;j++) {
+ // A[j+1][i+2] = A[j][i]+k;
+ // }
+ // }
+ Instruction *InnerIndexVarInc = nullptr;
+ if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader)
+ InnerIndexVarInc =
+ dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1));
+ else
+ InnerIndexVarInc =
+ dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
+
+ if (!InnerIndexVarInc) {
+ DEBUG(dbgs() << "Did not find an instruction to increment the induction "
+ << "variable.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "The inner loop does not increment the induction variable.";
+ });
+ return true;
+ }
+
+ // Since we split the inner loop latch on this induction variable. Make sure
+ // we do not have any instruction between the induction variable and branch
+ // instruction.
+
+ bool FoundInduction = false;
+ for (const Instruction &I : llvm::reverse(*InnerLoopLatch)) {
+ if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) ||
+ isa<ZExtInst>(I))
+ continue;
+
+ // We found an instruction. If this is not induction variable then it is not
+ // safe to split this loop latch.
+ if (!I.isIdenticalTo(InnerIndexVarInc)) {
+ DEBUG(dbgs() << "Found unsupported instructions between induction "
+ << "variable increment and branch.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(
+ DEBUG_TYPE, "UnsupportedInsBetweenInduction",
+ InnerLoop->getStartLoc(), InnerLoop->getHeader())
+ << "Found unsupported instruction between induction variable "
+ "increment and branch.";
+ });
+ return true;
+ }
+
+ FoundInduction = true;
+ break;
+ }
+ // The loop latch ended and we didn't find the induction variable return as
+ // current limitation.
+ if (!FoundInduction) {
+ DEBUG(dbgs() << "Did not find the induction variable.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Did not find the induction variable.";
+ });
+ return true;
+ }
+ return false;
+}
+
+bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
+ unsigned OuterLoopId,
+ CharMatrix &DepMatrix) {
+ if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
+ DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
+ << " and OuterLoopId = " << OuterLoopId
+ << " due to dependence\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Cannot interchange loops due to dependences.";
+ });
+ return false;
+ }
+
+ // Check if outer and inner loop contain legal instructions only.
+ for (auto *BB : OuterLoop->blocks())
+ for (Instruction &I : *BB)
+ if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+ // readnone functions do not prevent interchanging.
+ if (CI->doesNotReadMemory())
+ continue;
+ DEBUG(dbgs() << "Loops with call instructions cannot be interchanged "
+ << "safely.");
+ return false;
+ }
+
+ // Create unique Preheaders if we already do not have one.
+ BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+
+ // Create a unique outer preheader -
+ // 1) If OuterLoop preheader is not present.
+ // 2) If OuterLoop Preheader is same as OuterLoop Header
+ // 3) If OuterLoop Preheader is same as Header of the previous loop.
+ // 4) If OuterLoop Preheader is Entry node.
+ if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() ||
+ isa<PHINode>(OuterLoopPreHeader->begin()) ||
+ !OuterLoopPreHeader->getUniquePredecessor()) {
+ OuterLoopPreHeader =
+ InsertPreheaderForLoop(OuterLoop, DT, LI, PreserveLCSSA);
+ }
+
+ if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() ||
+ InnerLoopPreHeader == OuterLoop->getHeader()) {
+ InnerLoopPreHeader =
+ InsertPreheaderForLoop(InnerLoop, DT, LI, PreserveLCSSA);
+ }
+
+ // TODO: The loops could not be interchanged due to current limitations in the
+ // transform module.
+ if (currentLimitations()) {
+ DEBUG(dbgs() << "Not legal because of current transform limitation\n");
+ return false;
+ }
+
+ // Check if the loops are tightly nested.
+ if (!tightlyNested(OuterLoop, InnerLoop)) {
+ DEBUG(dbgs() << "Loops not tightly nested\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Cannot interchange loops because they are not tightly "
+ "nested.";
+ });
+ return false;
+ }
+
+ return true;
+}
+
+int LoopInterchangeProfitability::getInstrOrderCost() {
+ unsigned GoodOrder, BadOrder;
+ BadOrder = GoodOrder = 0;
+ for (BasicBlock *BB : InnerLoop->blocks()) {
+ for (Instruction &Ins : *BB) {
+ if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
+ unsigned NumOp = GEP->getNumOperands();
+ bool FoundInnerInduction = false;
+ bool FoundOuterInduction = false;
+ for (unsigned i = 0; i < NumOp; ++i) {
+ const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
+ if (!AR)
+ continue;
+
+ // If we find the inner induction after an outer induction e.g.
+ // for(int i=0;i<N;i++)
+ // for(int j=0;j<N;j++)
+ // A[i][j] = A[i-1][j-1]+k;
+ // then it is a good order.
+ if (AR->getLoop() == InnerLoop) {
+ // We found an InnerLoop induction after OuterLoop induction. It is
+ // a good order.
+ FoundInnerInduction = true;
+ if (FoundOuterInduction) {
+ GoodOrder++;
+ break;
+ }
+ }
+ // If we find the outer induction after an inner induction e.g.
+ // for(int i=0;i<N;i++)
+ // for(int j=0;j<N;j++)
+ // A[j][i] = A[j-1][i-1]+k;
+ // then it is a bad order.
+ if (AR->getLoop() == OuterLoop) {
+ // We found an OuterLoop induction after InnerLoop induction. It is
+ // a bad order.
+ FoundOuterInduction = true;
+ if (FoundInnerInduction) {
+ BadOrder++;
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ return GoodOrder - BadOrder;
+}
+
+static bool isProfitableForVectorization(unsigned InnerLoopId,
+ unsigned OuterLoopId,
+ CharMatrix &DepMatrix) {
+ // TODO: Improve this heuristic to catch more cases.
+ // If the inner loop is loop independent or doesn't carry any dependency it is
+ // profitable to move this to outer position.
+ for (auto &Row : DepMatrix) {
+ if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I')
+ return false;
+ // TODO: We need to improve this heuristic.
+ if (Row[OuterLoopId] != '=')
+ return false;
+ }
+ // If outer loop has dependence and inner loop is loop independent then it is
+ // profitable to interchange to enable parallelism.
+ return true;
+}
+
+bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
+ unsigned OuterLoopId,
+ CharMatrix &DepMatrix) {
+ // TODO: Add better profitability checks.
+ // e.g
+ // 1) Construct dependency matrix and move the one with no loop carried dep
+ // inside to enable vectorization.
+
+ // This is rough cost estimation algorithm. It counts the good and bad order
+ // of induction variables in the instruction and allows reordering if number
+ // of bad orders is more than good.
+ int Cost = getInstrOrderCost();
+ DEBUG(dbgs() << "Cost = " << Cost << "\n");
+ if (Cost < -LoopInterchangeCostThreshold)
+ return true;
+
+ // It is not profitable as per current cache profitability model. But check if
+ // we can move this loop outside to improve parallelism.
+ if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
+ return true;
+
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Interchanging loops is too costly (cost="
+ << ore::NV("Cost", Cost) << ", threshold="
+ << ore::NV("Threshold", LoopInterchangeCostThreshold)
+ << ") and it does not improve parallelism.";
+ });
+ return false;
+}
+
+void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
+ Loop *InnerLoop) {
+ for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end(); I != E;
+ ++I) {
+ if (*I == InnerLoop) {
+ OuterLoop->removeChildLoop(I);
+ return;
+ }
+ }
+ llvm_unreachable("Couldn't find loop");
+}
+
+void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop,
+ Loop *OuterLoop) {
+ Loop *OuterLoopParent = OuterLoop->getParentLoop();
+ if (OuterLoopParent) {
+ // Remove the loop from its parent loop.
+ removeChildLoop(OuterLoopParent, OuterLoop);
+ removeChildLoop(OuterLoop, InnerLoop);
+ OuterLoopParent->addChildLoop(InnerLoop);
+ } else {
+ removeChildLoop(OuterLoop, InnerLoop);
+ LI->changeTopLevelLoop(OuterLoop, InnerLoop);
+ }
+
+ while (!InnerLoop->empty())
+ OuterLoop->addChildLoop(InnerLoop->removeChildLoop(InnerLoop->begin()));
+
+ InnerLoop->addChildLoop(OuterLoop);
+}
+
+bool LoopInterchangeTransform::transform() {
+ bool Transformed = false;
+ Instruction *InnerIndexVar;
+
+ if (InnerLoop->getSubLoops().empty()) {
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ DEBUG(dbgs() << "Calling Split Inner Loop\n");
+ PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
+ if (!InductionPHI) {
+ DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
+ return false;
+ }
+
+ if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
+ InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
+ else
+ InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
+
+ // Ensure that InductionPHI is the first Phi node as required by
+ // splitInnerLoopHeader
+ if (&InductionPHI->getParent()->front() != InductionPHI)
+ InductionPHI->moveBefore(&InductionPHI->getParent()->front());
+
+ // Split at the place were the induction variable is
+ // incremented/decremented.
+ // TODO: This splitting logic may not work always. Fix this.
+ splitInnerLoopLatch(InnerIndexVar);
+ DEBUG(dbgs() << "splitInnerLoopLatch done\n");
+
+ // Splits the inner loops phi nodes out into a separate basic block.
+ splitInnerLoopHeader();
+ DEBUG(dbgs() << "splitInnerLoopHeader done\n");
+ }
+
+ Transformed |= adjustLoopLinks();
+ if (!Transformed) {
+ DEBUG(dbgs() << "adjustLoopLinks failed\n");
+ return false;
+ }
+
+ restructureLoops(InnerLoop, OuterLoop);
+ return true;
+}
+
+void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
+ BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+ BasicBlock *InnerLoopLatchPred = InnerLoopLatch;
+ InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI);
+}
+
+void LoopInterchangeTransform::splitInnerLoopHeader() {
+ // Split the inner loop header out. Here make sure that the reduction PHI's
+ // stay in the innerloop body.
+ BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ if (InnerLoopHasReduction) {
+ // Note: The induction PHI must be the first PHI for this to work
+ BasicBlock *New = InnerLoopHeader->splitBasicBlock(
+ ++(InnerLoopHeader->begin()), InnerLoopHeader->getName() + ".split");
+ if (LI)
+ if (Loop *L = LI->getLoopFor(InnerLoopHeader))
+ L->addBasicBlockToLoop(New, *LI);
+
+ // Adjust Reduction PHI's in the block.
+ SmallVector<PHINode *, 8> PHIVec;
+ for (auto I = New->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PHI = dyn_cast<PHINode>(I);
+ Value *V = PHI->getIncomingValueForBlock(InnerLoopPreHeader);
+ PHI->replaceAllUsesWith(V);
+ PHIVec.push_back((PHI));
+ }
+ for (PHINode *P : PHIVec) {
+ P->eraseFromParent();
+ }
+ } else {
+ SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
+ }
+
+ DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & "
+ "InnerLoopHeader\n");
+}
+
+/// \brief Move all instructions except the terminator from FromBB right before
+/// InsertBefore
+static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
+ auto &ToList = InsertBefore->getParent()->getInstList();
+ auto &FromList = FromBB->getInstList();
+
+ ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(),
+ FromBB->getTerminator()->getIterator());
+}
+
+void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock,
+ BasicBlock *OldPred,
+ BasicBlock *NewPred) {
+ for (auto I = CurrBlock->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PHI = cast<PHINode>(I);
+ unsigned Num = PHI->getNumIncomingValues();
+ for (unsigned i = 0; i < Num; ++i) {
+ if (PHI->getIncomingBlock(i) == OldPred)
+ PHI->setIncomingBlock(i, NewPred);
+ }
+ }
+}
+
+bool LoopInterchangeTransform::adjustLoopBranches() {
+ DEBUG(dbgs() << "adjustLoopBranches called\n");
+ // Adjust the loop preheader
+ BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+ BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+ BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+ BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+ BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
+ BasicBlock *InnerLoopLatchPredecessor =
+ InnerLoopLatch->getUniquePredecessor();
+ BasicBlock *InnerLoopLatchSuccessor;
+ BasicBlock *OuterLoopLatchSuccessor;
+
+ BranchInst *OuterLoopLatchBI =
+ dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
+ BranchInst *InnerLoopLatchBI =
+ dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
+ BranchInst *OuterLoopHeaderBI =
+ dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
+ BranchInst *InnerLoopHeaderBI =
+ dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
+
+ if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
+ !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
+ !InnerLoopHeaderBI)
+ return false;
+
+ BranchInst *InnerLoopLatchPredecessorBI =
+ dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
+ BranchInst *OuterLoopPredecessorBI =
+ dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
+
+ if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
+ return false;
+ BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
+ if (!InnerLoopHeaderSuccessor)
+ return false;
+
+ // Adjust Loop Preheader and headers
+
+ unsigned NumSucc = OuterLoopPredecessorBI->getNumSuccessors();
+ for (unsigned i = 0; i < NumSucc; ++i) {
+ if (OuterLoopPredecessorBI->getSuccessor(i) == OuterLoopPreHeader)
+ OuterLoopPredecessorBI->setSuccessor(i, InnerLoopPreHeader);
+ }
+
+ NumSucc = OuterLoopHeaderBI->getNumSuccessors();
+ for (unsigned i = 0; i < NumSucc; ++i) {
+ if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch)
+ OuterLoopHeaderBI->setSuccessor(i, LoopExit);
+ else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader)
+ OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSuccessor);
+ }
+
+ // Adjust reduction PHI's now that the incoming block has changed.
+ updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader,
+ OuterLoopHeader);
+
+ BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI);
+ InnerLoopHeaderBI->eraseFromParent();
+
+ // -------------Adjust loop latches-----------
+ if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
+ InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
+ else
+ InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
+
+ NumSucc = InnerLoopLatchPredecessorBI->getNumSuccessors();
+ for (unsigned i = 0; i < NumSucc; ++i) {
+ if (InnerLoopLatchPredecessorBI->getSuccessor(i) == InnerLoopLatch)
+ InnerLoopLatchPredecessorBI->setSuccessor(i, InnerLoopLatchSuccessor);
+ }
+
+ // Adjust PHI nodes in InnerLoopLatchSuccessor. Update all uses of PHI with
+ // the value and remove this PHI node from inner loop.
+ SmallVector<PHINode *, 8> LcssaVec;
+ for (auto I = InnerLoopLatchSuccessor->begin(); isa<PHINode>(I); ++I) {
+ PHINode *LcssaPhi = cast<PHINode>(I);
+ LcssaVec.push_back(LcssaPhi);
+ }
+ for (PHINode *P : LcssaVec) {
+ Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch);
+ P->replaceAllUsesWith(Incoming);
+ P->eraseFromParent();
+ }
+
+ if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
+ OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
+ else
+ OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
+
+ if (InnerLoopLatchBI->getSuccessor(1) == InnerLoopLatchSuccessor)
+ InnerLoopLatchBI->setSuccessor(1, OuterLoopLatchSuccessor);
+ else
+ InnerLoopLatchBI->setSuccessor(0, OuterLoopLatchSuccessor);
+
+ updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch);
+
+ if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopLatchSuccessor) {
+ OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch);
+ } else {
+ OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch);
+ }
+
+ return true;
+}
+
+void LoopInterchangeTransform::adjustLoopPreheaders() {
+ // We have interchanged the preheaders so we need to interchange the data in
+ // the preheader as well.
+ // This is because the content of inner preheader was previously executed
+ // inside the outer loop.
+ BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+ BranchInst *InnerTermBI =
+ cast<BranchInst>(InnerLoopPreHeader->getTerminator());
+
+ // These instructions should now be executed inside the loop.
+ // Move instruction into a new block after outer header.
+ moveBBContents(InnerLoopPreHeader, OuterLoopHeader->getTerminator());
+ // These instructions were not executed previously in the loop so move them to
+ // the older inner loop preheader.
+ moveBBContents(OuterLoopPreHeader, InnerTermBI);
+}
+
+bool LoopInterchangeTransform::adjustLoopLinks() {
+ // Adjust all branches in the inner and outer loop.
+ bool Changed = adjustLoopBranches();
+ if (Changed)
+ adjustLoopPreheaders();
+ return Changed;
+}
+
+char LoopInterchange::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange",
+ "Interchanges loops for cache reuse", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+
+INITIALIZE_PASS_END(LoopInterchange, "loop-interchange",
+ "Interchanges loops for cache reuse", false, false)
+
+Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
new file mode 100644
index 000000000000..dfa5ec1f354d
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -0,0 +1,679 @@
+//===- LoopLoadElimination.cpp - Loop Load Elimination Pass ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implement a loop-aware load elimination pass.
+//
+// It uses LoopAccessAnalysis to identify loop-carried dependences with a
+// distance of one between stores and loads. These form the candidates for the
+// transformation. The source value of each store then propagated to the user
+// of the corresponding load. This makes the load dead.
+//
+// The pass can also version the loop and add memchecks in order to prove that
+// may-aliasing stores can't change the value in memory before it's read by the
+// load.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopLoadElimination.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include <algorithm>
+#include <cassert>
+#include <forward_list>
+#include <set>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define LLE_OPTION "loop-load-elim"
+#define DEBUG_TYPE LLE_OPTION
+
+static cl::opt<unsigned> CheckPerElim(
+ "runtime-check-per-loop-load-elim", cl::Hidden,
+ cl::desc("Max number of memchecks allowed per eliminated load on average"),
+ cl::init(1));
+
+static cl::opt<unsigned> LoadElimSCEVCheckThreshold(
+ "loop-load-elimination-scev-check-threshold", cl::init(8), cl::Hidden,
+ cl::desc("The maximum number of SCEV checks allowed for Loop "
+ "Load Elimination"));
+
+STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE");
+
+namespace {
+
+/// \brief Represent a store-to-forwarding candidate.
+struct StoreToLoadForwardingCandidate {
+ LoadInst *Load;
+ StoreInst *Store;
+
+ StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store)
+ : Load(Load), Store(Store) {}
+
+ /// \brief Return true if the dependence from the store to the load has a
+ /// distance of one. E.g. A[i+1] = A[i]
+ bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
+ Loop *L) const {
+ Value *LoadPtr = Load->getPointerOperand();
+ Value *StorePtr = Store->getPointerOperand();
+ Type *LoadPtrType = LoadPtr->getType();
+ Type *LoadType = LoadPtrType->getPointerElementType();
+
+ assert(LoadPtrType->getPointerAddressSpace() ==
+ StorePtr->getType()->getPointerAddressSpace() &&
+ LoadType == StorePtr->getType()->getPointerElementType() &&
+ "Should be a known dependence");
+
+ // Currently we only support accesses with unit stride. FIXME: we should be
+ // able to handle non unit stirde as well as long as the stride is equal to
+ // the dependence distance.
+ if (getPtrStride(PSE, LoadPtr, L) != 1 ||
+ getPtrStride(PSE, StorePtr, L) != 1)
+ return false;
+
+ auto &DL = Load->getParent()->getModule()->getDataLayout();
+ unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType));
+
+ auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(LoadPtr));
+ auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(StorePtr));
+
+ // We don't need to check non-wrapping here because forward/backward
+ // dependence wouldn't be valid if these weren't monotonic accesses.
+ auto *Dist = cast<SCEVConstant>(
+ PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV));
+ const APInt &Val = Dist->getAPInt();
+ return Val == TypeByteSize;
+ }
+
+ Value *getLoadPtr() const { return Load->getPointerOperand(); }
+
+#ifndef NDEBUG
+ friend raw_ostream &operator<<(raw_ostream &OS,
+ const StoreToLoadForwardingCandidate &Cand) {
+ OS << *Cand.Store << " -->\n";
+ OS.indent(2) << *Cand.Load << "\n";
+ return OS;
+ }
+#endif
+};
+
+} // end anonymous namespace
+
+/// \brief Check if the store dominates all latches, so as long as there is no
+/// intervening store this value will be loaded in the next iteration.
+static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
+ DominatorTree *DT) {
+ SmallVector<BasicBlock *, 8> Latches;
+ L->getLoopLatches(Latches);
+ return llvm::all_of(Latches, [&](const BasicBlock *Latch) {
+ return DT->dominates(StoreBlock, Latch);
+ });
+}
+
+/// \brief Return true if the load is not executed on all paths in the loop.
+static bool isLoadConditional(LoadInst *Load, Loop *L) {
+ return Load->getParent() != L->getHeader();
+}
+
+namespace {
+
+/// \brief The per-loop class that does most of the work.
+class LoadEliminationForLoop {
+public:
+ LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
+ DominatorTree *DT)
+ : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.getPSE()) {}
+
+ /// \brief Look through the loop-carried and loop-independent dependences in
+ /// this loop and find store->load dependences.
+ ///
+ /// Note that no candidate is returned if LAA has failed to analyze the loop
+ /// (e.g. if it's not bottom-tested, contains volatile memops, etc.)
+ std::forward_list<StoreToLoadForwardingCandidate>
+ findStoreToLoadDependences(const LoopAccessInfo &LAI) {
+ std::forward_list<StoreToLoadForwardingCandidate> Candidates;
+
+ const auto *Deps = LAI.getDepChecker().getDependences();
+ if (!Deps)
+ return Candidates;
+
+ // Find store->load dependences (consequently true dep). Both lexically
+ // forward and backward dependences qualify. Disqualify loads that have
+ // other unknown dependences.
+
+ SmallSet<Instruction *, 4> LoadsWithUnknownDepedence;
+
+ for (const auto &Dep : *Deps) {
+ Instruction *Source = Dep.getSource(LAI);
+ Instruction *Destination = Dep.getDestination(LAI);
+
+ if (Dep.Type == MemoryDepChecker::Dependence::Unknown) {
+ if (isa<LoadInst>(Source))
+ LoadsWithUnknownDepedence.insert(Source);
+ if (isa<LoadInst>(Destination))
+ LoadsWithUnknownDepedence.insert(Destination);
+ continue;
+ }
+
+ if (Dep.isBackward())
+ // Note that the designations source and destination follow the program
+ // order, i.e. source is always first. (The direction is given by the
+ // DepType.)
+ std::swap(Source, Destination);
+ else
+ assert(Dep.isForward() && "Needs to be a forward dependence");
+
+ auto *Store = dyn_cast<StoreInst>(Source);
+ if (!Store)
+ continue;
+ auto *Load = dyn_cast<LoadInst>(Destination);
+ if (!Load)
+ continue;
+
+ // Only progagate the value if they are of the same type.
+ if (Store->getPointerOperandType() != Load->getPointerOperandType())
+ continue;
+
+ Candidates.emplace_front(Load, Store);
+ }
+
+ if (!LoadsWithUnknownDepedence.empty())
+ Candidates.remove_if([&](const StoreToLoadForwardingCandidate &C) {
+ return LoadsWithUnknownDepedence.count(C.Load);
+ });
+
+ return Candidates;
+ }
+
+ /// \brief Return the index of the instruction according to program order.
+ unsigned getInstrIndex(Instruction *Inst) {
+ auto I = InstOrder.find(Inst);
+ assert(I != InstOrder.end() && "No index for instruction");
+ return I->second;
+ }
+
+ /// \brief If a load has multiple candidates associated (i.e. different
+ /// stores), it means that it could be forwarding from multiple stores
+ /// depending on control flow. Remove these candidates.
+ ///
+ /// Here, we rely on LAA to include the relevant loop-independent dependences.
+ /// LAA is known to omit these in the very simple case when the read and the
+ /// write within an alias set always takes place using the *same* pointer.
+ ///
+ /// However, we know that this is not the case here, i.e. we can rely on LAA
+ /// to provide us with loop-independent dependences for the cases we're
+ /// interested. Consider the case for example where a loop-independent
+ /// dependece S1->S2 invalidates the forwarding S3->S2.
+ ///
+ /// A[i] = ... (S1)
+ /// ... = A[i] (S2)
+ /// A[i+1] = ... (S3)
+ ///
+ /// LAA will perform dependence analysis here because there are two
+ /// *different* pointers involved in the same alias set (&A[i] and &A[i+1]).
+ void removeDependencesFromMultipleStores(
+ std::forward_list<StoreToLoadForwardingCandidate> &Candidates) {
+ // If Store is nullptr it means that we have multiple stores forwarding to
+ // this store.
+ using LoadToSingleCandT =
+ DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *>;
+ LoadToSingleCandT LoadToSingleCand;
+
+ for (const auto &Cand : Candidates) {
+ bool NewElt;
+ LoadToSingleCandT::iterator Iter;
+
+ std::tie(Iter, NewElt) =
+ LoadToSingleCand.insert(std::make_pair(Cand.Load, &Cand));
+ if (!NewElt) {
+ const StoreToLoadForwardingCandidate *&OtherCand = Iter->second;
+ // Already multiple stores forward to this load.
+ if (OtherCand == nullptr)
+ continue;
+
+ // Handle the very basic case when the two stores are in the same block
+ // so deciding which one forwards is easy. The later one forwards as
+ // long as they both have a dependence distance of one to the load.
+ if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
+ Cand.isDependenceDistanceOfOne(PSE, L) &&
+ OtherCand->isDependenceDistanceOfOne(PSE, L)) {
+ // They are in the same block, the later one will forward to the load.
+ if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
+ OtherCand = &Cand;
+ } else
+ OtherCand = nullptr;
+ }
+ }
+
+ Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) {
+ if (LoadToSingleCand[Cand.Load] != &Cand) {
+ DEBUG(dbgs() << "Removing from candidates: \n" << Cand
+ << " The load may have multiple stores forwarding to "
+ << "it\n");
+ return true;
+ }
+ return false;
+ });
+ }
+
+ /// \brief Given two pointers operations by their RuntimePointerChecking
+ /// indices, return true if they require an alias check.
+ ///
+ /// We need a check if one is a pointer for a candidate load and the other is
+ /// a pointer for a possibly intervening store.
+ bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2,
+ const SmallSet<Value *, 4> &PtrsWrittenOnFwdingPath,
+ const std::set<Value *> &CandLoadPtrs) {
+ Value *Ptr1 =
+ LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue;
+ Value *Ptr2 =
+ LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx2).PointerValue;
+ return ((PtrsWrittenOnFwdingPath.count(Ptr1) && CandLoadPtrs.count(Ptr2)) ||
+ (PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1)));
+ }
+
+ /// \brief Return pointers that are possibly written to on the path from a
+ /// forwarding store to a load.
+ ///
+ /// These pointers need to be alias-checked against the forwarding candidates.
+ SmallSet<Value *, 4> findPointersWrittenOnForwardingPath(
+ const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
+ // From FirstStore to LastLoad neither of the elimination candidate loads
+ // should overlap with any of the stores.
+ //
+ // E.g.:
+ //
+ // st1 C[i]
+ // ld1 B[i] <-------,
+ // ld0 A[i] <----, | * LastLoad
+ // ... | |
+ // st2 E[i] | |
+ // st3 B[i+1] -- | -' * FirstStore
+ // st0 A[i+1] ---'
+ // st4 D[i]
+ //
+ // st0 forwards to ld0 if the accesses in st4 and st1 don't overlap with
+ // ld0.
+
+ LoadInst *LastLoad =
+ std::max_element(Candidates.begin(), Candidates.end(),
+ [&](const StoreToLoadForwardingCandidate &A,
+ const StoreToLoadForwardingCandidate &B) {
+ return getInstrIndex(A.Load) < getInstrIndex(B.Load);
+ })
+ ->Load;
+ StoreInst *FirstStore =
+ std::min_element(Candidates.begin(), Candidates.end(),
+ [&](const StoreToLoadForwardingCandidate &A,
+ const StoreToLoadForwardingCandidate &B) {
+ return getInstrIndex(A.Store) <
+ getInstrIndex(B.Store);
+ })
+ ->Store;
+
+ // We're looking for stores after the first forwarding store until the end
+ // of the loop, then from the beginning of the loop until the last
+ // forwarded-to load. Collect the pointer for the stores.
+ SmallSet<Value *, 4> PtrsWrittenOnFwdingPath;
+
+ auto InsertStorePtr = [&](Instruction *I) {
+ if (auto *S = dyn_cast<StoreInst>(I))
+ PtrsWrittenOnFwdingPath.insert(S->getPointerOperand());
+ };
+ const auto &MemInstrs = LAI.getDepChecker().getMemoryInstructions();
+ std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1,
+ MemInstrs.end(), InsertStorePtr);
+ std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)],
+ InsertStorePtr);
+
+ return PtrsWrittenOnFwdingPath;
+ }
+
+ /// \brief Determine the pointer alias checks to prove that there are no
+ /// intervening stores.
+ SmallVector<RuntimePointerChecking::PointerCheck, 4> collectMemchecks(
+ const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
+
+ SmallSet<Value *, 4> PtrsWrittenOnFwdingPath =
+ findPointersWrittenOnForwardingPath(Candidates);
+
+ // Collect the pointers of the candidate loads.
+ // FIXME: SmallSet does not work with std::inserter.
+ std::set<Value *> CandLoadPtrs;
+ transform(Candidates,
+ std::inserter(CandLoadPtrs, CandLoadPtrs.begin()),
+ std::mem_fn(&StoreToLoadForwardingCandidate::getLoadPtr));
+
+ const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks();
+ SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
+
+ copy_if(AllChecks, std::back_inserter(Checks),
+ [&](const RuntimePointerChecking::PointerCheck &Check) {
+ for (auto PtrIdx1 : Check.first->Members)
+ for (auto PtrIdx2 : Check.second->Members)
+ if (needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath,
+ CandLoadPtrs))
+ return true;
+ return false;
+ });
+
+ DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n");
+ DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+
+ return Checks;
+ }
+
+ /// \brief Perform the transformation for a candidate.
+ void
+ propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand,
+ SCEVExpander &SEE) {
+ // loop:
+ // %x = load %gep_i
+ // = ... %x
+ // store %y, %gep_i_plus_1
+ //
+ // =>
+ //
+ // ph:
+ // %x.initial = load %gep_0
+ // loop:
+ // %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
+ // %x = load %gep_i <---- now dead
+ // = ... %x.storeforward
+ // store %y, %gep_i_plus_1
+
+ Value *Ptr = Cand.Load->getPointerOperand();
+ auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(Ptr));
+ auto *PH = L->getLoopPreheader();
+ Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
+ PH->getTerminator());
+ Value *Initial =
+ new LoadInst(InitialPtr, "load_initial", /* isVolatile */ false,
+ Cand.Load->getAlignment(), PH->getTerminator());
+
+ PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded",
+ &L->getHeader()->front());
+ PHI->addIncoming(Initial, PH);
+ PHI->addIncoming(Cand.Store->getOperand(0), L->getLoopLatch());
+
+ Cand.Load->replaceAllUsesWith(PHI);
+ }
+
+ /// \brief Top-level driver for each loop: find store->load forwarding
+ /// candidates, add run-time checks and perform transformation.
+ bool processLoop() {
+ DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName()
+ << "\" checking " << *L << "\n");
+
+ // Look for store-to-load forwarding cases across the
+ // backedge. E.g.:
+ //
+ // loop:
+ // %x = load %gep_i
+ // = ... %x
+ // store %y, %gep_i_plus_1
+ //
+ // =>
+ //
+ // ph:
+ // %x.initial = load %gep_0
+ // loop:
+ // %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
+ // %x = load %gep_i <---- now dead
+ // = ... %x.storeforward
+ // store %y, %gep_i_plus_1
+
+ // First start with store->load dependences.
+ auto StoreToLoadDependences = findStoreToLoadDependences(LAI);
+ if (StoreToLoadDependences.empty())
+ return false;
+
+ // Generate an index for each load and store according to the original
+ // program order. This will be used later.
+ InstOrder = LAI.getDepChecker().generateInstructionOrderMap();
+
+ // To keep things simple for now, remove those where the load is potentially
+ // fed by multiple stores.
+ removeDependencesFromMultipleStores(StoreToLoadDependences);
+ if (StoreToLoadDependences.empty())
+ return false;
+
+ // Filter the candidates further.
+ SmallVector<StoreToLoadForwardingCandidate, 4> Candidates;
+ unsigned NumForwarding = 0;
+ for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) {
+ DEBUG(dbgs() << "Candidate " << Cand);
+
+ // Make sure that the stored values is available everywhere in the loop in
+ // the next iteration.
+ if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT))
+ continue;
+
+ // If the load is conditional we can't hoist its 0-iteration instance to
+ // the preheader because that would make it unconditional. Thus we would
+ // access a memory location that the original loop did not access.
+ if (isLoadConditional(Cand.Load, L))
+ continue;
+
+ // Check whether the SCEV difference is the same as the induction step,
+ // thus we load the value in the next iteration.
+ if (!Cand.isDependenceDistanceOfOne(PSE, L))
+ continue;
+
+ ++NumForwarding;
+ DEBUG(dbgs()
+ << NumForwarding
+ << ". Valid store-to-load forwarding across the loop backedge\n");
+ Candidates.push_back(Cand);
+ }
+ if (Candidates.empty())
+ return false;
+
+ // Check intervening may-alias stores. These need runtime checks for alias
+ // disambiguation.
+ SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks =
+ collectMemchecks(Candidates);
+
+ // Too many checks are likely to outweigh the benefits of forwarding.
+ if (Checks.size() > Candidates.size() * CheckPerElim) {
+ DEBUG(dbgs() << "Too many run-time checks needed.\n");
+ return false;
+ }
+
+ if (LAI.getPSE().getUnionPredicate().getComplexity() >
+ LoadElimSCEVCheckThreshold) {
+ DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
+ return false;
+ }
+
+ if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) {
+ if (L->getHeader()->getParent()->optForSize()) {
+ DEBUG(dbgs() << "Versioning is needed but not allowed when optimizing "
+ "for size.\n");
+ return false;
+ }
+
+ if (!L->isLoopSimplifyForm()) {
+ DEBUG(dbgs() << "Loop is not is loop-simplify form");
+ return false;
+ }
+
+ // Point of no-return, start the transformation. First, version the loop
+ // if necessary.
+
+ LoopVersioning LV(LAI, L, LI, DT, PSE.getSE(), false);
+ LV.setAliasChecks(std::move(Checks));
+ LV.setSCEVChecks(LAI.getPSE().getUnionPredicate());
+ LV.versionLoop();
+ }
+
+ // Next, propagate the value stored by the store to the users of the load.
+ // Also for the first iteration, generate the initial value of the load.
+ SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(),
+ "storeforward");
+ for (const auto &Cand : Candidates)
+ propagateStoredValueToLoadUsers(Cand, SEE);
+ NumLoopLoadEliminted += NumForwarding;
+
+ return true;
+ }
+
+private:
+ Loop *L;
+
+ /// \brief Maps the load/store instructions to their index according to
+ /// program order.
+ DenseMap<Instruction *, unsigned> InstOrder;
+
+ // Analyses used.
+ LoopInfo *LI;
+ const LoopAccessInfo &LAI;
+ DominatorTree *DT;
+ PredicatedScalarEvolution PSE;
+};
+
+} // end anonymous namespace
+
+static bool
+eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
+ function_ref<const LoopAccessInfo &(Loop &)> GetLAI) {
+ // Build up a worklist of inner-loops to transform to avoid iterator
+ // invalidation.
+ // FIXME: This logic comes from other passes that actually change the loop
+ // nest structure. It isn't clear this is necessary (or useful) for a pass
+ // which merely optimizes the use of loads in a loop.
+ SmallVector<Loop *, 8> Worklist;
+
+ for (Loop *TopLevelLoop : LI)
+ for (Loop *L : depth_first(TopLevelLoop))
+ // We only handle inner-most loops.
+ if (L->empty())
+ Worklist.push_back(L);
+
+ // Now walk the identified inner loops.
+ bool Changed = false;
+ for (Loop *L : Worklist) {
+ // The actual work is performed by LoadEliminationForLoop.
+ LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT);
+ Changed |= LEL.processLoop();
+ }
+ return Changed;
+}
+
+namespace {
+
+/// \brief The pass. Most of the work is delegated to the per-loop
+/// LoadEliminationForLoop class.
+class LoopLoadElimination : public FunctionPass {
+public:
+ static char ID;
+
+ LoopLoadElimination() : FunctionPass(ID) {
+ initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto &LAA = getAnalysis<LoopAccessLegacyAnalysis>();
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+ // Process each loop nest in the function.
+ return eliminateLoadsAcrossLoops(
+ F, LI, DT,
+ [&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); });
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<LoopAccessLegacyAnalysis>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+char LoopLoadElimination::ID;
+
+static const char LLE_name[] = "Loop Load Elimination";
+
+INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
+
+FunctionPass *llvm::createLoopLoadEliminationPass() {
+ return new LoopLoadElimination();
+}
+
+PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &AA = AM.getResult<AAManager>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+
+ auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+ bool Changed = eliminateLoadsAcrossLoops(
+ F, LI, DT, [&](Loop &L) -> const LoopAccessInfo & {
+ LoopStandardAnalysisResults AR = {AA, AC, DT, LI,
+ SE, TLI, TTI, nullptr};
+ return LAM.getResult<LoopAccessAnalysis>(L, AR);
+ });
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
new file mode 100644
index 000000000000..10f6fcdcfdb7
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -0,0 +1,92 @@
+//===- LoopPassManager.cpp - Loop pass management -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+
+using namespace llvm;
+
+// Explicit template instantiations and specialization defininitions for core
+// template typedefs.
+namespace llvm {
+template class PassManager<Loop, LoopAnalysisManager,
+ LoopStandardAnalysisResults &, LPMUpdater &>;
+
+/// Explicitly specialize the pass manager's run method to handle loop nest
+/// structure updates.
+template <>
+PreservedAnalyses
+PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
+ LPMUpdater &>::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR, LPMUpdater &U) {
+ PreservedAnalyses PA = PreservedAnalyses::all();
+
+ if (DebugLogging)
+ dbgs() << "Starting Loop pass manager run.\n";
+
+ for (auto &Pass : Passes) {
+ if (DebugLogging)
+ dbgs() << "Running pass: " << Pass->name() << " on " << L;
+
+ PreservedAnalyses PassPA = Pass->run(L, AM, AR, U);
+
+ // If the loop was deleted, abort the run and return to the outer walk.
+ if (U.skipCurrentLoop()) {
+ PA.intersect(std::move(PassPA));
+ break;
+ }
+
+#ifndef NDEBUG
+ // Verify the loop structure and LCSSA form before visiting the loop.
+ L.verifyLoop();
+ assert(L.isRecursivelyLCSSAForm(AR.DT, AR.LI) &&
+ "Loops must remain in LCSSA form!");
+#endif
+
+ // Update the analysis manager as each pass runs and potentially
+ // invalidates analyses.
+ AM.invalidate(L, PassPA);
+
+ // Finally, we intersect the final preserved analyses to compute the
+ // aggregate preserved set for this pass manager.
+ PA.intersect(std::move(PassPA));
+
+ // FIXME: Historically, the pass managers all called the LLVM context's
+ // yield function here. We don't have a generic way to acquire the
+ // context and it isn't yet clear what the right pattern is for yielding
+ // in the new pass manager so it is currently omitted.
+ // ...getContext().yield();
+ }
+
+ // Invalidation for the current loop should be handled above, and other loop
+ // analysis results shouldn't be impacted by runs over this loop. Therefore,
+ // the remaining analysis results in the AnalysisManager are preserved. We
+ // mark this with a set so that we don't need to inspect each one
+ // individually.
+ // FIXME: This isn't correct! This loop and all nested loops' analyses should
+ // be preserved, but unrolling should invalidate the parent loop's analyses.
+ PA.preserveSet<AllAnalysesOn<Loop>>();
+
+ if (DebugLogging)
+ dbgs() << "Finished Loop pass manager run.\n";
+
+ return PA;
+}
+}
+
+PrintLoopPass::PrintLoopPass() : OS(dbgs()) {}
+PrintLoopPass::PrintLoopPass(raw_ostream &OS, const std::string &Banner)
+ : OS(OS), Banner(Banner) {}
+
+PreservedAnalyses PrintLoopPass::run(Loop &L, LoopAnalysisManager &,
+ LoopStandardAnalysisResults &,
+ LPMUpdater &) {
+ printLoop(L, OS, Banner);
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp
new file mode 100644
index 000000000000..2e4c7b19e476
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -0,0 +1,750 @@
+//===-- LoopPredication.cpp - Guard based loop predication pass -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LoopPredication pass tries to convert loop variant range checks to loop
+// invariant by widening checks across loop iterations. For example, it will
+// convert
+//
+// for (i = 0; i < n; i++) {
+// guard(i < len);
+// ...
+// }
+//
+// to
+//
+// for (i = 0; i < n; i++) {
+// guard(n - 1 < len);
+// ...
+// }
+//
+// After this transformation the condition of the guard is loop invariant, so
+// loop-unswitch can later unswitch the loop by this condition which basically
+// predicates the loop by the widened condition:
+//
+// if (n - 1 < len)
+// for (i = 0; i < n; i++) {
+// ...
+// }
+// else
+// deoptimize
+//
+// It's tempting to rely on SCEV here, but it has proven to be problematic.
+// Generally the facts SCEV provides about the increment step of add
+// recurrences are true if the backedge of the loop is taken, which implicitly
+// assumes that the guard doesn't fail. Using these facts to optimize the
+// guard results in a circular logic where the guard is optimized under the
+// assumption that it never fails.
+//
+// For example, in the loop below the induction variable will be marked as nuw
+// basing on the guard. Basing on nuw the guard predicate will be considered
+// monotonic. Given a monotonic condition it's tempting to replace the induction
+// variable in the condition with its value on the last iteration. But this
+// transformation is not correct, e.g. e = 4, b = 5 breaks the loop.
+//
+// for (int i = b; i != e; i++)
+// guard(i u< len)
+//
+// One of the ways to reason about this problem is to use an inductive proof
+// approach. Given the loop:
+//
+// if (B(0)) {
+// do {
+// I = PHI(0, I.INC)
+// I.INC = I + Step
+// guard(G(I));
+// } while (B(I));
+// }
+//
+// where B(x) and G(x) are predicates that map integers to booleans, we want a
+// loop invariant expression M such the following program has the same semantics
+// as the above:
+//
+// if (B(0)) {
+// do {
+// I = PHI(0, I.INC)
+// I.INC = I + Step
+// guard(G(0) && M);
+// } while (B(I));
+// }
+//
+// One solution for M is M = forall X . (G(X) && B(X)) => G(X + Step)
+//
+// Informal proof that the transformation above is correct:
+//
+// By the definition of guards we can rewrite the guard condition to:
+// G(I) && G(0) && M
+//
+// Let's prove that for each iteration of the loop:
+// G(0) && M => G(I)
+// And the condition above can be simplified to G(Start) && M.
+//
+// Induction base.
+// G(0) && M => G(0)
+//
+// Induction step. Assuming G(0) && M => G(I) on the subsequent
+// iteration:
+//
+// B(I) is true because it's the backedge condition.
+// G(I) is true because the backedge is guarded by this condition.
+//
+// So M = forall X . (G(X) && B(X)) => G(X + Step) implies G(I + Step).
+//
+// Note that we can use anything stronger than M, i.e. any condition which
+// implies M.
+//
+// When S = 1 (i.e. forward iterating loop), the transformation is supported
+// when:
+// * The loop has a single latch with the condition of the form:
+// B(X) = latchStart + X <pred> latchLimit,
+// where <pred> is u<, u<=, s<, or s<=.
+// * The guard condition is of the form
+// G(X) = guardStart + X u< guardLimit
+//
+// For the ult latch comparison case M is:
+// forall X . guardStart + X u< guardLimit && latchStart + X <u latchLimit =>
+// guardStart + X + 1 u< guardLimit
+//
+// The only way the antecedent can be true and the consequent can be false is
+// if
+// X == guardLimit - 1 - guardStart
+// (and guardLimit is non-zero, but we won't use this latter fact).
+// If X == guardLimit - 1 - guardStart then the second half of the antecedent is
+// latchStart + guardLimit - 1 - guardStart u< latchLimit
+// and its negation is
+// latchStart + guardLimit - 1 - guardStart u>= latchLimit
+//
+// In other words, if
+// latchLimit u<= latchStart + guardLimit - 1 - guardStart
+// then:
+// (the ranges below are written in ConstantRange notation, where [A, B) is the
+// set for (I = A; I != B; I++ /*maywrap*/) yield(I);)
+//
+// forall X . guardStart + X u< guardLimit &&
+// latchStart + X u< latchLimit =>
+// guardStart + X + 1 u< guardLimit
+// == forall X . guardStart + X u< guardLimit &&
+// latchStart + X u< latchStart + guardLimit - 1 - guardStart =>
+// guardStart + X + 1 u< guardLimit
+// == forall X . (guardStart + X) in [0, guardLimit) &&
+// (latchStart + X) in [0, latchStart + guardLimit - 1 - guardStart) =>
+// (guardStart + X + 1) in [0, guardLimit)
+// == forall X . X in [-guardStart, guardLimit - guardStart) &&
+// X in [-latchStart, guardLimit - 1 - guardStart) =>
+// X in [-guardStart - 1, guardLimit - guardStart - 1)
+// == true
+//
+// So the widened condition is:
+// guardStart u< guardLimit &&
+// latchStart + guardLimit - 1 - guardStart u>= latchLimit
+// Similarly for ule condition the widened condition is:
+// guardStart u< guardLimit &&
+// latchStart + guardLimit - 1 - guardStart u> latchLimit
+// For slt condition the widened condition is:
+// guardStart u< guardLimit &&
+// latchStart + guardLimit - 1 - guardStart s>= latchLimit
+// For sle condition the widened condition is:
+// guardStart u< guardLimit &&
+// latchStart + guardLimit - 1 - guardStart s> latchLimit
+//
+// When S = -1 (i.e. reverse iterating loop), the transformation is supported
+// when:
+// * The loop has a single latch with the condition of the form:
+// B(X) = X <pred> latchLimit, where <pred> is u> or s>.
+// * The guard condition is of the form
+// G(X) = X - 1 u< guardLimit
+//
+// For the ugt latch comparison case M is:
+// forall X. X-1 u< guardLimit and X u> latchLimit => X-2 u< guardLimit
+//
+// The only way the antecedent can be true and the consequent can be false is if
+// X == 1.
+// If X == 1 then the second half of the antecedent is
+// 1 u> latchLimit, and its negation is latchLimit u>= 1.
+//
+// So the widened condition is:
+// guardStart u< guardLimit && latchLimit u>= 1.
+// Similarly for sgt condition the widened condition is:
+// guardStart u< guardLimit && latchLimit s>= 1.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopPredication.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+#define DEBUG_TYPE "loop-predication"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableIVTruncation("loop-predication-enable-iv-truncation",
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> EnableCountDownLoop("loop-predication-enable-count-down-loop",
+ cl::Hidden, cl::init(true));
+namespace {
+class LoopPredication {
+ /// Represents an induction variable check:
+ /// icmp Pred, <induction variable>, <loop invariant limit>
+ struct LoopICmp {
+ ICmpInst::Predicate Pred;
+ const SCEVAddRecExpr *IV;
+ const SCEV *Limit;
+ LoopICmp(ICmpInst::Predicate Pred, const SCEVAddRecExpr *IV,
+ const SCEV *Limit)
+ : Pred(Pred), IV(IV), Limit(Limit) {}
+ LoopICmp() {}
+ void dump() {
+ dbgs() << "LoopICmp Pred = " << Pred << ", IV = " << *IV
+ << ", Limit = " << *Limit << "\n";
+ }
+ };
+
+ ScalarEvolution *SE;
+
+ Loop *L;
+ const DataLayout *DL;
+ BasicBlock *Preheader;
+ LoopICmp LatchCheck;
+
+ bool isSupportedStep(const SCEV* Step);
+ Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI) {
+ return parseLoopICmp(ICI->getPredicate(), ICI->getOperand(0),
+ ICI->getOperand(1));
+ }
+ Optional<LoopICmp> parseLoopICmp(ICmpInst::Predicate Pred, Value *LHS,
+ Value *RHS);
+
+ Optional<LoopICmp> parseLoopLatchICmp();
+
+ bool CanExpand(const SCEV* S);
+ Value *expandCheck(SCEVExpander &Expander, IRBuilder<> &Builder,
+ ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
+ Instruction *InsertAt);
+
+ Optional<Value *> widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander,
+ IRBuilder<> &Builder);
+ Optional<Value *> widenICmpRangeCheckIncrementingLoop(LoopICmp LatchCheck,
+ LoopICmp RangeCheck,
+ SCEVExpander &Expander,
+ IRBuilder<> &Builder);
+ Optional<Value *> widenICmpRangeCheckDecrementingLoop(LoopICmp LatchCheck,
+ LoopICmp RangeCheck,
+ SCEVExpander &Expander,
+ IRBuilder<> &Builder);
+ bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);
+
+ // When the IV type is wider than the range operand type, we can still do loop
+ // predication, by generating SCEVs for the range and latch that are of the
+ // same type. We achieve this by generating a SCEV truncate expression for the
+ // latch IV. This is done iff truncation of the IV is a safe operation,
+ // without loss of information.
+ // Another way to achieve this is by generating a wider type SCEV for the
+ // range check operand, however, this needs a more involved check that
+ // operands do not overflow. This can lead to loss of information when the
+ // range operand is of the form: add i32 %offset, %iv. We need to prove that
+ // sext(x + y) is same as sext(x) + sext(y).
+ // This function returns true if we can safely represent the IV type in
+ // the RangeCheckType without loss of information.
+ bool isSafeToTruncateWideIVType(Type *RangeCheckType);
+ // Return the loopLatchCheck corresponding to the RangeCheckType if safe to do
+ // so.
+ Optional<LoopICmp> generateLoopLatchCheck(Type *RangeCheckType);
+public:
+ LoopPredication(ScalarEvolution *SE) : SE(SE){};
+ bool runOnLoop(Loop *L);
+};
+
+class LoopPredicationLegacyPass : public LoopPass {
+public:
+ static char ID;
+ LoopPredicationLegacyPass() : LoopPass(ID) {
+ initializeLoopPredicationLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ getLoopAnalysisUsage(AU);
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ LoopPredication LP(SE);
+ return LP.runOnLoop(L);
+ }
+};
+
+char LoopPredicationLegacyPass::ID = 0;
+} // end namespace llvm
+
+INITIALIZE_PASS_BEGIN(LoopPredicationLegacyPass, "loop-predication",
+ "Loop predication", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(LoopPredicationLegacyPass, "loop-predication",
+ "Loop predication", false, false)
+
+Pass *llvm::createLoopPredicationPass() {
+ return new LoopPredicationLegacyPass();
+}
+
+PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ LoopPredication LP(&AR.SE);
+ if (!LP.runOnLoop(&L))
+ return PreservedAnalyses::all();
+
+ return getLoopPassPreservedAnalyses();
+}
+
+Optional<LoopPredication::LoopICmp>
+LoopPredication::parseLoopICmp(ICmpInst::Predicate Pred, Value *LHS,
+ Value *RHS) {
+ const SCEV *LHSS = SE->getSCEV(LHS);
+ if (isa<SCEVCouldNotCompute>(LHSS))
+ return None;
+ const SCEV *RHSS = SE->getSCEV(RHS);
+ if (isa<SCEVCouldNotCompute>(RHSS))
+ return None;
+
+ // Canonicalize RHS to be loop invariant bound, LHS - a loop computable IV
+ if (SE->isLoopInvariant(LHSS, L)) {
+ std::swap(LHS, RHS);
+ std::swap(LHSS, RHSS);
+ Pred = ICmpInst::getSwappedPredicate(Pred);
+ }
+
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHSS);
+ if (!AR || AR->getLoop() != L)
+ return None;
+
+ return LoopICmp(Pred, AR, RHSS);
+}
+
+Value *LoopPredication::expandCheck(SCEVExpander &Expander,
+ IRBuilder<> &Builder,
+ ICmpInst::Predicate Pred, const SCEV *LHS,
+ const SCEV *RHS, Instruction *InsertAt) {
+ // TODO: we can check isLoopEntryGuardedByCond before emitting the check
+
+ Type *Ty = LHS->getType();
+ assert(Ty == RHS->getType() && "expandCheck operands have different types?");
+
+ if (SE->isLoopEntryGuardedByCond(L, Pred, LHS, RHS))
+ return Builder.getTrue();
+
+ Value *LHSV = Expander.expandCodeFor(LHS, Ty, InsertAt);
+ Value *RHSV = Expander.expandCodeFor(RHS, Ty, InsertAt);
+ return Builder.CreateICmp(Pred, LHSV, RHSV);
+}
+
+Optional<LoopPredication::LoopICmp>
+LoopPredication::generateLoopLatchCheck(Type *RangeCheckType) {
+
+ auto *LatchType = LatchCheck.IV->getType();
+ if (RangeCheckType == LatchType)
+ return LatchCheck;
+ // For now, bail out if latch type is narrower than range type.
+ if (DL->getTypeSizeInBits(LatchType) < DL->getTypeSizeInBits(RangeCheckType))
+ return None;
+ if (!isSafeToTruncateWideIVType(RangeCheckType))
+ return None;
+ // We can now safely identify the truncated version of the IV and limit for
+ // RangeCheckType.
+ LoopICmp NewLatchCheck;
+ NewLatchCheck.Pred = LatchCheck.Pred;
+ NewLatchCheck.IV = dyn_cast<SCEVAddRecExpr>(
+ SE->getTruncateExpr(LatchCheck.IV, RangeCheckType));
+ if (!NewLatchCheck.IV)
+ return None;
+ NewLatchCheck.Limit = SE->getTruncateExpr(LatchCheck.Limit, RangeCheckType);
+ DEBUG(dbgs() << "IV of type: " << *LatchType
+ << "can be represented as range check type:" << *RangeCheckType
+ << "\n");
+ DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n");
+ DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n");
+ return NewLatchCheck;
+}
+
+bool LoopPredication::isSupportedStep(const SCEV* Step) {
+ return Step->isOne() || (Step->isAllOnesValue() && EnableCountDownLoop);
+}
+
+bool LoopPredication::CanExpand(const SCEV* S) {
+ return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE);
+}
+
+Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop(
+ LoopPredication::LoopICmp LatchCheck, LoopPredication::LoopICmp RangeCheck,
+ SCEVExpander &Expander, IRBuilder<> &Builder) {
+ auto *Ty = RangeCheck.IV->getType();
+ // Generate the widened condition for the forward loop:
+ // guardStart u< guardLimit &&
+ // latchLimit <pred> guardLimit - 1 - guardStart + latchStart
+ // where <pred> depends on the latch condition predicate. See the file
+ // header comment for the reasoning.
+ // guardLimit - guardStart + latchStart - 1
+ const SCEV *GuardStart = RangeCheck.IV->getStart();
+ const SCEV *GuardLimit = RangeCheck.Limit;
+ const SCEV *LatchStart = LatchCheck.IV->getStart();
+ const SCEV *LatchLimit = LatchCheck.Limit;
+
+ // guardLimit - guardStart + latchStart - 1
+ const SCEV *RHS =
+ SE->getAddExpr(SE->getMinusSCEV(GuardLimit, GuardStart),
+ SE->getMinusSCEV(LatchStart, SE->getOne(Ty)));
+ if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) ||
+ !CanExpand(LatchLimit) || !CanExpand(RHS)) {
+ DEBUG(dbgs() << "Can't expand limit check!\n");
+ return None;
+ }
+ ICmpInst::Predicate LimitCheckPred;
+ switch (LatchCheck.Pred) {
+ case ICmpInst::ICMP_ULT:
+ LimitCheckPred = ICmpInst::ICMP_ULE;
+ break;
+ case ICmpInst::ICMP_ULE:
+ LimitCheckPred = ICmpInst::ICMP_ULT;
+ break;
+ case ICmpInst::ICMP_SLT:
+ LimitCheckPred = ICmpInst::ICMP_SLE;
+ break;
+ case ICmpInst::ICMP_SLE:
+ LimitCheckPred = ICmpInst::ICMP_SLT;
+ break;
+ default:
+ llvm_unreachable("Unsupported loop latch!");
+ }
+
+ DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n");
+ DEBUG(dbgs() << "RHS: " << *RHS << "\n");
+ DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n");
+
+ Instruction *InsertAt = Preheader->getTerminator();
+ auto *LimitCheck =
+ expandCheck(Expander, Builder, LimitCheckPred, LatchLimit, RHS, InsertAt);
+ auto *FirstIterationCheck = expandCheck(Expander, Builder, RangeCheck.Pred,
+ GuardStart, GuardLimit, InsertAt);
+ return Builder.CreateAnd(FirstIterationCheck, LimitCheck);
+}
+
+Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
+ LoopPredication::LoopICmp LatchCheck, LoopPredication::LoopICmp RangeCheck,
+ SCEVExpander &Expander, IRBuilder<> &Builder) {
+ auto *Ty = RangeCheck.IV->getType();
+ const SCEV *GuardStart = RangeCheck.IV->getStart();
+ const SCEV *GuardLimit = RangeCheck.Limit;
+ const SCEV *LatchLimit = LatchCheck.Limit;
+ if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) ||
+ !CanExpand(LatchLimit)) {
+ DEBUG(dbgs() << "Can't expand limit check!\n");
+ return None;
+ }
+ // The decrement of the latch check IV should be the same as the
+ // rangeCheckIV.
+ auto *PostDecLatchCheckIV = LatchCheck.IV->getPostIncExpr(*SE);
+ if (RangeCheck.IV != PostDecLatchCheckIV) {
+ DEBUG(dbgs() << "Not the same. PostDecLatchCheckIV: "
+ << *PostDecLatchCheckIV
+ << " and RangeCheckIV: " << *RangeCheck.IV << "\n");
+ return None;
+ }
+
+ // Generate the widened condition for CountDownLoop:
+ // guardStart u< guardLimit &&
+ // latchLimit <pred> 1.
+ // See the header comment for reasoning of the checks.
+ Instruction *InsertAt = Preheader->getTerminator();
+ auto LimitCheckPred = ICmpInst::isSigned(LatchCheck.Pred)
+ ? ICmpInst::ICMP_SGE
+ : ICmpInst::ICMP_UGE;
+ auto *FirstIterationCheck = expandCheck(Expander, Builder, ICmpInst::ICMP_ULT,
+ GuardStart, GuardLimit, InsertAt);
+ auto *LimitCheck = expandCheck(Expander, Builder, LimitCheckPred, LatchLimit,
+ SE->getOne(Ty), InsertAt);
+ return Builder.CreateAnd(FirstIterationCheck, LimitCheck);
+}
+
+/// If ICI can be widened to a loop invariant condition emits the loop
+/// invariant condition in the loop preheader and return it, otherwise
+/// returns None.
+Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
+ SCEVExpander &Expander,
+ IRBuilder<> &Builder) {
+ DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
+ DEBUG(ICI->dump());
+
+ // parseLoopStructure guarantees that the latch condition is:
+ // ++i <pred> latchLimit, where <pred> is u<, u<=, s<, or s<=.
+ // We are looking for the range checks of the form:
+ // i u< guardLimit
+ auto RangeCheck = parseLoopICmp(ICI);
+ if (!RangeCheck) {
+ DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+ return None;
+ }
+ DEBUG(dbgs() << "Guard check:\n");
+ DEBUG(RangeCheck->dump());
+ if (RangeCheck->Pred != ICmpInst::ICMP_ULT) {
+ DEBUG(dbgs() << "Unsupported range check predicate(" << RangeCheck->Pred
+ << ")!\n");
+ return None;
+ }
+ auto *RangeCheckIV = RangeCheck->IV;
+ if (!RangeCheckIV->isAffine()) {
+ DEBUG(dbgs() << "Range check IV is not affine!\n");
+ return None;
+ }
+ auto *Step = RangeCheckIV->getStepRecurrence(*SE);
+ // We cannot just compare with latch IV step because the latch and range IVs
+ // may have different types.
+ if (!isSupportedStep(Step)) {
+ DEBUG(dbgs() << "Range check and latch have IVs different steps!\n");
+ return None;
+ }
+ auto *Ty = RangeCheckIV->getType();
+ auto CurrLatchCheckOpt = generateLoopLatchCheck(Ty);
+ if (!CurrLatchCheckOpt) {
+ DEBUG(dbgs() << "Failed to generate a loop latch check "
+ "corresponding to range type: "
+ << *Ty << "\n");
+ return None;
+ }
+
+ LoopICmp CurrLatchCheck = *CurrLatchCheckOpt;
+ // At this point, the range and latch step should have the same type, but need
+ // not have the same value (we support both 1 and -1 steps).
+ assert(Step->getType() ==
+ CurrLatchCheck.IV->getStepRecurrence(*SE)->getType() &&
+ "Range and latch steps should be of same type!");
+ if (Step != CurrLatchCheck.IV->getStepRecurrence(*SE)) {
+ DEBUG(dbgs() << "Range and latch have different step values!\n");
+ return None;
+ }
+
+ if (Step->isOne())
+ return widenICmpRangeCheckIncrementingLoop(CurrLatchCheck, *RangeCheck,
+ Expander, Builder);
+ else {
+ assert(Step->isAllOnesValue() && "Step should be -1!");
+ return widenICmpRangeCheckDecrementingLoop(CurrLatchCheck, *RangeCheck,
+ Expander, Builder);
+ }
+}
+
+bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
+ SCEVExpander &Expander) {
+ DEBUG(dbgs() << "Processing guard:\n");
+ DEBUG(Guard->dump());
+
+ IRBuilder<> Builder(cast<Instruction>(Preheader->getTerminator()));
+
+ // The guard condition is expected to be in form of:
+ // cond1 && cond2 && cond3 ...
+ // Iterate over subconditions looking for for icmp conditions which can be
+ // widened across loop iterations. Widening these conditions remember the
+ // resulting list of subconditions in Checks vector.
+ SmallVector<Value *, 4> Worklist(1, Guard->getOperand(0));
+ SmallPtrSet<Value *, 4> Visited;
+
+ SmallVector<Value *, 4> Checks;
+
+ unsigned NumWidened = 0;
+ do {
+ Value *Condition = Worklist.pop_back_val();
+ if (!Visited.insert(Condition).second)
+ continue;
+
+ Value *LHS, *RHS;
+ using namespace llvm::PatternMatch;
+ if (match(Condition, m_And(m_Value(LHS), m_Value(RHS)))) {
+ Worklist.push_back(LHS);
+ Worklist.push_back(RHS);
+ continue;
+ }
+
+ if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) {
+ if (auto NewRangeCheck = widenICmpRangeCheck(ICI, Expander, Builder)) {
+ Checks.push_back(NewRangeCheck.getValue());
+ NumWidened++;
+ continue;
+ }
+ }
+
+ // Save the condition as is if we can't widen it
+ Checks.push_back(Condition);
+ } while (Worklist.size() != 0);
+
+ if (NumWidened == 0)
+ return false;
+
+ // Emit the new guard condition
+ Builder.SetInsertPoint(Guard);
+ Value *LastCheck = nullptr;
+ for (auto *Check : Checks)
+ if (!LastCheck)
+ LastCheck = Check;
+ else
+ LastCheck = Builder.CreateAnd(LastCheck, Check);
+ Guard->setOperand(0, LastCheck);
+
+ DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
+ return true;
+}
+
+Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
+ using namespace PatternMatch;
+
+ BasicBlock *LoopLatch = L->getLoopLatch();
+ if (!LoopLatch) {
+ DEBUG(dbgs() << "The loop doesn't have a single latch!\n");
+ return None;
+ }
+
+ ICmpInst::Predicate Pred;
+ Value *LHS, *RHS;
+ BasicBlock *TrueDest, *FalseDest;
+
+ if (!match(LoopLatch->getTerminator(),
+ m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)), TrueDest,
+ FalseDest))) {
+ DEBUG(dbgs() << "Failed to match the latch terminator!\n");
+ return None;
+ }
+ assert((TrueDest == L->getHeader() || FalseDest == L->getHeader()) &&
+ "One of the latch's destinations must be the header");
+ if (TrueDest != L->getHeader())
+ Pred = ICmpInst::getInversePredicate(Pred);
+
+ auto Result = parseLoopICmp(Pred, LHS, RHS);
+ if (!Result) {
+ DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+ return None;
+ }
+
+ // Check affine first, so if it's not we don't try to compute the step
+ // recurrence.
+ if (!Result->IV->isAffine()) {
+ DEBUG(dbgs() << "The induction variable is not affine!\n");
+ return None;
+ }
+
+ auto *Step = Result->IV->getStepRecurrence(*SE);
+ if (!isSupportedStep(Step)) {
+ DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n");
+ return None;
+ }
+
+ auto IsUnsupportedPredicate = [](const SCEV *Step, ICmpInst::Predicate Pred) {
+ if (Step->isOne()) {
+ return Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_SLT &&
+ Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_SLE;
+ } else {
+ assert(Step->isAllOnesValue() && "Step should be -1!");
+ return Pred != ICmpInst::ICMP_UGT && Pred != ICmpInst::ICMP_SGT;
+ }
+ };
+
+ if (IsUnsupportedPredicate(Step, Result->Pred)) {
+ DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred
+ << ")!\n");
+ return None;
+ }
+ return Result;
+}
+
+// Returns true if its safe to truncate the IV to RangeCheckType.
+bool LoopPredication::isSafeToTruncateWideIVType(Type *RangeCheckType) {
+ if (!EnableIVTruncation)
+ return false;
+ assert(DL->getTypeSizeInBits(LatchCheck.IV->getType()) >
+ DL->getTypeSizeInBits(RangeCheckType) &&
+ "Expected latch check IV type to be larger than range check operand "
+ "type!");
+ // The start and end values of the IV should be known. This is to guarantee
+ // that truncating the wide type will not lose information.
+ auto *Limit = dyn_cast<SCEVConstant>(LatchCheck.Limit);
+ auto *Start = dyn_cast<SCEVConstant>(LatchCheck.IV->getStart());
+ if (!Limit || !Start)
+ return false;
+ // This check makes sure that the IV does not change sign during loop
+ // iterations. Consider latchType = i64, LatchStart = 5, Pred = ICMP_SGE,
+ // LatchEnd = 2, rangeCheckType = i32. If it's not a monotonic predicate, the
+ // IV wraps around, and the truncation of the IV would lose the range of
+ // iterations between 2^32 and 2^64.
+ bool Increasing;
+ if (!SE->isMonotonicPredicate(LatchCheck.IV, LatchCheck.Pred, Increasing))
+ return false;
+ // The active bits should be less than the bits in the RangeCheckType. This
+ // guarantees that truncating the latch check to RangeCheckType is a safe
+ // operation.
+ auto RangeCheckTypeBitSize = DL->getTypeSizeInBits(RangeCheckType);
+ return Start->getAPInt().getActiveBits() < RangeCheckTypeBitSize &&
+ Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize;
+}
+
+bool LoopPredication::runOnLoop(Loop *Loop) {
+ L = Loop;
+
+ DEBUG(dbgs() << "Analyzing ");
+ DEBUG(L->dump());
+
+ Module *M = L->getHeader()->getModule();
+
+ // There is nothing to do if the module doesn't use guards
+ auto *GuardDecl =
+ M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard));
+ if (!GuardDecl || GuardDecl->use_empty())
+ return false;
+
+ DL = &M->getDataLayout();
+
+ Preheader = L->getLoopPreheader();
+ if (!Preheader)
+ return false;
+
+ auto LatchCheckOpt = parseLoopLatchICmp();
+ if (!LatchCheckOpt)
+ return false;
+ LatchCheck = *LatchCheckOpt;
+
+ DEBUG(dbgs() << "Latch check:\n");
+ DEBUG(LatchCheck.dump());
+
+ // Collect all the guards into a vector and process later, so as not
+ // to invalidate the instruction iterator.
+ SmallVector<IntrinsicInst *, 4> Guards;
+ for (const auto BB : L->blocks())
+ for (auto &I : *BB)
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::experimental_guard)
+ Guards.push_back(II);
+
+ if (Guards.empty())
+ return false;
+
+ SCEVExpander Expander(*SE, *DL, "loop-predication");
+
+ bool Changed = false;
+ for (auto *Guard : Guards)
+ Changed |= widenGuardConditions(Guard, Expander);
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
new file mode 100644
index 000000000000..d1a54b877950
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -0,0 +1,1801 @@
+//===- LoopReroll.cpp - Loop rerolling pass -------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop reroller.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <map>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-reroll"
+
+STATISTIC(NumRerolledLoops, "Number of rerolled loops");
+
+static cl::opt<unsigned>
+MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden,
+ cl::desc("The maximum increment for loop rerolling"));
+
+static cl::opt<unsigned>
+NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
+ cl::Hidden,
+ cl::desc("The maximum number of failures to tolerate"
+ " during fuzzy matching. (default: 400)"));
+
+// This loop re-rolling transformation aims to transform loops like this:
+//
+// int foo(int a);
+// void bar(int *x) {
+// for (int i = 0; i < 500; i += 3) {
+// foo(i);
+// foo(i+1);
+// foo(i+2);
+// }
+// }
+//
+// into a loop like this:
+//
+// void bar(int *x) {
+// for (int i = 0; i < 500; ++i)
+// foo(i);
+// }
+//
+// It does this by looking for loops that, besides the latch code, are composed
+// of isomorphic DAGs of instructions, with each DAG rooted at some increment
+// to the induction variable, and where each DAG is isomorphic to the DAG
+// rooted at the induction variable (excepting the sub-DAGs which root the
+// other induction-variable increments). In other words, we're looking for loop
+// bodies of the form:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1 <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2 <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// where each f(i) is a set of instructions that, collectively, are a function
+// only of i (and other loop-invariant values).
+//
+// As a special case, we can also reroll loops like this:
+//
+// int foo(int);
+// void bar(int *x) {
+// for (int i = 0; i < 500; ++i) {
+// x[3*i] = foo(0);
+// x[3*i+1] = foo(0);
+// x[3*i+2] = foo(0);
+// }
+// }
+//
+// into this:
+//
+// void bar(int *x) {
+// for (int i = 0; i < 1500; ++i)
+// x[i] = foo(0);
+// }
+//
+// in which case, we're looking for inputs like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// %scaled.iv = mul %iv, scale
+// f(%scaled.iv)
+// %scaled.iv.1 = add %scaled.iv, 1
+// f(%scaled.iv.1)
+// %scaled.iv.2 = add %scaled.iv, 2
+// f(%scaled.iv.2)
+// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
+// f(%scaled.iv.scale_m_1)
+// ...
+// %iv.next = add %iv, 1
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+
+namespace {
+
+ enum IterationLimits {
+ /// The maximum number of iterations that we'll try and reroll.
+ IL_MaxRerollIterations = 32,
+ /// The bitvector index used by loop induction variables and other
+ /// instructions that belong to all iterations.
+ IL_All,
+ IL_End
+ };
+
+ class LoopReroll : public LoopPass {
+ public:
+ static char ID; // Pass ID, replacement for typeid
+
+ LoopReroll() : LoopPass(ID) {
+ initializeLoopRerollPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ getLoopAnalysisUsage(AU);
+ }
+
+ protected:
+ AliasAnalysis *AA;
+ LoopInfo *LI;
+ ScalarEvolution *SE;
+ TargetLibraryInfo *TLI;
+ DominatorTree *DT;
+ bool PreserveLCSSA;
+
+ using SmallInstructionVector = SmallVector<Instruction *, 16>;
+ using SmallInstructionSet = SmallSet<Instruction *, 16>;
+
+ // Map between induction variable and its increment
+ DenseMap<Instruction *, int64_t> IVToIncMap;
+
+ // For loop with multiple induction variable, remember the one used only to
+ // control the loop.
+ Instruction *LoopControlIV;
+
+ // A chain of isomorphic instructions, identified by a single-use PHI
+ // representing a reduction. Only the last value may be used outside the
+ // loop.
+ struct SimpleLoopReduction {
+ SimpleLoopReduction(Instruction *P, Loop *L) : Instructions(1, P) {
+ assert(isa<PHINode>(P) && "First reduction instruction must be a PHI");
+ add(L);
+ }
+
+ bool valid() const {
+ return Valid;
+ }
+
+ Instruction *getPHI() const {
+ assert(Valid && "Using invalid reduction");
+ return Instructions.front();
+ }
+
+ Instruction *getReducedValue() const {
+ assert(Valid && "Using invalid reduction");
+ return Instructions.back();
+ }
+
+ Instruction *get(size_t i) const {
+ assert(Valid && "Using invalid reduction");
+ return Instructions[i+1];
+ }
+
+ Instruction *operator [] (size_t i) const { return get(i); }
+
+ // The size, ignoring the initial PHI.
+ size_t size() const {
+ assert(Valid && "Using invalid reduction");
+ return Instructions.size()-1;
+ }
+
+ using iterator = SmallInstructionVector::iterator;
+ using const_iterator = SmallInstructionVector::const_iterator;
+
+ iterator begin() {
+ assert(Valid && "Using invalid reduction");
+ return std::next(Instructions.begin());
+ }
+
+ const_iterator begin() const {
+ assert(Valid && "Using invalid reduction");
+ return std::next(Instructions.begin());
+ }
+
+ iterator end() { return Instructions.end(); }
+ const_iterator end() const { return Instructions.end(); }
+
+ protected:
+ bool Valid = false;
+ SmallInstructionVector Instructions;
+
+ void add(Loop *L);
+ };
+
+ // The set of all reductions, and state tracking of possible reductions
+ // during loop instruction processing.
+ struct ReductionTracker {
+ using SmallReductionVector = SmallVector<SimpleLoopReduction, 16>;
+
+ // Add a new possible reduction.
+ void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); }
+
+ // Setup to track possible reductions corresponding to the provided
+ // rerolling scale. Only reductions with a number of non-PHI instructions
+ // that is divisible by the scale are considered. Three instructions sets
+ // are filled in:
+ // - A set of all possible instructions in eligible reductions.
+ // - A set of all PHIs in eligible reductions
+ // - A set of all reduced values (last instructions) in eligible
+ // reductions.
+ void restrictToScale(uint64_t Scale,
+ SmallInstructionSet &PossibleRedSet,
+ SmallInstructionSet &PossibleRedPHISet,
+ SmallInstructionSet &PossibleRedLastSet) {
+ PossibleRedIdx.clear();
+ PossibleRedIter.clear();
+ Reds.clear();
+
+ for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i)
+ if (PossibleReds[i].size() % Scale == 0) {
+ PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
+ PossibleRedPHISet.insert(PossibleReds[i].getPHI());
+
+ PossibleRedSet.insert(PossibleReds[i].getPHI());
+ PossibleRedIdx[PossibleReds[i].getPHI()] = i;
+ for (Instruction *J : PossibleReds[i]) {
+ PossibleRedSet.insert(J);
+ PossibleRedIdx[J] = i;
+ }
+ }
+ }
+
+ // The functions below are used while processing the loop instructions.
+
+ // Are the two instructions both from reductions, and furthermore, from
+ // the same reduction?
+ bool isPairInSame(Instruction *J1, Instruction *J2) {
+ DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1);
+ if (J1I != PossibleRedIdx.end()) {
+ DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2);
+ if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second)
+ return true;
+ }
+
+ return false;
+ }
+
+ // The two provided instructions, the first from the base iteration, and
+ // the second from iteration i, form a matched pair. If these are part of
+ // a reduction, record that fact.
+ void recordPair(Instruction *J1, Instruction *J2, unsigned i) {
+ if (PossibleRedIdx.count(J1)) {
+ assert(PossibleRedIdx.count(J2) &&
+ "Recording reduction vs. non-reduction instruction?");
+
+ PossibleRedIter[J1] = 0;
+ PossibleRedIter[J2] = i;
+
+ int Idx = PossibleRedIdx[J1];
+ assert(Idx == PossibleRedIdx[J2] &&
+ "Recording pair from different reductions?");
+ Reds.insert(Idx);
+ }
+ }
+
+ // The functions below can be called after we've finished processing all
+ // instructions in the loop, and we know which reductions were selected.
+
+ bool validateSelected();
+ void replaceSelected();
+
+ protected:
+ // The vector of all possible reductions (for any scale).
+ SmallReductionVector PossibleReds;
+
+ DenseMap<Instruction *, int> PossibleRedIdx;
+ DenseMap<Instruction *, int> PossibleRedIter;
+ DenseSet<int> Reds;
+ };
+
+ // A DAGRootSet models an induction variable being used in a rerollable
+ // loop. For example,
+ //
+ // x[i*3+0] = y1
+ // x[i*3+1] = y2
+ // x[i*3+2] = y3
+ //
+ // Base instruction -> i*3
+ // +---+----+
+ // / | \
+ // ST[y1] +1 +2 <-- Roots
+ // | |
+ // ST[y2] ST[y3]
+ //
+ // There may be multiple DAGRoots, for example:
+ //
+ // x[i*2+0] = ... (1)
+ // x[i*2+1] = ... (1)
+ // x[i*2+4] = ... (2)
+ // x[i*2+5] = ... (2)
+ // x[(i+1234)*2+5678] = ... (3)
+ // x[(i+1234)*2+5679] = ... (3)
+ //
+ // The loop will be rerolled by adding a new loop induction variable,
+ // one for the Base instruction in each DAGRootSet.
+ //
+ struct DAGRootSet {
+ Instruction *BaseInst;
+ SmallInstructionVector Roots;
+
+ // The instructions between IV and BaseInst (but not including BaseInst).
+ SmallInstructionSet SubsumedInsts;
+ };
+
+ // The set of all DAG roots, and state tracking of all roots
+ // for a particular induction variable.
+ struct DAGRootTracker {
+ DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV,
+ ScalarEvolution *SE, AliasAnalysis *AA,
+ TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI,
+ bool PreserveLCSSA,
+ DenseMap<Instruction *, int64_t> &IncrMap,
+ Instruction *LoopCtrlIV)
+ : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI),
+ PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap),
+ LoopControlIV(LoopCtrlIV) {}
+
+ /// Stage 1: Find all the DAG roots for the induction variable.
+ bool findRoots();
+
+ /// Stage 2: Validate if the found roots are valid.
+ bool validate(ReductionTracker &Reductions);
+
+ /// Stage 3: Assuming validate() returned true, perform the
+ /// replacement.
+ /// @param IterCount The maximum iteration count of L.
+ void replace(const SCEV *IterCount);
+
+ protected:
+ using UsesTy = MapVector<Instruction *, BitVector>;
+
+ void findRootsRecursive(Instruction *IVU,
+ SmallInstructionSet SubsumedInsts);
+ bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts);
+ bool collectPossibleRoots(Instruction *Base,
+ std::map<int64_t,Instruction*> &Roots);
+ bool validateRootSet(DAGRootSet &DRS);
+
+ bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet);
+ void collectInLoopUserSet(const SmallInstructionVector &Roots,
+ const SmallInstructionSet &Exclude,
+ const SmallInstructionSet &Final,
+ DenseSet<Instruction *> &Users);
+ void collectInLoopUserSet(Instruction *Root,
+ const SmallInstructionSet &Exclude,
+ const SmallInstructionSet &Final,
+ DenseSet<Instruction *> &Users);
+
+ UsesTy::iterator nextInstr(int Val, UsesTy &In,
+ const SmallInstructionSet &Exclude,
+ UsesTy::iterator *StartI=nullptr);
+ bool isBaseInst(Instruction *I);
+ bool isRootInst(Instruction *I);
+ bool instrDependsOn(Instruction *I,
+ UsesTy::iterator Start,
+ UsesTy::iterator End);
+ void replaceIV(Instruction *Inst, Instruction *IV, const SCEV *IterCount);
+ void updateNonLoopCtrlIncr();
+
+ LoopReroll *Parent;
+
+ // Members of Parent, replicated here for brevity.
+ Loop *L;
+ ScalarEvolution *SE;
+ AliasAnalysis *AA;
+ TargetLibraryInfo *TLI;
+ DominatorTree *DT;
+ LoopInfo *LI;
+ bool PreserveLCSSA;
+
+ // The loop induction variable.
+ Instruction *IV;
+
+ // Loop step amount.
+ int64_t Inc;
+
+ // Loop reroll count; if Inc == 1, this records the scaling applied
+ // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ;
+ // If Inc is not 1, Scale = Inc.
+ uint64_t Scale;
+
+ // The roots themselves.
+ SmallVector<DAGRootSet,16> RootSets;
+
+ // All increment instructions for IV.
+ SmallInstructionVector LoopIncs;
+
+ // Map of all instructions in the loop (in order) to the iterations
+ // they are used in (or specially, IL_All for instructions
+ // used in the loop increment mechanism).
+ UsesTy Uses;
+
+ // Map between induction variable and its increment
+ DenseMap<Instruction *, int64_t> &IVToIncMap;
+
+ Instruction *LoopControlIV;
+ };
+
+ // Check if it is a compare-like instruction whose user is a branch
+ bool isCompareUsedByBranch(Instruction *I) {
+ auto *TI = I->getParent()->getTerminator();
+ if (!isa<BranchInst>(TI) || !isa<CmpInst>(I))
+ return false;
+ return I->hasOneUse() && TI->getOperand(0) == I;
+ };
+
+ bool isLoopControlIV(Loop *L, Instruction *IV);
+ void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
+ void collectPossibleReductions(Loop *L,
+ ReductionTracker &Reductions);
+ bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount,
+ ReductionTracker &Reductions);
+ };
+
+} // end anonymous namespace
+
+char LoopReroll::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+
+Pass *llvm::createLoopRerollPass() {
+ return new LoopReroll;
+}
+
+// Returns true if the provided instruction is used outside the given loop.
+// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
+// non-loop blocks to be outside the loop.
+static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
+ for (User *U : I->users()) {
+ if (!L->contains(cast<Instruction>(U)))
+ return true;
+ }
+ return false;
+}
+
+static const SCEVConstant *getIncrmentFactorSCEV(ScalarEvolution *SE,
+ const SCEV *SCEVExpr,
+ Instruction &IV) {
+ const SCEVMulExpr *MulSCEV = dyn_cast<SCEVMulExpr>(SCEVExpr);
+
+ // If StepRecurrence of a SCEVExpr is a constant (c1 * c2, c2 = sizeof(ptr)),
+ // Return c1.
+ if (!MulSCEV && IV.getType()->isPointerTy())
+ if (const SCEVConstant *IncSCEV = dyn_cast<SCEVConstant>(SCEVExpr)) {
+ const PointerType *PTy = cast<PointerType>(IV.getType());
+ Type *ElTy = PTy->getElementType();
+ const SCEV *SizeOfExpr =
+ SE->getSizeOfExpr(SE->getEffectiveSCEVType(IV.getType()), ElTy);
+ if (IncSCEV->getValue()->getValue().isNegative()) {
+ const SCEV *NewSCEV =
+ SE->getUDivExpr(SE->getNegativeSCEV(SCEVExpr), SizeOfExpr);
+ return dyn_cast<SCEVConstant>(SE->getNegativeSCEV(NewSCEV));
+ } else {
+ return dyn_cast<SCEVConstant>(SE->getUDivExpr(SCEVExpr, SizeOfExpr));
+ }
+ }
+
+ if (!MulSCEV)
+ return nullptr;
+
+ // If StepRecurrence of a SCEVExpr is a c * sizeof(x), where c is constant,
+ // Return c.
+ const SCEVConstant *CIncSCEV = nullptr;
+ for (const SCEV *Operand : MulSCEV->operands()) {
+ if (const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Operand)) {
+ CIncSCEV = Constant;
+ } else if (const SCEVUnknown *Unknown = dyn_cast<SCEVUnknown>(Operand)) {
+ Type *AllocTy;
+ if (!Unknown->isSizeOf(AllocTy))
+ break;
+ } else {
+ return nullptr;
+ }
+ }
+ return CIncSCEV;
+}
+
+// Check if an IV is only used to control the loop. There are two cases:
+// 1. It only has one use which is loop increment, and the increment is only
+// used by comparison and the PHI (could has sext with nsw in between), and the
+// comparison is only used by branch.
+// 2. It is used by loop increment and the comparison, the loop increment is
+// only used by the PHI, and the comparison is used only by the branch.
+bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) {
+ unsigned IVUses = IV->getNumUses();
+ if (IVUses != 2 && IVUses != 1)
+ return false;
+
+ for (auto *User : IV->users()) {
+ int32_t IncOrCmpUses = User->getNumUses();
+ bool IsCompInst = isCompareUsedByBranch(cast<Instruction>(User));
+
+ // User can only have one or two uses.
+ if (IncOrCmpUses != 2 && IncOrCmpUses != 1)
+ return false;
+
+ // Case 1
+ if (IVUses == 1) {
+ // The only user must be the loop increment.
+ // The loop increment must have two uses.
+ if (IsCompInst || IncOrCmpUses != 2)
+ return false;
+ }
+
+ // Case 2
+ if (IVUses == 2 && IncOrCmpUses != 1)
+ return false;
+
+ // The users of the IV must be a binary operation or a comparison
+ if (auto *BO = dyn_cast<BinaryOperator>(User)) {
+ if (BO->getOpcode() == Instruction::Add) {
+ // Loop Increment
+ // User of Loop Increment should be either PHI or CMP
+ for (auto *UU : User->users()) {
+ if (PHINode *PN = dyn_cast<PHINode>(UU)) {
+ if (PN != IV)
+ return false;
+ }
+ // Must be a CMP or an ext (of a value with nsw) then CMP
+ else {
+ Instruction *UUser = dyn_cast<Instruction>(UU);
+ // Skip SExt if we are extending an nsw value
+ // TODO: Allow ZExt too
+ if (BO->hasNoSignedWrap() && UUser && UUser->hasOneUse() &&
+ isa<SExtInst>(UUser))
+ UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
+ if (!isCompareUsedByBranch(UUser))
+ return false;
+ }
+ }
+ } else
+ return false;
+ // Compare : can only have one use, and must be branch
+ } else if (!IsCompInst)
+ return false;
+ }
+ return true;
+}
+
+// Collect the list of loop induction variables with respect to which it might
+// be possible to reroll the loop.
+void LoopReroll::collectPossibleIVs(Loop *L,
+ SmallInstructionVector &PossibleIVs) {
+ BasicBlock *Header = L->getHeader();
+ for (BasicBlock::iterator I = Header->begin(),
+ IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+ if (!isa<PHINode>(I))
+ continue;
+ if (!I->getType()->isIntegerTy() && !I->getType()->isPointerTy())
+ continue;
+
+ if (const SCEVAddRecExpr *PHISCEV =
+ dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&*I))) {
+ if (PHISCEV->getLoop() != L)
+ continue;
+ if (!PHISCEV->isAffine())
+ continue;
+ const SCEVConstant *IncSCEV = nullptr;
+ if (I->getType()->isPointerTy())
+ IncSCEV =
+ getIncrmentFactorSCEV(SE, PHISCEV->getStepRecurrence(*SE), *I);
+ else
+ IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE));
+ if (IncSCEV) {
+ const APInt &AInt = IncSCEV->getValue()->getValue().abs();
+ if (IncSCEV->getValue()->isZero() || AInt.uge(MaxInc))
+ continue;
+ IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue();
+ DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV
+ << "\n");
+
+ if (isLoopControlIV(L, &*I)) {
+ assert(!LoopControlIV && "Found two loop control only IV");
+ LoopControlIV = &(*I);
+ DEBUG(dbgs() << "LRR: Possible loop control only IV: " << *I << " = "
+ << *PHISCEV << "\n");
+ } else
+ PossibleIVs.push_back(&*I);
+ }
+ }
+ }
+}
+
+// Add the remainder of the reduction-variable chain to the instruction vector
+// (the initial PHINode has already been added). If successful, the object is
+// marked as valid.
+void LoopReroll::SimpleLoopReduction::add(Loop *L) {
+ assert(!Valid && "Cannot add to an already-valid chain");
+
+ // The reduction variable must be a chain of single-use instructions
+ // (including the PHI), except for the last value (which is used by the PHI
+ // and also outside the loop).
+ Instruction *C = Instructions.front();
+ if (C->user_empty())
+ return;
+
+ do {
+ C = cast<Instruction>(*C->user_begin());
+ if (C->hasOneUse()) {
+ if (!C->isBinaryOp())
+ return;
+
+ if (!(isa<PHINode>(Instructions.back()) ||
+ C->isSameOperationAs(Instructions.back())))
+ return;
+
+ Instructions.push_back(C);
+ }
+ } while (C->hasOneUse());
+
+ if (Instructions.size() < 2 ||
+ !C->isSameOperationAs(Instructions.back()) ||
+ C->use_empty())
+ return;
+
+ // C is now the (potential) last instruction in the reduction chain.
+ for (User *U : C->users()) {
+ // The only in-loop user can be the initial PHI.
+ if (L->contains(cast<Instruction>(U)))
+ if (cast<Instruction>(U) != Instructions.front())
+ return;
+ }
+
+ Instructions.push_back(C);
+ Valid = true;
+}
+
+// Collect the vector of possible reduction variables.
+void LoopReroll::collectPossibleReductions(Loop *L,
+ ReductionTracker &Reductions) {
+ BasicBlock *Header = L->getHeader();
+ for (BasicBlock::iterator I = Header->begin(),
+ IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+ if (!isa<PHINode>(I))
+ continue;
+ if (!I->getType()->isSingleValueType())
+ continue;
+
+ SimpleLoopReduction SLR(&*I, L);
+ if (!SLR.valid())
+ continue;
+
+ DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " <<
+ SLR.size() << " chained instructions)\n");
+ Reductions.addSLR(SLR);
+ }
+}
+
+// Collect the set of all users of the provided root instruction. This set of
+// users contains not only the direct users of the root instruction, but also
+// all users of those users, and so on. There are two exceptions:
+//
+// 1. Instructions in the set of excluded instructions are never added to the
+// use set (even if they are users). This is used, for example, to exclude
+// including root increments in the use set of the primary IV.
+//
+// 2. Instructions in the set of final instructions are added to the use set
+// if they are users, but their users are not added. This is used, for
+// example, to prevent a reduction update from forcing all later reduction
+// updates into the use set.
+void LoopReroll::DAGRootTracker::collectInLoopUserSet(
+ Instruction *Root, const SmallInstructionSet &Exclude,
+ const SmallInstructionSet &Final,
+ DenseSet<Instruction *> &Users) {
+ SmallInstructionVector Queue(1, Root);
+ while (!Queue.empty()) {
+ Instruction *I = Queue.pop_back_val();
+ if (!Users.insert(I).second)
+ continue;
+
+ if (!Final.count(I))
+ for (Use &U : I->uses()) {
+ Instruction *User = cast<Instruction>(U.getUser());
+ if (PHINode *PN = dyn_cast<PHINode>(User)) {
+ // Ignore "wrap-around" uses to PHIs of this loop's header.
+ if (PN->getIncomingBlock(U) == L->getHeader())
+ continue;
+ }
+
+ if (L->contains(User) && !Exclude.count(User)) {
+ Queue.push_back(User);
+ }
+ }
+
+ // We also want to collect single-user "feeder" values.
+ for (User::op_iterator OI = I->op_begin(),
+ OIE = I->op_end(); OI != OIE; ++OI) {
+ if (Instruction *Op = dyn_cast<Instruction>(*OI))
+ if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) &&
+ !Final.count(Op))
+ Queue.push_back(Op);
+ }
+ }
+}
+
+// Collect all of the users of all of the provided root instructions (combined
+// into a single set).
+void LoopReroll::DAGRootTracker::collectInLoopUserSet(
+ const SmallInstructionVector &Roots,
+ const SmallInstructionSet &Exclude,
+ const SmallInstructionSet &Final,
+ DenseSet<Instruction *> &Users) {
+ for (Instruction *Root : Roots)
+ collectInLoopUserSet(Root, Exclude, Final, Users);
+}
+
+static bool isUnorderedLoadStore(Instruction *I) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ return LI->isUnordered();
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->isUnordered();
+ if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+ return !MI->isVolatile();
+ return false;
+}
+
+/// Return true if IVU is a "simple" arithmetic operation.
+/// This is used for narrowing the search space for DAGRoots; only arithmetic
+/// and GEPs can be part of a DAGRoot.
+static bool isSimpleArithmeticOp(User *IVU) {
+ if (Instruction *I = dyn_cast<Instruction>(IVU)) {
+ switch (I->getOpcode()) {
+ default: return false;
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::Shl:
+ case Instruction::AShr:
+ case Instruction::LShr:
+ case Instruction::GetElementPtr:
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool isLoopIncrement(User *U, Instruction *IV) {
+ BinaryOperator *BO = dyn_cast<BinaryOperator>(U);
+
+ if ((BO && BO->getOpcode() != Instruction::Add) ||
+ (!BO && !isa<GetElementPtrInst>(U)))
+ return false;
+
+ for (auto *UU : U->users()) {
+ PHINode *PN = dyn_cast<PHINode>(UU);
+ if (PN && PN == IV)
+ return true;
+ }
+ return false;
+}
+
+bool LoopReroll::DAGRootTracker::
+collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
+ SmallInstructionVector BaseUsers;
+
+ for (auto *I : Base->users()) {
+ ConstantInt *CI = nullptr;
+
+ if (isLoopIncrement(I, IV)) {
+ LoopIncs.push_back(cast<Instruction>(I));
+ continue;
+ }
+
+ // The root nodes must be either GEPs, ORs or ADDs.
+ if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+ if (BO->getOpcode() == Instruction::Add ||
+ BO->getOpcode() == Instruction::Or)
+ CI = dyn_cast<ConstantInt>(BO->getOperand(1));
+ } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+ Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1);
+ CI = dyn_cast<ConstantInt>(LastOperand);
+ }
+
+ if (!CI) {
+ if (Instruction *II = dyn_cast<Instruction>(I)) {
+ BaseUsers.push_back(II);
+ continue;
+ } else {
+ DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I << "\n");
+ return false;
+ }
+ }
+
+ int64_t V = std::abs(CI->getValue().getSExtValue());
+ if (Roots.find(V) != Roots.end())
+ // No duplicates, please.
+ return false;
+
+ Roots[V] = cast<Instruction>(I);
+ }
+
+ // Make sure we have at least two roots.
+ if (Roots.empty() || (Roots.size() == 1 && BaseUsers.empty()))
+ return false;
+
+ // If we found non-loop-inc, non-root users of Base, assume they are
+ // for the zeroth root index. This is because "add %a, 0" gets optimized
+ // away.
+ if (BaseUsers.size()) {
+ if (Roots.find(0) != Roots.end()) {
+ DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
+ return false;
+ }
+ Roots[0] = Base;
+ }
+
+ // Calculate the number of users of the base, or lowest indexed, iteration.
+ unsigned NumBaseUses = BaseUsers.size();
+ if (NumBaseUses == 0)
+ NumBaseUses = Roots.begin()->second->getNumUses();
+
+ // Check that every node has the same number of users.
+ for (auto &KV : Roots) {
+ if (KV.first == 0)
+ continue;
+ if (!KV.second->hasNUses(NumBaseUses)) {
+ DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
+ << "#Base=" << NumBaseUses << ", #Root=" <<
+ KV.second->getNumUses() << "\n");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void LoopReroll::DAGRootTracker::
+findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) {
+ // Does the user look like it could be part of a root set?
+ // All its users must be simple arithmetic ops.
+ if (I->hasNUsesOrMore(IL_MaxRerollIterations + 1))
+ return;
+
+ if (I != IV && findRootsBase(I, SubsumedInsts))
+ return;
+
+ SubsumedInsts.insert(I);
+
+ for (User *V : I->users()) {
+ Instruction *I = cast<Instruction>(V);
+ if (is_contained(LoopIncs, I))
+ continue;
+
+ if (!isSimpleArithmeticOp(I))
+ continue;
+
+ // The recursive call makes a copy of SubsumedInsts.
+ findRootsRecursive(I, SubsumedInsts);
+ }
+}
+
+bool LoopReroll::DAGRootTracker::validateRootSet(DAGRootSet &DRS) {
+ if (DRS.Roots.empty())
+ return false;
+
+ // Consider a DAGRootSet with N-1 roots (so N different values including
+ // BaseInst).
+ // Define d = Roots[0] - BaseInst, which should be the same as
+ // Roots[I] - Roots[I-1] for all I in [1..N).
+ // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the
+ // loop iteration J.
+ //
+ // Now, For the loop iterations to be consecutive:
+ // D = d * N
+ const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
+ if (!ADR)
+ return false;
+ unsigned N = DRS.Roots.size() + 1;
+ const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), ADR);
+ const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N);
+ if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV))
+ return false;
+
+ return true;
+}
+
+bool LoopReroll::DAGRootTracker::
+findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
+ // The base of a RootSet must be an AddRec, so it can be erased.
+ const auto *IVU_ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IVU));
+ if (!IVU_ADR || IVU_ADR->getLoop() != L)
+ return false;
+
+ std::map<int64_t, Instruction*> V;
+ if (!collectPossibleRoots(IVU, V))
+ return false;
+
+ // If we didn't get a root for index zero, then IVU must be
+ // subsumed.
+ if (V.find(0) == V.end())
+ SubsumedInsts.insert(IVU);
+
+ // Partition the vector into monotonically increasing indexes.
+ DAGRootSet DRS;
+ DRS.BaseInst = nullptr;
+
+ SmallVector<DAGRootSet, 16> PotentialRootSets;
+
+ for (auto &KV : V) {
+ if (!DRS.BaseInst) {
+ DRS.BaseInst = KV.second;
+ DRS.SubsumedInsts = SubsumedInsts;
+ } else if (DRS.Roots.empty()) {
+ DRS.Roots.push_back(KV.second);
+ } else if (V.find(KV.first - 1) != V.end()) {
+ DRS.Roots.push_back(KV.second);
+ } else {
+ // Linear sequence terminated.
+ if (!validateRootSet(DRS))
+ return false;
+
+ // Construct a new DAGRootSet with the next sequence.
+ PotentialRootSets.push_back(DRS);
+ DRS.BaseInst = KV.second;
+ DRS.Roots.clear();
+ }
+ }
+
+ if (!validateRootSet(DRS))
+ return false;
+
+ PotentialRootSets.push_back(DRS);
+
+ RootSets.append(PotentialRootSets.begin(), PotentialRootSets.end());
+
+ return true;
+}
+
+bool LoopReroll::DAGRootTracker::findRoots() {
+ Inc = IVToIncMap[IV];
+
+ assert(RootSets.empty() && "Unclean state!");
+ if (std::abs(Inc) == 1) {
+ for (auto *IVU : IV->users()) {
+ if (isLoopIncrement(IVU, IV))
+ LoopIncs.push_back(cast<Instruction>(IVU));
+ }
+ findRootsRecursive(IV, SmallInstructionSet());
+ LoopIncs.push_back(IV);
+ } else {
+ if (!findRootsBase(IV, SmallInstructionSet()))
+ return false;
+ }
+
+ // Ensure all sets have the same size.
+ if (RootSets.empty()) {
+ DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
+ return false;
+ }
+ for (auto &V : RootSets) {
+ if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) {
+ DEBUG(dbgs()
+ << "LRR: Aborting because not all root sets have the same size\n");
+ return false;
+ }
+ }
+
+ Scale = RootSets[0].Roots.size() + 1;
+
+ if (Scale > IL_MaxRerollIterations) {
+ DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
+ << "#Found=" << Scale << ", #Max=" << IL_MaxRerollIterations
+ << "\n");
+ return false;
+ }
+
+ DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale << "\n");
+
+ return true;
+}
+
+bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) {
+ // Populate the MapVector with all instructions in the block, in order first,
+ // so we can iterate over the contents later in perfect order.
+ for (auto &I : *L->getHeader()) {
+ Uses[&I].resize(IL_End);
+ }
+
+ SmallInstructionSet Exclude;
+ for (auto &DRS : RootSets) {
+ Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
+ Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
+ Exclude.insert(DRS.BaseInst);
+ }
+ Exclude.insert(LoopIncs.begin(), LoopIncs.end());
+
+ for (auto &DRS : RootSets) {
+ DenseSet<Instruction*> VBase;
+ collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase);
+ for (auto *I : VBase) {
+ Uses[I].set(0);
+ }
+
+ unsigned Idx = 1;
+ for (auto *Root : DRS.Roots) {
+ DenseSet<Instruction*> V;
+ collectInLoopUserSet(Root, Exclude, PossibleRedSet, V);
+
+ // While we're here, check the use sets are the same size.
+ if (V.size() != VBase.size()) {
+ DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
+ return false;
+ }
+
+ for (auto *I : V) {
+ Uses[I].set(Idx);
+ }
+ ++Idx;
+ }
+
+ // Make sure our subsumed instructions are remembered too.
+ for (auto *I : DRS.SubsumedInsts) {
+ Uses[I].set(IL_All);
+ }
+ }
+
+ // Make sure the loop increments are also accounted for.
+
+ Exclude.clear();
+ for (auto &DRS : RootSets) {
+ Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
+ Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
+ Exclude.insert(DRS.BaseInst);
+ }
+
+ DenseSet<Instruction*> V;
+ collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V);
+ for (auto *I : V) {
+ Uses[I].set(IL_All);
+ }
+
+ return true;
+}
+
+/// Get the next instruction in "In" that is a member of set Val.
+/// Start searching from StartI, and do not return anything in Exclude.
+/// If StartI is not given, start from In.begin().
+LoopReroll::DAGRootTracker::UsesTy::iterator
+LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In,
+ const SmallInstructionSet &Exclude,
+ UsesTy::iterator *StartI) {
+ UsesTy::iterator I = StartI ? *StartI : In.begin();
+ while (I != In.end() && (I->second.test(Val) == 0 ||
+ Exclude.count(I->first) != 0))
+ ++I;
+ return I;
+}
+
+bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) {
+ for (auto &DRS : RootSets) {
+ if (DRS.BaseInst == I)
+ return true;
+ }
+ return false;
+}
+
+bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) {
+ for (auto &DRS : RootSets) {
+ if (is_contained(DRS.Roots, I))
+ return true;
+ }
+ return false;
+}
+
+/// Return true if instruction I depends on any instruction between
+/// Start and End.
+bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I,
+ UsesTy::iterator Start,
+ UsesTy::iterator End) {
+ for (auto *U : I->users()) {
+ for (auto It = Start; It != End; ++It)
+ if (U == It->first)
+ return true;
+ }
+ return false;
+}
+
+static bool isIgnorableInst(const Instruction *I) {
+ if (isa<DbgInfoIntrinsic>(I))
+ return true;
+ const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+ if (!II)
+ return false;
+ switch (II->getIntrinsicID()) {
+ default:
+ return false;
+ case Intrinsic::annotation:
+ case Intrinsic::ptr_annotation:
+ case Intrinsic::var_annotation:
+ // TODO: the following intrinsics may also be whitelisted:
+ // lifetime_start, lifetime_end, invariant_start, invariant_end
+ return true;
+ }
+ return false;
+}
+
+bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
+ // We now need to check for equivalence of the use graph of each root with
+ // that of the primary induction variable (excluding the roots). Our goal
+ // here is not to solve the full graph isomorphism problem, but rather to
+ // catch common cases without a lot of work. As a result, we will assume
+ // that the relative order of the instructions in each unrolled iteration
+ // is the same (although we will not make an assumption about how the
+ // different iterations are intermixed). Note that while the order must be
+ // the same, the instructions may not be in the same basic block.
+
+ // An array of just the possible reductions for this scale factor. When we
+ // collect the set of all users of some root instructions, these reduction
+ // instructions are treated as 'final' (their uses are not considered).
+ // This is important because we don't want the root use set to search down
+ // the reduction chain.
+ SmallInstructionSet PossibleRedSet;
+ SmallInstructionSet PossibleRedLastSet;
+ SmallInstructionSet PossibleRedPHISet;
+ Reductions.restrictToScale(Scale, PossibleRedSet,
+ PossibleRedPHISet, PossibleRedLastSet);
+
+ // Populate "Uses" with where each instruction is used.
+ if (!collectUsedInstructions(PossibleRedSet))
+ return false;
+
+ // Make sure we mark the reduction PHIs as used in all iterations.
+ for (auto *I : PossibleRedPHISet) {
+ Uses[I].set(IL_All);
+ }
+
+ // Make sure we mark loop-control-only PHIs as used in all iterations. See
+ // comment above LoopReroll::isLoopControlIV for more information.
+ BasicBlock *Header = L->getHeader();
+ if (LoopControlIV && LoopControlIV != IV) {
+ for (auto *U : LoopControlIV->users()) {
+ Instruction *IVUser = dyn_cast<Instruction>(U);
+ // IVUser could be loop increment or compare
+ Uses[IVUser].set(IL_All);
+ for (auto *UU : IVUser->users()) {
+ Instruction *UUser = dyn_cast<Instruction>(UU);
+ // UUser could be compare, PHI or branch
+ Uses[UUser].set(IL_All);
+ // Skip SExt
+ if (isa<SExtInst>(UUser)) {
+ UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
+ Uses[UUser].set(IL_All);
+ }
+ // Is UUser a compare instruction?
+ if (UU->hasOneUse()) {
+ Instruction *BI = dyn_cast<BranchInst>(*UUser->user_begin());
+ if (BI == cast<BranchInst>(Header->getTerminator()))
+ Uses[BI].set(IL_All);
+ }
+ }
+ }
+ }
+
+ // Make sure all instructions in the loop are in one and only one
+ // set.
+ for (auto &KV : Uses) {
+ if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) {
+ DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
+ << *KV.first << " (#uses=" << KV.second.count() << ")\n");
+ return false;
+ }
+ }
+
+ DEBUG(
+ for (auto &KV : Uses) {
+ dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
+ }
+ );
+
+ for (unsigned Iter = 1; Iter < Scale; ++Iter) {
+ // In addition to regular aliasing information, we need to look for
+ // instructions from later (future) iterations that have side effects
+ // preventing us from reordering them past other instructions with side
+ // effects.
+ bool FutureSideEffects = false;
+ AliasSetTracker AST(*AA);
+ // The map between instructions in f(%iv.(i+1)) and f(%iv).
+ DenseMap<Value *, Value *> BaseMap;
+
+ // Compare iteration Iter to the base.
+ SmallInstructionSet Visited;
+ auto BaseIt = nextInstr(0, Uses, Visited);
+ auto RootIt = nextInstr(Iter, Uses, Visited);
+ auto LastRootIt = Uses.begin();
+
+ while (BaseIt != Uses.end() && RootIt != Uses.end()) {
+ Instruction *BaseInst = BaseIt->first;
+ Instruction *RootInst = RootIt->first;
+
+ // Skip over the IV or root instructions; only match their users.
+ bool Continue = false;
+ if (isBaseInst(BaseInst)) {
+ Visited.insert(BaseInst);
+ BaseIt = nextInstr(0, Uses, Visited);
+ Continue = true;
+ }
+ if (isRootInst(RootInst)) {
+ LastRootIt = RootIt;
+ Visited.insert(RootInst);
+ RootIt = nextInstr(Iter, Uses, Visited);
+ Continue = true;
+ }
+ if (Continue) continue;
+
+ if (!BaseInst->isSameOperationAs(RootInst)) {
+ // Last chance saloon. We don't try and solve the full isomorphism
+ // problem, but try and at least catch the case where two instructions
+ // *of different types* are round the wrong way. We won't be able to
+ // efficiently tell, given two ADD instructions, which way around we
+ // should match them, but given an ADD and a SUB, we can at least infer
+ // which one is which.
+ //
+ // This should allow us to deal with a greater subset of the isomorphism
+ // problem. It does however change a linear algorithm into a quadratic
+ // one, so limit the number of probes we do.
+ auto TryIt = RootIt;
+ unsigned N = NumToleratedFailedMatches;
+ while (TryIt != Uses.end() &&
+ !BaseInst->isSameOperationAs(TryIt->first) &&
+ N--) {
+ ++TryIt;
+ TryIt = nextInstr(Iter, Uses, Visited, &TryIt);
+ }
+
+ if (TryIt == Uses.end() || TryIt == RootIt ||
+ instrDependsOn(TryIt->first, RootIt, TryIt)) {
+ DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+ " vs. " << *RootInst << "\n");
+ return false;
+ }
+
+ RootIt = TryIt;
+ RootInst = TryIt->first;
+ }
+
+ // All instructions between the last root and this root
+ // may belong to some other iteration. If they belong to a
+ // future iteration, then they're dangerous to alias with.
+ //
+ // Note that because we allow a limited amount of flexibility in the order
+ // that we visit nodes, LastRootIt might be *before* RootIt, in which
+ // case we've already checked this set of instructions so we shouldn't
+ // do anything.
+ for (; LastRootIt < RootIt; ++LastRootIt) {
+ Instruction *I = LastRootIt->first;
+ if (LastRootIt->second.find_first() < (int)Iter)
+ continue;
+ if (I->mayWriteToMemory())
+ AST.add(I);
+ // Note: This is specifically guarded by a check on isa<PHINode>,
+ // which while a valid (somewhat arbitrary) micro-optimization, is
+ // needed because otherwise isSafeToSpeculativelyExecute returns
+ // false on PHI nodes.
+ if (!isa<PHINode>(I) && !isUnorderedLoadStore(I) &&
+ !isSafeToSpeculativelyExecute(I))
+ // Intervening instructions cause side effects.
+ FutureSideEffects = true;
+ }
+
+ // Make sure that this instruction, which is in the use set of this
+ // root instruction, does not also belong to the base set or the set of
+ // some other root instruction.
+ if (RootIt->second.count() > 1) {
+ DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+ " vs. " << *RootInst << " (prev. case overlap)\n");
+ return false;
+ }
+
+ // Make sure that we don't alias with any instruction in the alias set
+ // tracker. If we do, then we depend on a future iteration, and we
+ // can't reroll.
+ if (RootInst->mayReadFromMemory())
+ for (auto &K : AST) {
+ if (K.aliasesUnknownInst(RootInst, *AA)) {
+ DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+ " vs. " << *RootInst << " (depends on future store)\n");
+ return false;
+ }
+ }
+
+ // If we've past an instruction from a future iteration that may have
+ // side effects, and this instruction might also, then we can't reorder
+ // them, and this matching fails. As an exception, we allow the alias
+ // set tracker to handle regular (unordered) load/store dependencies.
+ if (FutureSideEffects && ((!isUnorderedLoadStore(BaseInst) &&
+ !isSafeToSpeculativelyExecute(BaseInst)) ||
+ (!isUnorderedLoadStore(RootInst) &&
+ !isSafeToSpeculativelyExecute(RootInst)))) {
+ DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+ " vs. " << *RootInst <<
+ " (side effects prevent reordering)\n");
+ return false;
+ }
+
+ // For instructions that are part of a reduction, if the operation is
+ // associative, then don't bother matching the operands (because we
+ // already know that the instructions are isomorphic, and the order
+ // within the iteration does not matter). For non-associative reductions,
+ // we do need to match the operands, because we need to reject
+ // out-of-order instructions within an iteration!
+ // For example (assume floating-point addition), we need to reject this:
+ // x += a[i]; x += b[i];
+ // x += a[i+1]; x += b[i+1];
+ // x += b[i+2]; x += a[i+2];
+ bool InReduction = Reductions.isPairInSame(BaseInst, RootInst);
+
+ if (!(InReduction && BaseInst->isAssociative())) {
+ bool Swapped = false, SomeOpMatched = false;
+ for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) {
+ Value *Op2 = RootInst->getOperand(j);
+
+ // If this is part of a reduction (and the operation is not
+ // associatve), then we match all operands, but not those that are
+ // part of the reduction.
+ if (InReduction)
+ if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
+ if (Reductions.isPairInSame(RootInst, Op2I))
+ continue;
+
+ DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
+ if (BMI != BaseMap.end()) {
+ Op2 = BMI->second;
+ } else {
+ for (auto &DRS : RootSets) {
+ if (DRS.Roots[Iter-1] == (Instruction*) Op2) {
+ Op2 = DRS.BaseInst;
+ break;
+ }
+ }
+ }
+
+ if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
+ // If we've not already decided to swap the matched operands, and
+ // we've not already matched our first operand (note that we could
+ // have skipped matching the first operand because it is part of a
+ // reduction above), and the instruction is commutative, then try
+ // the swapped match.
+ if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched &&
+ BaseInst->getOperand(!j) == Op2) {
+ Swapped = true;
+ } else {
+ DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+ << " vs. " << *RootInst << " (operand " << j << ")\n");
+ return false;
+ }
+ }
+
+ SomeOpMatched = true;
+ }
+ }
+
+ if ((!PossibleRedLastSet.count(BaseInst) &&
+ hasUsesOutsideLoop(BaseInst, L)) ||
+ (!PossibleRedLastSet.count(RootInst) &&
+ hasUsesOutsideLoop(RootInst, L))) {
+ DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+ " vs. " << *RootInst << " (uses outside loop)\n");
+ return false;
+ }
+
+ Reductions.recordPair(BaseInst, RootInst, Iter);
+ BaseMap.insert(std::make_pair(RootInst, BaseInst));
+
+ LastRootIt = RootIt;
+ Visited.insert(BaseInst);
+ Visited.insert(RootInst);
+ BaseIt = nextInstr(0, Uses, Visited);
+ RootIt = nextInstr(Iter, Uses, Visited);
+ }
+ assert(BaseIt == Uses.end() && RootIt == Uses.end() &&
+ "Mismatched set sizes!");
+ }
+
+ DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
+ *IV << "\n");
+
+ return true;
+}
+
+void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
+ BasicBlock *Header = L->getHeader();
+ // Remove instructions associated with non-base iterations.
+ for (BasicBlock::reverse_iterator J = Header->rbegin(), JE = Header->rend();
+ J != JE;) {
+ unsigned I = Uses[&*J].find_first();
+ if (I > 0 && I < IL_All) {
+ DEBUG(dbgs() << "LRR: removing: " << *J << "\n");
+ J++->eraseFromParent();
+ continue;
+ }
+
+ ++J;
+ }
+
+ bool HasTwoIVs = LoopControlIV && LoopControlIV != IV;
+
+ if (HasTwoIVs) {
+ updateNonLoopCtrlIncr();
+ replaceIV(LoopControlIV, LoopControlIV, IterCount);
+ } else
+ // We need to create a new induction variable for each different BaseInst.
+ for (auto &DRS : RootSets)
+ // Insert the new induction variable.
+ replaceIV(DRS.BaseInst, IV, IterCount);
+
+ SimplifyInstructionsInBlock(Header, TLI);
+ DeleteDeadPHIs(Header, TLI);
+}
+
+// For non-loop-control IVs, we only need to update the last increment
+// with right amount, then we are done.
+void LoopReroll::DAGRootTracker::updateNonLoopCtrlIncr() {
+ const SCEV *NewInc = nullptr;
+ for (auto *LoopInc : LoopIncs) {
+ GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LoopInc);
+ const SCEVConstant *COp = nullptr;
+ if (GEP && LoopInc->getOperand(0)->getType()->isPointerTy()) {
+ COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(1)));
+ } else {
+ COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(0)));
+ if (!COp)
+ COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(1)));
+ }
+
+ assert(COp && "Didn't find constant operand of LoopInc!\n");
+
+ const APInt &AInt = COp->getValue()->getValue();
+ const SCEV *ScaleSCEV = SE->getConstant(COp->getType(), Scale);
+ if (AInt.isNegative()) {
+ NewInc = SE->getNegativeSCEV(COp);
+ NewInc = SE->getUDivExpr(NewInc, ScaleSCEV);
+ NewInc = SE->getNegativeSCEV(NewInc);
+ } else
+ NewInc = SE->getUDivExpr(COp, ScaleSCEV);
+
+ LoopInc->setOperand(1, dyn_cast<SCEVConstant>(NewInc)->getValue());
+ }
+}
+
+void LoopReroll::DAGRootTracker::replaceIV(Instruction *Inst,
+ Instruction *InstIV,
+ const SCEV *IterCount) {
+ BasicBlock *Header = L->getHeader();
+ int64_t Inc = IVToIncMap[InstIV];
+ bool NeedNewIV = InstIV == LoopControlIV;
+ bool Negative = !NeedNewIV && Inc < 0;
+
+ const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(Inst));
+ const SCEV *Start = RealIVSCEV->getStart();
+
+ if (NeedNewIV)
+ Start = SE->getConstant(Start->getType(), 0);
+
+ const SCEV *SizeOfExpr = nullptr;
+ const SCEV *IncrExpr =
+ SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1);
+ if (auto *PTy = dyn_cast<PointerType>(Inst->getType())) {
+ Type *ElTy = PTy->getElementType();
+ SizeOfExpr =
+ SE->getSizeOfExpr(SE->getEffectiveSCEVType(Inst->getType()), ElTy);
+ IncrExpr = SE->getMulExpr(IncrExpr, SizeOfExpr);
+ }
+ const SCEV *NewIVSCEV =
+ SE->getAddRecExpr(Start, IncrExpr, L, SCEV::FlagAnyWrap);
+
+ { // Limit the lifetime of SCEVExpander.
+ const DataLayout &DL = Header->getModule()->getDataLayout();
+ SCEVExpander Expander(*SE, DL, "reroll");
+ Value *NewIV = Expander.expandCodeFor(NewIVSCEV, Inst->getType(),
+ Header->getFirstNonPHIOrDbg());
+
+ for (auto &KV : Uses)
+ if (KV.second.find_first() == 0)
+ KV.first->replaceUsesOfWith(Inst, NewIV);
+
+ if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
+ // FIXME: Why do we need this check?
+ if (Uses[BI].find_first() == IL_All) {
+ const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
+
+ if (NeedNewIV)
+ ICSCEV = SE->getMulExpr(IterCount,
+ SE->getConstant(IterCount->getType(), Scale));
+
+ // Iteration count SCEV minus or plus 1
+ const SCEV *MinusPlus1SCEV =
+ SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1);
+ if (Inst->getType()->isPointerTy()) {
+ assert(SizeOfExpr && "SizeOfExpr is not initialized");
+ MinusPlus1SCEV = SE->getMulExpr(MinusPlus1SCEV, SizeOfExpr);
+ }
+
+ const SCEV *ICMinusPlus1SCEV = SE->getMinusSCEV(ICSCEV, MinusPlus1SCEV);
+ // Iteration count minus 1
+ Instruction *InsertPtr = nullptr;
+ if (isa<SCEVConstant>(ICMinusPlus1SCEV)) {
+ InsertPtr = BI;
+ } else {
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader)
+ Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
+ InsertPtr = Preheader->getTerminator();
+ }
+
+ if (!isa<PointerType>(NewIV->getType()) && NeedNewIV &&
+ (SE->getTypeSizeInBits(NewIV->getType()) <
+ SE->getTypeSizeInBits(ICMinusPlus1SCEV->getType()))) {
+ IRBuilder<> Builder(BI);
+ Builder.SetCurrentDebugLocation(BI->getDebugLoc());
+ NewIV = Builder.CreateSExt(NewIV, ICMinusPlus1SCEV->getType());
+ }
+ Value *ICMinusPlus1 = Expander.expandCodeFor(
+ ICMinusPlus1SCEV, NewIV->getType(), InsertPtr);
+
+ Value *Cond =
+ new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinusPlus1, "exitcond");
+ BI->setCondition(Cond);
+
+ if (BI->getSuccessor(1) != Header)
+ BI->swapSuccessors();
+ }
+ }
+ }
+}
+
+// Validate the selected reductions. All iterations must have an isomorphic
+// part of the reduction chain and, for non-associative reductions, the chain
+// entries must appear in order.
+bool LoopReroll::ReductionTracker::validateSelected() {
+ // For a non-associative reduction, the chain entries must appear in order.
+ for (int i : Reds) {
+ int PrevIter = 0, BaseCount = 0, Count = 0;
+ for (Instruction *J : PossibleReds[i]) {
+ // Note that all instructions in the chain must have been found because
+ // all instructions in the function must have been assigned to some
+ // iteration.
+ int Iter = PossibleRedIter[J];
+ if (Iter != PrevIter && Iter != PrevIter + 1 &&
+ !PossibleReds[i].getReducedValue()->isAssociative()) {
+ DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
+ J << "\n");
+ return false;
+ }
+
+ if (Iter != PrevIter) {
+ if (Count != BaseCount) {
+ DEBUG(dbgs() << "LRR: Iteration " << PrevIter <<
+ " reduction use count " << Count <<
+ " is not equal to the base use count " <<
+ BaseCount << "\n");
+ return false;
+ }
+
+ Count = 0;
+ }
+
+ ++Count;
+ if (Iter == 0)
+ ++BaseCount;
+
+ PrevIter = Iter;
+ }
+ }
+
+ return true;
+}
+
+// For all selected reductions, remove all parts except those in the first
+// iteration (and the PHI). Replace outside uses of the reduced value with uses
+// of the first-iteration reduced value (in other words, reroll the selected
+// reductions).
+void LoopReroll::ReductionTracker::replaceSelected() {
+ // Fixup reductions to refer to the last instruction associated with the
+ // first iteration (not the last).
+ for (int i : Reds) {
+ int j = 0;
+ for (int e = PossibleReds[i].size(); j != e; ++j)
+ if (PossibleRedIter[PossibleReds[i][j]] != 0) {
+ --j;
+ break;
+ }
+
+ // Replace users with the new end-of-chain value.
+ SmallInstructionVector Users;
+ for (User *U : PossibleReds[i].getReducedValue()->users()) {
+ Users.push_back(cast<Instruction>(U));
+ }
+
+ for (Instruction *User : Users)
+ User->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
+ PossibleReds[i][j]);
+ }
+}
+
+// Reroll the provided loop with respect to the provided induction variable.
+// Generally, we're looking for a loop like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1 <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2 <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
+// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
+// be intermixed with eachother. The restriction imposed by this algorithm is
+// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
+// etc. be the same.
+//
+// First, we collect the use set of %iv, excluding the other increment roots.
+// This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
+// times, having collected the use set of f(%iv.(i+1)), during which we:
+// - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
+// the next unmatched instruction in f(%iv.(i+1)).
+// - Ensure that both matched instructions don't have any external users
+// (with the exception of last-in-chain reduction instructions).
+// - Track the (aliasing) write set, and other side effects, of all
+// instructions that belong to future iterations that come before the matched
+// instructions. If the matched instructions read from that write set, then
+// f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
+// f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
+// if any of these future instructions had side effects (could not be
+// speculatively executed), and so do the matched instructions, when we
+// cannot reorder those side-effect-producing instructions, and rerolling
+// fails.
+//
+// Finally, we make sure that all loop instructions are either loop increment
+// roots, belong to simple latch code, parts of validated reductions, part of
+// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
+// have been validated), then we reroll the loop.
+bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
+ const SCEV *IterCount,
+ ReductionTracker &Reductions) {
+ DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA,
+ IVToIncMap, LoopControlIV);
+
+ if (!DAGRoots.findRoots())
+ return false;
+ DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
+ *IV << "\n");
+
+ if (!DAGRoots.validate(Reductions))
+ return false;
+ if (!Reductions.validateSelected())
+ return false;
+ // At this point, we've validated the rerolling, and we're committed to
+ // making changes!
+
+ Reductions.replaceSelected();
+ DAGRoots.replace(IterCount);
+
+ ++NumRerolledLoops;
+ return true;
+}
+
+bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
+ if (skipLoop(L))
+ return false;
+
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+ BasicBlock *Header = L->getHeader();
+ DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() <<
+ "] Loop %" << Header->getName() << " (" <<
+ L->getNumBlocks() << " block(s))\n");
+
+ // For now, we'll handle only single BB loops.
+ if (L->getNumBlocks() > 1)
+ return false;
+
+ if (!SE->hasLoopInvariantBackedgeTakenCount(L))
+ return false;
+
+ const SCEV *LIBETC = SE->getBackedgeTakenCount(L);
+ const SCEV *IterCount = SE->getAddExpr(LIBETC, SE->getOne(LIBETC->getType()));
+ DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n");
+ DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n");
+
+ // First, we need to find the induction variable with respect to which we can
+ // reroll (there may be several possible options).
+ SmallInstructionVector PossibleIVs;
+ IVToIncMap.clear();
+ LoopControlIV = nullptr;
+ collectPossibleIVs(L, PossibleIVs);
+
+ if (PossibleIVs.empty()) {
+ DEBUG(dbgs() << "LRR: No possible IVs found\n");
+ return false;
+ }
+
+ ReductionTracker Reductions;
+ collectPossibleReductions(L, Reductions);
+ bool Changed = false;
+
+ // For each possible IV, collect the associated possible set of 'root' nodes
+ // (i+1, i+2, etc.).
+ for (Instruction *PossibleIV : PossibleIVs)
+ if (reroll(PossibleIV, L, Header, IterCount, Reductions)) {
+ Changed = true;
+ break;
+ }
+ DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n");
+
+ // Trip count of L has changed so SE must be re-evaluated.
+ if (Changed)
+ SE->forgetLoop(L);
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
new file mode 100644
index 000000000000..a91f53ba663f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -0,0 +1,711 @@
+//===- LoopRotation.cpp - Loop Rotation Pass ------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Loop Rotation Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopRotation.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-rotate"
+
+static cl::opt<unsigned> DefaultRotationThreshold(
+ "rotation-max-header-size", cl::init(16), cl::Hidden,
+ cl::desc("The default maximum header size for automatic loop rotation"));
+
+STATISTIC(NumRotated, "Number of loops rotated");
+
+namespace {
+/// A simple loop rotation transformation.
+class LoopRotate {
+ const unsigned MaxHeaderSize;
+ LoopInfo *LI;
+ const TargetTransformInfo *TTI;
+ AssumptionCache *AC;
+ DominatorTree *DT;
+ ScalarEvolution *SE;
+ const SimplifyQuery &SQ;
+
+public:
+ LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
+ const TargetTransformInfo *TTI, AssumptionCache *AC,
+ DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ)
+ : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
+ SQ(SQ) {}
+ bool processLoop(Loop *L);
+
+private:
+ bool rotateLoop(Loop *L, bool SimplifiedLatch);
+ bool simplifyLoopLatch(Loop *L);
+};
+} // end anonymous namespace
+
+/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
+/// old header into the preheader. If there were uses of the values produced by
+/// these instruction that were outside of the loop, we have to insert PHI nodes
+/// to merge the two values. Do this now.
+static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
+ BasicBlock *OrigPreheader,
+ ValueToValueMapTy &ValueMap,
+ SmallVectorImpl<PHINode*> *InsertedPHIs) {
+ // Remove PHI node entries that are no longer live.
+ BasicBlock::iterator I, E = OrigHeader->end();
+ for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
+ PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
+
+ // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
+ // as necessary.
+ SSAUpdater SSA(InsertedPHIs);
+ for (I = OrigHeader->begin(); I != E; ++I) {
+ Value *OrigHeaderVal = &*I;
+
+ // If there are no uses of the value (e.g. because it returns void), there
+ // is nothing to rewrite.
+ if (OrigHeaderVal->use_empty())
+ continue;
+
+ Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal);
+
+ // The value now exits in two versions: the initial value in the preheader
+ // and the loop "next" value in the original header.
+ SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
+ SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
+ SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
+
+ // Visit each use of the OrigHeader instruction.
+ for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
+ UE = OrigHeaderVal->use_end();
+ UI != UE;) {
+ // Grab the use before incrementing the iterator.
+ Use &U = *UI;
+
+ // Increment the iterator before removing the use from the list.
+ ++UI;
+
+ // SSAUpdater can't handle a non-PHI use in the same block as an
+ // earlier def. We can easily handle those cases manually.
+ Instruction *UserInst = cast<Instruction>(U.getUser());
+ if (!isa<PHINode>(UserInst)) {
+ BasicBlock *UserBB = UserInst->getParent();
+
+ // The original users in the OrigHeader are already using the
+ // original definitions.
+ if (UserBB == OrigHeader)
+ continue;
+
+ // Users in the OrigPreHeader need to use the value to which the
+ // original definitions are mapped.
+ if (UserBB == OrigPreheader) {
+ U = OrigPreHeaderVal;
+ continue;
+ }
+ }
+
+ // Anything else can be handled by SSAUpdater.
+ SSA.RewriteUse(U);
+ }
+
+ // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
+ // intrinsics.
+ SmallVector<DbgValueInst *, 1> DbgValues;
+ llvm::findDbgValues(DbgValues, OrigHeaderVal);
+ for (auto &DbgValue : DbgValues) {
+ // The original users in the OrigHeader are already using the original
+ // definitions.
+ BasicBlock *UserBB = DbgValue->getParent();
+ if (UserBB == OrigHeader)
+ continue;
+
+ // Users in the OrigPreHeader need to use the value to which the
+ // original definitions are mapped and anything else can be handled by
+ // the SSAUpdater. To avoid adding PHINodes, check if the value is
+ // available in UserBB, if not substitute undef.
+ Value *NewVal;
+ if (UserBB == OrigPreheader)
+ NewVal = OrigPreHeaderVal;
+ else if (SSA.HasValueForBlock(UserBB))
+ NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
+ else
+ NewVal = UndefValue::get(OrigHeaderVal->getType());
+ DbgValue->setOperand(0,
+ MetadataAsValue::get(OrigHeaderVal->getContext(),
+ ValueAsMetadata::get(NewVal)));
+ }
+ }
+}
+
+/// Propagate dbg.value intrinsics through the newly inserted Phis.
+static void insertDebugValues(BasicBlock *OrigHeader,
+ SmallVectorImpl<PHINode*> &InsertedPHIs) {
+ ValueToValueMapTy DbgValueMap;
+
+ // Map existing PHI nodes to their dbg.values.
+ for (auto &I : *OrigHeader) {
+ if (auto DbgII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+ if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation()))
+ DbgValueMap.insert({Loc, DbgII});
+ }
+ }
+
+ // Then iterate through the new PHIs and look to see if they use one of the
+ // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will
+ // propagate the info through the new PHI.
+ LLVMContext &C = OrigHeader->getContext();
+ for (auto PHI : InsertedPHIs) {
+ for (auto VI : PHI->operand_values()) {
+ auto V = DbgValueMap.find(VI);
+ if (V != DbgValueMap.end()) {
+ auto *DbgII = cast<DbgInfoIntrinsic>(V->second);
+ Instruction *NewDbgII = DbgII->clone();
+ auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI));
+ NewDbgII->setOperand(0, PhiMAV);
+ BasicBlock *Parent = PHI->getParent();
+ NewDbgII->insertBefore(Parent->getFirstNonPHIOrDbgOrLifetime());
+ }
+ }
+ }
+}
+
+/// Rotate loop LP. Return true if the loop is rotated.
+///
+/// \param SimplifiedLatch is true if the latch was just folded into the final
+/// loop exit. In this case we may want to rotate even though the new latch is
+/// now an exiting branch. This rotation would have happened had the latch not
+/// been simplified. However, if SimplifiedLatch is false, then we avoid
+/// rotating loops in which the latch exits to avoid excessive or endless
+/// rotation. LoopRotate should be repeatable and converge to a canonical
+/// form. This property is satisfied because simplifying the loop latch can only
+/// happen once across multiple invocations of the LoopRotate pass.
+bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
+ // If the loop has only one block then there is not much to rotate.
+ if (L->getBlocks().size() == 1)
+ return false;
+
+ BasicBlock *OrigHeader = L->getHeader();
+ BasicBlock *OrigLatch = L->getLoopLatch();
+
+ BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+ if (!BI || BI->isUnconditional())
+ return false;
+
+ // If the loop header is not one of the loop exiting blocks then
+ // either this loop is already rotated or it is not
+ // suitable for loop rotation transformations.
+ if (!L->isLoopExiting(OrigHeader))
+ return false;
+
+ // If the loop latch already contains a branch that leaves the loop then the
+ // loop is already rotated.
+ if (!OrigLatch)
+ return false;
+
+ // Rotate if either the loop latch does *not* exit the loop, or if the loop
+ // latch was just simplified.
+ if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch)
+ return false;
+
+ // Check size of original header and reject loop if it is very big or we can't
+ // duplicate blocks inside it.
+ {
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+ CodeMetrics Metrics;
+ Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
+ if (Metrics.notDuplicatable) {
+ DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
+ << " instructions: ";
+ L->dump());
+ return false;
+ }
+ if (Metrics.convergent) {
+ DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
+ "instructions: ";
+ L->dump());
+ return false;
+ }
+ if (Metrics.NumInsts > MaxHeaderSize)
+ return false;
+ }
+
+ // Now, this loop is suitable for rotation.
+ BasicBlock *OrigPreheader = L->getLoopPreheader();
+
+ // If the loop could not be converted to canonical form, it must have an
+ // indirectbr in it, just give up.
+ if (!OrigPreheader)
+ return false;
+
+ // Anything ScalarEvolution may know about this loop or the PHI nodes
+ // in its header will soon be invalidated.
+ if (SE)
+ SE->forgetLoop(L);
+
+ DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+
+ // Find new Loop header. NewHeader is a Header's one and only successor
+ // that is inside loop. Header's other successor is outside the
+ // loop. Otherwise loop is not suitable for rotation.
+ BasicBlock *Exit = BI->getSuccessor(0);
+ BasicBlock *NewHeader = BI->getSuccessor(1);
+ if (L->contains(Exit))
+ std::swap(Exit, NewHeader);
+ assert(NewHeader && "Unable to determine new loop header");
+ assert(L->contains(NewHeader) && !L->contains(Exit) &&
+ "Unable to determine loop header and exit blocks");
+
+ // This code assumes that the new header has exactly one predecessor.
+ // Remove any single-entry PHI nodes in it.
+ assert(NewHeader->getSinglePredecessor() &&
+ "New header doesn't have one pred!");
+ FoldSingleEntryPHINodes(NewHeader);
+
+ // Begin by walking OrigHeader and populating ValueMap with an entry for
+ // each Instruction.
+ BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+ ValueToValueMapTy ValueMap;
+
+ // For PHI nodes, the value available in OldPreHeader is just the
+ // incoming value from OldPreHeader.
+ for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
+ ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
+
+ // For the rest of the instructions, either hoist to the OrigPreheader if
+ // possible or create a clone in the OldPreHeader if not.
+ TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
+
+ // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication.
+ using DbgIntrinsicHash =
+ std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>;
+ auto makeHash = [](DbgInfoIntrinsic *D) -> DbgIntrinsicHash {
+ return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()};
+ };
+ SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
+ for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend();
+ I != E; ++I) {
+ if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&*I))
+ DbgIntrinsics.insert(makeHash(DII));
+ else
+ break;
+ }
+
+ while (I != E) {
+ Instruction *Inst = &*I++;
+
+ // If the instruction's operands are invariant and it doesn't read or write
+ // memory, then it is safe to hoist. Doing this doesn't change the order of
+ // execution in the preheader, but does prevent the instruction from
+ // executing in each iteration of the loop. This means it is safe to hoist
+ // something that might trap, but isn't safe to hoist something that reads
+ // memory (without proving that the loop doesn't write).
+ if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
+ !Inst->mayWriteToMemory() && !isa<TerminatorInst>(Inst) &&
+ !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
+ Inst->moveBefore(LoopEntryBranch);
+ continue;
+ }
+
+ // Otherwise, create a duplicate of the instruction.
+ Instruction *C = Inst->clone();
+
+ // Eagerly remap the operands of the instruction.
+ RemapInstruction(C, ValueMap,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+ // Avoid inserting the same intrinsic twice.
+ if (auto *DII = dyn_cast<DbgInfoIntrinsic>(C))
+ if (DbgIntrinsics.count(makeHash(DII))) {
+ C->deleteValue();
+ continue;
+ }
+
+ // With the operands remapped, see if the instruction constant folds or is
+ // otherwise simplifyable. This commonly occurs because the entry from PHI
+ // nodes allows icmps and other instructions to fold.
+ Value *V = SimplifyInstruction(C, SQ);
+ if (V && LI->replacementPreservesLCSSAForm(C, V)) {
+ // If so, then delete the temporary instruction and stick the folded value
+ // in the map.
+ ValueMap[Inst] = V;
+ if (!C->mayHaveSideEffects()) {
+ C->deleteValue();
+ C = nullptr;
+ }
+ } else {
+ ValueMap[Inst] = C;
+ }
+ if (C) {
+ // Otherwise, stick the new instruction into the new block!
+ C->setName(Inst->getName());
+ C->insertBefore(LoopEntryBranch);
+
+ if (auto *II = dyn_cast<IntrinsicInst>(C))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC->registerAssumption(II);
+ }
+ }
+
+ // Along with all the other instructions, we just cloned OrigHeader's
+ // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
+ // successors by duplicating their incoming values for OrigHeader.
+ TerminatorInst *TI = OrigHeader->getTerminator();
+ for (BasicBlock *SuccBB : TI->successors())
+ for (BasicBlock::iterator BI = SuccBB->begin();
+ PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+ PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
+
+ // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
+ // OrigPreHeader's old terminator (the original branch into the loop), and
+ // remove the corresponding incoming values from the PHI nodes in OrigHeader.
+ LoopEntryBranch->eraseFromParent();
+
+
+ SmallVector<PHINode*, 2> InsertedPHIs;
+ // If there were any uses of instructions in the duplicated block outside the
+ // loop, update them, inserting PHI nodes as required
+ RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
+ &InsertedPHIs);
+
+ // Attach dbg.value intrinsics to the new phis if that phi uses a value that
+ // previously had debug metadata attached. This keeps the debug info
+ // up-to-date in the loop body.
+ if (!InsertedPHIs.empty())
+ insertDebugValues(OrigHeader, InsertedPHIs);
+
+ // NewHeader is now the header of the loop.
+ L->moveToHeader(NewHeader);
+ assert(L->getHeader() == NewHeader && "Latch block is our new header");
+
+ // Inform DT about changes to the CFG.
+ if (DT) {
+ // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
+ // the DT about the removed edge to the OrigHeader (that got removed).
+ SmallVector<DominatorTree::UpdateType, 3> Updates;
+ Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit});
+ Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
+ Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
+ DT->applyUpdates(Updates);
+ }
+
+ // At this point, we've finished our major CFG changes. As part of cloning
+ // the loop into the preheader we've simplified instructions and the
+ // duplicated conditional branch may now be branching on a constant. If it is
+ // branching on a constant and if that constant means that we enter the loop,
+ // then we fold away the cond branch to an uncond branch. This simplifies the
+ // loop in cases important for nested loops, and it also means we don't have
+ // to split as many edges.
+ BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
+ assert(PHBI->isConditional() && "Should be clone of BI condbr!");
+ if (!isa<ConstantInt>(PHBI->getCondition()) ||
+ PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
+ NewHeader) {
+ // The conditional branch can't be folded, handle the general case.
+ // Split edges as necessary to preserve LoopSimplify form.
+
+ // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
+ // thus is not a preheader anymore.
+ // Split the edge to form a real preheader.
+ BasicBlock *NewPH = SplitCriticalEdge(
+ OrigPreheader, NewHeader,
+ CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+ NewPH->setName(NewHeader->getName() + ".lr.ph");
+
+ // Preserve canonical loop form, which means that 'Exit' should have only
+ // one predecessor. Note that Exit could be an exit block for multiple
+ // nested loops, causing both of the edges to now be critical and need to
+ // be split.
+ SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
+ bool SplitLatchEdge = false;
+ for (BasicBlock *ExitPred : ExitPreds) {
+ // We only need to split loop exit edges.
+ Loop *PredLoop = LI->getLoopFor(ExitPred);
+ if (!PredLoop || PredLoop->contains(Exit))
+ continue;
+ if (isa<IndirectBrInst>(ExitPred->getTerminator()))
+ continue;
+ SplitLatchEdge |= L->getLoopLatch() == ExitPred;
+ BasicBlock *ExitSplit = SplitCriticalEdge(
+ ExitPred, Exit,
+ CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+ ExitSplit->moveBefore(Exit);
+ }
+ assert(SplitLatchEdge &&
+ "Despite splitting all preds, failed to split latch exit?");
+ } else {
+ // We can fold the conditional branch in the preheader, this makes things
+ // simpler. The first step is to remove the extra edge to the Exit block.
+ Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
+ BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
+ NewBI->setDebugLoc(PHBI->getDebugLoc());
+ PHBI->eraseFromParent();
+
+ // With our CFG finalized, update DomTree if it is available.
+ if (DT) DT->deleteEdge(OrigPreheader, Exit);
+ }
+
+ assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
+ assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+
+ // Now that the CFG and DomTree are in a consistent state again, try to merge
+ // the OrigHeader block into OrigLatch. This will succeed if they are
+ // connected by an unconditional branch. This is just a cleanup so the
+ // emitted code isn't too gross in this common case.
+ MergeBlockIntoPredecessor(OrigHeader, DT, LI);
+
+ DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+
+ ++NumRotated;
+ return true;
+}
+
+/// Determine whether the instructions in this range may be safely and cheaply
+/// speculated. This is not an important enough situation to develop complex
+/// heuristics. We handle a single arithmetic instruction along with any type
+/// conversions.
+static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
+ BasicBlock::iterator End, Loop *L) {
+ bool seenIncrement = false;
+ bool MultiExitLoop = false;
+
+ if (!L->getExitingBlock())
+ MultiExitLoop = true;
+
+ for (BasicBlock::iterator I = Begin; I != End; ++I) {
+
+ if (!isSafeToSpeculativelyExecute(&*I))
+ return false;
+
+ if (isa<DbgInfoIntrinsic>(I))
+ continue;
+
+ switch (I->getOpcode()) {
+ default:
+ return false;
+ case Instruction::GetElementPtr:
+ // GEPs are cheap if all indices are constant.
+ if (!cast<GEPOperator>(I)->hasAllConstantIndices())
+ return false;
+ // fall-thru to increment case
+ LLVM_FALLTHROUGH;
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr: {
+ Value *IVOpnd =
+ !isa<Constant>(I->getOperand(0))
+ ? I->getOperand(0)
+ : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr;
+ if (!IVOpnd)
+ return false;
+
+ // If increment operand is used outside of the loop, this speculation
+ // could cause extra live range interference.
+ if (MultiExitLoop) {
+ for (User *UseI : IVOpnd->users()) {
+ auto *UserInst = cast<Instruction>(UseI);
+ if (!L->contains(UserInst))
+ return false;
+ }
+ }
+
+ if (seenIncrement)
+ return false;
+ seenIncrement = true;
+ break;
+ }
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ // ignore type conversions
+ break;
+ }
+ }
+ return true;
+}
+
+/// Fold the loop tail into the loop exit by speculating the loop tail
+/// instructions. Typically, this is a single post-increment. In the case of a
+/// simple 2-block loop, hoisting the increment can be much better than
+/// duplicating the entire loop header. In the case of loops with early exits,
+/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
+/// canonical form so downstream passes can handle it.
+///
+/// I don't believe this invalidates SCEV.
+bool LoopRotate::simplifyLoopLatch(Loop *L) {
+ BasicBlock *Latch = L->getLoopLatch();
+ if (!Latch || Latch->hasAddressTaken())
+ return false;
+
+ BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
+ if (!Jmp || !Jmp->isUnconditional())
+ return false;
+
+ BasicBlock *LastExit = Latch->getSinglePredecessor();
+ if (!LastExit || !L->isLoopExiting(LastExit))
+ return false;
+
+ BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
+ if (!BI)
+ return false;
+
+ if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
+ return false;
+
+ DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
+ << LastExit->getName() << "\n");
+
+ // Hoist the instructions from Latch into LastExit.
+ LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
+ Latch->begin(), Jmp->getIterator());
+
+ unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
+ BasicBlock *Header = Jmp->getSuccessor(0);
+ assert(Header == L->getHeader() && "expected a backward branch");
+
+ // Remove Latch from the CFG so that LastExit becomes the new Latch.
+ BI->setSuccessor(FallThruPath, Header);
+ Latch->replaceSuccessorsPhiUsesWith(LastExit);
+ Jmp->eraseFromParent();
+
+ // Nuke the Latch block.
+ assert(Latch->empty() && "unable to evacuate Latch");
+ LI->removeBlock(Latch);
+ if (DT)
+ DT->eraseNode(Latch);
+ Latch->eraseFromParent();
+ return true;
+}
+
+/// Rotate \c L, and return true if any modification was made.
+bool LoopRotate::processLoop(Loop *L) {
+ // Save the loop metadata.
+ MDNode *LoopMD = L->getLoopID();
+
+ // Simplify the loop latch before attempting to rotate the header
+ // upward. Rotation may not be needed if the loop tail can be folded into the
+ // loop exit.
+ bool SimplifiedLatch = simplifyLoopLatch(L);
+
+ bool MadeChange = rotateLoop(L, SimplifiedLatch);
+ assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) &&
+ "Loop latch should be exiting after loop-rotate.");
+
+ // Restore the loop metadata.
+ // NB! We presume LoopRotation DOESN'T ADD its own metadata.
+ if ((MadeChange || SimplifiedLatch) && LoopMD)
+ L->setLoopID(LoopMD);
+
+ return MadeChange || SimplifiedLatch;
+}
+
+LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication)
+ : EnableHeaderDuplication(EnableHeaderDuplication) {}
+
+PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ int Threshold = EnableHeaderDuplication ? DefaultRotationThreshold : 0;
+ const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
+ const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL);
+ LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
+ SQ);
+
+ bool Changed = LR.processLoop(&L);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ return getLoopPassPreservedAnalyses();
+}
+
+namespace {
+
+class LoopRotateLegacyPass : public LoopPass {
+ unsigned MaxHeaderSize;
+
+public:
+ static char ID; // Pass ID, replacement for typeid
+ LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) {
+ initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry());
+ if (SpecifiedMaxHeaderSize == -1)
+ MaxHeaderSize = DefaultRotationThreshold;
+ else
+ MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize);
+ }
+
+ // LCSSA form makes instruction renaming easier.
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ getLoopAnalysisUsage(AU);
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+ Function &F = *L->getHeader()->getParent();
+
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+ auto *SE = SEWP ? &SEWP->getSE() : nullptr;
+ const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
+ LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE, SQ);
+ return LR.processLoop(L);
+ }
+};
+}
+
+char LoopRotateLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false,
+ false)
+
+Pass *llvm::createLoopRotatePass(int MaxHeaderSize) {
+ return new LoopRotateLegacyPass(MaxHeaderSize);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
new file mode 100644
index 000000000000..35c05e84fd68
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -0,0 +1,109 @@
+//===--------- LoopSimplifyCFG.cpp - Loop CFG Simplification Pass ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Loop SimplifyCFG Pass. This pass is responsible for
+// basic loop CFG cleanup, primarily to assist other loop passes. If you
+// encounter a noncanonical CFG construct that causes another loop pass to
+// perform suboptimally, this is the place to fix it up.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-simplifycfg"
+
+static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) {
+ bool Changed = false;
+ // Copy blocks into a temporary array to avoid iterator invalidation issues
+ // as we remove them.
+ SmallVector<WeakTrackingVH, 16> Blocks(L.blocks());
+
+ for (auto &Block : Blocks) {
+ // Attempt to merge blocks in the trivial case. Don't modify blocks which
+ // belong to other loops.
+ BasicBlock *Succ = cast_or_null<BasicBlock>(Block);
+ if (!Succ)
+ continue;
+
+ BasicBlock *Pred = Succ->getSinglePredecessor();
+ if (!Pred || !Pred->getSingleSuccessor() || LI.getLoopFor(Pred) != &L)
+ continue;
+
+ // Pred is going to disappear, so we need to update the loop info.
+ if (L.getHeader() == Pred)
+ L.moveToHeader(Succ);
+ LI.removeBlock(Pred);
+ MergeBasicBlockIntoOnlyPred(Succ, &DT);
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ if (!simplifyLoopCFG(L, AR.DT, AR.LI))
+ return PreservedAnalyses::all();
+
+ return getLoopPassPreservedAnalyses();
+}
+
+namespace {
+class LoopSimplifyCFGLegacyPass : public LoopPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+ LoopSimplifyCFGLegacyPass() : LoopPass(ID) {
+ initializeLoopSimplifyCFGLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &) override {
+ if (skipLoop(L))
+ return false;
+
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ return simplifyLoopCFG(*L, DT, LI);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DependenceAnalysisWrapperPass>();
+ getLoopAnalysisUsage(AU);
+ }
+};
+}
+
+char LoopSimplifyCFGLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopSimplifyCFGLegacyPass, "loop-simplifycfg",
+ "Simplify loop CFG", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(LoopSimplifyCFGLegacyPass, "loop-simplifycfg",
+ "Simplify loop CFG", false, false)
+
+Pass *llvm::createLoopSimplifyCFGPass() {
+ return new LoopSimplifyCFGLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
new file mode 100644
index 000000000000..430a7085d93f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
@@ -0,0 +1,373 @@
+//===-- LoopSink.cpp - Loop Sink Pass -------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does the inverse transformation of what LICM does.
+// It traverses all of the instructions in the loop's preheader and sinks
+// them to the loop body where frequency is lower than the loop's preheader.
+// This pass is a reverse-transformation of LICM. It differs from the Sink
+// pass in the following ways:
+//
+// * It only handles sinking of instructions from the loop's preheader to the
+// loop's body
+// * It uses alias set tracker to get more accurate alias info
+// * It uses block frequency info to find the optimal sinking locations
+//
+// Overall algorithm:
+//
+// For I in Preheader:
+// InsertBBs = BBs that uses I
+// For BB in sorted(LoopBBs):
+// DomBBs = BBs in InsertBBs that are dominated by BB
+// if freq(DomBBs) > freq(BB)
+// InsertBBs = UseBBs - DomBBs + BB
+// For BB in InsertBBs:
+// Insert I at BB's beginning
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopSink.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loopsink"
+
+STATISTIC(NumLoopSunk, "Number of instructions sunk into loop");
+STATISTIC(NumLoopSunkCloned, "Number of cloned instructions sunk into loop");
+
+static cl::opt<unsigned> SinkFrequencyPercentThreshold(
+ "sink-freq-percent-threshold", cl::Hidden, cl::init(90),
+ cl::desc("Do not sink instructions that require cloning unless they "
+ "execute less than this percent of the time."));
+
+static cl::opt<unsigned> MaxNumberOfUseBBsForSinking(
+ "max-uses-for-sinking", cl::Hidden, cl::init(30),
+ cl::desc("Do not sink instructions that have too many uses."));
+
+/// Return adjusted total frequency of \p BBs.
+///
+/// * If there is only one BB, sinking instruction will not introduce code
+/// size increase. Thus there is no need to adjust the frequency.
+/// * If there are more than one BB, sinking would lead to code size increase.
+/// In this case, we add some "tax" to the total frequency to make it harder
+/// to sink. E.g.
+/// Freq(Preheader) = 100
+/// Freq(BBs) = sum(50, 49) = 99
+/// Even if Freq(BBs) < Freq(Preheader), we will not sink from Preheade to
+/// BBs as the difference is too small to justify the code size increase.
+/// To model this, The adjusted Freq(BBs) will be:
+/// AdjustedFreq(BBs) = 99 / SinkFrequencyPercentThreshold%
+static BlockFrequency adjustedSumFreq(SmallPtrSetImpl<BasicBlock *> &BBs,
+ BlockFrequencyInfo &BFI) {
+ BlockFrequency T = 0;
+ for (BasicBlock *B : BBs)
+ T += BFI.getBlockFreq(B);
+ if (BBs.size() > 1)
+ T /= BranchProbability(SinkFrequencyPercentThreshold, 100);
+ return T;
+}
+
+/// Return a set of basic blocks to insert sinked instructions.
+///
+/// The returned set of basic blocks (BBsToSinkInto) should satisfy:
+///
+/// * Inside the loop \p L
+/// * For each UseBB in \p UseBBs, there is at least one BB in BBsToSinkInto
+/// that domintates the UseBB
+/// * Has minimum total frequency that is no greater than preheader frequency
+///
+/// The purpose of the function is to find the optimal sinking points to
+/// minimize execution cost, which is defined as "sum of frequency of
+/// BBsToSinkInto".
+/// As a result, the returned BBsToSinkInto needs to have minimum total
+/// frequency.
+/// Additionally, if the total frequency of BBsToSinkInto exceeds preheader
+/// frequency, the optimal solution is not sinking (return empty set).
+///
+/// \p ColdLoopBBs is used to help find the optimal sinking locations.
+/// It stores a list of BBs that is:
+///
+/// * Inside the loop \p L
+/// * Has a frequency no larger than the loop's preheader
+/// * Sorted by BB frequency
+///
+/// The complexity of the function is O(UseBBs.size() * ColdLoopBBs.size()).
+/// To avoid expensive computation, we cap the maximum UseBBs.size() in its
+/// caller.
+static SmallPtrSet<BasicBlock *, 2>
+findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl<BasicBlock *> &UseBBs,
+ const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
+ DominatorTree &DT, BlockFrequencyInfo &BFI) {
+ SmallPtrSet<BasicBlock *, 2> BBsToSinkInto;
+ if (UseBBs.size() == 0)
+ return BBsToSinkInto;
+
+ BBsToSinkInto.insert(UseBBs.begin(), UseBBs.end());
+ SmallPtrSet<BasicBlock *, 2> BBsDominatedByColdestBB;
+
+ // For every iteration:
+ // * Pick the ColdestBB from ColdLoopBBs
+ // * Find the set BBsDominatedByColdestBB that satisfy:
+ // - BBsDominatedByColdestBB is a subset of BBsToSinkInto
+ // - Every BB in BBsDominatedByColdestBB is dominated by ColdestBB
+ // * If Freq(ColdestBB) < Freq(BBsDominatedByColdestBB), remove
+ // BBsDominatedByColdestBB from BBsToSinkInto, add ColdestBB to
+ // BBsToSinkInto
+ for (BasicBlock *ColdestBB : ColdLoopBBs) {
+ BBsDominatedByColdestBB.clear();
+ for (BasicBlock *SinkedBB : BBsToSinkInto)
+ if (DT.dominates(ColdestBB, SinkedBB))
+ BBsDominatedByColdestBB.insert(SinkedBB);
+ if (BBsDominatedByColdestBB.size() == 0)
+ continue;
+ if (adjustedSumFreq(BBsDominatedByColdestBB, BFI) >
+ BFI.getBlockFreq(ColdestBB)) {
+ for (BasicBlock *DominatedBB : BBsDominatedByColdestBB) {
+ BBsToSinkInto.erase(DominatedBB);
+ }
+ BBsToSinkInto.insert(ColdestBB);
+ }
+ }
+
+ // If the total frequency of BBsToSinkInto is larger than preheader frequency,
+ // do not sink.
+ if (adjustedSumFreq(BBsToSinkInto, BFI) >
+ BFI.getBlockFreq(L.getLoopPreheader()))
+ BBsToSinkInto.clear();
+ return BBsToSinkInto;
+}
+
+// Sinks \p I from the loop \p L's preheader to its uses. Returns true if
+// sinking is successful.
+// \p LoopBlockNumber is used to sort the insertion blocks to ensure
+// determinism.
+static bool sinkInstruction(Loop &L, Instruction &I,
+ const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
+ const SmallDenseMap<BasicBlock *, int, 16> &LoopBlockNumber,
+ LoopInfo &LI, DominatorTree &DT,
+ BlockFrequencyInfo &BFI) {
+ // Compute the set of blocks in loop L which contain a use of I.
+ SmallPtrSet<BasicBlock *, 2> BBs;
+ for (auto &U : I.uses()) {
+ Instruction *UI = cast<Instruction>(U.getUser());
+ // We cannot sink I to PHI-uses.
+ if (dyn_cast<PHINode>(UI))
+ return false;
+ // We cannot sink I if it has uses outside of the loop.
+ if (!L.contains(LI.getLoopFor(UI->getParent())))
+ return false;
+ BBs.insert(UI->getParent());
+ }
+
+ // findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max
+ // BBs.size() to avoid expensive computation.
+ // FIXME: Handle code size growth for min_size and opt_size.
+ if (BBs.size() > MaxNumberOfUseBBsForSinking)
+ return false;
+
+ // Find the set of BBs that we should insert a copy of I.
+ SmallPtrSet<BasicBlock *, 2> BBsToSinkInto =
+ findBBsToSinkInto(L, BBs, ColdLoopBBs, DT, BFI);
+ if (BBsToSinkInto.empty())
+ return false;
+
+ // Copy the final BBs into a vector and sort them using the total ordering
+ // of the loop block numbers as iterating the set doesn't give a useful
+ // order. No need to stable sort as the block numbers are a total ordering.
+ SmallVector<BasicBlock *, 2> SortedBBsToSinkInto;
+ SortedBBsToSinkInto.insert(SortedBBsToSinkInto.begin(), BBsToSinkInto.begin(),
+ BBsToSinkInto.end());
+ std::sort(SortedBBsToSinkInto.begin(), SortedBBsToSinkInto.end(),
+ [&](BasicBlock *A, BasicBlock *B) {
+ return *LoopBlockNumber.find(A) < *LoopBlockNumber.find(B);
+ });
+
+ BasicBlock *MoveBB = *SortedBBsToSinkInto.begin();
+ // FIXME: Optimize the efficiency for cloned value replacement. The current
+ // implementation is O(SortedBBsToSinkInto.size() * I.num_uses()).
+ for (BasicBlock *N : SortedBBsToSinkInto) {
+ if (N == MoveBB)
+ continue;
+ // Clone I and replace its uses.
+ Instruction *IC = I.clone();
+ IC->setName(I.getName());
+ IC->insertBefore(&*N->getFirstInsertionPt());
+ // Replaces uses of I with IC in N
+ for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;) {
+ Use &U = *UI++;
+ auto *I = cast<Instruction>(U.getUser());
+ if (I->getParent() == N)
+ U.set(IC);
+ }
+ // Replaces uses of I with IC in blocks dominated by N
+ replaceDominatedUsesWith(&I, IC, DT, N);
+ DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
+ << '\n');
+ NumLoopSunkCloned++;
+ }
+ DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n');
+ NumLoopSunk++;
+ I.moveBefore(&*MoveBB->getFirstInsertionPt());
+
+ return true;
+}
+
+/// Sinks instructions from loop's preheader to the loop body if the
+/// sum frequency of inserted copy is smaller than preheader's frequency.
+static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
+ DominatorTree &DT,
+ BlockFrequencyInfo &BFI,
+ ScalarEvolution *SE) {
+ BasicBlock *Preheader = L.getLoopPreheader();
+ if (!Preheader)
+ return false;
+
+ // Enable LoopSink only when runtime profile is available.
+ // With static profile, the sinking decision may be sub-optimal.
+ if (!Preheader->getParent()->hasProfileData())
+ return false;
+
+ const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader);
+ // If there are no basic blocks with lower frequency than the preheader then
+ // we can avoid the detailed analysis as we will never find profitable sinking
+ // opportunities.
+ if (all_of(L.blocks(), [&](const BasicBlock *BB) {
+ return BFI.getBlockFreq(BB) > PreheaderFreq;
+ }))
+ return false;
+
+ bool Changed = false;
+ AliasSetTracker CurAST(AA);
+
+ // Compute alias set.
+ for (BasicBlock *BB : L.blocks())
+ CurAST.add(*BB);
+
+ // Sort loop's basic blocks by frequency
+ SmallVector<BasicBlock *, 10> ColdLoopBBs;
+ SmallDenseMap<BasicBlock *, int, 16> LoopBlockNumber;
+ int i = 0;
+ for (BasicBlock *B : L.blocks())
+ if (BFI.getBlockFreq(B) < BFI.getBlockFreq(L.getLoopPreheader())) {
+ ColdLoopBBs.push_back(B);
+ LoopBlockNumber[B] = ++i;
+ }
+ std::stable_sort(ColdLoopBBs.begin(), ColdLoopBBs.end(),
+ [&](BasicBlock *A, BasicBlock *B) {
+ return BFI.getBlockFreq(A) < BFI.getBlockFreq(B);
+ });
+
+ // Traverse preheader's instructions in reverse order becaue if A depends
+ // on B (A appears after B), A needs to be sinked first before B can be
+ // sinked.
+ for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) {
+ Instruction *I = &*II++;
+ // No need to check for instruction's operands are loop invariant.
+ assert(L.hasLoopInvariantOperands(I) &&
+ "Insts in a loop's preheader should have loop invariant operands!");
+ if (!canSinkOrHoistInst(*I, &AA, &DT, &L, &CurAST, nullptr))
+ continue;
+ if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI))
+ Changed = true;
+ }
+
+ if (Changed && SE)
+ SE->forgetLoopDispositions(&L);
+ return Changed;
+}
+
+PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
+ LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+ // Nothing to do if there are no loops.
+ if (LI.empty())
+ return PreservedAnalyses::all();
+
+ AAResults &AA = FAM.getResult<AAManager>(F);
+ DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+ BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+
+ // We want to do a postorder walk over the loops. Since loops are a tree this
+ // is equivalent to a reversed preorder walk and preorder is easy to compute
+ // without recursion. Since we reverse the preorder, we will visit siblings
+ // in reverse program order. This isn't expected to matter at all but is more
+ // consistent with sinking algorithms which generally work bottom-up.
+ SmallVector<Loop *, 4> PreorderLoops = LI.getLoopsInPreorder();
+
+ bool Changed = false;
+ do {
+ Loop &L = *PreorderLoops.pop_back_val();
+
+ // Note that we don't pass SCEV here because it is only used to invalidate
+ // loops in SCEV and we don't preserve (or request) SCEV at all making that
+ // unnecessary.
+ Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI,
+ /*ScalarEvolution*/ nullptr);
+ } while (!PreorderLoops.empty());
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+namespace {
+struct LegacyLoopSinkPass : public LoopPass {
+ static char ID;
+ LegacyLoopSinkPass() : LoopPass(ID) {
+ initializeLegacyLoopSinkPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+
+ auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+ return sinkLoopInvariantInstructions(
+ *L, getAnalysis<AAResultsWrapperPass>().getAAResults(),
+ getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
+ getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(),
+ SE ? &SE->getSE() : nullptr);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
+ getLoopAnalysisUsage(AU);
+ }
+};
+}
+
+char LegacyLoopSinkPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_END(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false, false)
+
+Pass *llvm::createLoopSinkPass() { return new LegacyLoopSinkPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
new file mode 100644
index 000000000000..ff3e9eef16d9
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -0,0 +1,5518 @@
+//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into forms suitable for efficient execution
+// on the target.
+//
+// This pass performs a strength reduction on array references inside loops that
+// have as one or more of their components the loop induction variable, it
+// rewrites expressions to take advantage of scaled-index addressing modes
+// available on the target, and it performs a variety of other optimizations
+// related to loop induction variables.
+//
+// Terminology note: this code has a lot of handling for "post-increment" or
+// "post-inc" users. This is not talking about post-increment addressing modes;
+// it is instead talking about code like this:
+//
+// %i = phi [ 0, %entry ], [ %i.next, %latch ]
+// ...
+// %i.next = add %i, 1
+// %c = icmp eq %i.next, %n
+//
+// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
+// it's useful to think about these as the same register, with some uses using
+// the value of the register before the add and some using it after. In this
+// example, the icmp is a post-increment user, since it uses %i.next, which is
+// the value of the induction variable after the increment. The other common
+// case of post-increment users is users outside the loop.
+//
+// TODO: More sophistication in the way Formulae are generated and filtered.
+//
+// TODO: Handle multiple loops at a time.
+//
+// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
+// of a GlobalValue?
+//
+// TODO: When truncation is free, truncate ICmp users' operands to make it a
+// smaller encoding (on x86 at least).
+//
+// TODO: When a negated register is used by an add (such as in a list of
+// multiple base registers, or as the increment expression in an addrec),
+// we may not actually need both reg and (-1 * reg) in registers; the
+// negation can be implemented by using a sub instead of an add. The
+// lack of support for taking this into consideration when making
+// register pressure decisions is partly worked around by the "Special"
+// use kind.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ScalarEvolutionNormalization.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <map>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-reduce"
+
+/// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for
+/// bail out. This threshold is far beyond the number of users that LSR can
+/// conceivably solve, so it should not affect generated code, but catches the
+/// worst cases before LSR burns too much compile time and stack space.
+static const unsigned MaxIVUsers = 200;
+
+// Temporary flag to cleanup congruent phis after LSR phi expansion.
+// It's currently disabled until we can determine whether it's truly useful or
+// not. The flag should be removed after the v3.0 release.
+// This is now needed for ivchains.
+static cl::opt<bool> EnablePhiElim(
+ "enable-lsr-phielim", cl::Hidden, cl::init(true),
+ cl::desc("Enable LSR phi elimination"));
+
+// The flag adds instruction count to solutions cost comparision.
+static cl::opt<bool> InsnsCost(
+ "lsr-insns-cost", cl::Hidden, cl::init(true),
+ cl::desc("Add instruction count to a LSR cost model"));
+
+// Flag to choose how to narrow complex lsr solution
+static cl::opt<bool> LSRExpNarrow(
+ "lsr-exp-narrow", cl::Hidden, cl::init(false),
+ cl::desc("Narrow LSR complex solution using"
+ " expectation of registers number"));
+
+// Flag to narrow search space by filtering non-optimal formulae with
+// the same ScaledReg and Scale.
+static cl::opt<bool> FilterSameScaledReg(
+ "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
+ cl::desc("Narrow LSR search space by filtering non-optimal formulae"
+ " with the same ScaledReg and Scale"));
+
+#ifndef NDEBUG
+// Stress test IV chain generation.
+static cl::opt<bool> StressIVChain(
+ "stress-ivchain", cl::Hidden, cl::init(false),
+ cl::desc("Stress test LSR IV chains"));
+#else
+static bool StressIVChain = false;
+#endif
+
+namespace {
+
+struct MemAccessTy {
+ /// Used in situations where the accessed memory type is unknown.
+ static const unsigned UnknownAddressSpace =
+ std::numeric_limits<unsigned>::max();
+
+ Type *MemTy = nullptr;
+ unsigned AddrSpace = UnknownAddressSpace;
+
+ MemAccessTy() = default;
+ MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
+
+ bool operator==(MemAccessTy Other) const {
+ return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
+ }
+
+ bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
+
+ static MemAccessTy getUnknown(LLVMContext &Ctx,
+ unsigned AS = UnknownAddressSpace) {
+ return MemAccessTy(Type::getVoidTy(Ctx), AS);
+ }
+};
+
+/// This class holds data which is used to order reuse candidates.
+class RegSortData {
+public:
+ /// This represents the set of LSRUse indices which reference
+ /// a particular register.
+ SmallBitVector UsedByIndices;
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+} // end anonymous namespace
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void RegSortData::print(raw_ostream &OS) const {
+ OS << "[NumUses=" << UsedByIndices.count() << ']';
+}
+
+LLVM_DUMP_METHOD void RegSortData::dump() const {
+ print(errs()); errs() << '\n';
+}
+#endif
+
+namespace {
+
+/// Map register candidates to information about how they are used.
+class RegUseTracker {
+ using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
+
+ RegUsesTy RegUsesMap;
+ SmallVector<const SCEV *, 16> RegSequence;
+
+public:
+ void countRegister(const SCEV *Reg, size_t LUIdx);
+ void dropRegister(const SCEV *Reg, size_t LUIdx);
+ void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
+
+ bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
+
+ const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
+
+ void clear();
+
+ using iterator = SmallVectorImpl<const SCEV *>::iterator;
+ using const_iterator = SmallVectorImpl<const SCEV *>::const_iterator;
+
+ iterator begin() { return RegSequence.begin(); }
+ iterator end() { return RegSequence.end(); }
+ const_iterator begin() const { return RegSequence.begin(); }
+ const_iterator end() const { return RegSequence.end(); }
+};
+
+} // end anonymous namespace
+
+void
+RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
+ std::pair<RegUsesTy::iterator, bool> Pair =
+ RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
+ RegSortData &RSD = Pair.first->second;
+ if (Pair.second)
+ RegSequence.push_back(Reg);
+ RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
+ RSD.UsedByIndices.set(LUIdx);
+}
+
+void
+RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
+ RegUsesTy::iterator It = RegUsesMap.find(Reg);
+ assert(It != RegUsesMap.end());
+ RegSortData &RSD = It->second;
+ assert(RSD.UsedByIndices.size() > LUIdx);
+ RSD.UsedByIndices.reset(LUIdx);
+}
+
+void
+RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
+ assert(LUIdx <= LastLUIdx);
+
+ // Update RegUses. The data structure is not optimized for this purpose;
+ // we must iterate through it and update each of the bit vectors.
+ for (auto &Pair : RegUsesMap) {
+ SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
+ if (LUIdx < UsedByIndices.size())
+ UsedByIndices[LUIdx] =
+ LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
+ UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
+ }
+}
+
+bool
+RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
+ RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
+ if (I == RegUsesMap.end())
+ return false;
+ const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
+ int i = UsedByIndices.find_first();
+ if (i == -1) return false;
+ if ((size_t)i != LUIdx) return true;
+ return UsedByIndices.find_next(i) != -1;
+}
+
+const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
+ RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
+ assert(I != RegUsesMap.end() && "Unknown register!");
+ return I->second.UsedByIndices;
+}
+
+void RegUseTracker::clear() {
+ RegUsesMap.clear();
+ RegSequence.clear();
+}
+
+namespace {
+
+/// This class holds information that describes a formula for computing
+/// satisfying a use. It may include broken-out immediates and scaled registers.
+struct Formula {
+ /// Global base address used for complex addressing.
+ GlobalValue *BaseGV = nullptr;
+
+ /// Base offset for complex addressing.
+ int64_t BaseOffset = 0;
+
+ /// Whether any complex addressing has a base register.
+ bool HasBaseReg = false;
+
+ /// The scale of any complex addressing.
+ int64_t Scale = 0;
+
+ /// The list of "base" registers for this use. When this is non-empty. The
+ /// canonical representation of a formula is
+ /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
+ /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
+ /// 3. The reg containing recurrent expr related with currect loop in the
+ /// formula should be put in the ScaledReg.
+ /// #1 enforces that the scaled register is always used when at least two
+ /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
+ /// #2 enforces that 1 * reg is reg.
+ /// #3 ensures invariant regs with respect to current loop can be combined
+ /// together in LSR codegen.
+ /// This invariant can be temporarly broken while building a formula.
+ /// However, every formula inserted into the LSRInstance must be in canonical
+ /// form.
+ SmallVector<const SCEV *, 4> BaseRegs;
+
+ /// The 'scaled' register for this use. This should be non-null when Scale is
+ /// not zero.
+ const SCEV *ScaledReg = nullptr;
+
+ /// An additional constant offset which added near the use. This requires a
+ /// temporary register, but the offset itself can live in an add immediate
+ /// field rather than a register.
+ int64_t UnfoldedOffset = 0;
+
+ Formula() = default;
+
+ void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
+
+ bool isCanonical(const Loop &L) const;
+
+ void canonicalize(const Loop &L);
+
+ bool unscale();
+
+ bool hasZeroEnd() const;
+
+ size_t getNumRegs() const;
+ Type *getType() const;
+
+ void deleteBaseReg(const SCEV *&S);
+
+ bool referencesReg(const SCEV *S) const;
+ bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
+ const RegUseTracker &RegUses) const;
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+} // end anonymous namespace
+
+/// Recursion helper for initialMatch.
+static void DoInitialMatch(const SCEV *S, Loop *L,
+ SmallVectorImpl<const SCEV *> &Good,
+ SmallVectorImpl<const SCEV *> &Bad,
+ ScalarEvolution &SE) {
+ // Collect expressions which properly dominate the loop header.
+ if (SE.properlyDominates(S, L->getHeader())) {
+ Good.push_back(S);
+ return;
+ }
+
+ // Look at add operands.
+ if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+ for (const SCEV *S : Add->operands())
+ DoInitialMatch(S, L, Good, Bad, SE);
+ return;
+ }
+
+ // Look at addrec operands.
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
+ if (!AR->getStart()->isZero() && AR->isAffine()) {
+ DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
+ DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
+ AR->getStepRecurrence(SE),
+ // FIXME: AR->getNoWrapFlags()
+ AR->getLoop(), SCEV::FlagAnyWrap),
+ L, Good, Bad, SE);
+ return;
+ }
+
+ // Handle a multiplication by -1 (negation) if it didn't fold.
+ if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
+ if (Mul->getOperand(0)->isAllOnesValue()) {
+ SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
+ const SCEV *NewMul = SE.getMulExpr(Ops);
+
+ SmallVector<const SCEV *, 4> MyGood;
+ SmallVector<const SCEV *, 4> MyBad;
+ DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
+ const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
+ SE.getEffectiveSCEVType(NewMul->getType())));
+ for (const SCEV *S : MyGood)
+ Good.push_back(SE.getMulExpr(NegOne, S));
+ for (const SCEV *S : MyBad)
+ Bad.push_back(SE.getMulExpr(NegOne, S));
+ return;
+ }
+
+ // Ok, we can't do anything interesting. Just stuff the whole thing into a
+ // register and hope for the best.
+ Bad.push_back(S);
+}
+
+/// Incorporate loop-variant parts of S into this Formula, attempting to keep
+/// all loop-invariant and loop-computable values in a single base register.
+void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
+ SmallVector<const SCEV *, 4> Good;
+ SmallVector<const SCEV *, 4> Bad;
+ DoInitialMatch(S, L, Good, Bad, SE);
+ if (!Good.empty()) {
+ const SCEV *Sum = SE.getAddExpr(Good);
+ if (!Sum->isZero())
+ BaseRegs.push_back(Sum);
+ HasBaseReg = true;
+ }
+ if (!Bad.empty()) {
+ const SCEV *Sum = SE.getAddExpr(Bad);
+ if (!Sum->isZero())
+ BaseRegs.push_back(Sum);
+ HasBaseReg = true;
+ }
+ canonicalize(*L);
+}
+
+/// \brief Check whether or not this formula statisfies the canonical
+/// representation.
+/// \see Formula::BaseRegs.
+bool Formula::isCanonical(const Loop &L) const {
+ if (!ScaledReg)
+ return BaseRegs.size() <= 1;
+
+ if (Scale != 1)
+ return true;
+
+ if (Scale == 1 && BaseRegs.empty())
+ return false;
+
+ const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+ if (SAR && SAR->getLoop() == &L)
+ return true;
+
+ // If ScaledReg is not a recurrent expr, or it is but its loop is not current
+ // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
+ // loop, we want to swap the reg in BaseRegs with ScaledReg.
+ auto I =
+ find_if(make_range(BaseRegs.begin(), BaseRegs.end()), [&](const SCEV *S) {
+ return isa<const SCEVAddRecExpr>(S) &&
+ (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+ });
+ return I == BaseRegs.end();
+}
+
+/// \brief Helper method to morph a formula into its canonical representation.
+/// \see Formula::BaseRegs.
+/// Every formula having more than one base register, must use the ScaledReg
+/// field. Otherwise, we would have to do special cases everywhere in LSR
+/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
+/// On the other hand, 1*reg should be canonicalized into reg.
+void Formula::canonicalize(const Loop &L) {
+ if (isCanonical(L))
+ return;
+ // So far we did not need this case. This is easy to implement but it is
+ // useless to maintain dead code. Beside it could hurt compile time.
+ assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
+
+ // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
+ if (!ScaledReg) {
+ ScaledReg = BaseRegs.back();
+ BaseRegs.pop_back();
+ Scale = 1;
+ }
+
+ // If ScaledReg is an invariant with respect to L, find the reg from
+ // BaseRegs containing the recurrent expr related with Loop L. Swap the
+ // reg with ScaledReg.
+ const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+ if (!SAR || SAR->getLoop() != &L) {
+ auto I = find_if(make_range(BaseRegs.begin(), BaseRegs.end()),
+ [&](const SCEV *S) {
+ return isa<const SCEVAddRecExpr>(S) &&
+ (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+ });
+ if (I != BaseRegs.end())
+ std::swap(ScaledReg, *I);
+ }
+}
+
+/// \brief Get rid of the scale in the formula.
+/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
+/// \return true if it was possible to get rid of the scale, false otherwise.
+/// \note After this operation the formula may not be in the canonical form.
+bool Formula::unscale() {
+ if (Scale != 1)
+ return false;
+ Scale = 0;
+ BaseRegs.push_back(ScaledReg);
+ ScaledReg = nullptr;
+ return true;
+}
+
+bool Formula::hasZeroEnd() const {
+ if (UnfoldedOffset || BaseOffset)
+ return false;
+ if (BaseRegs.size() != 1 || ScaledReg)
+ return false;
+ return true;
+}
+
+/// Return the total number of register operands used by this formula. This does
+/// not include register uses implied by non-constant addrec strides.
+size_t Formula::getNumRegs() const {
+ return !!ScaledReg + BaseRegs.size();
+}
+
+/// Return the type of this formula, if it has one, or null otherwise. This type
+/// is meaningless except for the bit size.
+Type *Formula::getType() const {
+ return !BaseRegs.empty() ? BaseRegs.front()->getType() :
+ ScaledReg ? ScaledReg->getType() :
+ BaseGV ? BaseGV->getType() :
+ nullptr;
+}
+
+/// Delete the given base reg from the BaseRegs list.
+void Formula::deleteBaseReg(const SCEV *&S) {
+ if (&S != &BaseRegs.back())
+ std::swap(S, BaseRegs.back());
+ BaseRegs.pop_back();
+}
+
+/// Test if this formula references the given register.
+bool Formula::referencesReg(const SCEV *S) const {
+ return S == ScaledReg || is_contained(BaseRegs, S);
+}
+
+/// Test whether this formula uses registers which are used by uses other than
+/// the use with the given index.
+bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
+ const RegUseTracker &RegUses) const {
+ if (ScaledReg)
+ if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
+ return true;
+ for (const SCEV *BaseReg : BaseRegs)
+ if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
+ return true;
+ return false;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void Formula::print(raw_ostream &OS) const {
+ bool First = true;
+ if (BaseGV) {
+ if (!First) OS << " + "; else First = false;
+ BaseGV->printAsOperand(OS, /*PrintType=*/false);
+ }
+ if (BaseOffset != 0) {
+ if (!First) OS << " + "; else First = false;
+ OS << BaseOffset;
+ }
+ for (const SCEV *BaseReg : BaseRegs) {
+ if (!First) OS << " + "; else First = false;
+ OS << "reg(" << *BaseReg << ')';
+ }
+ if (HasBaseReg && BaseRegs.empty()) {
+ if (!First) OS << " + "; else First = false;
+ OS << "**error: HasBaseReg**";
+ } else if (!HasBaseReg && !BaseRegs.empty()) {
+ if (!First) OS << " + "; else First = false;
+ OS << "**error: !HasBaseReg**";
+ }
+ if (Scale != 0) {
+ if (!First) OS << " + "; else First = false;
+ OS << Scale << "*reg(";
+ if (ScaledReg)
+ OS << *ScaledReg;
+ else
+ OS << "<unknown>";
+ OS << ')';
+ }
+ if (UnfoldedOffset != 0) {
+ if (!First) OS << " + ";
+ OS << "imm(" << UnfoldedOffset << ')';
+ }
+}
+
+LLVM_DUMP_METHOD void Formula::dump() const {
+ print(errs()); errs() << '\n';
+}
+#endif
+
+/// Return true if the given addrec can be sign-extended without changing its
+/// value.
+static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
+ Type *WideTy =
+ IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
+ return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
+}
+
+/// Return true if the given add can be sign-extended without changing its
+/// value.
+static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
+ Type *WideTy =
+ IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
+ return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
+}
+
+/// Return true if the given mul can be sign-extended without changing its
+/// value.
+static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
+ Type *WideTy =
+ IntegerType::get(SE.getContext(),
+ SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
+ return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
+}
+
+/// Return an expression for LHS /s RHS, if it can be determined and if the
+/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
+/// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that
+/// the multiplication may overflow, which is useful when the result will be
+/// used in a context where the most significant bits are ignored.
+static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
+ ScalarEvolution &SE,
+ bool IgnoreSignificantBits = false) {
+ // Handle the trivial case, which works for any SCEV type.
+ if (LHS == RHS)
+ return SE.getConstant(LHS->getType(), 1);
+
+ // Handle a few RHS special cases.
+ const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
+ if (RC) {
+ const APInt &RA = RC->getAPInt();
+ // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
+ // some folding.
+ if (RA.isAllOnesValue())
+ return SE.getMulExpr(LHS, RC);
+ // Handle x /s 1 as x.
+ if (RA == 1)
+ return LHS;
+ }
+
+ // Check for a division of a constant by a constant.
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
+ if (!RC)
+ return nullptr;
+ const APInt &LA = C->getAPInt();
+ const APInt &RA = RC->getAPInt();
+ if (LA.srem(RA) != 0)
+ return nullptr;
+ return SE.getConstant(LA.sdiv(RA));
+ }
+
+ // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
+ if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
+ const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
+ IgnoreSignificantBits);
+ if (!Step) return nullptr;
+ const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
+ IgnoreSignificantBits);
+ if (!Start) return nullptr;
+ // FlagNW is independent of the start value, step direction, and is
+ // preserved with smaller magnitude steps.
+ // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+ return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
+ }
+ return nullptr;
+ }
+
+ // Distribute the sdiv over add operands, if the add doesn't overflow.
+ if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
+ if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
+ SmallVector<const SCEV *, 8> Ops;
+ for (const SCEV *S : Add->operands()) {
+ const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
+ if (!Op) return nullptr;
+ Ops.push_back(Op);
+ }
+ return SE.getAddExpr(Ops);
+ }
+ return nullptr;
+ }
+
+ // Check for a multiply operand that we can pull RHS out of.
+ if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
+ if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
+ SmallVector<const SCEV *, 4> Ops;
+ bool Found = false;
+ for (const SCEV *S : Mul->operands()) {
+ if (!Found)
+ if (const SCEV *Q = getExactSDiv(S, RHS, SE,
+ IgnoreSignificantBits)) {
+ S = Q;
+ Found = true;
+ }
+ Ops.push_back(S);
+ }
+ return Found ? SE.getMulExpr(Ops) : nullptr;
+ }
+ return nullptr;
+ }
+
+ // Otherwise we don't know.
+ return nullptr;
+}
+
+/// If S involves the addition of a constant integer value, return that integer
+/// value, and mutate S to point to a new SCEV with that value excluded.
+static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
+ if (C->getAPInt().getMinSignedBits() <= 64) {
+ S = SE.getConstant(C->getType(), 0);
+ return C->getValue()->getSExtValue();
+ }
+ } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+ SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
+ int64_t Result = ExtractImmediate(NewOps.front(), SE);
+ if (Result != 0)
+ S = SE.getAddExpr(NewOps);
+ return Result;
+ } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+ SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
+ int64_t Result = ExtractImmediate(NewOps.front(), SE);
+ if (Result != 0)
+ S = SE.getAddRecExpr(NewOps, AR->getLoop(),
+ // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+ SCEV::FlagAnyWrap);
+ return Result;
+ }
+ return 0;
+}
+
+/// If S involves the addition of a GlobalValue address, return that symbol, and
+/// mutate S to point to a new SCEV with that value excluded.
+static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
+ if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+ if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
+ S = SE.getConstant(GV->getType(), 0);
+ return GV;
+ }
+ } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+ SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
+ GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
+ if (Result)
+ S = SE.getAddExpr(NewOps);
+ return Result;
+ } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+ SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
+ GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
+ if (Result)
+ S = SE.getAddRecExpr(NewOps, AR->getLoop(),
+ // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+ SCEV::FlagAnyWrap);
+ return Result;
+ }
+ return nullptr;
+}
+
+/// Returns true if the specified instruction is using the specified value as an
+/// address.
+static bool isAddressUse(const TargetTransformInfo &TTI,
+ Instruction *Inst, Value *OperandVal) {
+ bool isAddress = isa<LoadInst>(Inst);
+ if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ if (SI->getPointerOperand() == OperandVal)
+ isAddress = true;
+ } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+ // Addressing modes can also be folded into prefetches and a variety
+ // of intrinsics.
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::memset:
+ case Intrinsic::prefetch:
+ if (II->getArgOperand(0) == OperandVal)
+ isAddress = true;
+ break;
+ case Intrinsic::memmove:
+ case Intrinsic::memcpy:
+ if (II->getArgOperand(0) == OperandVal ||
+ II->getArgOperand(1) == OperandVal)
+ isAddress = true;
+ break;
+ default: {
+ MemIntrinsicInfo IntrInfo;
+ if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
+ if (IntrInfo.PtrVal == OperandVal)
+ isAddress = true;
+ }
+ }
+ }
+ } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+ if (RMW->getPointerOperand() == OperandVal)
+ isAddress = true;
+ } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+ if (CmpX->getPointerOperand() == OperandVal)
+ isAddress = true;
+ }
+ return isAddress;
+}
+
+/// Return the type of the memory being accessed.
+static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
+ Instruction *Inst) {
+ MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
+ if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ AccessTy.MemTy = SI->getOperand(0)->getType();
+ AccessTy.AddrSpace = SI->getPointerAddressSpace();
+ } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ AccessTy.AddrSpace = LI->getPointerAddressSpace();
+ } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+ AccessTy.AddrSpace = RMW->getPointerAddressSpace();
+ } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+ AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
+ } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::prefetch:
+ AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
+ break;
+ default: {
+ MemIntrinsicInfo IntrInfo;
+ if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
+ AccessTy.AddrSpace
+ = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
+ }
+
+ break;
+ }
+ }
+ }
+
+ // All pointers have the same requirements, so canonicalize them to an
+ // arbitrary pointer type to minimize variation.
+ if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy))
+ AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
+ PTy->getAddressSpace());
+
+ return AccessTy;
+}
+
+/// Return true if this AddRec is already a phi in its loop.
+static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
+ for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
+ if (SE.isSCEVable(PN.getType()) &&
+ (SE.getEffectiveSCEVType(PN.getType()) ==
+ SE.getEffectiveSCEVType(AR->getType())) &&
+ SE.getSCEV(&PN) == AR)
+ return true;
+ }
+ return false;
+}
+
+/// Check if expanding this expression is likely to incur significant cost. This
+/// is tricky because SCEV doesn't track which expressions are actually computed
+/// by the current IR.
+///
+/// We currently allow expansion of IV increments that involve adds,
+/// multiplication by constants, and AddRecs from existing phis.
+///
+/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
+/// obvious multiple of the UDivExpr.
+static bool isHighCostExpansion(const SCEV *S,
+ SmallPtrSetImpl<const SCEV*> &Processed,
+ ScalarEvolution &SE) {
+ // Zero/One operand expressions
+ switch (S->getSCEVType()) {
+ case scUnknown:
+ case scConstant:
+ return false;
+ case scTruncate:
+ return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
+ Processed, SE);
+ case scZeroExtend:
+ return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
+ Processed, SE);
+ case scSignExtend:
+ return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
+ Processed, SE);
+ }
+
+ if (!Processed.insert(S).second)
+ return false;
+
+ if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+ for (const SCEV *S : Add->operands()) {
+ if (isHighCostExpansion(S, Processed, SE))
+ return true;
+ }
+ return false;
+ }
+
+ if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
+ if (Mul->getNumOperands() == 2) {
+ // Multiplication by a constant is ok
+ if (isa<SCEVConstant>(Mul->getOperand(0)))
+ return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
+
+ // If we have the value of one operand, check if an existing
+ // multiplication already generates this expression.
+ if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
+ Value *UVal = U->getValue();
+ for (User *UR : UVal->users()) {
+ // If U is a constant, it may be used by a ConstantExpr.
+ Instruction *UI = dyn_cast<Instruction>(UR);
+ if (UI && UI->getOpcode() == Instruction::Mul &&
+ SE.isSCEVable(UI->getType())) {
+ return SE.getSCEV(UI) == Mul;
+ }
+ }
+ }
+ }
+ }
+
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+ if (isExistingPhi(AR, SE))
+ return false;
+ }
+
+ // Fow now, consider any other type of expression (div/mul/min/max) high cost.
+ return true;
+}
+
+/// If any of the instructions is the specified set are trivially dead, delete
+/// them and see if this makes any of their operands subsequently dead.
+static bool
+DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+ bool Changed = false;
+
+ while (!DeadInsts.empty()) {
+ Value *V = DeadInsts.pop_back_val();
+ Instruction *I = dyn_cast_or_null<Instruction>(V);
+
+ if (!I || !isInstructionTriviallyDead(I))
+ continue;
+
+ for (Use &O : I->operands())
+ if (Instruction *U = dyn_cast<Instruction>(O)) {
+ O = nullptr;
+ if (U->use_empty())
+ DeadInsts.emplace_back(U);
+ }
+
+ I->eraseFromParent();
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+namespace {
+
+class LSRUse;
+
+} // end anonymous namespace
+
+/// \brief Check if the addressing mode defined by \p F is completely
+/// folded in \p LU at isel time.
+/// This includes address-mode folding and special icmp tricks.
+/// This function returns true if \p LU can accommodate what \p F
+/// defines and up to 1 base + 1 scaled + offset.
+/// In other words, if \p F has several base registers, this function may
+/// still return true. Therefore, users still need to account for
+/// additional base registers and/or unfolded offsets to derive an
+/// accurate cost model.
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+ const LSRUse &LU, const Formula &F);
+
+// Get the cost of the scaling factor used in F for LU.
+static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+ const LSRUse &LU, const Formula &F,
+ const Loop &L);
+
+namespace {
+
+/// This class is used to measure and compare candidate formulae.
+class Cost {
+ TargetTransformInfo::LSRCost C;
+
+public:
+ Cost() {
+ C.Insns = 0;
+ C.NumRegs = 0;
+ C.AddRecCost = 0;
+ C.NumIVMuls = 0;
+ C.NumBaseAdds = 0;
+ C.ImmCost = 0;
+ C.SetupCost = 0;
+ C.ScaleCost = 0;
+ }
+
+ bool isLess(Cost &Other, const TargetTransformInfo &TTI);
+
+ void Lose();
+
+#ifndef NDEBUG
+ // Once any of the metrics loses, they must all remain losers.
+ bool isValid() {
+ return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
+ | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
+ || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
+ & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
+ }
+#endif
+
+ bool isLoser() {
+ assert(isValid() && "invalid cost");
+ return C.NumRegs == ~0u;
+ }
+
+ void RateFormula(const TargetTransformInfo &TTI,
+ const Formula &F,
+ SmallPtrSetImpl<const SCEV *> &Regs,
+ const DenseSet<const SCEV *> &VisitedRegs,
+ const Loop *L,
+ ScalarEvolution &SE, DominatorTree &DT,
+ const LSRUse &LU,
+ SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+
+private:
+ void RateRegister(const SCEV *Reg,
+ SmallPtrSetImpl<const SCEV *> &Regs,
+ const Loop *L,
+ ScalarEvolution &SE, DominatorTree &DT);
+ void RatePrimaryRegister(const SCEV *Reg,
+ SmallPtrSetImpl<const SCEV *> &Regs,
+ const Loop *L,
+ ScalarEvolution &SE, DominatorTree &DT,
+ SmallPtrSetImpl<const SCEV *> *LoserRegs);
+};
+
+/// An operand value in an instruction which is to be replaced with some
+/// equivalent, possibly strength-reduced, replacement.
+struct LSRFixup {
+ /// The instruction which will be updated.
+ Instruction *UserInst = nullptr;
+
+ /// The operand of the instruction which will be replaced. The operand may be
+ /// used more than once; every instance will be replaced.
+ Value *OperandValToReplace = nullptr;
+
+ /// If this user is to use the post-incremented value of an induction
+ /// variable, this set is non-empty and holds the loops associated with the
+ /// induction variable.
+ PostIncLoopSet PostIncLoops;
+
+ /// A constant offset to be added to the LSRUse expression. This allows
+ /// multiple fixups to share the same LSRUse with different offsets, for
+ /// example in an unrolled loop.
+ int64_t Offset = 0;
+
+ LSRFixup() = default;
+
+ bool isUseFullyOutsideLoop(const Loop *L) const;
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
+/// SmallVectors of const SCEV*.
+struct UniquifierDenseMapInfo {
+ static SmallVector<const SCEV *, 4> getEmptyKey() {
+ SmallVector<const SCEV *, 4> V;
+ V.push_back(reinterpret_cast<const SCEV *>(-1));
+ return V;
+ }
+
+ static SmallVector<const SCEV *, 4> getTombstoneKey() {
+ SmallVector<const SCEV *, 4> V;
+ V.push_back(reinterpret_cast<const SCEV *>(-2));
+ return V;
+ }
+
+ static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
+ return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+ }
+
+ static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
+ const SmallVector<const SCEV *, 4> &RHS) {
+ return LHS == RHS;
+ }
+};
+
+/// This class holds the state that LSR keeps for each use in IVUsers, as well
+/// as uses invented by LSR itself. It includes information about what kinds of
+/// things can be folded into the user, information about the user itself, and
+/// information about how the use may be satisfied. TODO: Represent multiple
+/// users of the same expression in common?
+class LSRUse {
+ DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
+
+public:
+ /// An enum for a kind of use, indicating what types of scaled and immediate
+ /// operands it might support.
+ enum KindType {
+ Basic, ///< A normal use, with no folding.
+ Special, ///< A special case of basic, allowing -1 scales.
+ Address, ///< An address use; folding according to TargetLowering
+ ICmpZero ///< An equality icmp with both operands folded into one.
+ // TODO: Add a generic icmp too?
+ };
+
+ using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
+
+ KindType Kind;
+ MemAccessTy AccessTy;
+
+ /// The list of operands which are to be replaced.
+ SmallVector<LSRFixup, 8> Fixups;
+
+ /// Keep track of the min and max offsets of the fixups.
+ int64_t MinOffset = std::numeric_limits<int64_t>::max();
+ int64_t MaxOffset = std::numeric_limits<int64_t>::min();
+
+ /// This records whether all of the fixups using this LSRUse are outside of
+ /// the loop, in which case some special-case heuristics may be used.
+ bool AllFixupsOutsideLoop = true;
+
+ /// RigidFormula is set to true to guarantee that this use will be associated
+ /// with a single formula--the one that initially matched. Some SCEV
+ /// expressions cannot be expanded. This allows LSR to consider the registers
+ /// used by those expressions without the need to expand them later after
+ /// changing the formula.
+ bool RigidFormula = false;
+
+ /// This records the widest use type for any fixup using this
+ /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
+ /// fixup widths to be equivalent, because the narrower one may be relying on
+ /// the implicit truncation to truncate away bogus bits.
+ Type *WidestFixupType = nullptr;
+
+ /// A list of ways to build a value that can satisfy this user. After the
+ /// list is populated, one of these is selected heuristically and used to
+ /// formulate a replacement for OperandValToReplace in UserInst.
+ SmallVector<Formula, 12> Formulae;
+
+ /// The set of register candidates used by all formulae in this LSRUse.
+ SmallPtrSet<const SCEV *, 4> Regs;
+
+ LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
+
+ LSRFixup &getNewFixup() {
+ Fixups.push_back(LSRFixup());
+ return Fixups.back();
+ }
+
+ void pushFixup(LSRFixup &f) {
+ Fixups.push_back(f);
+ if (f.Offset > MaxOffset)
+ MaxOffset = f.Offset;
+ if (f.Offset < MinOffset)
+ MinOffset = f.Offset;
+ }
+
+ bool HasFormulaWithSameRegs(const Formula &F) const;
+ float getNotSelectedProbability(const SCEV *Reg) const;
+ bool InsertFormula(const Formula &F, const Loop &L);
+ void DeleteFormula(Formula &F);
+ void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+} // end anonymous namespace
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+ LSRUse::KindType Kind, MemAccessTy AccessTy,
+ GlobalValue *BaseGV, int64_t BaseOffset,
+ bool HasBaseReg, int64_t Scale,
+ Instruction *Fixup = nullptr);
+
+/// Tally up interesting quantities from the given register.
+void Cost::RateRegister(const SCEV *Reg,
+ SmallPtrSetImpl<const SCEV *> &Regs,
+ const Loop *L,
+ ScalarEvolution &SE, DominatorTree &DT) {
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
+ // If this is an addrec for another loop, it should be an invariant
+ // with respect to L since L is the innermost loop (at least
+ // for now LSR only handles innermost loops).
+ if (AR->getLoop() != L) {
+ // If the AddRec exists, consider it's register free and leave it alone.
+ if (isExistingPhi(AR, SE))
+ return;
+
+ // It is bad to allow LSR for current loop to add induction variables
+ // for its sibling loops.
+ if (!AR->getLoop()->contains(L)) {
+ Lose();
+ return;
+ }
+
+ // Otherwise, it will be an invariant with respect to Loop L.
+ ++C.NumRegs;
+ return;
+ }
+ C.AddRecCost += 1; /// TODO: This should be a function of the stride.
+
+ // Add the step value register, if it needs one.
+ // TODO: The non-affine case isn't precisely modeled here.
+ if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
+ if (!Regs.count(AR->getOperand(1))) {
+ RateRegister(AR->getOperand(1), Regs, L, SE, DT);
+ if (isLoser())
+ return;
+ }
+ }
+ }
+ ++C.NumRegs;
+
+ // Rough heuristic; favor registers which don't require extra setup
+ // instructions in the preheader.
+ if (!isa<SCEVUnknown>(Reg) &&
+ !isa<SCEVConstant>(Reg) &&
+ !(isa<SCEVAddRecExpr>(Reg) &&
+ (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
+ isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
+ ++C.SetupCost;
+
+ C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
+ SE.hasComputableLoopEvolution(Reg, L);
+}
+
+/// Record this register in the set. If we haven't seen it before, rate
+/// it. Optional LoserRegs provides a way to declare any formula that refers to
+/// one of those regs an instant loser.
+void Cost::RatePrimaryRegister(const SCEV *Reg,
+ SmallPtrSetImpl<const SCEV *> &Regs,
+ const Loop *L,
+ ScalarEvolution &SE, DominatorTree &DT,
+ SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+ if (LoserRegs && LoserRegs->count(Reg)) {
+ Lose();
+ return;
+ }
+ if (Regs.insert(Reg).second) {
+ RateRegister(Reg, Regs, L, SE, DT);
+ if (LoserRegs && isLoser())
+ LoserRegs->insert(Reg);
+ }
+}
+
+void Cost::RateFormula(const TargetTransformInfo &TTI,
+ const Formula &F,
+ SmallPtrSetImpl<const SCEV *> &Regs,
+ const DenseSet<const SCEV *> &VisitedRegs,
+ const Loop *L,
+ ScalarEvolution &SE, DominatorTree &DT,
+ const LSRUse &LU,
+ SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+ assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
+ // Tally up the registers.
+ unsigned PrevAddRecCost = C.AddRecCost;
+ unsigned PrevNumRegs = C.NumRegs;
+ unsigned PrevNumBaseAdds = C.NumBaseAdds;
+ if (const SCEV *ScaledReg = F.ScaledReg) {
+ if (VisitedRegs.count(ScaledReg)) {
+ Lose();
+ return;
+ }
+ RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
+ if (isLoser())
+ return;
+ }
+ for (const SCEV *BaseReg : F.BaseRegs) {
+ if (VisitedRegs.count(BaseReg)) {
+ Lose();
+ return;
+ }
+ RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
+ if (isLoser())
+ return;
+ }
+
+ // Determine how many (unfolded) adds we'll need inside the loop.
+ size_t NumBaseParts = F.getNumRegs();
+ if (NumBaseParts > 1)
+ // Do not count the base and a possible second register if the target
+ // allows to fold 2 registers.
+ C.NumBaseAdds +=
+ NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
+ C.NumBaseAdds += (F.UnfoldedOffset != 0);
+
+ // Accumulate non-free scaling amounts.
+ C.ScaleCost += getScalingFactorCost(TTI, LU, F, *L);
+
+ // Tally up the non-zero immediates.
+ for (const LSRFixup &Fixup : LU.Fixups) {
+ int64_t O = Fixup.Offset;
+ int64_t Offset = (uint64_t)O + F.BaseOffset;
+ if (F.BaseGV)
+ C.ImmCost += 64; // Handle symbolic values conservatively.
+ // TODO: This should probably be the pointer size.
+ else if (Offset != 0)
+ C.ImmCost += APInt(64, Offset, true).getMinSignedBits();
+
+ // Check with target if this offset with this instruction is
+ // specifically not supported.
+ if (LU.Kind == LSRUse::Address && Offset != 0 &&
+ !isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
+ Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
+ C.NumBaseAdds++;
+ }
+
+ // If we don't count instruction cost exit here.
+ if (!InsnsCost) {
+ assert(isValid() && "invalid cost");
+ return;
+ }
+
+ // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
+ // additional instruction (at least fill).
+ unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1;
+ if (C.NumRegs > TTIRegNum) {
+ // Cost already exceeded TTIRegNum, then only newly added register can add
+ // new instructions.
+ if (PrevNumRegs > TTIRegNum)
+ C.Insns += (C.NumRegs - PrevNumRegs);
+ else
+ C.Insns += (C.NumRegs - TTIRegNum);
+ }
+
+ // If ICmpZero formula ends with not 0, it could not be replaced by
+ // just add or sub. We'll need to compare final result of AddRec.
+ // That means we'll need an additional instruction.
+ // For -10 + {0, +, 1}:
+ // i = i + 1;
+ // cmp i, 10
+ //
+ // For {-10, +, 1}:
+ // i = i + 1;
+ if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
+ C.Insns++;
+ // Each new AddRec adds 1 instruction to calculation.
+ C.Insns += (C.AddRecCost - PrevAddRecCost);
+
+ // BaseAdds adds instructions for unfolded registers.
+ if (LU.Kind != LSRUse::ICmpZero)
+ C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
+ assert(isValid() && "invalid cost");
+}
+
+/// Set this cost to a losing value.
+void Cost::Lose() {
+ C.Insns = std::numeric_limits<unsigned>::max();
+ C.NumRegs = std::numeric_limits<unsigned>::max();
+ C.AddRecCost = std::numeric_limits<unsigned>::max();
+ C.NumIVMuls = std::numeric_limits<unsigned>::max();
+ C.NumBaseAdds = std::numeric_limits<unsigned>::max();
+ C.ImmCost = std::numeric_limits<unsigned>::max();
+ C.SetupCost = std::numeric_limits<unsigned>::max();
+ C.ScaleCost = std::numeric_limits<unsigned>::max();
+}
+
+/// Choose the lower cost.
+bool Cost::isLess(Cost &Other, const TargetTransformInfo &TTI) {
+ if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
+ C.Insns != Other.C.Insns)
+ return C.Insns < Other.C.Insns;
+ return TTI.isLSRCostLess(C, Other.C);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void Cost::print(raw_ostream &OS) const {
+ if (InsnsCost)
+ OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
+ OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
+ if (C.AddRecCost != 0)
+ OS << ", with addrec cost " << C.AddRecCost;
+ if (C.NumIVMuls != 0)
+ OS << ", plus " << C.NumIVMuls << " IV mul"
+ << (C.NumIVMuls == 1 ? "" : "s");
+ if (C.NumBaseAdds != 0)
+ OS << ", plus " << C.NumBaseAdds << " base add"
+ << (C.NumBaseAdds == 1 ? "" : "s");
+ if (C.ScaleCost != 0)
+ OS << ", plus " << C.ScaleCost << " scale cost";
+ if (C.ImmCost != 0)
+ OS << ", plus " << C.ImmCost << " imm cost";
+ if (C.SetupCost != 0)
+ OS << ", plus " << C.SetupCost << " setup cost";
+}
+
+LLVM_DUMP_METHOD void Cost::dump() const {
+ print(errs()); errs() << '\n';
+}
+#endif
+
+/// Test whether this fixup always uses its value outside of the given loop.
+bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
+ // PHI nodes use their value in their incoming blocks.
+ if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ if (PN->getIncomingValue(i) == OperandValToReplace &&
+ L->contains(PN->getIncomingBlock(i)))
+ return false;
+ return true;
+ }
+
+ return !L->contains(UserInst);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LSRFixup::print(raw_ostream &OS) const {
+ OS << "UserInst=";
+ // Store is common and interesting enough to be worth special-casing.
+ if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
+ OS << "store ";
+ Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
+ } else if (UserInst->getType()->isVoidTy())
+ OS << UserInst->getOpcodeName();
+ else
+ UserInst->printAsOperand(OS, /*PrintType=*/false);
+
+ OS << ", OperandValToReplace=";
+ OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
+
+ for (const Loop *PIL : PostIncLoops) {
+ OS << ", PostIncLoop=";
+ PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
+ }
+
+ if (Offset != 0)
+ OS << ", Offset=" << Offset;
+}
+
+LLVM_DUMP_METHOD void LSRFixup::dump() const {
+ print(errs()); errs() << '\n';
+}
+#endif
+
+/// Test whether this use as a formula which has the same registers as the given
+/// formula.
+bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
+ SmallVector<const SCEV *, 4> Key = F.BaseRegs;
+ if (F.ScaledReg) Key.push_back(F.ScaledReg);
+ // Unstable sort by host order ok, because this is only used for uniquifying.
+ std::sort(Key.begin(), Key.end());
+ return Uniquifier.count(Key);
+}
+
+/// The function returns a probability of selecting formula without Reg.
+float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
+ unsigned FNum = 0;
+ for (const Formula &F : Formulae)
+ if (F.referencesReg(Reg))
+ FNum++;
+ return ((float)(Formulae.size() - FNum)) / Formulae.size();
+}
+
+/// If the given formula has not yet been inserted, add it to the list, and
+/// return true. Return false otherwise. The formula must be in canonical form.
+bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
+ assert(F.isCanonical(L) && "Invalid canonical representation");
+
+ if (!Formulae.empty() && RigidFormula)
+ return false;
+
+ SmallVector<const SCEV *, 4> Key = F.BaseRegs;
+ if (F.ScaledReg) Key.push_back(F.ScaledReg);
+ // Unstable sort by host order ok, because this is only used for uniquifying.
+ std::sort(Key.begin(), Key.end());
+
+ if (!Uniquifier.insert(Key).second)
+ return false;
+
+ // Using a register to hold the value of 0 is not profitable.
+ assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
+ "Zero allocated in a scaled register!");
+#ifndef NDEBUG
+ for (const SCEV *BaseReg : F.BaseRegs)
+ assert(!BaseReg->isZero() && "Zero allocated in a base register!");
+#endif
+
+ // Add the formula to the list.
+ Formulae.push_back(F);
+
+ // Record registers now being used by this use.
+ Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+ if (F.ScaledReg)
+ Regs.insert(F.ScaledReg);
+
+ return true;
+}
+
+/// Remove the given formula from this use's list.
+void LSRUse::DeleteFormula(Formula &F) {
+ if (&F != &Formulae.back())
+ std::swap(F, Formulae.back());
+ Formulae.pop_back();
+}
+
+/// Recompute the Regs field, and update RegUses.
+void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
+ // Now that we've filtered out some formulae, recompute the Regs set.
+ SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
+ Regs.clear();
+ for (const Formula &F : Formulae) {
+ if (F.ScaledReg) Regs.insert(F.ScaledReg);
+ Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+ }
+
+ // Update the RegTracker.
+ for (const SCEV *S : OldRegs)
+ if (!Regs.count(S))
+ RegUses.dropRegister(S, LUIdx);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LSRUse::print(raw_ostream &OS) const {
+ OS << "LSR Use: Kind=";
+ switch (Kind) {
+ case Basic: OS << "Basic"; break;
+ case Special: OS << "Special"; break;
+ case ICmpZero: OS << "ICmpZero"; break;
+ case Address:
+ OS << "Address of ";
+ if (AccessTy.MemTy->isPointerTy())
+ OS << "pointer"; // the full pointer type could be really verbose
+ else {
+ OS << *AccessTy.MemTy;
+ }
+
+ OS << " in addrspace(" << AccessTy.AddrSpace << ')';
+ }
+
+ OS << ", Offsets={";
+ bool NeedComma = false;
+ for (const LSRFixup &Fixup : Fixups) {
+ if (NeedComma) OS << ',';
+ OS << Fixup.Offset;
+ NeedComma = true;
+ }
+ OS << '}';
+
+ if (AllFixupsOutsideLoop)
+ OS << ", all-fixups-outside-loop";
+
+ if (WidestFixupType)
+ OS << ", widest fixup type: " << *WidestFixupType;
+}
+
+LLVM_DUMP_METHOD void LSRUse::dump() const {
+ print(errs()); errs() << '\n';
+}
+#endif
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+ LSRUse::KindType Kind, MemAccessTy AccessTy,
+ GlobalValue *BaseGV, int64_t BaseOffset,
+ bool HasBaseReg, int64_t Scale,
+ Instruction *Fixup/*= nullptr*/) {
+ switch (Kind) {
+ case LSRUse::Address:
+ return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
+ HasBaseReg, Scale, AccessTy.AddrSpace, Fixup);
+
+ case LSRUse::ICmpZero:
+ // There's not even a target hook for querying whether it would be legal to
+ // fold a GV into an ICmp.
+ if (BaseGV)
+ return false;
+
+ // ICmp only has two operands; don't allow more than two non-trivial parts.
+ if (Scale != 0 && HasBaseReg && BaseOffset != 0)
+ return false;
+
+ // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
+ // putting the scaled register in the other operand of the icmp.
+ if (Scale != 0 && Scale != -1)
+ return false;
+
+ // If we have low-level target information, ask the target if it can fold an
+ // integer immediate on an icmp.
+ if (BaseOffset != 0) {
+ // We have one of:
+ // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
+ // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
+ // Offs is the ICmp immediate.
+ if (Scale == 0)
+ // The cast does the right thing with
+ // std::numeric_limits<int64_t>::min().
+ BaseOffset = -(uint64_t)BaseOffset;
+ return TTI.isLegalICmpImmediate(BaseOffset);
+ }
+
+ // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
+ return true;
+
+ case LSRUse::Basic:
+ // Only handle single-register values.
+ return !BaseGV && Scale == 0 && BaseOffset == 0;
+
+ case LSRUse::Special:
+ // Special case Basic to handle -1 scales.
+ return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0;
+ }
+
+ llvm_unreachable("Invalid LSRUse Kind!");
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+ int64_t MinOffset, int64_t MaxOffset,
+ LSRUse::KindType Kind, MemAccessTy AccessTy,
+ GlobalValue *BaseGV, int64_t BaseOffset,
+ bool HasBaseReg, int64_t Scale) {
+ // Check for overflow.
+ if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
+ (MinOffset > 0))
+ return false;
+ MinOffset = (uint64_t)BaseOffset + MinOffset;
+ if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
+ (MaxOffset > 0))
+ return false;
+ MaxOffset = (uint64_t)BaseOffset + MaxOffset;
+
+ return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
+ HasBaseReg, Scale) &&
+ isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
+ HasBaseReg, Scale);
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+ int64_t MinOffset, int64_t MaxOffset,
+ LSRUse::KindType Kind, MemAccessTy AccessTy,
+ const Formula &F, const Loop &L) {
+ // For the purpose of isAMCompletelyFolded either having a canonical formula
+ // or a scale not equal to zero is correct.
+ // Problems may arise from non canonical formulae having a scale == 0.
+ // Strictly speaking it would best to just rely on canonical formulae.
+ // However, when we generate the scaled formulae, we first check that the
+ // scaling factor is profitable before computing the actual ScaledReg for
+ // compile time sake.
+ assert((F.isCanonical(L) || F.Scale != 0));
+ return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
+ F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
+}
+
+/// Test whether we know how to expand the current formula.
+static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
+ int64_t MaxOffset, LSRUse::KindType Kind,
+ MemAccessTy AccessTy, GlobalValue *BaseGV,
+ int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {
+ // We know how to expand completely foldable formulae.
+ return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
+ BaseOffset, HasBaseReg, Scale) ||
+ // Or formulae that use a base register produced by a sum of base
+ // registers.
+ (Scale == 1 &&
+ isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
+ BaseGV, BaseOffset, true, 0));
+}
+
+static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
+ int64_t MaxOffset, LSRUse::KindType Kind,
+ MemAccessTy AccessTy, const Formula &F) {
+ return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
+ F.BaseOffset, F.HasBaseReg, F.Scale);
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+ const LSRUse &LU, const Formula &F) {
+ // Target may want to look at the user instructions.
+ if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
+ for (const LSRFixup &Fixup : LU.Fixups)
+ if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
+ (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
+ F.Scale, Fixup.UserInst))
+ return false;
+ return true;
+ }
+
+ return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+ LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
+ F.Scale);
+}
+
+static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+ const LSRUse &LU, const Formula &F,
+ const Loop &L) {
+ if (!F.Scale)
+ return 0;
+
+ // If the use is not completely folded in that instruction, we will have to
+ // pay an extra cost only for scale != 1.
+ if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+ LU.AccessTy, F, L))
+ return F.Scale != 1;
+
+ switch (LU.Kind) {
+ case LSRUse::Address: {
+ // Check the scaling factor cost with both the min and max offsets.
+ int ScaleCostMinOffset = TTI.getScalingFactorCost(
+ LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg,
+ F.Scale, LU.AccessTy.AddrSpace);
+ int ScaleCostMaxOffset = TTI.getScalingFactorCost(
+ LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg,
+ F.Scale, LU.AccessTy.AddrSpace);
+
+ assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&
+ "Legal addressing mode has an illegal cost!");
+ return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
+ }
+ case LSRUse::ICmpZero:
+ case LSRUse::Basic:
+ case LSRUse::Special:
+ // The use is completely folded, i.e., everything is folded into the
+ // instruction.
+ return 0;
+ }
+
+ llvm_unreachable("Invalid LSRUse Kind!");
+}
+
+static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
+ LSRUse::KindType Kind, MemAccessTy AccessTy,
+ GlobalValue *BaseGV, int64_t BaseOffset,
+ bool HasBaseReg) {
+ // Fast-path: zero is always foldable.
+ if (BaseOffset == 0 && !BaseGV) return true;
+
+ // Conservatively, create an address with an immediate and a
+ // base and a scale.
+ int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
+
+ // Canonicalize a scale of 1 to a base register if the formula doesn't
+ // already have a base register.
+ if (!HasBaseReg && Scale == 1) {
+ Scale = 0;
+ HasBaseReg = true;
+ }
+
+ return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
+ HasBaseReg, Scale);
+}
+
+static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
+ ScalarEvolution &SE, int64_t MinOffset,
+ int64_t MaxOffset, LSRUse::KindType Kind,
+ MemAccessTy AccessTy, const SCEV *S,
+ bool HasBaseReg) {
+ // Fast-path: zero is always foldable.
+ if (S->isZero()) return true;
+
+ // Conservatively, create an address with an immediate and a
+ // base and a scale.
+ int64_t BaseOffset = ExtractImmediate(S, SE);
+ GlobalValue *BaseGV = ExtractSymbol(S, SE);
+
+ // If there's anything else involved, it's not foldable.
+ if (!S->isZero()) return false;
+
+ // Fast-path: zero is always foldable.
+ if (BaseOffset == 0 && !BaseGV) return true;
+
+ // Conservatively, create an address with an immediate and a
+ // base and a scale.
+ int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
+
+ return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
+ BaseOffset, HasBaseReg, Scale);
+}
+
+namespace {
+
+/// An individual increment in a Chain of IV increments. Relate an IV user to
+/// an expression that computes the IV it uses from the IV used by the previous
+/// link in the Chain.
+///
+/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
+/// original IVOperand. The head of the chain's IVOperand is only valid during
+/// chain collection, before LSR replaces IV users. During chain generation,
+/// IncExpr can be used to find the new IVOperand that computes the same
+/// expression.
+struct IVInc {
+ Instruction *UserInst;
+ Value* IVOperand;
+ const SCEV *IncExpr;
+
+ IVInc(Instruction *U, Value *O, const SCEV *E)
+ : UserInst(U), IVOperand(O), IncExpr(E) {}
+};
+
+// The list of IV increments in program order. We typically add the head of a
+// chain without finding subsequent links.
+struct IVChain {
+ SmallVector<IVInc, 1> Incs;
+ const SCEV *ExprBase = nullptr;
+
+ IVChain() = default;
+ IVChain(const IVInc &Head, const SCEV *Base)
+ : Incs(1, Head), ExprBase(Base) {}
+
+ using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
+
+ // Return the first increment in the chain.
+ const_iterator begin() const {
+ assert(!Incs.empty());
+ return std::next(Incs.begin());
+ }
+ const_iterator end() const {
+ return Incs.end();
+ }
+
+ // Returns true if this chain contains any increments.
+ bool hasIncs() const { return Incs.size() >= 2; }
+
+ // Add an IVInc to the end of this chain.
+ void add(const IVInc &X) { Incs.push_back(X); }
+
+ // Returns the last UserInst in the chain.
+ Instruction *tailUserInst() const { return Incs.back().UserInst; }
+
+ // Returns true if IncExpr can be profitably added to this chain.
+ bool isProfitableIncrement(const SCEV *OperExpr,
+ const SCEV *IncExpr,
+ ScalarEvolution&);
+};
+
+/// Helper for CollectChains to track multiple IV increment uses. Distinguish
+/// between FarUsers that definitely cross IV increments and NearUsers that may
+/// be used between IV increments.
+struct ChainUsers {
+ SmallPtrSet<Instruction*, 4> FarUsers;
+ SmallPtrSet<Instruction*, 4> NearUsers;
+};
+
+/// This class holds state for the main loop strength reduction logic.
+class LSRInstance {
+ IVUsers &IU;
+ ScalarEvolution &SE;
+ DominatorTree &DT;
+ LoopInfo &LI;
+ const TargetTransformInfo &TTI;
+ Loop *const L;
+ bool Changed = false;
+
+ /// This is the insert position that the current loop's induction variable
+ /// increment should be placed. In simple loops, this is the latch block's
+ /// terminator. But in more complicated cases, this is a position which will
+ /// dominate all the in-loop post-increment users.
+ Instruction *IVIncInsertPos = nullptr;
+
+ /// Interesting factors between use strides.
+ ///
+ /// We explicitly use a SetVector which contains a SmallSet, instead of the
+ /// default, a SmallDenseSet, because we need to use the full range of
+ /// int64_ts, and there's currently no good way of doing that with
+ /// SmallDenseSet.
+ SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
+
+ /// Interesting use types, to facilitate truncation reuse.
+ SmallSetVector<Type *, 4> Types;
+
+ /// The list of interesting uses.
+ SmallVector<LSRUse, 16> Uses;
+
+ /// Track which uses use which register candidates.
+ RegUseTracker RegUses;
+
+ // Limit the number of chains to avoid quadratic behavior. We don't expect to
+ // have more than a few IV increment chains in a loop. Missing a Chain falls
+ // back to normal LSR behavior for those uses.
+ static const unsigned MaxChains = 8;
+
+ /// IV users can form a chain of IV increments.
+ SmallVector<IVChain, MaxChains> IVChainVec;
+
+ /// IV users that belong to profitable IVChains.
+ SmallPtrSet<Use*, MaxChains> IVIncSet;
+
+ void OptimizeShadowIV();
+ bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
+ ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
+ void OptimizeLoopTermCond();
+
+ void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
+ SmallVectorImpl<ChainUsers> &ChainUsersVec);
+ void FinalizeChain(IVChain &Chain);
+ void CollectChains();
+ void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts);
+
+ void CollectInterestingTypesAndFactors();
+ void CollectFixupsAndInitialFormulae();
+
+ // Support for sharing of LSRUses between LSRFixups.
+ using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
+ UseMapTy UseMap;
+
+ bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
+ LSRUse::KindType Kind, MemAccessTy AccessTy);
+
+ std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
+ MemAccessTy AccessTy);
+
+ void DeleteUse(LSRUse &LU, size_t LUIdx);
+
+ LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
+
+ void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
+ void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
+ void CountRegisters(const Formula &F, size_t LUIdx);
+ bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
+
+ void CollectLoopInvariantFixupsAndFormulae();
+
+ void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
+ unsigned Depth = 0);
+
+ void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
+ const Formula &Base, unsigned Depth,
+ size_t Idx, bool IsScaledReg = false);
+ void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
+ void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+ const Formula &Base, size_t Idx,
+ bool IsScaledReg = false);
+ void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
+ void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+ const Formula &Base,
+ const SmallVectorImpl<int64_t> &Worklist,
+ size_t Idx, bool IsScaledReg = false);
+ void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
+ void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
+ void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
+ void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
+ void GenerateCrossUseConstantOffsets();
+ void GenerateAllReuseFormulae();
+
+ void FilterOutUndesirableDedicatedRegisters();
+
+ size_t EstimateSearchSpaceComplexity() const;
+ void NarrowSearchSpaceByDetectingSupersets();
+ void NarrowSearchSpaceByCollapsingUnrolledCode();
+ void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+ void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
+ void NarrowSearchSpaceByDeletingCostlyFormulas();
+ void NarrowSearchSpaceByPickingWinnerRegs();
+ void NarrowSearchSpaceUsingHeuristics();
+
+ void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
+ Cost &SolutionCost,
+ SmallVectorImpl<const Formula *> &Workspace,
+ const Cost &CurCost,
+ const SmallPtrSet<const SCEV *, 16> &CurRegs,
+ DenseSet<const SCEV *> &VisitedRegs) const;
+ void Solve(SmallVectorImpl<const Formula *> &Solution) const;
+
+ BasicBlock::iterator
+ HoistInsertPosition(BasicBlock::iterator IP,
+ const SmallVectorImpl<Instruction *> &Inputs) const;
+ BasicBlock::iterator
+ AdjustInsertPositionForExpand(BasicBlock::iterator IP,
+ const LSRFixup &LF,
+ const LSRUse &LU,
+ SCEVExpander &Rewriter) const;
+
+ Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
+ BasicBlock::iterator IP, SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
+ void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
+ const Formula &F, SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
+ void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
+ SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
+ void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
+
+public:
+ LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
+ LoopInfo &LI, const TargetTransformInfo &TTI);
+
+ bool getChanged() const { return Changed; }
+
+ void print_factors_and_types(raw_ostream &OS) const;
+ void print_fixups(raw_ostream &OS) const;
+ void print_uses(raw_ostream &OS) const;
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+} // end anonymous namespace
+
+/// If IV is used in a int-to-float cast inside the loop then try to eliminate
+/// the cast operation.
+void LSRInstance::OptimizeShadowIV() {
+ const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
+ if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+ return;
+
+ for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
+ UI != E; /* empty */) {
+ IVUsers::const_iterator CandidateUI = UI;
+ ++UI;
+ Instruction *ShadowUse = CandidateUI->getUser();
+ Type *DestTy = nullptr;
+ bool IsSigned = false;
+
+ /* If shadow use is a int->float cast then insert a second IV
+ to eliminate this cast.
+
+ for (unsigned i = 0; i < n; ++i)
+ foo((double)i);
+
+ is transformed into
+
+ double d = 0.0;
+ for (unsigned i = 0; i < n; ++i, ++d)
+ foo(d);
+ */
+ if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
+ IsSigned = false;
+ DestTy = UCast->getDestTy();
+ }
+ else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
+ IsSigned = true;
+ DestTy = SCast->getDestTy();
+ }
+ if (!DestTy) continue;
+
+ // If target does not support DestTy natively then do not apply
+ // this transformation.
+ if (!TTI.isTypeLegal(DestTy)) continue;
+
+ PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
+ if (!PH) continue;
+ if (PH->getNumIncomingValues() != 2) continue;
+
+ // If the calculation in integers overflows, the result in FP type will
+ // differ. So we only can do this transformation if we are guaranteed to not
+ // deal with overflowing values
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
+ if (!AR) continue;
+ if (IsSigned && !AR->hasNoSignedWrap()) continue;
+ if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
+
+ Type *SrcTy = PH->getType();
+ int Mantissa = DestTy->getFPMantissaWidth();
+ if (Mantissa == -1) continue;
+ if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
+ continue;
+
+ unsigned Entry, Latch;
+ if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
+ Entry = 0;
+ Latch = 1;
+ } else {
+ Entry = 1;
+ Latch = 0;
+ }
+
+ ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
+ if (!Init) continue;
+ Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
+ (double)Init->getSExtValue() :
+ (double)Init->getZExtValue());
+
+ BinaryOperator *Incr =
+ dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
+ if (!Incr) continue;
+ if (Incr->getOpcode() != Instruction::Add
+ && Incr->getOpcode() != Instruction::Sub)
+ continue;
+
+ /* Initialize new IV, double d = 0.0 in above example. */
+ ConstantInt *C = nullptr;
+ if (Incr->getOperand(0) == PH)
+ C = dyn_cast<ConstantInt>(Incr->getOperand(1));
+ else if (Incr->getOperand(1) == PH)
+ C = dyn_cast<ConstantInt>(Incr->getOperand(0));
+ else
+ continue;
+
+ if (!C) continue;
+
+ // Ignore negative constants, as the code below doesn't handle them
+ // correctly. TODO: Remove this restriction.
+ if (!C->getValue().isStrictlyPositive()) continue;
+
+ /* Add new PHINode. */
+ PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
+
+ /* create new increment. '++d' in above example. */
+ Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
+ BinaryOperator *NewIncr =
+ BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
+ Instruction::FAdd : Instruction::FSub,
+ NewPH, CFP, "IV.S.next.", Incr);
+
+ NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
+ NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
+
+ /* Remove cast operation */
+ ShadowUse->replaceAllUsesWith(NewPH);
+ ShadowUse->eraseFromParent();
+ Changed = true;
+ break;
+ }
+}
+
+/// If Cond has an operand that is an expression of an IV, set the IV user and
+/// stride information and return true, otherwise return false.
+bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
+ for (IVStrideUse &U : IU)
+ if (U.getUser() == Cond) {
+ // NOTE: we could handle setcc instructions with multiple uses here, but
+ // InstCombine does it as well for simple uses, it's not clear that it
+ // occurs enough in real life to handle.
+ CondUse = &U;
+ return true;
+ }
+ return false;
+}
+
+/// Rewrite the loop's terminating condition if it uses a max computation.
+///
+/// This is a narrow solution to a specific, but acute, problem. For loops
+/// like this:
+///
+/// i = 0;
+/// do {
+/// p[i] = 0.0;
+/// } while (++i < n);
+///
+/// the trip count isn't just 'n', because 'n' might not be positive. And
+/// unfortunately this can come up even for loops where the user didn't use
+/// a C do-while loop. For example, seemingly well-behaved top-test loops
+/// will commonly be lowered like this:
+///
+/// if (n > 0) {
+/// i = 0;
+/// do {
+/// p[i] = 0.0;
+/// } while (++i < n);
+/// }
+///
+/// and then it's possible for subsequent optimization to obscure the if
+/// test in such a way that indvars can't find it.
+///
+/// When indvars can't find the if test in loops like this, it creates a
+/// max expression, which allows it to give the loop a canonical
+/// induction variable:
+///
+/// i = 0;
+/// max = n < 1 ? 1 : n;
+/// do {
+/// p[i] = 0.0;
+/// } while (++i != max);
+///
+/// Canonical induction variables are necessary because the loop passes
+/// are designed around them. The most obvious example of this is the
+/// LoopInfo analysis, which doesn't remember trip count values. It
+/// expects to be able to rediscover the trip count each time it is
+/// needed, and it does this using a simple analysis that only succeeds if
+/// the loop has a canonical induction variable.
+///
+/// However, when it comes time to generate code, the maximum operation
+/// can be quite costly, especially if it's inside of an outer loop.
+///
+/// This function solves this problem by detecting this type of loop and
+/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
+/// the instructions for the maximum computation.
+ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
+ // Check that the loop matches the pattern we're looking for.
+ if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
+ Cond->getPredicate() != CmpInst::ICMP_NE)
+ return Cond;
+
+ SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
+ if (!Sel || !Sel->hasOneUse()) return Cond;
+
+ const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
+ if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+ return Cond;
+ const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
+
+ // Add one to the backedge-taken count to get the trip count.
+ const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
+ if (IterationCount != SE.getSCEV(Sel)) return Cond;
+
+ // Check for a max calculation that matches the pattern. There's no check
+ // for ICMP_ULE here because the comparison would be with zero, which
+ // isn't interesting.
+ CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
+ const SCEVNAryExpr *Max = nullptr;
+ if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
+ Pred = ICmpInst::ICMP_SLE;
+ Max = S;
+ } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
+ Pred = ICmpInst::ICMP_SLT;
+ Max = S;
+ } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
+ Pred = ICmpInst::ICMP_ULT;
+ Max = U;
+ } else {
+ // No match; bail.
+ return Cond;
+ }
+
+ // To handle a max with more than two operands, this optimization would
+ // require additional checking and setup.
+ if (Max->getNumOperands() != 2)
+ return Cond;
+
+ const SCEV *MaxLHS = Max->getOperand(0);
+ const SCEV *MaxRHS = Max->getOperand(1);
+
+ // ScalarEvolution canonicalizes constants to the left. For < and >, look
+ // for a comparison with 1. For <= and >=, a comparison with zero.
+ if (!MaxLHS ||
+ (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
+ return Cond;
+
+ // Check the relevant induction variable for conformance to
+ // the pattern.
+ const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
+ if (!AR || !AR->isAffine() ||
+ AR->getStart() != One ||
+ AR->getStepRecurrence(SE) != One)
+ return Cond;
+
+ assert(AR->getLoop() == L &&
+ "Loop condition operand is an addrec in a different loop!");
+
+ // Check the right operand of the select, and remember it, as it will
+ // be used in the new comparison instruction.
+ Value *NewRHS = nullptr;
+ if (ICmpInst::isTrueWhenEqual(Pred)) {
+ // Look for n+1, and grab n.
+ if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
+ if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+ if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+ NewRHS = BO->getOperand(0);
+ if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
+ if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+ if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+ NewRHS = BO->getOperand(0);
+ if (!NewRHS)
+ return Cond;
+ } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
+ NewRHS = Sel->getOperand(1);
+ else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
+ NewRHS = Sel->getOperand(2);
+ else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
+ NewRHS = SU->getValue();
+ else
+ // Max doesn't match expected pattern.
+ return Cond;
+
+ // Determine the new comparison opcode. It may be signed or unsigned,
+ // and the original comparison may be either equality or inequality.
+ if (Cond->getPredicate() == CmpInst::ICMP_EQ)
+ Pred = CmpInst::getInversePredicate(Pred);
+
+ // Ok, everything looks ok to change the condition into an SLT or SGE and
+ // delete the max calculation.
+ ICmpInst *NewCond =
+ new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
+
+ // Delete the max calculation instructions.
+ Cond->replaceAllUsesWith(NewCond);
+ CondUse->setUser(NewCond);
+ Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
+ Cond->eraseFromParent();
+ Sel->eraseFromParent();
+ if (Cmp->use_empty())
+ Cmp->eraseFromParent();
+ return NewCond;
+}
+
+/// Change loop terminating condition to use the postinc iv when possible.
+void
+LSRInstance::OptimizeLoopTermCond() {
+ SmallPtrSet<Instruction *, 4> PostIncs;
+
+ // We need a different set of heuristics for rotated and non-rotated loops.
+ // If a loop is rotated then the latch is also the backedge, so inserting
+ // post-inc expressions just before the latch is ideal. To reduce live ranges
+ // it also makes sense to rewrite terminating conditions to use post-inc
+ // expressions.
+ //
+ // If the loop is not rotated then the latch is not a backedge; the latch
+ // check is done in the loop head. Adding post-inc expressions before the
+ // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
+ // in the loop body. In this case we do *not* want to use post-inc expressions
+ // in the latch check, and we want to insert post-inc expressions before
+ // the backedge.
+ BasicBlock *LatchBlock = L->getLoopLatch();
+ SmallVector<BasicBlock*, 8> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+ if (llvm::all_of(ExitingBlocks, [&LatchBlock](const BasicBlock *BB) {
+ return LatchBlock != BB;
+ })) {
+ // The backedge doesn't exit the loop; treat this as a head-tested loop.
+ IVIncInsertPos = LatchBlock->getTerminator();
+ return;
+ }
+
+ // Otherwise treat this as a rotated loop.
+ for (BasicBlock *ExitingBlock : ExitingBlocks) {
+ // Get the terminating condition for the loop if possible. If we
+ // can, we want to change it to use a post-incremented version of its
+ // induction variable, to allow coalescing the live ranges for the IV into
+ // one register value.
+
+ BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+ if (!TermBr)
+ continue;
+ // FIXME: Overly conservative, termination condition could be an 'or' etc..
+ if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
+ continue;
+
+ // Search IVUsesByStride to find Cond's IVUse if there is one.
+ IVStrideUse *CondUse = nullptr;
+ ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
+ if (!FindIVUserForCond(Cond, CondUse))
+ continue;
+
+ // If the trip count is computed in terms of a max (due to ScalarEvolution
+ // being unable to find a sufficient guard, for example), change the loop
+ // comparison to use SLT or ULT instead of NE.
+ // One consequence of doing this now is that it disrupts the count-down
+ // optimization. That's not always a bad thing though, because in such
+ // cases it may still be worthwhile to avoid a max.
+ Cond = OptimizeMax(Cond, CondUse);
+
+ // If this exiting block dominates the latch block, it may also use
+ // the post-inc value if it won't be shared with other uses.
+ // Check for dominance.
+ if (!DT.dominates(ExitingBlock, LatchBlock))
+ continue;
+
+ // Conservatively avoid trying to use the post-inc value in non-latch
+ // exits if there may be pre-inc users in intervening blocks.
+ if (LatchBlock != ExitingBlock)
+ for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
+ // Test if the use is reachable from the exiting block. This dominator
+ // query is a conservative approximation of reachability.
+ if (&*UI != CondUse &&
+ !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
+ // Conservatively assume there may be reuse if the quotient of their
+ // strides could be a legal scale.
+ const SCEV *A = IU.getStride(*CondUse, L);
+ const SCEV *B = IU.getStride(*UI, L);
+ if (!A || !B) continue;
+ if (SE.getTypeSizeInBits(A->getType()) !=
+ SE.getTypeSizeInBits(B->getType())) {
+ if (SE.getTypeSizeInBits(A->getType()) >
+ SE.getTypeSizeInBits(B->getType()))
+ B = SE.getSignExtendExpr(B, A->getType());
+ else
+ A = SE.getSignExtendExpr(A, B->getType());
+ }
+ if (const SCEVConstant *D =
+ dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
+ const ConstantInt *C = D->getValue();
+ // Stride of one or negative one can have reuse with non-addresses.
+ if (C->isOne() || C->isMinusOne())
+ goto decline_post_inc;
+ // Avoid weird situations.
+ if (C->getValue().getMinSignedBits() >= 64 ||
+ C->getValue().isMinSignedValue())
+ goto decline_post_inc;
+ // Check for possible scaled-address reuse.
+ MemAccessTy AccessTy = getAccessType(TTI, UI->getUser());
+ int64_t Scale = C->getSExtValue();
+ if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+ /*BaseOffset=*/0,
+ /*HasBaseReg=*/false, Scale,
+ AccessTy.AddrSpace))
+ goto decline_post_inc;
+ Scale = -Scale;
+ if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+ /*BaseOffset=*/0,
+ /*HasBaseReg=*/false, Scale,
+ AccessTy.AddrSpace))
+ goto decline_post_inc;
+ }
+ }
+
+ DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
+ << *Cond << '\n');
+
+ // It's possible for the setcc instruction to be anywhere in the loop, and
+ // possible for it to have multiple users. If it is not immediately before
+ // the exiting block branch, move it.
+ if (&*++BasicBlock::iterator(Cond) != TermBr) {
+ if (Cond->hasOneUse()) {
+ Cond->moveBefore(TermBr);
+ } else {
+ // Clone the terminating condition and insert into the loopend.
+ ICmpInst *OldCond = Cond;
+ Cond = cast<ICmpInst>(Cond->clone());
+ Cond->setName(L->getHeader()->getName() + ".termcond");
+ ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond);
+
+ // Clone the IVUse, as the old use still exists!
+ CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
+ TermBr->replaceUsesOfWith(OldCond, Cond);
+ }
+ }
+
+ // If we get to here, we know that we can transform the setcc instruction to
+ // use the post-incremented version of the IV, allowing us to coalesce the
+ // live ranges for the IV correctly.
+ CondUse->transformToPostInc(L);
+ Changed = true;
+
+ PostIncs.insert(Cond);
+ decline_post_inc:;
+ }
+
+ // Determine an insertion point for the loop induction variable increment. It
+ // must dominate all the post-inc comparisons we just set up, and it must
+ // dominate the loop latch edge.
+ IVIncInsertPos = L->getLoopLatch()->getTerminator();
+ for (Instruction *Inst : PostIncs) {
+ BasicBlock *BB =
+ DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
+ Inst->getParent());
+ if (BB == Inst->getParent())
+ IVIncInsertPos = Inst;
+ else if (BB != IVIncInsertPos->getParent())
+ IVIncInsertPos = BB->getTerminator();
+ }
+}
+
+/// Determine if the given use can accommodate a fixup at the given offset and
+/// other details. If so, update the use and return true.
+bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
+ bool HasBaseReg, LSRUse::KindType Kind,
+ MemAccessTy AccessTy) {
+ int64_t NewMinOffset = LU.MinOffset;
+ int64_t NewMaxOffset = LU.MaxOffset;
+ MemAccessTy NewAccessTy = AccessTy;
+
+ // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
+ // something conservative, however this can pessimize in the case that one of
+ // the uses will have all its uses outside the loop, for example.
+ if (LU.Kind != Kind)
+ return false;
+
+ // Check for a mismatched access type, and fall back conservatively as needed.
+ // TODO: Be less conservative when the type is similar and can use the same
+ // addressing modes.
+ if (Kind == LSRUse::Address) {
+ if (AccessTy.MemTy != LU.AccessTy.MemTy) {
+ NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
+ AccessTy.AddrSpace);
+ }
+ }
+
+ // Conservatively assume HasBaseReg is true for now.
+ if (NewOffset < LU.MinOffset) {
+ if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
+ LU.MaxOffset - NewOffset, HasBaseReg))
+ return false;
+ NewMinOffset = NewOffset;
+ } else if (NewOffset > LU.MaxOffset) {
+ if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
+ NewOffset - LU.MinOffset, HasBaseReg))
+ return false;
+ NewMaxOffset = NewOffset;
+ }
+
+ // Update the use.
+ LU.MinOffset = NewMinOffset;
+ LU.MaxOffset = NewMaxOffset;
+ LU.AccessTy = NewAccessTy;
+ return true;
+}
+
+/// Return an LSRUse index and an offset value for a fixup which needs the given
+/// expression, with the given kind and optional access type. Either reuse an
+/// existing use or create a new one, as needed.
+std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
+ LSRUse::KindType Kind,
+ MemAccessTy AccessTy) {
+ const SCEV *Copy = Expr;
+ int64_t Offset = ExtractImmediate(Expr, SE);
+
+ // Basic uses can't accept any offset, for example.
+ if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
+ Offset, /*HasBaseReg=*/ true)) {
+ Expr = Copy;
+ Offset = 0;
+ }
+
+ std::pair<UseMapTy::iterator, bool> P =
+ UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
+ if (!P.second) {
+ // A use already existed with this base.
+ size_t LUIdx = P.first->second;
+ LSRUse &LU = Uses[LUIdx];
+ if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
+ // Reuse this use.
+ return std::make_pair(LUIdx, Offset);
+ }
+
+ // Create a new use.
+ size_t LUIdx = Uses.size();
+ P.first->second = LUIdx;
+ Uses.push_back(LSRUse(Kind, AccessTy));
+ LSRUse &LU = Uses[LUIdx];
+
+ LU.MinOffset = Offset;
+ LU.MaxOffset = Offset;
+ return std::make_pair(LUIdx, Offset);
+}
+
+/// Delete the given use from the Uses list.
+void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
+ if (&LU != &Uses.back())
+ std::swap(LU, Uses.back());
+ Uses.pop_back();
+
+ // Update RegUses.
+ RegUses.swapAndDropUse(LUIdx, Uses.size());
+}
+
+/// Look for a use distinct from OrigLU which is has a formula that has the same
+/// registers as the given formula.
+LSRUse *
+LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
+ const LSRUse &OrigLU) {
+ // Search all uses for the formula. This could be more clever.
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ // Check whether this use is close enough to OrigLU, to see whether it's
+ // worthwhile looking through its formulae.
+ // Ignore ICmpZero uses because they may contain formulae generated by
+ // GenerateICmpZeroScales, in which case adding fixup offsets may
+ // be invalid.
+ if (&LU != &OrigLU &&
+ LU.Kind != LSRUse::ICmpZero &&
+ LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
+ LU.WidestFixupType == OrigLU.WidestFixupType &&
+ LU.HasFormulaWithSameRegs(OrigF)) {
+ // Scan through this use's formulae.
+ for (const Formula &F : LU.Formulae) {
+ // Check to see if this formula has the same registers and symbols
+ // as OrigF.
+ if (F.BaseRegs == OrigF.BaseRegs &&
+ F.ScaledReg == OrigF.ScaledReg &&
+ F.BaseGV == OrigF.BaseGV &&
+ F.Scale == OrigF.Scale &&
+ F.UnfoldedOffset == OrigF.UnfoldedOffset) {
+ if (F.BaseOffset == 0)
+ return &LU;
+ // This is the formula where all the registers and symbols matched;
+ // there aren't going to be any others. Since we declined it, we
+ // can skip the rest of the formulae and proceed to the next LSRUse.
+ break;
+ }
+ }
+ }
+ }
+
+ // Nothing looked good.
+ return nullptr;
+}
+
+void LSRInstance::CollectInterestingTypesAndFactors() {
+ SmallSetVector<const SCEV *, 4> Strides;
+
+ // Collect interesting types and strides.
+ SmallVector<const SCEV *, 4> Worklist;
+ for (const IVStrideUse &U : IU) {
+ const SCEV *Expr = IU.getExpr(U);
+
+ // Collect interesting types.
+ Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
+
+ // Add strides for mentioned loops.
+ Worklist.push_back(Expr);
+ do {
+ const SCEV *S = Worklist.pop_back_val();
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+ if (AR->getLoop() == L)
+ Strides.insert(AR->getStepRecurrence(SE));
+ Worklist.push_back(AR->getStart());
+ } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+ Worklist.append(Add->op_begin(), Add->op_end());
+ }
+ } while (!Worklist.empty());
+ }
+
+ // Compute interesting factors from the set of interesting strides.
+ for (SmallSetVector<const SCEV *, 4>::const_iterator
+ I = Strides.begin(), E = Strides.end(); I != E; ++I)
+ for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
+ std::next(I); NewStrideIter != E; ++NewStrideIter) {
+ const SCEV *OldStride = *I;
+ const SCEV *NewStride = *NewStrideIter;
+
+ if (SE.getTypeSizeInBits(OldStride->getType()) !=
+ SE.getTypeSizeInBits(NewStride->getType())) {
+ if (SE.getTypeSizeInBits(OldStride->getType()) >
+ SE.getTypeSizeInBits(NewStride->getType()))
+ NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
+ else
+ OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
+ }
+ if (const SCEVConstant *Factor =
+ dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
+ SE, true))) {
+ if (Factor->getAPInt().getMinSignedBits() <= 64)
+ Factors.insert(Factor->getAPInt().getSExtValue());
+ } else if (const SCEVConstant *Factor =
+ dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
+ NewStride,
+ SE, true))) {
+ if (Factor->getAPInt().getMinSignedBits() <= 64)
+ Factors.insert(Factor->getAPInt().getSExtValue());
+ }
+ }
+
+ // If all uses use the same type, don't bother looking for truncation-based
+ // reuse.
+ if (Types.size() == 1)
+ Types.clear();
+
+ DEBUG(print_factors_and_types(dbgs()));
+}
+
+/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
+/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
+/// IVStrideUses, we could partially skip this.
+static User::op_iterator
+findIVOperand(User::op_iterator OI, User::op_iterator OE,
+ Loop *L, ScalarEvolution &SE) {
+ for(; OI != OE; ++OI) {
+ if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
+ if (!SE.isSCEVable(Oper->getType()))
+ continue;
+
+ if (const SCEVAddRecExpr *AR =
+ dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
+ if (AR->getLoop() == L)
+ break;
+ }
+ }
+ }
+ return OI;
+}
+
+/// IVChain logic must consistenctly peek base TruncInst operands, so wrap it in
+/// a convenient helper.
+static Value *getWideOperand(Value *Oper) {
+ if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
+ return Trunc->getOperand(0);
+ return Oper;
+}
+
+/// Return true if we allow an IV chain to include both types.
+static bool isCompatibleIVType(Value *LVal, Value *RVal) {
+ Type *LType = LVal->getType();
+ Type *RType = RVal->getType();
+ return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy() &&
+ // Different address spaces means (possibly)
+ // different types of the pointer implementation,
+ // e.g. i16 vs i32 so disallow that.
+ (LType->getPointerAddressSpace() ==
+ RType->getPointerAddressSpace()));
+}
+
+/// Return an approximation of this SCEV expression's "base", or NULL for any
+/// constant. Returning the expression itself is conservative. Returning a
+/// deeper subexpression is more precise and valid as long as it isn't less
+/// complex than another subexpression. For expressions involving multiple
+/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
+/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
+/// IVInc==b-a.
+///
+/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
+/// SCEVUnknown, we simply return the rightmost SCEV operand.
+static const SCEV *getExprBase(const SCEV *S) {
+ switch (S->getSCEVType()) {
+ default: // uncluding scUnknown.
+ return S;
+ case scConstant:
+ return nullptr;
+ case scTruncate:
+ return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
+ case scZeroExtend:
+ return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
+ case scSignExtend:
+ return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
+ case scAddExpr: {
+ // Skip over scaled operands (scMulExpr) to follow add operands as long as
+ // there's nothing more complex.
+ // FIXME: not sure if we want to recognize negation.
+ const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
+ for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()),
+ E(Add->op_begin()); I != E; ++I) {
+ const SCEV *SubExpr = *I;
+ if (SubExpr->getSCEVType() == scAddExpr)
+ return getExprBase(SubExpr);
+
+ if (SubExpr->getSCEVType() != scMulExpr)
+ return SubExpr;
+ }
+ return S; // all operands are scaled, be conservative.
+ }
+ case scAddRecExpr:
+ return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
+ }
+}
+
+/// Return true if the chain increment is profitable to expand into a loop
+/// invariant value, which may require its own register. A profitable chain
+/// increment will be an offset relative to the same base. We allow such offsets
+/// to potentially be used as chain increment as long as it's not obviously
+/// expensive to expand using real instructions.
+bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
+ const SCEV *IncExpr,
+ ScalarEvolution &SE) {
+ // Aggressively form chains when -stress-ivchain.
+ if (StressIVChain)
+ return true;
+
+ // Do not replace a constant offset from IV head with a nonconstant IV
+ // increment.
+ if (!isa<SCEVConstant>(IncExpr)) {
+ const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
+ if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
+ return false;
+ }
+
+ SmallPtrSet<const SCEV*, 8> Processed;
+ return !isHighCostExpansion(IncExpr, Processed, SE);
+}
+
+/// Return true if the number of registers needed for the chain is estimated to
+/// be less than the number required for the individual IV users. First prohibit
+/// any IV users that keep the IV live across increments (the Users set should
+/// be empty). Next count the number and type of increments in the chain.
+///
+/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
+/// effectively use postinc addressing modes. Only consider it profitable it the
+/// increments can be computed in fewer registers when chained.
+///
+/// TODO: Consider IVInc free if it's already used in another chains.
+static bool
+isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
+ ScalarEvolution &SE, const TargetTransformInfo &TTI) {
+ if (StressIVChain)
+ return true;
+
+ if (!Chain.hasIncs())
+ return false;
+
+ if (!Users.empty()) {
+ DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
+ for (Instruction *Inst : Users) {
+ dbgs() << " " << *Inst << "\n";
+ });
+ return false;
+ }
+ assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
+
+ // The chain itself may require a register, so intialize cost to 1.
+ int cost = 1;
+
+ // A complete chain likely eliminates the need for keeping the original IV in
+ // a register. LSR does not currently know how to form a complete chain unless
+ // the header phi already exists.
+ if (isa<PHINode>(Chain.tailUserInst())
+ && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
+ --cost;
+ }
+ const SCEV *LastIncExpr = nullptr;
+ unsigned NumConstIncrements = 0;
+ unsigned NumVarIncrements = 0;
+ unsigned NumReusedIncrements = 0;
+ for (const IVInc &Inc : Chain) {
+ if (Inc.IncExpr->isZero())
+ continue;
+
+ // Incrementing by zero or some constant is neutral. We assume constants can
+ // be folded into an addressing mode or an add's immediate operand.
+ if (isa<SCEVConstant>(Inc.IncExpr)) {
+ ++NumConstIncrements;
+ continue;
+ }
+
+ if (Inc.IncExpr == LastIncExpr)
+ ++NumReusedIncrements;
+ else
+ ++NumVarIncrements;
+
+ LastIncExpr = Inc.IncExpr;
+ }
+ // An IV chain with a single increment is handled by LSR's postinc
+ // uses. However, a chain with multiple increments requires keeping the IV's
+ // value live longer than it needs to be if chained.
+ if (NumConstIncrements > 1)
+ --cost;
+
+ // Materializing increment expressions in the preheader that didn't exist in
+ // the original code may cost a register. For example, sign-extended array
+ // indices can produce ridiculous increments like this:
+ // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
+ cost += NumVarIncrements;
+
+ // Reusing variable increments likely saves a register to hold the multiple of
+ // the stride.
+ cost -= NumReusedIncrements;
+
+ DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
+ << "\n");
+
+ return cost < 0;
+}
+
+/// Add this IV user to an existing chain or make it the head of a new chain.
+void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
+ SmallVectorImpl<ChainUsers> &ChainUsersVec) {
+ // When IVs are used as types of varying widths, they are generally converted
+ // to a wider type with some uses remaining narrow under a (free) trunc.
+ Value *const NextIV = getWideOperand(IVOper);
+ const SCEV *const OperExpr = SE.getSCEV(NextIV);
+ const SCEV *const OperExprBase = getExprBase(OperExpr);
+
+ // Visit all existing chains. Check if its IVOper can be computed as a
+ // profitable loop invariant increment from the last link in the Chain.
+ unsigned ChainIdx = 0, NChains = IVChainVec.size();
+ const SCEV *LastIncExpr = nullptr;
+ for (; ChainIdx < NChains; ++ChainIdx) {
+ IVChain &Chain = IVChainVec[ChainIdx];
+
+ // Prune the solution space aggressively by checking that both IV operands
+ // are expressions that operate on the same unscaled SCEVUnknown. This
+ // "base" will be canceled by the subsequent getMinusSCEV call. Checking
+ // first avoids creating extra SCEV expressions.
+ if (!StressIVChain && Chain.ExprBase != OperExprBase)
+ continue;
+
+ Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
+ if (!isCompatibleIVType(PrevIV, NextIV))
+ continue;
+
+ // A phi node terminates a chain.
+ if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
+ continue;
+
+ // The increment must be loop-invariant so it can be kept in a register.
+ const SCEV *PrevExpr = SE.getSCEV(PrevIV);
+ const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
+ if (!SE.isLoopInvariant(IncExpr, L))
+ continue;
+
+ if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
+ LastIncExpr = IncExpr;
+ break;
+ }
+ }
+ // If we haven't found a chain, create a new one, unless we hit the max. Don't
+ // bother for phi nodes, because they must be last in the chain.
+ if (ChainIdx == NChains) {
+ if (isa<PHINode>(UserInst))
+ return;
+ if (NChains >= MaxChains && !StressIVChain) {
+ DEBUG(dbgs() << "IV Chain Limit\n");
+ return;
+ }
+ LastIncExpr = OperExpr;
+ // IVUsers may have skipped over sign/zero extensions. We don't currently
+ // attempt to form chains involving extensions unless they can be hoisted
+ // into this loop's AddRec.
+ if (!isa<SCEVAddRecExpr>(LastIncExpr))
+ return;
+ ++NChains;
+ IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
+ OperExprBase));
+ ChainUsersVec.resize(NChains);
+ DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
+ << ") IV=" << *LastIncExpr << "\n");
+ } else {
+ DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
+ << ") IV+" << *LastIncExpr << "\n");
+ // Add this IV user to the end of the chain.
+ IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
+ }
+ IVChain &Chain = IVChainVec[ChainIdx];
+
+ SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
+ // This chain's NearUsers become FarUsers.
+ if (!LastIncExpr->isZero()) {
+ ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
+ NearUsers.end());
+ NearUsers.clear();
+ }
+
+ // All other uses of IVOperand become near uses of the chain.
+ // We currently ignore intermediate values within SCEV expressions, assuming
+ // they will eventually be used be the current chain, or can be computed
+ // from one of the chain increments. To be more precise we could
+ // transitively follow its user and only add leaf IV users to the set.
+ for (User *U : IVOper->users()) {
+ Instruction *OtherUse = dyn_cast<Instruction>(U);
+ if (!OtherUse)
+ continue;
+ // Uses in the chain will no longer be uses if the chain is formed.
+ // Include the head of the chain in this iteration (not Chain.begin()).
+ IVChain::const_iterator IncIter = Chain.Incs.begin();
+ IVChain::const_iterator IncEnd = Chain.Incs.end();
+ for( ; IncIter != IncEnd; ++IncIter) {
+ if (IncIter->UserInst == OtherUse)
+ break;
+ }
+ if (IncIter != IncEnd)
+ continue;
+
+ if (SE.isSCEVable(OtherUse->getType())
+ && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
+ && IU.isIVUserOrOperand(OtherUse)) {
+ continue;
+ }
+ NearUsers.insert(OtherUse);
+ }
+
+ // Since this user is part of the chain, it's no longer considered a use
+ // of the chain.
+ ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
+}
+
+/// Populate the vector of Chains.
+///
+/// This decreases ILP at the architecture level. Targets with ample registers,
+/// multiple memory ports, and no register renaming probably don't want
+/// this. However, such targets should probably disable LSR altogether.
+///
+/// The job of LSR is to make a reasonable choice of induction variables across
+/// the loop. Subsequent passes can easily "unchain" computation exposing more
+/// ILP *within the loop* if the target wants it.
+///
+/// Finding the best IV chain is potentially a scheduling problem. Since LSR
+/// will not reorder memory operations, it will recognize this as a chain, but
+/// will generate redundant IV increments. Ideally this would be corrected later
+/// by a smart scheduler:
+/// = A[i]
+/// = A[i+x]
+/// A[i] =
+/// A[i+x] =
+///
+/// TODO: Walk the entire domtree within this loop, not just the path to the
+/// loop latch. This will discover chains on side paths, but requires
+/// maintaining multiple copies of the Chains state.
+void LSRInstance::CollectChains() {
+ DEBUG(dbgs() << "Collecting IV Chains.\n");
+ SmallVector<ChainUsers, 8> ChainUsersVec;
+
+ SmallVector<BasicBlock *,8> LatchPath;
+ BasicBlock *LoopHeader = L->getHeader();
+ for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
+ Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
+ LatchPath.push_back(Rung->getBlock());
+ }
+ LatchPath.push_back(LoopHeader);
+
+ // Walk the instruction stream from the loop header to the loop latch.
+ for (BasicBlock *BB : reverse(LatchPath)) {
+ for (Instruction &I : *BB) {
+ // Skip instructions that weren't seen by IVUsers analysis.
+ if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
+ continue;
+
+ // Ignore users that are part of a SCEV expression. This way we only
+ // consider leaf IV Users. This effectively rediscovers a portion of
+ // IVUsers analysis but in program order this time.
+ if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
+ continue;
+
+ // Remove this instruction from any NearUsers set it may be in.
+ for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
+ ChainIdx < NChains; ++ChainIdx) {
+ ChainUsersVec[ChainIdx].NearUsers.erase(&I);
+ }
+ // Search for operands that can be chained.
+ SmallPtrSet<Instruction*, 4> UniqueOperands;
+ User::op_iterator IVOpEnd = I.op_end();
+ User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
+ while (IVOpIter != IVOpEnd) {
+ Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
+ if (UniqueOperands.insert(IVOpInst).second)
+ ChainInstruction(&I, IVOpInst, ChainUsersVec);
+ IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
+ }
+ } // Continue walking down the instructions.
+ } // Continue walking down the domtree.
+ // Visit phi backedges to determine if the chain can generate the IV postinc.
+ for (PHINode &PN : L->getHeader()->phis()) {
+ if (!SE.isSCEVable(PN.getType()))
+ continue;
+
+ Instruction *IncV =
+ dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
+ if (IncV)
+ ChainInstruction(&PN, IncV, ChainUsersVec);
+ }
+ // Remove any unprofitable chains.
+ unsigned ChainIdx = 0;
+ for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
+ UsersIdx < NChains; ++UsersIdx) {
+ if (!isProfitableChain(IVChainVec[UsersIdx],
+ ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
+ continue;
+ // Preserve the chain at UsesIdx.
+ if (ChainIdx != UsersIdx)
+ IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
+ FinalizeChain(IVChainVec[ChainIdx]);
+ ++ChainIdx;
+ }
+ IVChainVec.resize(ChainIdx);
+}
+
+void LSRInstance::FinalizeChain(IVChain &Chain) {
+ assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
+ DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
+
+ for (const IVInc &Inc : Chain) {
+ DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
+ auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
+ assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
+ IVIncSet.insert(UseI);
+ }
+}
+
+/// Return true if the IVInc can be folded into an addressing mode.
+static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
+ Value *Operand, const TargetTransformInfo &TTI) {
+ const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
+ if (!IncConst || !isAddressUse(TTI, UserInst, Operand))
+ return false;
+
+ if (IncConst->getAPInt().getMinSignedBits() > 64)
+ return false;
+
+ MemAccessTy AccessTy = getAccessType(TTI, UserInst);
+ int64_t IncOffset = IncConst->getValue()->getSExtValue();
+ if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
+ IncOffset, /*HaseBaseReg=*/false))
+ return false;
+
+ return true;
+}
+
+/// Generate an add or subtract for each IVInc in a chain to materialize the IV
+/// user's operand from the previous IV user's operand.
+void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+ // Find the new IVOperand for the head of the chain. It may have been replaced
+ // by LSR.
+ const IVInc &Head = Chain.Incs[0];
+ User::op_iterator IVOpEnd = Head.UserInst->op_end();
+ // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
+ User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
+ IVOpEnd, L, SE);
+ Value *IVSrc = nullptr;
+ while (IVOpIter != IVOpEnd) {
+ IVSrc = getWideOperand(*IVOpIter);
+
+ // If this operand computes the expression that the chain needs, we may use
+ // it. (Check this after setting IVSrc which is used below.)
+ //
+ // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
+ // narrow for the chain, so we can no longer use it. We do allow using a
+ // wider phi, assuming the LSR checked for free truncation. In that case we
+ // should already have a truncate on this operand such that
+ // getSCEV(IVSrc) == IncExpr.
+ if (SE.getSCEV(*IVOpIter) == Head.IncExpr
+ || SE.getSCEV(IVSrc) == Head.IncExpr) {
+ break;
+ }
+ IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
+ }
+ if (IVOpIter == IVOpEnd) {
+ // Gracefully give up on this chain.
+ DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
+ return;
+ }
+
+ DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
+ Type *IVTy = IVSrc->getType();
+ Type *IntTy = SE.getEffectiveSCEVType(IVTy);
+ const SCEV *LeftOverExpr = nullptr;
+ for (const IVInc &Inc : Chain) {
+ Instruction *InsertPt = Inc.UserInst;
+ if (isa<PHINode>(InsertPt))
+ InsertPt = L->getLoopLatch()->getTerminator();
+
+ // IVOper will replace the current IV User's operand. IVSrc is the IV
+ // value currently held in a register.
+ Value *IVOper = IVSrc;
+ if (!Inc.IncExpr->isZero()) {
+ // IncExpr was the result of subtraction of two narrow values, so must
+ // be signed.
+ const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
+ LeftOverExpr = LeftOverExpr ?
+ SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
+ }
+ if (LeftOverExpr && !LeftOverExpr->isZero()) {
+ // Expand the IV increment.
+ Rewriter.clearPostInc();
+ Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
+ const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
+ SE.getUnknown(IncV));
+ IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
+
+ // If an IV increment can't be folded, use it as the next IV value.
+ if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
+ assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
+ IVSrc = IVOper;
+ LeftOverExpr = nullptr;
+ }
+ }
+ Type *OperTy = Inc.IVOperand->getType();
+ if (IVTy != OperTy) {
+ assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
+ "cannot extend a chained IV");
+ IRBuilder<> Builder(InsertPt);
+ IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
+ }
+ Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
+ DeadInsts.emplace_back(Inc.IVOperand);
+ }
+ // If LSR created a new, wider phi, we may also replace its postinc. We only
+ // do this if we also found a wide value for the head of the chain.
+ if (isa<PHINode>(Chain.tailUserInst())) {
+ for (PHINode &Phi : L->getHeader()->phis()) {
+ if (!isCompatibleIVType(&Phi, IVSrc))
+ continue;
+ Instruction *PostIncV = dyn_cast<Instruction>(
+ Phi.getIncomingValueForBlock(L->getLoopLatch()));
+ if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
+ continue;
+ Value *IVOper = IVSrc;
+ Type *PostIncTy = PostIncV->getType();
+ if (IVTy != PostIncTy) {
+ assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
+ IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
+ Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
+ IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
+ }
+ Phi.replaceUsesOfWith(PostIncV, IVOper);
+ DeadInsts.emplace_back(PostIncV);
+ }
+ }
+}
+
+void LSRInstance::CollectFixupsAndInitialFormulae() {
+ for (const IVStrideUse &U : IU) {
+ Instruction *UserInst = U.getUser();
+ // Skip IV users that are part of profitable IV Chains.
+ User::op_iterator UseI =
+ find(UserInst->operands(), U.getOperandValToReplace());
+ assert(UseI != UserInst->op_end() && "cannot find IV operand");
+ if (IVIncSet.count(UseI)) {
+ DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
+ continue;
+ }
+
+ LSRUse::KindType Kind = LSRUse::Basic;
+ MemAccessTy AccessTy;
+ if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
+ Kind = LSRUse::Address;
+ AccessTy = getAccessType(TTI, UserInst);
+ }
+
+ const SCEV *S = IU.getExpr(U);
+ PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
+
+ // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
+ // (N - i == 0), and this allows (N - i) to be the expression that we work
+ // with rather than just N or i, so we can consider the register
+ // requirements for both N and i at the same time. Limiting this code to
+ // equality icmps is not a problem because all interesting loops use
+ // equality icmps, thanks to IndVarSimplify.
+ if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst))
+ if (CI->isEquality()) {
+ // Swap the operands if needed to put the OperandValToReplace on the
+ // left, for consistency.
+ Value *NV = CI->getOperand(1);
+ if (NV == U.getOperandValToReplace()) {
+ CI->setOperand(1, CI->getOperand(0));
+ CI->setOperand(0, NV);
+ NV = CI->getOperand(1);
+ Changed = true;
+ }
+
+ // x == y --> x - y == 0
+ const SCEV *N = SE.getSCEV(NV);
+ if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
+ // S is normalized, so normalize N before folding it into S
+ // to keep the result normalized.
+ N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
+ Kind = LSRUse::ICmpZero;
+ S = SE.getMinusSCEV(N, S);
+ }
+
+ // -1 and the negations of all interesting strides (except the negation
+ // of -1) are now also interesting.
+ for (size_t i = 0, e = Factors.size(); i != e; ++i)
+ if (Factors[i] != -1)
+ Factors.insert(-(uint64_t)Factors[i]);
+ Factors.insert(-1);
+ }
+
+ // Get or create an LSRUse.
+ std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
+ size_t LUIdx = P.first;
+ int64_t Offset = P.second;
+ LSRUse &LU = Uses[LUIdx];
+
+ // Record the fixup.
+ LSRFixup &LF = LU.getNewFixup();
+ LF.UserInst = UserInst;
+ LF.OperandValToReplace = U.getOperandValToReplace();
+ LF.PostIncLoops = TmpPostIncLoops;
+ LF.Offset = Offset;
+ LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+
+ if (!LU.WidestFixupType ||
+ SE.getTypeSizeInBits(LU.WidestFixupType) <
+ SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+ LU.WidestFixupType = LF.OperandValToReplace->getType();
+
+ // If this is the first use of this LSRUse, give it a formula.
+ if (LU.Formulae.empty()) {
+ InsertInitialFormula(S, LU, LUIdx);
+ CountRegisters(LU.Formulae.back(), LUIdx);
+ }
+ }
+
+ DEBUG(print_fixups(dbgs()));
+}
+
+/// Insert a formula for the given expression into the given use, separating out
+/// loop-variant portions from loop-invariant and loop-computable portions.
+void
+LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
+ // Mark uses whose expressions cannot be expanded.
+ if (!isSafeToExpand(S, SE))
+ LU.RigidFormula = true;
+
+ Formula F;
+ F.initialMatch(S, L, SE);
+ bool Inserted = InsertFormula(LU, LUIdx, F);
+ assert(Inserted && "Initial formula already exists!"); (void)Inserted;
+}
+
+/// Insert a simple single-register formula for the given expression into the
+/// given use.
+void
+LSRInstance::InsertSupplementalFormula(const SCEV *S,
+ LSRUse &LU, size_t LUIdx) {
+ Formula F;
+ F.BaseRegs.push_back(S);
+ F.HasBaseReg = true;
+ bool Inserted = InsertFormula(LU, LUIdx, F);
+ assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
+}
+
+/// Note which registers are used by the given formula, updating RegUses.
+void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
+ if (F.ScaledReg)
+ RegUses.countRegister(F.ScaledReg, LUIdx);
+ for (const SCEV *BaseReg : F.BaseRegs)
+ RegUses.countRegister(BaseReg, LUIdx);
+}
+
+/// If the given formula has not yet been inserted, add it to the list, and
+/// return true. Return false otherwise.
+bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
+ // Do not insert formula that we will not be able to expand.
+ assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
+ "Formula is illegal");
+
+ if (!LU.InsertFormula(F, *L))
+ return false;
+
+ CountRegisters(F, LUIdx);
+ return true;
+}
+
+/// Check for other uses of loop-invariant values which we're tracking. These
+/// other uses will pin these values in registers, making them less profitable
+/// for elimination.
+/// TODO: This currently misses non-constant addrec step registers.
+/// TODO: Should this give more weight to users inside the loop?
+void
+LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
+ SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
+ SmallPtrSet<const SCEV *, 32> Visited;
+
+ while (!Worklist.empty()) {
+ const SCEV *S = Worklist.pop_back_val();
+
+ // Don't process the same SCEV twice
+ if (!Visited.insert(S).second)
+ continue;
+
+ if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
+ Worklist.append(N->op_begin(), N->op_end());
+ else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
+ Worklist.push_back(C->getOperand());
+ else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
+ Worklist.push_back(D->getLHS());
+ Worklist.push_back(D->getRHS());
+ } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
+ const Value *V = US->getValue();
+ if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
+ // Look for instructions defined outside the loop.
+ if (L->contains(Inst)) continue;
+ } else if (isa<UndefValue>(V))
+ // Undef doesn't have a live range, so it doesn't matter.
+ continue;
+ for (const Use &U : V->uses()) {
+ const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
+ // Ignore non-instructions.
+ if (!UserInst)
+ continue;
+ // Ignore instructions in other functions (as can happen with
+ // Constants).
+ if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
+ continue;
+ // Ignore instructions not dominated by the loop.
+ const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
+ UserInst->getParent() :
+ cast<PHINode>(UserInst)->getIncomingBlock(
+ PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
+ if (!DT.dominates(L->getHeader(), UseBB))
+ continue;
+ // Don't bother if the instruction is in a BB which ends in an EHPad.
+ if (UseBB->getTerminator()->isEHPad())
+ continue;
+ // Don't bother rewriting PHIs in catchswitch blocks.
+ if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
+ continue;
+ // Ignore uses which are part of other SCEV expressions, to avoid
+ // analyzing them multiple times.
+ if (SE.isSCEVable(UserInst->getType())) {
+ const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
+ // If the user is a no-op, look through to its uses.
+ if (!isa<SCEVUnknown>(UserS))
+ continue;
+ if (UserS == US) {
+ Worklist.push_back(
+ SE.getUnknown(const_cast<Instruction *>(UserInst)));
+ continue;
+ }
+ }
+ // Ignore icmp instructions which are already being analyzed.
+ if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
+ unsigned OtherIdx = !U.getOperandNo();
+ Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
+ if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
+ continue;
+ }
+
+ std::pair<size_t, int64_t> P = getUse(
+ S, LSRUse::Basic, MemAccessTy());
+ size_t LUIdx = P.first;
+ int64_t Offset = P.second;
+ LSRUse &LU = Uses[LUIdx];
+ LSRFixup &LF = LU.getNewFixup();
+ LF.UserInst = const_cast<Instruction *>(UserInst);
+ LF.OperandValToReplace = U;
+ LF.Offset = Offset;
+ LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+ if (!LU.WidestFixupType ||
+ SE.getTypeSizeInBits(LU.WidestFixupType) <
+ SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+ LU.WidestFixupType = LF.OperandValToReplace->getType();
+ InsertSupplementalFormula(US, LU, LUIdx);
+ CountRegisters(LU.Formulae.back(), Uses.size() - 1);
+ break;
+ }
+ }
+ }
+}
+
+/// Split S into subexpressions which can be pulled out into separate
+/// registers. If C is non-null, multiply each subexpression by C.
+///
+/// Return remainder expression after factoring the subexpressions captured by
+/// Ops. If Ops is complete, return NULL.
+static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
+ SmallVectorImpl<const SCEV *> &Ops,
+ const Loop *L,
+ ScalarEvolution &SE,
+ unsigned Depth = 0) {
+ // Arbitrarily cap recursion to protect compile time.
+ if (Depth >= 3)
+ return S;
+
+ if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+ // Break out add operands.
+ for (const SCEV *S : Add->operands()) {
+ const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
+ if (Remainder)
+ Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
+ }
+ return nullptr;
+ } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+ // Split a non-zero base out of an addrec.
+ if (AR->getStart()->isZero() || !AR->isAffine())
+ return S;
+
+ const SCEV *Remainder = CollectSubexprs(AR->getStart(),
+ C, Ops, L, SE, Depth+1);
+ // Split the non-zero AddRec unless it is part of a nested recurrence that
+ // does not pertain to this loop.
+ if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
+ Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
+ Remainder = nullptr;
+ }
+ if (Remainder != AR->getStart()) {
+ if (!Remainder)
+ Remainder = SE.getConstant(AR->getType(), 0);
+ return SE.getAddRecExpr(Remainder,
+ AR->getStepRecurrence(SE),
+ AR->getLoop(),
+ //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+ SCEV::FlagAnyWrap);
+ }
+ } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
+ // Break (C * (a + b + c)) into C*a + C*b + C*c.
+ if (Mul->getNumOperands() != 2)
+ return S;
+ if (const SCEVConstant *Op0 =
+ dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
+ C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
+ const SCEV *Remainder =
+ CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
+ if (Remainder)
+ Ops.push_back(SE.getMulExpr(C, Remainder));
+ return nullptr;
+ }
+ }
+ return S;
+}
+
+/// \brief Helper function for LSRInstance::GenerateReassociations.
+void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
+ const Formula &Base,
+ unsigned Depth, size_t Idx,
+ bool IsScaledReg) {
+ const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+ SmallVector<const SCEV *, 8> AddOps;
+ const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
+ if (Remainder)
+ AddOps.push_back(Remainder);
+
+ if (AddOps.size() == 1)
+ return;
+
+ for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
+ JE = AddOps.end();
+ J != JE; ++J) {
+ // Loop-variant "unknown" values are uninteresting; we won't be able to
+ // do anything meaningful with them.
+ if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
+ continue;
+
+ // Don't pull a constant into a register if the constant could be folded
+ // into an immediate field.
+ if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+ LU.AccessTy, *J, Base.getNumRegs() > 1))
+ continue;
+
+ // Collect all operands except *J.
+ SmallVector<const SCEV *, 8> InnerAddOps(
+ ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
+ InnerAddOps.append(std::next(J),
+ ((const SmallVector<const SCEV *, 8> &)AddOps).end());
+
+ // Don't leave just a constant behind in a register if the constant could
+ // be folded into an immediate field.
+ if (InnerAddOps.size() == 1 &&
+ isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+ LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
+ continue;
+
+ const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
+ if (InnerSum->isZero())
+ continue;
+ Formula F = Base;
+
+ // Add the remaining pieces of the add back into the new formula.
+ const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
+ if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
+ TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+ InnerSumSC->getValue()->getZExtValue())) {
+ F.UnfoldedOffset =
+ (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
+ if (IsScaledReg)
+ F.ScaledReg = nullptr;
+ else
+ F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
+ } else if (IsScaledReg)
+ F.ScaledReg = InnerSum;
+ else
+ F.BaseRegs[Idx] = InnerSum;
+
+ // Add J as its own register, or an unfolded immediate.
+ const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
+ if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
+ TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+ SC->getValue()->getZExtValue()))
+ F.UnfoldedOffset =
+ (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
+ else
+ F.BaseRegs.push_back(*J);
+ // We may have changed the number of register in base regs, adjust the
+ // formula accordingly.
+ F.canonicalize(*L);
+
+ if (InsertFormula(LU, LUIdx, F))
+ // If that formula hadn't been seen before, recurse to find more like
+ // it.
+ GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth + 1);
+ }
+}
+
+/// Split out subexpressions from adds and the bases of addrecs.
+void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
+ Formula Base, unsigned Depth) {
+ assert(Base.isCanonical(*L) && "Input must be in the canonical form");
+ // Arbitrarily cap recursion to protect compile time.
+ if (Depth >= 3)
+ return;
+
+ for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+ GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
+
+ if (Base.Scale == 1)
+ GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
+ /* Idx */ -1, /* IsScaledReg */ true);
+}
+
+/// Generate a formula consisting of all of the loop-dominating registers added
+/// into a single register.
+void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
+ Formula Base) {
+ // This method is only interesting on a plurality of registers.
+ if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1)
+ return;
+
+ // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
+ // processing the formula.
+ Base.unscale();
+ Formula F = Base;
+ F.BaseRegs.clear();
+ SmallVector<const SCEV *, 4> Ops;
+ for (const SCEV *BaseReg : Base.BaseRegs) {
+ if (SE.properlyDominates(BaseReg, L->getHeader()) &&
+ !SE.hasComputableLoopEvolution(BaseReg, L))
+ Ops.push_back(BaseReg);
+ else
+ F.BaseRegs.push_back(BaseReg);
+ }
+ if (Ops.size() > 1) {
+ const SCEV *Sum = SE.getAddExpr(Ops);
+ // TODO: If Sum is zero, it probably means ScalarEvolution missed an
+ // opportunity to fold something. For now, just ignore such cases
+ // rather than proceed with zero in a register.
+ if (!Sum->isZero()) {
+ F.BaseRegs.push_back(Sum);
+ F.canonicalize(*L);
+ (void)InsertFormula(LU, LUIdx, F);
+ }
+ }
+}
+
+/// \brief Helper function for LSRInstance::GenerateSymbolicOffsets.
+void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+ const Formula &Base, size_t Idx,
+ bool IsScaledReg) {
+ const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+ GlobalValue *GV = ExtractSymbol(G, SE);
+ if (G->isZero() || !GV)
+ return;
+ Formula F = Base;
+ F.BaseGV = GV;
+ if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+ return;
+ if (IsScaledReg)
+ F.ScaledReg = G;
+ else
+ F.BaseRegs[Idx] = G;
+ (void)InsertFormula(LU, LUIdx, F);
+}
+
+/// Generate reuse formulae using symbolic offsets.
+void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
+ Formula Base) {
+ // We can't add a symbolic offset if the address already contains one.
+ if (Base.BaseGV) return;
+
+ for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+ GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
+ if (Base.Scale == 1)
+ GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
+ /* IsScaledReg */ true);
+}
+
+/// \brief Helper function for LSRInstance::GenerateConstantOffsets.
+void LSRInstance::GenerateConstantOffsetsImpl(
+ LSRUse &LU, unsigned LUIdx, const Formula &Base,
+ const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
+ const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+ for (int64_t Offset : Worklist) {
+ Formula F = Base;
+ F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
+ if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind,
+ LU.AccessTy, F)) {
+ // Add the offset to the base register.
+ const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G);
+ // If it cancelled out, drop the base register, otherwise update it.
+ if (NewG->isZero()) {
+ if (IsScaledReg) {
+ F.Scale = 0;
+ F.ScaledReg = nullptr;
+ } else
+ F.deleteBaseReg(F.BaseRegs[Idx]);
+ F.canonicalize(*L);
+ } else if (IsScaledReg)
+ F.ScaledReg = NewG;
+ else
+ F.BaseRegs[Idx] = NewG;
+
+ (void)InsertFormula(LU, LUIdx, F);
+ }
+ }
+
+ int64_t Imm = ExtractImmediate(G, SE);
+ if (G->isZero() || Imm == 0)
+ return;
+ Formula F = Base;
+ F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
+ if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+ return;
+ if (IsScaledReg)
+ F.ScaledReg = G;
+ else
+ F.BaseRegs[Idx] = G;
+ (void)InsertFormula(LU, LUIdx, F);
+}
+
+/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
+void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
+ Formula Base) {
+ // TODO: For now, just add the min and max offset, because it usually isn't
+ // worthwhile looking at everything inbetween.
+ SmallVector<int64_t, 2> Worklist;
+ Worklist.push_back(LU.MinOffset);
+ if (LU.MaxOffset != LU.MinOffset)
+ Worklist.push_back(LU.MaxOffset);
+
+ for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+ GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
+ if (Base.Scale == 1)
+ GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
+ /* IsScaledReg */ true);
+}
+
+/// For ICmpZero, check to see if we can scale up the comparison. For example, x
+/// == y -> x*c == y*c.
+void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
+ Formula Base) {
+ if (LU.Kind != LSRUse::ICmpZero) return;
+
+ // Determine the integer type for the base formula.
+ Type *IntTy = Base.getType();
+ if (!IntTy) return;
+ if (SE.getTypeSizeInBits(IntTy) > 64) return;
+
+ // Don't do this if there is more than one offset.
+ if (LU.MinOffset != LU.MaxOffset) return;
+
+ // Check if transformation is valid. It is illegal to multiply pointer.
+ if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
+ return;
+ for (const SCEV *BaseReg : Base.BaseRegs)
+ if (BaseReg->getType()->isPointerTy())
+ return;
+ assert(!Base.BaseGV && "ICmpZero use is not legal!");
+
+ // Check each interesting stride.
+ for (int64_t Factor : Factors) {
+ // Check that the multiplication doesn't overflow.
+ if (Base.BaseOffset == std::numeric_limits<int64_t>::min() && Factor == -1)
+ continue;
+ int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
+ if (NewBaseOffset / Factor != Base.BaseOffset)
+ continue;
+ // If the offset will be truncated at this use, check that it is in bounds.
+ if (!IntTy->isPointerTy() &&
+ !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
+ continue;
+
+ // Check that multiplying with the use offset doesn't overflow.
+ int64_t Offset = LU.MinOffset;
+ if (Offset == std::numeric_limits<int64_t>::min() && Factor == -1)
+ continue;
+ Offset = (uint64_t)Offset * Factor;
+ if (Offset / Factor != LU.MinOffset)
+ continue;
+ // If the offset will be truncated at this use, check that it is in bounds.
+ if (!IntTy->isPointerTy() &&
+ !ConstantInt::isValueValidForType(IntTy, Offset))
+ continue;
+
+ Formula F = Base;
+ F.BaseOffset = NewBaseOffset;
+
+ // Check that this scale is legal.
+ if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
+ continue;
+
+ // Compensate for the use having MinOffset built into it.
+ F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
+
+ const SCEV *FactorS = SE.getConstant(IntTy, Factor);
+
+ // Check that multiplying with each base register doesn't overflow.
+ for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
+ F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
+ if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
+ goto next;
+ }
+
+ // Check that multiplying with the scaled register doesn't overflow.
+ if (F.ScaledReg) {
+ F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
+ if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
+ continue;
+ }
+
+ // Check that multiplying with the unfolded offset doesn't overflow.
+ if (F.UnfoldedOffset != 0) {
+ if (F.UnfoldedOffset == std::numeric_limits<int64_t>::min() &&
+ Factor == -1)
+ continue;
+ F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
+ if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
+ continue;
+ // If the offset will be truncated, check that it is in bounds.
+ if (!IntTy->isPointerTy() &&
+ !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
+ continue;
+ }
+
+ // If we make it here and it's legal, add it.
+ (void)InsertFormula(LU, LUIdx, F);
+ next:;
+ }
+}
+
+/// Generate stride factor reuse formulae by making use of scaled-offset address
+/// modes, for example.
+void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
+ // Determine the integer type for the base formula.
+ Type *IntTy = Base.getType();
+ if (!IntTy) return;
+
+ // If this Formula already has a scaled register, we can't add another one.
+ // Try to unscale the formula to generate a better scale.
+ if (Base.Scale != 0 && !Base.unscale())
+ return;
+
+ assert(Base.Scale == 0 && "unscale did not did its job!");
+
+ // Check each interesting stride.
+ for (int64_t Factor : Factors) {
+ Base.Scale = Factor;
+ Base.HasBaseReg = Base.BaseRegs.size() > 1;
+ // Check whether this scale is going to be legal.
+ if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+ Base)) {
+ // As a special-case, handle special out-of-loop Basic users specially.
+ // TODO: Reconsider this special case.
+ if (LU.Kind == LSRUse::Basic &&
+ isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
+ LU.AccessTy, Base) &&
+ LU.AllFixupsOutsideLoop)
+ LU.Kind = LSRUse::Special;
+ else
+ continue;
+ }
+ // For an ICmpZero, negating a solitary base register won't lead to
+ // new solutions.
+ if (LU.Kind == LSRUse::ICmpZero &&
+ !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
+ continue;
+ // For each addrec base reg, if its loop is current loop, apply the scale.
+ for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
+ if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
+ const SCEV *FactorS = SE.getConstant(IntTy, Factor);
+ if (FactorS->isZero())
+ continue;
+ // Divide out the factor, ignoring high bits, since we'll be
+ // scaling the value back up in the end.
+ if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
+ // TODO: This could be optimized to avoid all the copying.
+ Formula F = Base;
+ F.ScaledReg = Quotient;
+ F.deleteBaseReg(F.BaseRegs[i]);
+ // The canonical representation of 1*reg is reg, which is already in
+ // Base. In that case, do not try to insert the formula, it will be
+ // rejected anyway.
+ if (F.Scale == 1 && (F.BaseRegs.empty() ||
+ (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
+ continue;
+ // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
+ // non canonical Formula with ScaledReg's loop not being L.
+ if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
+ F.canonicalize(*L);
+ (void)InsertFormula(LU, LUIdx, F);
+ }
+ }
+ }
+ }
+}
+
+/// Generate reuse formulae from different IV types.
+void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
+ // Don't bother truncating symbolic values.
+ if (Base.BaseGV) return;
+
+ // Determine the integer type for the base formula.
+ Type *DstTy = Base.getType();
+ if (!DstTy) return;
+ DstTy = SE.getEffectiveSCEVType(DstTy);
+
+ for (Type *SrcTy : Types) {
+ if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
+ Formula F = Base;
+
+ if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy);
+ for (const SCEV *&BaseReg : F.BaseRegs)
+ BaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy);
+
+ // TODO: This assumes we've done basic processing on all uses and
+ // have an idea what the register usage is.
+ if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
+ continue;
+
+ F.canonicalize(*L);
+ (void)InsertFormula(LU, LUIdx, F);
+ }
+ }
+}
+
+namespace {
+
+/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
+/// modifications so that the search phase doesn't have to worry about the data
+/// structures moving underneath it.
+struct WorkItem {
+ size_t LUIdx;
+ int64_t Imm;
+ const SCEV *OrigReg;
+
+ WorkItem(size_t LI, int64_t I, const SCEV *R)
+ : LUIdx(LI), Imm(I), OrigReg(R) {}
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+} // end anonymous namespace
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void WorkItem::print(raw_ostream &OS) const {
+ OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
+ << " , add offset " << Imm;
+}
+
+LLVM_DUMP_METHOD void WorkItem::dump() const {
+ print(errs()); errs() << '\n';
+}
+#endif
+
+/// Look for registers which are a constant distance apart and try to form reuse
+/// opportunities between them.
+void LSRInstance::GenerateCrossUseConstantOffsets() {
+ // Group the registers by their value without any added constant offset.
+ using ImmMapTy = std::map<int64_t, const SCEV *>;
+
+ DenseMap<const SCEV *, ImmMapTy> Map;
+ DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
+ SmallVector<const SCEV *, 8> Sequence;
+ for (const SCEV *Use : RegUses) {
+ const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
+ int64_t Imm = ExtractImmediate(Reg, SE);
+ auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
+ if (Pair.second)
+ Sequence.push_back(Reg);
+ Pair.first->second.insert(std::make_pair(Imm, Use));
+ UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
+ }
+
+ // Now examine each set of registers with the same base value. Build up
+ // a list of work to do and do the work in a separate step so that we're
+ // not adding formulae and register counts while we're searching.
+ SmallVector<WorkItem, 32> WorkItems;
+ SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
+ for (const SCEV *Reg : Sequence) {
+ const ImmMapTy &Imms = Map.find(Reg)->second;
+
+ // It's not worthwhile looking for reuse if there's only one offset.
+ if (Imms.size() == 1)
+ continue;
+
+ DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
+ for (const auto &Entry : Imms)
+ dbgs() << ' ' << Entry.first;
+ dbgs() << '\n');
+
+ // Examine each offset.
+ for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
+ J != JE; ++J) {
+ const SCEV *OrigReg = J->second;
+
+ int64_t JImm = J->first;
+ const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
+
+ if (!isa<SCEVConstant>(OrigReg) &&
+ UsedByIndicesMap[Reg].count() == 1) {
+ DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
+ continue;
+ }
+
+ // Conservatively examine offsets between this orig reg a few selected
+ // other orig regs.
+ ImmMapTy::const_iterator OtherImms[] = {
+ Imms.begin(), std::prev(Imms.end()),
+ Imms.lower_bound((Imms.begin()->first + std::prev(Imms.end())->first) /
+ 2)
+ };
+ for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
+ ImmMapTy::const_iterator M = OtherImms[i];
+ if (M == J || M == JE) continue;
+
+ // Compute the difference between the two.
+ int64_t Imm = (uint64_t)JImm - M->first;
+ for (unsigned LUIdx : UsedByIndices.set_bits())
+ // Make a memo of this use, offset, and register tuple.
+ if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
+ WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
+ }
+ }
+ }
+
+ Map.clear();
+ Sequence.clear();
+ UsedByIndicesMap.clear();
+ UniqueItems.clear();
+
+ // Now iterate through the worklist and add new formulae.
+ for (const WorkItem &WI : WorkItems) {
+ size_t LUIdx = WI.LUIdx;
+ LSRUse &LU = Uses[LUIdx];
+ int64_t Imm = WI.Imm;
+ const SCEV *OrigReg = WI.OrigReg;
+
+ Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
+ const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
+ unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
+
+ // TODO: Use a more targeted data structure.
+ for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
+ Formula F = LU.Formulae[L];
+ // FIXME: The code for the scaled and unscaled registers looks
+ // very similar but slightly different. Investigate if they
+ // could be merged. That way, we would not have to unscale the
+ // Formula.
+ F.unscale();
+ // Use the immediate in the scaled register.
+ if (F.ScaledReg == OrigReg) {
+ int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
+ // Don't create 50 + reg(-50).
+ if (F.referencesReg(SE.getSCEV(
+ ConstantInt::get(IntTy, -(uint64_t)Offset))))
+ continue;
+ Formula NewF = F;
+ NewF.BaseOffset = Offset;
+ if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+ NewF))
+ continue;
+ NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
+
+ // If the new scale is a constant in a register, and adding the constant
+ // value to the immediate would produce a value closer to zero than the
+ // immediate itself, then the formula isn't worthwhile.
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
+ if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) &&
+ (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
+ .ule(std::abs(NewF.BaseOffset)))
+ continue;
+
+ // OK, looks good.
+ NewF.canonicalize(*this->L);
+ (void)InsertFormula(LU, LUIdx, NewF);
+ } else {
+ // Use the immediate in a base register.
+ for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
+ const SCEV *BaseReg = F.BaseRegs[N];
+ if (BaseReg != OrigReg)
+ continue;
+ Formula NewF = F;
+ NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
+ if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
+ LU.Kind, LU.AccessTy, NewF)) {
+ if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
+ continue;
+ NewF = F;
+ NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
+ }
+ NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
+
+ // If the new formula has a constant in a register, and adding the
+ // constant value to the immediate would produce a value closer to
+ // zero than the immediate itself, then the formula isn't worthwhile.
+ for (const SCEV *NewReg : NewF.BaseRegs)
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg))
+ if ((C->getAPInt() + NewF.BaseOffset)
+ .abs()
+ .slt(std::abs(NewF.BaseOffset)) &&
+ (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >=
+ countTrailingZeros<uint64_t>(NewF.BaseOffset))
+ goto skip_formula;
+
+ // Ok, looks good.
+ NewF.canonicalize(*this->L);
+ (void)InsertFormula(LU, LUIdx, NewF);
+ break;
+ skip_formula:;
+ }
+ }
+ }
+ }
+}
+
+/// Generate formulae for each use.
+void
+LSRInstance::GenerateAllReuseFormulae() {
+ // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
+ // queries are more precise.
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
+ }
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateScales(LU, LUIdx, LU.Formulae[i]);
+ }
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
+ }
+
+ GenerateCrossUseConstantOffsets();
+
+ DEBUG(dbgs() << "\n"
+ "After generating reuse formulae:\n";
+ print_uses(dbgs()));
+}
+
+/// If there are multiple formulae with the same set of registers used
+/// by other uses, pick the best one and delete the others.
+void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
+ DenseSet<const SCEV *> VisitedRegs;
+ SmallPtrSet<const SCEV *, 16> Regs;
+ SmallPtrSet<const SCEV *, 16> LoserRegs;
+#ifndef NDEBUG
+ bool ChangedFormulae = false;
+#endif
+
+ // Collect the best formula for each unique set of shared registers. This
+ // is reset for each use.
+ using BestFormulaeTy =
+ DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>;
+
+ BestFormulaeTy BestFormulae;
+
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
+
+ bool Any = false;
+ for (size_t FIdx = 0, NumForms = LU.Formulae.size();
+ FIdx != NumForms; ++FIdx) {
+ Formula &F = LU.Formulae[FIdx];
+
+ // Some formulas are instant losers. For example, they may depend on
+ // nonexistent AddRecs from other loops. These need to be filtered
+ // immediately, otherwise heuristics could choose them over others leading
+ // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
+ // avoids the need to recompute this information across formulae using the
+ // same bad AddRec. Passing LoserRegs is also essential unless we remove
+ // the corresponding bad register from the Regs set.
+ Cost CostF;
+ Regs.clear();
+ CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, SE, DT, LU, &LoserRegs);
+ if (CostF.isLoser()) {
+ // During initial formula generation, undesirable formulae are generated
+ // by uses within other loops that have some non-trivial address mode or
+ // use the postinc form of the IV. LSR needs to provide these formulae
+ // as the basis of rediscovering the desired formula that uses an AddRec
+ // corresponding to the existing phi. Once all formulae have been
+ // generated, these initial losers may be pruned.
+ DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
+ dbgs() << "\n");
+ }
+ else {
+ SmallVector<const SCEV *, 4> Key;
+ for (const SCEV *Reg : F.BaseRegs) {
+ if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
+ Key.push_back(Reg);
+ }
+ if (F.ScaledReg &&
+ RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
+ Key.push_back(F.ScaledReg);
+ // Unstable sort by host order ok, because this is only used for
+ // uniquifying.
+ std::sort(Key.begin(), Key.end());
+
+ std::pair<BestFormulaeTy::const_iterator, bool> P =
+ BestFormulae.insert(std::make_pair(Key, FIdx));
+ if (P.second)
+ continue;
+
+ Formula &Best = LU.Formulae[P.first->second];
+
+ Cost CostBest;
+ Regs.clear();
+ CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU);
+ if (CostF.isLess(CostBest, TTI))
+ std::swap(F, Best);
+ DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
+ dbgs() << "\n"
+ " in favor of formula "; Best.print(dbgs());
+ dbgs() << '\n');
+ }
+#ifndef NDEBUG
+ ChangedFormulae = true;
+#endif
+ LU.DeleteFormula(F);
+ --FIdx;
+ --NumForms;
+ Any = true;
+ }
+
+ // Now that we've filtered out some formulae, recompute the Regs set.
+ if (Any)
+ LU.RecomputeRegs(LUIdx, RegUses);
+
+ // Reset this to prepare for the next use.
+ BestFormulae.clear();
+ }
+
+ DEBUG(if (ChangedFormulae) {
+ dbgs() << "\n"
+ "After filtering out undesirable candidates:\n";
+ print_uses(dbgs());
+ });
+}
+
+// This is a rough guess that seems to work fairly well.
+static const size_t ComplexityLimit = std::numeric_limits<uint16_t>::max();
+
+/// Estimate the worst-case number of solutions the solver might have to
+/// consider. It almost never considers this many solutions because it prune the
+/// search space, but the pruning isn't always sufficient.
+size_t LSRInstance::EstimateSearchSpaceComplexity() const {
+ size_t Power = 1;
+ for (const LSRUse &LU : Uses) {
+ size_t FSize = LU.Formulae.size();
+ if (FSize >= ComplexityLimit) {
+ Power = ComplexityLimit;
+ break;
+ }
+ Power *= FSize;
+ if (Power >= ComplexityLimit)
+ break;
+ }
+ return Power;
+}
+
+/// When one formula uses a superset of the registers of another formula, it
+/// won't help reduce register pressure (though it may not necessarily hurt
+/// register pressure); remove it to simplify the system.
+void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
+ if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+ DEBUG(dbgs() << "The search space is too complex.\n");
+
+ DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
+ "which use a superset of registers used by other "
+ "formulae.\n");
+
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ bool Any = false;
+ for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+ Formula &F = LU.Formulae[i];
+ // Look for a formula with a constant or GV in a register. If the use
+ // also has a formula with that same value in an immediate field,
+ // delete the one that uses a register.
+ for (SmallVectorImpl<const SCEV *>::const_iterator
+ I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
+ Formula NewF = F;
+ NewF.BaseOffset += C->getValue()->getSExtValue();
+ NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
+ (I - F.BaseRegs.begin()));
+ if (LU.HasFormulaWithSameRegs(NewF)) {
+ DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
+ LU.DeleteFormula(F);
+ --i;
+ --e;
+ Any = true;
+ break;
+ }
+ } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
+ if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
+ if (!F.BaseGV) {
+ Formula NewF = F;
+ NewF.BaseGV = GV;
+ NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
+ (I - F.BaseRegs.begin()));
+ if (LU.HasFormulaWithSameRegs(NewF)) {
+ DEBUG(dbgs() << " Deleting "; F.print(dbgs());
+ dbgs() << '\n');
+ LU.DeleteFormula(F);
+ --i;
+ --e;
+ Any = true;
+ break;
+ }
+ }
+ }
+ }
+ }
+ if (Any)
+ LU.RecomputeRegs(LUIdx, RegUses);
+ }
+
+ DEBUG(dbgs() << "After pre-selection:\n";
+ print_uses(dbgs()));
+ }
+}
+
+/// When there are many registers for expressions like A, A+1, A+2, etc.,
+/// allocate a single register for them.
+void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
+ if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+ return;
+
+ DEBUG(dbgs() << "The search space is too complex.\n"
+ "Narrowing the search space by assuming that uses separated "
+ "by a constant offset will use the same registers.\n");
+
+ // This is especially useful for unrolled loops.
+
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ for (const Formula &F : LU.Formulae) {
+ if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
+ continue;
+
+ LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
+ if (!LUThatHas)
+ continue;
+
+ if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
+ LU.Kind, LU.AccessTy))
+ continue;
+
+ DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
+
+ LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
+
+ // Transfer the fixups of LU to LUThatHas.
+ for (LSRFixup &Fixup : LU.Fixups) {
+ Fixup.Offset += F.BaseOffset;
+ LUThatHas->pushFixup(Fixup);
+ DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
+ }
+
+ // Delete formulae from the new use which are no longer legal.
+ bool Any = false;
+ for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
+ Formula &F = LUThatHas->Formulae[i];
+ if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
+ LUThatHas->Kind, LUThatHas->AccessTy, F)) {
+ DEBUG(dbgs() << " Deleting "; F.print(dbgs());
+ dbgs() << '\n');
+ LUThatHas->DeleteFormula(F);
+ --i;
+ --e;
+ Any = true;
+ }
+ }
+
+ if (Any)
+ LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
+
+ // Delete the old use.
+ DeleteUse(LU, LUIdx);
+ --LUIdx;
+ --NumUses;
+ break;
+ }
+ }
+
+ DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+}
+
+/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
+/// we've done more filtering, as it may be able to find more formulae to
+/// eliminate.
+void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
+ if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+ DEBUG(dbgs() << "The search space is too complex.\n");
+
+ DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
+ "undesirable dedicated registers.\n");
+
+ FilterOutUndesirableDedicatedRegisters();
+
+ DEBUG(dbgs() << "After pre-selection:\n";
+ print_uses(dbgs()));
+ }
+}
+
+/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
+/// Pick the best one and delete the others.
+/// This narrowing heuristic is to keep as many formulae with different
+/// Scale and ScaledReg pair as possible while narrowing the search space.
+/// The benefit is that it is more likely to find out a better solution
+/// from a formulae set with more Scale and ScaledReg variations than
+/// a formulae set with the same Scale and ScaledReg. The picking winner
+/// reg heurstic will often keep the formulae with the same Scale and
+/// ScaledReg and filter others, and we want to avoid that if possible.
+void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
+ if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+ return;
+
+ DEBUG(dbgs() << "The search space is too complex.\n"
+ "Narrowing the search space by choosing the best Formula "
+ "from the Formulae with the same Scale and ScaledReg.\n");
+
+ // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
+ using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
+
+ BestFormulaeTy BestFormulae;
+#ifndef NDEBUG
+ bool ChangedFormulae = false;
+#endif
+ DenseSet<const SCEV *> VisitedRegs;
+ SmallPtrSet<const SCEV *, 16> Regs;
+
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
+
+ // Return true if Formula FA is better than Formula FB.
+ auto IsBetterThan = [&](Formula &FA, Formula &FB) {
+ // First we will try to choose the Formula with fewer new registers.
+ // For a register used by current Formula, the more the register is
+ // shared among LSRUses, the less we increase the register number
+ // counter of the formula.
+ size_t FARegNum = 0;
+ for (const SCEV *Reg : FA.BaseRegs) {
+ const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
+ FARegNum += (NumUses - UsedByIndices.count() + 1);
+ }
+ size_t FBRegNum = 0;
+ for (const SCEV *Reg : FB.BaseRegs) {
+ const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
+ FBRegNum += (NumUses - UsedByIndices.count() + 1);
+ }
+ if (FARegNum != FBRegNum)
+ return FARegNum < FBRegNum;
+
+ // If the new register numbers are the same, choose the Formula with
+ // less Cost.
+ Cost CostFA, CostFB;
+ Regs.clear();
+ CostFA.RateFormula(TTI, FA, Regs, VisitedRegs, L, SE, DT, LU);
+ Regs.clear();
+ CostFB.RateFormula(TTI, FB, Regs, VisitedRegs, L, SE, DT, LU);
+ return CostFA.isLess(CostFB, TTI);
+ };
+
+ bool Any = false;
+ for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
+ ++FIdx) {
+ Formula &F = LU.Formulae[FIdx];
+ if (!F.ScaledReg)
+ continue;
+ auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
+ if (P.second)
+ continue;
+
+ Formula &Best = LU.Formulae[P.first->second];
+ if (IsBetterThan(F, Best))
+ std::swap(F, Best);
+ DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
+ dbgs() << "\n"
+ " in favor of formula ";
+ Best.print(dbgs()); dbgs() << '\n');
+#ifndef NDEBUG
+ ChangedFormulae = true;
+#endif
+ LU.DeleteFormula(F);
+ --FIdx;
+ --NumForms;
+ Any = true;
+ }
+ if (Any)
+ LU.RecomputeRegs(LUIdx, RegUses);
+
+ // Reset this to prepare for the next use.
+ BestFormulae.clear();
+ }
+
+ DEBUG(if (ChangedFormulae) {
+ dbgs() << "\n"
+ "After filtering out undesirable candidates:\n";
+ print_uses(dbgs());
+ });
+}
+
+/// The function delete formulas with high registers number expectation.
+/// Assuming we don't know the value of each formula (already delete
+/// all inefficient), generate probability of not selecting for each
+/// register.
+/// For example,
+/// Use1:
+/// reg(a) + reg({0,+,1})
+/// reg(a) + reg({-1,+,1}) + 1
+/// reg({a,+,1})
+/// Use2:
+/// reg(b) + reg({0,+,1})
+/// reg(b) + reg({-1,+,1}) + 1
+/// reg({b,+,1})
+/// Use3:
+/// reg(c) + reg(b) + reg({0,+,1})
+/// reg(c) + reg({b,+,1})
+///
+/// Probability of not selecting
+/// Use1 Use2 Use3
+/// reg(a) (1/3) * 1 * 1
+/// reg(b) 1 * (1/3) * (1/2)
+/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
+/// reg({-1,+,1}) (2/3) * (2/3) * 1
+/// reg({a,+,1}) (2/3) * 1 * 1
+/// reg({b,+,1}) 1 * (2/3) * (2/3)
+/// reg(c) 1 * 1 * 0
+///
+/// Now count registers number mathematical expectation for each formula:
+/// Note that for each use we exclude probability if not selecting for the use.
+/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
+/// probabilty 1/3 of not selecting for Use1).
+/// Use1:
+/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
+/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
+/// reg({a,+,1}) 1
+/// Use2:
+/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
+/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
+/// reg({b,+,1}) 2/3
+/// Use3:
+/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
+/// reg(c) + reg({b,+,1}) 1 + 2/3
+void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
+ if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+ return;
+ // Ok, we have too many of formulae on our hands to conveniently handle.
+ // Use a rough heuristic to thin out the list.
+
+ // Set of Regs wich will be 100% used in final solution.
+ // Used in each formula of a solution (in example above this is reg(c)).
+ // We can skip them in calculations.
+ SmallPtrSet<const SCEV *, 4> UniqRegs;
+ DEBUG(dbgs() << "The search space is too complex.\n");
+
+ // Map each register to probability of not selecting
+ DenseMap <const SCEV *, float> RegNumMap;
+ for (const SCEV *Reg : RegUses) {
+ if (UniqRegs.count(Reg))
+ continue;
+ float PNotSel = 1;
+ for (const LSRUse &LU : Uses) {
+ if (!LU.Regs.count(Reg))
+ continue;
+ float P = LU.getNotSelectedProbability(Reg);
+ if (P != 0.0)
+ PNotSel *= P;
+ else
+ UniqRegs.insert(Reg);
+ }
+ RegNumMap.insert(std::make_pair(Reg, PNotSel));
+ }
+
+ DEBUG(dbgs() << "Narrowing the search space by deleting costly formulas\n");
+
+ // Delete formulas where registers number expectation is high.
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ // If nothing to delete - continue.
+ if (LU.Formulae.size() < 2)
+ continue;
+ // This is temporary solution to test performance. Float should be
+ // replaced with round independent type (based on integers) to avoid
+ // different results for different target builds.
+ float FMinRegNum = LU.Formulae[0].getNumRegs();
+ float FMinARegNum = LU.Formulae[0].getNumRegs();
+ size_t MinIdx = 0;
+ for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+ Formula &F = LU.Formulae[i];
+ float FRegNum = 0;
+ float FARegNum = 0;
+ for (const SCEV *BaseReg : F.BaseRegs) {
+ if (UniqRegs.count(BaseReg))
+ continue;
+ FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
+ if (isa<SCEVAddRecExpr>(BaseReg))
+ FARegNum +=
+ RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
+ }
+ if (const SCEV *ScaledReg = F.ScaledReg) {
+ if (!UniqRegs.count(ScaledReg)) {
+ FRegNum +=
+ RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
+ if (isa<SCEVAddRecExpr>(ScaledReg))
+ FARegNum +=
+ RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
+ }
+ }
+ if (FMinRegNum > FRegNum ||
+ (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
+ FMinRegNum = FRegNum;
+ FMinARegNum = FARegNum;
+ MinIdx = i;
+ }
+ }
+ DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
+ dbgs() << " with min reg num " << FMinRegNum << '\n');
+ if (MinIdx != 0)
+ std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
+ while (LU.Formulae.size() != 1) {
+ DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
+ dbgs() << '\n');
+ LU.Formulae.pop_back();
+ }
+ LU.RecomputeRegs(LUIdx, RegUses);
+ assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
+ Formula &F = LU.Formulae[0];
+ DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
+ // When we choose the formula, the regs become unique.
+ UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+ if (F.ScaledReg)
+ UniqRegs.insert(F.ScaledReg);
+ }
+ DEBUG(dbgs() << "After pre-selection:\n";
+ print_uses(dbgs()));
+}
+
+/// Pick a register which seems likely to be profitable, and then in any use
+/// which has any reference to that register, delete all formulae which do not
+/// reference that register.
+void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
+ // With all other options exhausted, loop until the system is simple
+ // enough to handle.
+ SmallPtrSet<const SCEV *, 4> Taken;
+ while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+ // Ok, we have too many of formulae on our hands to conveniently handle.
+ // Use a rough heuristic to thin out the list.
+ DEBUG(dbgs() << "The search space is too complex.\n");
+
+ // Pick the register which is used by the most LSRUses, which is likely
+ // to be a good reuse register candidate.
+ const SCEV *Best = nullptr;
+ unsigned BestNum = 0;
+ for (const SCEV *Reg : RegUses) {
+ if (Taken.count(Reg))
+ continue;
+ if (!Best) {
+ Best = Reg;
+ BestNum = RegUses.getUsedByIndices(Reg).count();
+ } else {
+ unsigned Count = RegUses.getUsedByIndices(Reg).count();
+ if (Count > BestNum) {
+ Best = Reg;
+ BestNum = Count;
+ }
+ }
+ }
+
+ DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
+ << " will yield profitable reuse.\n");
+ Taken.insert(Best);
+
+ // In any use with formulae which references this register, delete formulae
+ // which don't reference it.
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ if (!LU.Regs.count(Best)) continue;
+
+ bool Any = false;
+ for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+ Formula &F = LU.Formulae[i];
+ if (!F.referencesReg(Best)) {
+ DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
+ LU.DeleteFormula(F);
+ --e;
+ --i;
+ Any = true;
+ assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
+ continue;
+ }
+ }
+
+ if (Any)
+ LU.RecomputeRegs(LUIdx, RegUses);
+ }
+
+ DEBUG(dbgs() << "After pre-selection:\n";
+ print_uses(dbgs()));
+ }
+}
+
+/// If there are an extraordinary number of formulae to choose from, use some
+/// rough heuristics to prune down the number of formulae. This keeps the main
+/// solver from taking an extraordinary amount of time in some worst-case
+/// scenarios.
+void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
+ NarrowSearchSpaceByDetectingSupersets();
+ NarrowSearchSpaceByCollapsingUnrolledCode();
+ NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+ if (FilterSameScaledReg)
+ NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
+ if (LSRExpNarrow)
+ NarrowSearchSpaceByDeletingCostlyFormulas();
+ else
+ NarrowSearchSpaceByPickingWinnerRegs();
+}
+
+/// This is the recursive solver.
+void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
+ Cost &SolutionCost,
+ SmallVectorImpl<const Formula *> &Workspace,
+ const Cost &CurCost,
+ const SmallPtrSet<const SCEV *, 16> &CurRegs,
+ DenseSet<const SCEV *> &VisitedRegs) const {
+ // Some ideas:
+ // - prune more:
+ // - use more aggressive filtering
+ // - sort the formula so that the most profitable solutions are found first
+ // - sort the uses too
+ // - search faster:
+ // - don't compute a cost, and then compare. compare while computing a cost
+ // and bail early.
+ // - track register sets with SmallBitVector
+
+ const LSRUse &LU = Uses[Workspace.size()];
+
+ // If this use references any register that's already a part of the
+ // in-progress solution, consider it a requirement that a formula must
+ // reference that register in order to be considered. This prunes out
+ // unprofitable searching.
+ SmallSetVector<const SCEV *, 4> ReqRegs;
+ for (const SCEV *S : CurRegs)
+ if (LU.Regs.count(S))
+ ReqRegs.insert(S);
+
+ SmallPtrSet<const SCEV *, 16> NewRegs;
+ Cost NewCost;
+ for (const Formula &F : LU.Formulae) {
+ // Ignore formulae which may not be ideal in terms of register reuse of
+ // ReqRegs. The formula should use all required registers before
+ // introducing new ones.
+ int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
+ for (const SCEV *Reg : ReqRegs) {
+ if ((F.ScaledReg && F.ScaledReg == Reg) ||
+ is_contained(F.BaseRegs, Reg)) {
+ --NumReqRegsToFind;
+ if (NumReqRegsToFind == 0)
+ break;
+ }
+ }
+ if (NumReqRegsToFind != 0) {
+ // If none of the formulae satisfied the required registers, then we could
+ // clear ReqRegs and try again. Currently, we simply give up in this case.
+ continue;
+ }
+
+ // Evaluate the cost of the current formula. If it's already worse than
+ // the current best, prune the search at that point.
+ NewCost = CurCost;
+ NewRegs = CurRegs;
+ NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, SE, DT, LU);
+ if (NewCost.isLess(SolutionCost, TTI)) {
+ Workspace.push_back(&F);
+ if (Workspace.size() != Uses.size()) {
+ SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
+ NewRegs, VisitedRegs);
+ if (F.getNumRegs() == 1 && Workspace.size() == 1)
+ VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
+ } else {
+ DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
+ dbgs() << ".\n Regs:";
+ for (const SCEV *S : NewRegs)
+ dbgs() << ' ' << *S;
+ dbgs() << '\n');
+
+ SolutionCost = NewCost;
+ Solution = Workspace;
+ }
+ Workspace.pop_back();
+ }
+ }
+}
+
+/// Choose one formula from each use. Return the results in the given Solution
+/// vector.
+void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
+ SmallVector<const Formula *, 8> Workspace;
+ Cost SolutionCost;
+ SolutionCost.Lose();
+ Cost CurCost;
+ SmallPtrSet<const SCEV *, 16> CurRegs;
+ DenseSet<const SCEV *> VisitedRegs;
+ Workspace.reserve(Uses.size());
+
+ // SolveRecurse does all the work.
+ SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
+ CurRegs, VisitedRegs);
+ if (Solution.empty()) {
+ DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
+ return;
+ }
+
+ // Ok, we've now made all our decisions.
+ DEBUG(dbgs() << "\n"
+ "The chosen solution requires "; SolutionCost.print(dbgs());
+ dbgs() << ":\n";
+ for (size_t i = 0, e = Uses.size(); i != e; ++i) {
+ dbgs() << " ";
+ Uses[i].print(dbgs());
+ dbgs() << "\n"
+ " ";
+ Solution[i]->print(dbgs());
+ dbgs() << '\n';
+ });
+
+ assert(Solution.size() == Uses.size() && "Malformed solution!");
+}
+
+/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
+/// we can go while still being dominated by the input positions. This helps
+/// canonicalize the insert position, which encourages sharing.
+BasicBlock::iterator
+LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
+ const SmallVectorImpl<Instruction *> &Inputs)
+ const {
+ Instruction *Tentative = &*IP;
+ while (true) {
+ bool AllDominate = true;
+ Instruction *BetterPos = nullptr;
+ // Don't bother attempting to insert before a catchswitch, their basic block
+ // cannot have other non-PHI instructions.
+ if (isa<CatchSwitchInst>(Tentative))
+ return IP;
+
+ for (Instruction *Inst : Inputs) {
+ if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
+ AllDominate = false;
+ break;
+ }
+ // Attempt to find an insert position in the middle of the block,
+ // instead of at the end, so that it can be used for other expansions.
+ if (Tentative->getParent() == Inst->getParent() &&
+ (!BetterPos || !DT.dominates(Inst, BetterPos)))
+ BetterPos = &*std::next(BasicBlock::iterator(Inst));
+ }
+ if (!AllDominate)
+ break;
+ if (BetterPos)
+ IP = BetterPos->getIterator();
+ else
+ IP = Tentative->getIterator();
+
+ const Loop *IPLoop = LI.getLoopFor(IP->getParent());
+ unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
+
+ BasicBlock *IDom;
+ for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
+ if (!Rung) return IP;
+ Rung = Rung->getIDom();
+ if (!Rung) return IP;
+ IDom = Rung->getBlock();
+
+ // Don't climb into a loop though.
+ const Loop *IDomLoop = LI.getLoopFor(IDom);
+ unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
+ if (IDomDepth <= IPLoopDepth &&
+ (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
+ break;
+ }
+
+ Tentative = IDom->getTerminator();
+ }
+
+ return IP;
+}
+
+/// Determine an input position which will be dominated by the operands and
+/// which will dominate the result.
+BasicBlock::iterator
+LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
+ const LSRFixup &LF,
+ const LSRUse &LU,
+ SCEVExpander &Rewriter) const {
+ // Collect some instructions which must be dominated by the
+ // expanding replacement. These must be dominated by any operands that
+ // will be required in the expansion.
+ SmallVector<Instruction *, 4> Inputs;
+ if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
+ Inputs.push_back(I);
+ if (LU.Kind == LSRUse::ICmpZero)
+ if (Instruction *I =
+ dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
+ Inputs.push_back(I);
+ if (LF.PostIncLoops.count(L)) {
+ if (LF.isUseFullyOutsideLoop(L))
+ Inputs.push_back(L->getLoopLatch()->getTerminator());
+ else
+ Inputs.push_back(IVIncInsertPos);
+ }
+ // The expansion must also be dominated by the increment positions of any
+ // loops it for which it is using post-inc mode.
+ for (const Loop *PIL : LF.PostIncLoops) {
+ if (PIL == L) continue;
+
+ // Be dominated by the loop exit.
+ SmallVector<BasicBlock *, 4> ExitingBlocks;
+ PIL->getExitingBlocks(ExitingBlocks);
+ if (!ExitingBlocks.empty()) {
+ BasicBlock *BB = ExitingBlocks[0];
+ for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
+ BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
+ Inputs.push_back(BB->getTerminator());
+ }
+ }
+
+ assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
+ && !isa<DbgInfoIntrinsic>(LowestIP) &&
+ "Insertion point must be a normal instruction");
+
+ // Then, climb up the immediate dominator tree as far as we can go while
+ // still being dominated by the input positions.
+ BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
+
+ // Don't insert instructions before PHI nodes.
+ while (isa<PHINode>(IP)) ++IP;
+
+ // Ignore landingpad instructions.
+ while (IP->isEHPad()) ++IP;
+
+ // Ignore debug intrinsics.
+ while (isa<DbgInfoIntrinsic>(IP)) ++IP;
+
+ // Set IP below instructions recently inserted by SCEVExpander. This keeps the
+ // IP consistent across expansions and allows the previously inserted
+ // instructions to be reused by subsequent expansion.
+ while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
+ ++IP;
+
+ return IP;
+}
+
+/// Emit instructions for the leading candidate expression for this LSRUse (this
+/// is called "expanding").
+Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
+ const Formula &F, BasicBlock::iterator IP,
+ SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
+ if (LU.RigidFormula)
+ return LF.OperandValToReplace;
+
+ // Determine an input position which will be dominated by the operands and
+ // which will dominate the result.
+ IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
+ Rewriter.setInsertPoint(&*IP);
+
+ // Inform the Rewriter if we have a post-increment use, so that it can
+ // perform an advantageous expansion.
+ Rewriter.setPostInc(LF.PostIncLoops);
+
+ // This is the type that the user actually needs.
+ Type *OpTy = LF.OperandValToReplace->getType();
+ // This will be the type that we'll initially expand to.
+ Type *Ty = F.getType();
+ if (!Ty)
+ // No type known; just expand directly to the ultimate type.
+ Ty = OpTy;
+ else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
+ // Expand directly to the ultimate type if it's the right size.
+ Ty = OpTy;
+ // This is the type to do integer arithmetic in.
+ Type *IntTy = SE.getEffectiveSCEVType(Ty);
+
+ // Build up a list of operands to add together to form the full base.
+ SmallVector<const SCEV *, 8> Ops;
+
+ // Expand the BaseRegs portion.
+ for (const SCEV *Reg : F.BaseRegs) {
+ assert(!Reg->isZero() && "Zero allocated in a base register!");
+
+ // If we're expanding for a post-inc user, make the post-inc adjustment.
+ Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
+ Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
+ }
+
+ // Expand the ScaledReg portion.
+ Value *ICmpScaledV = nullptr;
+ if (F.Scale != 0) {
+ const SCEV *ScaledS = F.ScaledReg;
+
+ // If we're expanding for a post-inc user, make the post-inc adjustment.
+ PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
+ ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
+
+ if (LU.Kind == LSRUse::ICmpZero) {
+ // Expand ScaleReg as if it was part of the base regs.
+ if (F.Scale == 1)
+ Ops.push_back(
+ SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
+ else {
+ // An interesting way of "folding" with an icmp is to use a negated
+ // scale, which we'll implement by inserting it into the other operand
+ // of the icmp.
+ assert(F.Scale == -1 &&
+ "The only scale supported by ICmpZero uses is -1!");
+ ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
+ }
+ } else {
+ // Otherwise just expand the scaled register and an explicit scale,
+ // which is expected to be matched as part of the address.
+
+ // Flush the operand list to suppress SCEVExpander hoisting address modes.
+ // Unless the addressing mode will not be folded.
+ if (!Ops.empty() && LU.Kind == LSRUse::Address &&
+ isAMCompletelyFolded(TTI, LU, F)) {
+ Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
+ Ops.clear();
+ Ops.push_back(SE.getUnknown(FullV));
+ }
+ ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
+ if (F.Scale != 1)
+ ScaledS =
+ SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
+ Ops.push_back(ScaledS);
+ }
+ }
+
+ // Expand the GV portion.
+ if (F.BaseGV) {
+ // Flush the operand list to suppress SCEVExpander hoisting.
+ if (!Ops.empty()) {
+ Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
+ Ops.clear();
+ Ops.push_back(SE.getUnknown(FullV));
+ }
+ Ops.push_back(SE.getUnknown(F.BaseGV));
+ }
+
+ // Flush the operand list to suppress SCEVExpander hoisting of both folded and
+ // unfolded offsets. LSR assumes they both live next to their uses.
+ if (!Ops.empty()) {
+ Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
+ Ops.clear();
+ Ops.push_back(SE.getUnknown(FullV));
+ }
+
+ // Expand the immediate portion.
+ int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
+ if (Offset != 0) {
+ if (LU.Kind == LSRUse::ICmpZero) {
+ // The other interesting way of "folding" with an ICmpZero is to use a
+ // negated immediate.
+ if (!ICmpScaledV)
+ ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
+ else {
+ Ops.push_back(SE.getUnknown(ICmpScaledV));
+ ICmpScaledV = ConstantInt::get(IntTy, Offset);
+ }
+ } else {
+ // Just add the immediate values. These again are expected to be matched
+ // as part of the address.
+ Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
+ }
+ }
+
+ // Expand the unfolded offset portion.
+ int64_t UnfoldedOffset = F.UnfoldedOffset;
+ if (UnfoldedOffset != 0) {
+ // Just add the immediate values.
+ Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
+ UnfoldedOffset)));
+ }
+
+ // Emit instructions summing all the operands.
+ const SCEV *FullS = Ops.empty() ?
+ SE.getConstant(IntTy, 0) :
+ SE.getAddExpr(Ops);
+ Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
+
+ // We're done expanding now, so reset the rewriter.
+ Rewriter.clearPostInc();
+
+ // An ICmpZero Formula represents an ICmp which we're handling as a
+ // comparison against zero. Now that we've expanded an expression for that
+ // form, update the ICmp's other operand.
+ if (LU.Kind == LSRUse::ICmpZero) {
+ ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
+ DeadInsts.emplace_back(CI->getOperand(1));
+ assert(!F.BaseGV && "ICmp does not support folding a global value and "
+ "a scale at the same time!");
+ if (F.Scale == -1) {
+ if (ICmpScaledV->getType() != OpTy) {
+ Instruction *Cast =
+ CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
+ OpTy, false),
+ ICmpScaledV, OpTy, "tmp", CI);
+ ICmpScaledV = Cast;
+ }
+ CI->setOperand(1, ICmpScaledV);
+ } else {
+ // A scale of 1 means that the scale has been expanded as part of the
+ // base regs.
+ assert((F.Scale == 0 || F.Scale == 1) &&
+ "ICmp does not support folding a global value and "
+ "a scale at the same time!");
+ Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
+ -(uint64_t)Offset);
+ if (C->getType() != OpTy)
+ C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
+ OpTy, false),
+ C, OpTy);
+
+ CI->setOperand(1, C);
+ }
+ }
+
+ return FullV;
+}
+
+/// Helper for Rewrite. PHI nodes are special because the use of their operands
+/// effectively happens in their predecessor blocks, so the expression may need
+/// to be expanded in multiple places.
+void LSRInstance::RewriteForPHI(
+ PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F,
+ SCEVExpander &Rewriter, SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
+ DenseMap<BasicBlock *, Value *> Inserted;
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
+ BasicBlock *BB = PN->getIncomingBlock(i);
+
+ // If this is a critical edge, split the edge so that we do not insert
+ // the code on all predecessor/successor paths. We do this unless this
+ // is the canonical backedge for this loop, which complicates post-inc
+ // users.
+ if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
+ !isa<IndirectBrInst>(BB->getTerminator()) &&
+ !isa<CatchSwitchInst>(BB->getTerminator())) {
+ BasicBlock *Parent = PN->getParent();
+ Loop *PNLoop = LI.getLoopFor(Parent);
+ if (!PNLoop || Parent != PNLoop->getHeader()) {
+ // Split the critical edge.
+ BasicBlock *NewBB = nullptr;
+ if (!Parent->isLandingPad()) {
+ NewBB = SplitCriticalEdge(BB, Parent,
+ CriticalEdgeSplittingOptions(&DT, &LI)
+ .setMergeIdenticalEdges()
+ .setDontDeleteUselessPHIs());
+ } else {
+ SmallVector<BasicBlock*, 2> NewBBs;
+ SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI);
+ NewBB = NewBBs[0];
+ }
+ // If NewBB==NULL, then SplitCriticalEdge refused to split because all
+ // phi predecessors are identical. The simple thing to do is skip
+ // splitting in this case rather than complicate the API.
+ if (NewBB) {
+ // If PN is outside of the loop and BB is in the loop, we want to
+ // move the block to be immediately before the PHI block, not
+ // immediately after BB.
+ if (L->contains(BB) && !L->contains(PN))
+ NewBB->moveBefore(PN->getParent());
+
+ // Splitting the edge can reduce the number of PHI entries we have.
+ e = PN->getNumIncomingValues();
+ BB = NewBB;
+ i = PN->getBasicBlockIndex(BB);
+ }
+ }
+ }
+
+ std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
+ Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
+ if (!Pair.second)
+ PN->setIncomingValue(i, Pair.first->second);
+ else {
+ Value *FullV = Expand(LU, LF, F, BB->getTerminator()->getIterator(),
+ Rewriter, DeadInsts);
+
+ // If this is reuse-by-noop-cast, insert the noop cast.
+ Type *OpTy = LF.OperandValToReplace->getType();
+ if (FullV->getType() != OpTy)
+ FullV =
+ CastInst::Create(CastInst::getCastOpcode(FullV, false,
+ OpTy, false),
+ FullV, LF.OperandValToReplace->getType(),
+ "tmp", BB->getTerminator());
+
+ PN->setIncomingValue(i, FullV);
+ Pair.first->second = FullV;
+ }
+ }
+}
+
+/// Emit instructions for the leading candidate expression for this LSRUse (this
+/// is called "expanding"), and update the UserInst to reference the newly
+/// expanded value.
+void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
+ const Formula &F, SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
+ // First, find an insertion point that dominates UserInst. For PHI nodes,
+ // find the nearest block which dominates all the relevant uses.
+ if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
+ RewriteForPHI(PN, LU, LF, F, Rewriter, DeadInsts);
+ } else {
+ Value *FullV =
+ Expand(LU, LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts);
+
+ // If this is reuse-by-noop-cast, insert the noop cast.
+ Type *OpTy = LF.OperandValToReplace->getType();
+ if (FullV->getType() != OpTy) {
+ Instruction *Cast =
+ CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
+ FullV, OpTy, "tmp", LF.UserInst);
+ FullV = Cast;
+ }
+
+ // Update the user. ICmpZero is handled specially here (for now) because
+ // Expand may have updated one of the operands of the icmp already, and
+ // its new value may happen to be equal to LF.OperandValToReplace, in
+ // which case doing replaceUsesOfWith leads to replacing both operands
+ // with the same value. TODO: Reorganize this.
+ if (LU.Kind == LSRUse::ICmpZero)
+ LF.UserInst->setOperand(0, FullV);
+ else
+ LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
+ }
+
+ DeadInsts.emplace_back(LF.OperandValToReplace);
+}
+
+/// Rewrite all the fixup locations with new values, following the chosen
+/// solution.
+void LSRInstance::ImplementSolution(
+ const SmallVectorImpl<const Formula *> &Solution) {
+ // Keep track of instructions we may have made dead, so that
+ // we can remove them after we are done working.
+ SmallVector<WeakTrackingVH, 16> DeadInsts;
+
+ SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(),
+ "lsr");
+#ifndef NDEBUG
+ Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+ Rewriter.disableCanonicalMode();
+ Rewriter.enableLSRMode();
+ Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
+
+ // Mark phi nodes that terminate chains so the expander tries to reuse them.
+ for (const IVChain &Chain : IVChainVec) {
+ if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
+ Rewriter.setChainedPhi(PN);
+ }
+
+ // Expand the new value definitions and update the users.
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
+ for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
+ Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], Rewriter, DeadInsts);
+ Changed = true;
+ }
+
+ for (const IVChain &Chain : IVChainVec) {
+ GenerateIVChain(Chain, Rewriter, DeadInsts);
+ Changed = true;
+ }
+ // Clean up after ourselves. This must be done before deleting any
+ // instructions.
+ Rewriter.clear();
+
+ Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
+}
+
+LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
+ DominatorTree &DT, LoopInfo &LI,
+ const TargetTransformInfo &TTI)
+ : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L) {
+ // If LoopSimplify form is not available, stay out of trouble.
+ if (!L->isLoopSimplifyForm())
+ return;
+
+ // If there's no interesting work to be done, bail early.
+ if (IU.empty()) return;
+
+ // If there's too much analysis to be done, bail early. We won't be able to
+ // model the problem anyway.
+ unsigned NumUsers = 0;
+ for (const IVStrideUse &U : IU) {
+ if (++NumUsers > MaxIVUsers) {
+ (void)U;
+ DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U << "\n");
+ return;
+ }
+ // Bail out if we have a PHI on an EHPad that gets a value from a
+ // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
+ // no good place to stick any instructions.
+ if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
+ auto *FirstNonPHI = PN->getParent()->getFirstNonPHI();
+ if (isa<FuncletPadInst>(FirstNonPHI) ||
+ isa<CatchSwitchInst>(FirstNonPHI))
+ for (BasicBlock *PredBB : PN->blocks())
+ if (isa<CatchSwitchInst>(PredBB->getFirstNonPHI()))
+ return;
+ }
+ }
+
+#ifndef NDEBUG
+ // All dominating loops must have preheaders, or SCEVExpander may not be able
+ // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
+ //
+ // IVUsers analysis should only create users that are dominated by simple loop
+ // headers. Since this loop should dominate all of its users, its user list
+ // should be empty if this loop itself is not within a simple loop nest.
+ for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader());
+ Rung; Rung = Rung->getIDom()) {
+ BasicBlock *BB = Rung->getBlock();
+ const Loop *DomLoop = LI.getLoopFor(BB);
+ if (DomLoop && DomLoop->getHeader() == BB) {
+ assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest");
+ }
+ }
+#endif // DEBUG
+
+ DEBUG(dbgs() << "\nLSR on loop ";
+ L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
+ dbgs() << ":\n");
+
+ // First, perform some low-level loop optimizations.
+ OptimizeShadowIV();
+ OptimizeLoopTermCond();
+
+ // If loop preparation eliminates all interesting IV users, bail.
+ if (IU.empty()) return;
+
+ // Skip nested loops until we can model them better with formulae.
+ if (!L->empty()) {
+ DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
+ return;
+ }
+
+ // Start collecting data and preparing for the solver.
+ CollectChains();
+ CollectInterestingTypesAndFactors();
+ CollectFixupsAndInitialFormulae();
+ CollectLoopInvariantFixupsAndFormulae();
+
+ assert(!Uses.empty() && "IVUsers reported at least one use");
+ DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
+ print_uses(dbgs()));
+
+ // Now use the reuse data to generate a bunch of interesting ways
+ // to formulate the values needed for the uses.
+ GenerateAllReuseFormulae();
+
+ FilterOutUndesirableDedicatedRegisters();
+ NarrowSearchSpaceUsingHeuristics();
+
+ SmallVector<const Formula *, 8> Solution;
+ Solve(Solution);
+
+ // Release memory that is no longer needed.
+ Factors.clear();
+ Types.clear();
+ RegUses.clear();
+
+ if (Solution.empty())
+ return;
+
+#ifndef NDEBUG
+ // Formulae should be legal.
+ for (const LSRUse &LU : Uses) {
+ for (const Formula &F : LU.Formulae)
+ assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+ F) && "Illegal formula generated!");
+ };
+#endif
+
+ // Now that we've decided what we want, make it so.
+ ImplementSolution(Solution);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
+ if (Factors.empty() && Types.empty()) return;
+
+ OS << "LSR has identified the following interesting factors and types: ";
+ bool First = true;
+
+ for (int64_t Factor : Factors) {
+ if (!First) OS << ", ";
+ First = false;
+ OS << '*' << Factor;
+ }
+
+ for (Type *Ty : Types) {
+ if (!First) OS << ", ";
+ First = false;
+ OS << '(' << *Ty << ')';
+ }
+ OS << '\n';
+}
+
+void LSRInstance::print_fixups(raw_ostream &OS) const {
+ OS << "LSR is examining the following fixup sites:\n";
+ for (const LSRUse &LU : Uses)
+ for (const LSRFixup &LF : LU.Fixups) {
+ dbgs() << " ";
+ LF.print(OS);
+ OS << '\n';
+ }
+}
+
+void LSRInstance::print_uses(raw_ostream &OS) const {
+ OS << "LSR is examining the following uses:\n";
+ for (const LSRUse &LU : Uses) {
+ dbgs() << " ";
+ LU.print(OS);
+ OS << '\n';
+ for (const Formula &F : LU.Formulae) {
+ OS << " ";
+ F.print(OS);
+ OS << '\n';
+ }
+ }
+}
+
+void LSRInstance::print(raw_ostream &OS) const {
+ print_factors_and_types(OS);
+ print_fixups(OS);
+ print_uses(OS);
+}
+
+LLVM_DUMP_METHOD void LSRInstance::dump() const {
+ print(errs()); errs() << '\n';
+}
+#endif
+
+namespace {
+
+class LoopStrengthReduce : public LoopPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+
+ LoopStrengthReduce();
+
+private:
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+} // end anonymous namespace
+
+LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
+ initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
+}
+
+void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
+ // We split critical edges, so we change the CFG. However, we do update
+ // many analyses if they are around.
+ AU.addPreservedID(LoopSimplifyID);
+
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ // Requiring LoopSimplify a second time here prevents IVUsers from running
+ // twice, since LoopSimplify was invalidated by running ScalarEvolution.
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequired<IVUsersWrapperPass>();
+ AU.addPreserved<IVUsersWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+}
+
+static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
+ DominatorTree &DT, LoopInfo &LI,
+ const TargetTransformInfo &TTI) {
+ bool Changed = false;
+
+ // Run the main LSR transformation.
+ Changed |= LSRInstance(L, IU, SE, DT, LI, TTI).getChanged();
+
+ // Remove any extra phis created by processing inner loops.
+ Changed |= DeleteDeadPHIs(L->getHeader());
+ if (EnablePhiElim && L->isLoopSimplifyForm()) {
+ SmallVector<WeakTrackingVH, 16> DeadInsts;
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+ SCEVExpander Rewriter(SE, DL, "lsr");
+#ifndef NDEBUG
+ Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+ unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
+ if (numFolded) {
+ Changed = true;
+ DeleteTriviallyDeadInstructions(DeadInsts);
+ DeleteDeadPHIs(L->getHeader());
+ }
+ }
+ return Changed;
+}
+
+bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
+ if (skipLoop(L))
+ return false;
+
+ auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
+ auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *L->getHeader()->getParent());
+ return ReduceLoopStrength(L, IU, SE, DT, LI, TTI);
+}
+
+PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
+ AR.DT, AR.LI, AR.TTI))
+ return PreservedAnalyses::all();
+
+ return getLoopPassPreservedAnalyses();
+}
+
+char LoopStrengthReduce::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
+ "Loop Strength Reduction", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(IVUsersWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
+ "Loop Strength Reduction", false, false)
+
+Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
new file mode 100644
index 000000000000..15e7da5e1a7a
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -0,0 +1,1353 @@
+//===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop unroller. It works best when loops have
+// been canonicalized by the -indvars pass, allowing it to determine the trip
+// counts of loops easily.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopUnrollAnalyzer.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll"
+
+static cl::opt<unsigned>
+ UnrollThreshold("unroll-threshold", cl::Hidden,
+ cl::desc("The cost threshold for loop unrolling"));
+
+static cl::opt<unsigned> UnrollPartialThreshold(
+ "unroll-partial-threshold", cl::Hidden,
+ cl::desc("The cost threshold for partial loop unrolling"));
+
+static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
+ "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
+ cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
+ "to the threshold when aggressively unrolling a loop due to the "
+ "dynamic cost savings. If completely unrolling a loop will reduce "
+ "the total runtime from X to Y, we boost the loop unroll "
+ "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
+ "X/Y). This limit avoids excessive code bloat."));
+
+static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
+ "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
+ cl::desc("Don't allow loop unrolling to simulate more than this number of"
+ "iterations when checking full unroll profitability"));
+
+static cl::opt<unsigned> UnrollCount(
+ "unroll-count", cl::Hidden,
+ cl::desc("Use this unroll count for all loops including those with "
+ "unroll_count pragma values, for testing purposes"));
+
+static cl::opt<unsigned> UnrollMaxCount(
+ "unroll-max-count", cl::Hidden,
+ cl::desc("Set the max unroll count for partial and runtime unrolling, for"
+ "testing purposes"));
+
+static cl::opt<unsigned> UnrollFullMaxCount(
+ "unroll-full-max-count", cl::Hidden,
+ cl::desc(
+ "Set the max unroll count for full unrolling, for testing purposes"));
+
+static cl::opt<unsigned> UnrollPeelCount(
+ "unroll-peel-count", cl::Hidden,
+ cl::desc("Set the unroll peeling count, for testing purposes"));
+
+static cl::opt<bool>
+ UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
+ cl::desc("Allows loops to be partially unrolled until "
+ "-unroll-threshold loop size is reached."));
+
+static cl::opt<bool> UnrollAllowRemainder(
+ "unroll-allow-remainder", cl::Hidden,
+ cl::desc("Allow generation of a loop remainder (extra iterations) "
+ "when unrolling a loop."));
+
+static cl::opt<bool>
+ UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
+ cl::desc("Unroll loops with run-time trip counts"));
+
+static cl::opt<unsigned> UnrollMaxUpperBound(
+ "unroll-max-upperbound", cl::init(8), cl::Hidden,
+ cl::desc(
+ "The max of trip count upper bound that is considered in unrolling"));
+
+static cl::opt<unsigned> PragmaUnrollThreshold(
+ "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
+ cl::desc("Unrolled size limit for loops with an unroll(full) or "
+ "unroll_count pragma."));
+
+static cl::opt<unsigned> FlatLoopTripCountThreshold(
+ "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
+ cl::desc("If the runtime tripcount for the loop is lower than the "
+ "threshold, the loop is considered as flat and will be less "
+ "aggressively unrolled."));
+
+static cl::opt<bool>
+ UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
+ cl::desc("Allows loops to be peeled when the dynamic "
+ "trip count is known to be low."));
+
+static cl::opt<bool> UnrollUnrollRemainder(
+ "unroll-remainder", cl::Hidden,
+ cl::desc("Allow the loop remainder to be unrolled."));
+
+// This option isn't ever intended to be enabled, it serves to allow
+// experiments to check the assumptions about when this kind of revisit is
+// necessary.
+static cl::opt<bool> UnrollRevisitChildLoops(
+ "unroll-revisit-child-loops", cl::Hidden,
+ cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
+ "This shouldn't typically be needed as child loops (or their "
+ "clones) were already visited."));
+
+/// A magic value for use with the Threshold parameter to indicate
+/// that the loop unroll should be performed regardless of how much
+/// code expansion would result.
+static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
+
+/// Gather the various unrolling parameters based on the defaults, compiler
+/// flags, TTI overrides and user specified parameters.
+static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
+ Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
+ Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
+ Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
+ Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling) {
+ TargetTransformInfo::UnrollingPreferences UP;
+
+ // Set up the defaults
+ UP.Threshold = OptLevel > 2 ? 300 : 150;
+ UP.MaxPercentThresholdBoost = 400;
+ UP.OptSizeThreshold = 0;
+ UP.PartialThreshold = 150;
+ UP.PartialOptSizeThreshold = 0;
+ UP.Count = 0;
+ UP.PeelCount = 0;
+ UP.DefaultUnrollRuntimeCount = 8;
+ UP.MaxCount = std::numeric_limits<unsigned>::max();
+ UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
+ UP.BEInsns = 2;
+ UP.Partial = false;
+ UP.Runtime = false;
+ UP.AllowRemainder = true;
+ UP.UnrollRemainder = false;
+ UP.AllowExpensiveTripCount = false;
+ UP.Force = false;
+ UP.UpperBound = false;
+ UP.AllowPeeling = true;
+
+ // Override with any target specific settings
+ TTI.getUnrollingPreferences(L, SE, UP);
+
+ // Apply size attributes
+ if (L->getHeader()->getParent()->optForSize()) {
+ UP.Threshold = UP.OptSizeThreshold;
+ UP.PartialThreshold = UP.PartialOptSizeThreshold;
+ }
+
+ // Apply any user values specified by cl::opt
+ if (UnrollThreshold.getNumOccurrences() > 0)
+ UP.Threshold = UnrollThreshold;
+ if (UnrollPartialThreshold.getNumOccurrences() > 0)
+ UP.PartialThreshold = UnrollPartialThreshold;
+ if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
+ UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
+ if (UnrollMaxCount.getNumOccurrences() > 0)
+ UP.MaxCount = UnrollMaxCount;
+ if (UnrollFullMaxCount.getNumOccurrences() > 0)
+ UP.FullUnrollMaxCount = UnrollFullMaxCount;
+ if (UnrollPeelCount.getNumOccurrences() > 0)
+ UP.PeelCount = UnrollPeelCount;
+ if (UnrollAllowPartial.getNumOccurrences() > 0)
+ UP.Partial = UnrollAllowPartial;
+ if (UnrollAllowRemainder.getNumOccurrences() > 0)
+ UP.AllowRemainder = UnrollAllowRemainder;
+ if (UnrollRuntime.getNumOccurrences() > 0)
+ UP.Runtime = UnrollRuntime;
+ if (UnrollMaxUpperBound == 0)
+ UP.UpperBound = false;
+ if (UnrollAllowPeeling.getNumOccurrences() > 0)
+ UP.AllowPeeling = UnrollAllowPeeling;
+ if (UnrollUnrollRemainder.getNumOccurrences() > 0)
+ UP.UnrollRemainder = UnrollUnrollRemainder;
+
+ // Apply user values provided by argument
+ if (UserThreshold.hasValue()) {
+ UP.Threshold = *UserThreshold;
+ UP.PartialThreshold = *UserThreshold;
+ }
+ if (UserCount.hasValue())
+ UP.Count = *UserCount;
+ if (UserAllowPartial.hasValue())
+ UP.Partial = *UserAllowPartial;
+ if (UserRuntime.hasValue())
+ UP.Runtime = *UserRuntime;
+ if (UserUpperBound.hasValue())
+ UP.UpperBound = *UserUpperBound;
+ if (UserAllowPeeling.hasValue())
+ UP.AllowPeeling = *UserAllowPeeling;
+
+ return UP;
+}
+
+namespace {
+
+/// A struct to densely store the state of an instruction after unrolling at
+/// each iteration.
+///
+/// This is designed to work like a tuple of <Instruction *, int> for the
+/// purposes of hashing and lookup, but to be able to associate two boolean
+/// states with each key.
+struct UnrolledInstState {
+ Instruction *I;
+ int Iteration : 30;
+ unsigned IsFree : 1;
+ unsigned IsCounted : 1;
+};
+
+/// Hashing and equality testing for a set of the instruction states.
+struct UnrolledInstStateKeyInfo {
+ using PtrInfo = DenseMapInfo<Instruction *>;
+ using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>;
+
+ static inline UnrolledInstState getEmptyKey() {
+ return {PtrInfo::getEmptyKey(), 0, 0, 0};
+ }
+
+ static inline UnrolledInstState getTombstoneKey() {
+ return {PtrInfo::getTombstoneKey(), 0, 0, 0};
+ }
+
+ static inline unsigned getHashValue(const UnrolledInstState &S) {
+ return PairInfo::getHashValue({S.I, S.Iteration});
+ }
+
+ static inline bool isEqual(const UnrolledInstState &LHS,
+ const UnrolledInstState &RHS) {
+ return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
+ }
+};
+
+struct EstimatedUnrollCost {
+ /// \brief The estimated cost after unrolling.
+ unsigned UnrolledCost;
+
+ /// \brief The estimated dynamic cost of executing the instructions in the
+ /// rolled form.
+ unsigned RolledDynamicCost;
+};
+
+} // end anonymous namespace
+
+/// \brief Figure out if the loop is worth full unrolling.
+///
+/// Complete loop unrolling can make some loads constant, and we need to know
+/// if that would expose any further optimization opportunities. This routine
+/// estimates this optimization. It computes cost of unrolled loop
+/// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
+/// dynamic cost we mean that we won't count costs of blocks that are known not
+/// to be executed (i.e. if we have a branch in the loop and we know that at the
+/// given iteration its condition would be resolved to true, we won't add up the
+/// cost of the 'false'-block).
+/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
+/// the analysis failed (no benefits expected from the unrolling, or the loop is
+/// too big to analyze), the returned value is None.
+static Optional<EstimatedUnrollCost>
+analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
+ ScalarEvolution &SE, const TargetTransformInfo &TTI,
+ unsigned MaxUnrolledLoopSize) {
+ // We want to be able to scale offsets by the trip count and add more offsets
+ // to them without checking for overflows, and we already don't want to
+ // analyze *massive* trip counts, so we force the max to be reasonably small.
+ assert(UnrollMaxIterationsCountToAnalyze <
+ (unsigned)(std::numeric_limits<int>::max() / 2) &&
+ "The unroll iterations max is too large!");
+
+ // Only analyze inner loops. We can't properly estimate cost of nested loops
+ // and we won't visit inner loops again anyway.
+ if (!L->empty())
+ return None;
+
+ // Don't simulate loops with a big or unknown tripcount
+ if (!UnrollMaxIterationsCountToAnalyze || !TripCount ||
+ TripCount > UnrollMaxIterationsCountToAnalyze)
+ return None;
+
+ SmallSetVector<BasicBlock *, 16> BBWorklist;
+ SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
+ DenseMap<Value *, Constant *> SimplifiedValues;
+ SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
+
+ // The estimated cost of the unrolled form of the loop. We try to estimate
+ // this by simplifying as much as we can while computing the estimate.
+ unsigned UnrolledCost = 0;
+
+ // We also track the estimated dynamic (that is, actually executed) cost in
+ // the rolled form. This helps identify cases when the savings from unrolling
+ // aren't just exposing dead control flows, but actual reduced dynamic
+ // instructions due to the simplifications which we expect to occur after
+ // unrolling.
+ unsigned RolledDynamicCost = 0;
+
+ // We track the simplification of each instruction in each iteration. We use
+ // this to recursively merge costs into the unrolled cost on-demand so that
+ // we don't count the cost of any dead code. This is essentially a map from
+ // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
+ DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
+
+ // A small worklist used to accumulate cost of instructions from each
+ // observable and reached root in the loop.
+ SmallVector<Instruction *, 16> CostWorklist;
+
+ // PHI-used worklist used between iterations while accumulating cost.
+ SmallVector<Instruction *, 4> PHIUsedList;
+
+ // Helper function to accumulate cost for instructions in the loop.
+ auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
+ assert(Iteration >= 0 && "Cannot have a negative iteration!");
+ assert(CostWorklist.empty() && "Must start with an empty cost list");
+ assert(PHIUsedList.empty() && "Must start with an empty phi used list");
+ CostWorklist.push_back(&RootI);
+ for (;; --Iteration) {
+ do {
+ Instruction *I = CostWorklist.pop_back_val();
+
+ // InstCostMap only uses I and Iteration as a key, the other two values
+ // don't matter here.
+ auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
+ if (CostIter == InstCostMap.end())
+ // If an input to a PHI node comes from a dead path through the loop
+ // we may have no cost data for it here. What that actually means is
+ // that it is free.
+ continue;
+ auto &Cost = *CostIter;
+ if (Cost.IsCounted)
+ // Already counted this instruction.
+ continue;
+
+ // Mark that we are counting the cost of this instruction now.
+ Cost.IsCounted = true;
+
+ // If this is a PHI node in the loop header, just add it to the PHI set.
+ if (auto *PhiI = dyn_cast<PHINode>(I))
+ if (PhiI->getParent() == L->getHeader()) {
+ assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
+ "inherently simplify during unrolling.");
+ if (Iteration == 0)
+ continue;
+
+ // Push the incoming value from the backedge into the PHI used list
+ // if it is an in-loop instruction. We'll use this to populate the
+ // cost worklist for the next iteration (as we count backwards).
+ if (auto *OpI = dyn_cast<Instruction>(
+ PhiI->getIncomingValueForBlock(L->getLoopLatch())))
+ if (L->contains(OpI))
+ PHIUsedList.push_back(OpI);
+ continue;
+ }
+
+ // First accumulate the cost of this instruction.
+ if (!Cost.IsFree) {
+ UnrolledCost += TTI.getUserCost(I);
+ DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration
+ << "): ");
+ DEBUG(I->dump());
+ }
+
+ // We must count the cost of every operand which is not free,
+ // recursively. If we reach a loop PHI node, simply add it to the set
+ // to be considered on the next iteration (backwards!).
+ for (Value *Op : I->operands()) {
+ // Check whether this operand is free due to being a constant or
+ // outside the loop.
+ auto *OpI = dyn_cast<Instruction>(Op);
+ if (!OpI || !L->contains(OpI))
+ continue;
+
+ // Otherwise accumulate its cost.
+ CostWorklist.push_back(OpI);
+ }
+ } while (!CostWorklist.empty());
+
+ if (PHIUsedList.empty())
+ // We've exhausted the search.
+ break;
+
+ assert(Iteration > 0 &&
+ "Cannot track PHI-used values past the first iteration!");
+ CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
+ PHIUsedList.clear();
+ }
+ };
+
+ // Ensure that we don't violate the loop structure invariants relied on by
+ // this analysis.
+ assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
+ assert(L->isLCSSAForm(DT) &&
+ "Must have loops in LCSSA form to track live-out values.");
+
+ DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
+
+ // Simulate execution of each iteration of the loop counting instructions,
+ // which would be simplified.
+ // Since the same load will take different values on different iterations,
+ // we literally have to go through all loop's iterations.
+ for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
+ DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
+
+ // Prepare for the iteration by collecting any simplified entry or backedge
+ // inputs.
+ for (Instruction &I : *L->getHeader()) {
+ auto *PHI = dyn_cast<PHINode>(&I);
+ if (!PHI)
+ break;
+
+ // The loop header PHI nodes must have exactly two input: one from the
+ // loop preheader and one from the loop latch.
+ assert(
+ PHI->getNumIncomingValues() == 2 &&
+ "Must have an incoming value only for the preheader and the latch.");
+
+ Value *V = PHI->getIncomingValueForBlock(
+ Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
+ Constant *C = dyn_cast<Constant>(V);
+ if (Iteration != 0 && !C)
+ C = SimplifiedValues.lookup(V);
+ if (C)
+ SimplifiedInputValues.push_back({PHI, C});
+ }
+
+ // Now clear and re-populate the map for the next iteration.
+ SimplifiedValues.clear();
+ while (!SimplifiedInputValues.empty())
+ SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
+
+ UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
+
+ BBWorklist.clear();
+ BBWorklist.insert(L->getHeader());
+ // Note that we *must not* cache the size, this loop grows the worklist.
+ for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
+ BasicBlock *BB = BBWorklist[Idx];
+
+ // Visit all instructions in the given basic block and try to simplify
+ // it. We don't change the actual IR, just count optimization
+ // opportunities.
+ for (Instruction &I : *BB) {
+ if (isa<DbgInfoIntrinsic>(I))
+ continue;
+
+ // Track this instruction's expected baseline cost when executing the
+ // rolled loop form.
+ RolledDynamicCost += TTI.getUserCost(&I);
+
+ // Visit the instruction to analyze its loop cost after unrolling,
+ // and if the visitor returns true, mark the instruction as free after
+ // unrolling and continue.
+ bool IsFree = Analyzer.visit(I);
+ bool Inserted = InstCostMap.insert({&I, (int)Iteration,
+ (unsigned)IsFree,
+ /*IsCounted*/ false}).second;
+ (void)Inserted;
+ assert(Inserted && "Cannot have a state for an unvisited instruction!");
+
+ if (IsFree)
+ continue;
+
+ // Can't properly model a cost of a call.
+ // FIXME: With a proper cost model we should be able to do it.
+ if(isa<CallInst>(&I))
+ return None;
+
+ // If the instruction might have a side-effect recursively account for
+ // the cost of it and all the instructions leading up to it.
+ if (I.mayHaveSideEffects())
+ AddCostRecursively(I, Iteration);
+
+ // If unrolled body turns out to be too big, bail out.
+ if (UnrolledCost > MaxUnrolledLoopSize) {
+ DEBUG(dbgs() << " Exceeded threshold.. exiting.\n"
+ << " UnrolledCost: " << UnrolledCost
+ << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
+ << "\n");
+ return None;
+ }
+ }
+
+ TerminatorInst *TI = BB->getTerminator();
+
+ // Add in the live successors by first checking whether we have terminator
+ // that may be simplified based on the values simplified by this call.
+ BasicBlock *KnownSucc = nullptr;
+ if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ if (BI->isConditional()) {
+ if (Constant *SimpleCond =
+ SimplifiedValues.lookup(BI->getCondition())) {
+ // Just take the first successor if condition is undef
+ if (isa<UndefValue>(SimpleCond))
+ KnownSucc = BI->getSuccessor(0);
+ else if (ConstantInt *SimpleCondVal =
+ dyn_cast<ConstantInt>(SimpleCond))
+ KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
+ }
+ }
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+ if (Constant *SimpleCond =
+ SimplifiedValues.lookup(SI->getCondition())) {
+ // Just take the first successor if condition is undef
+ if (isa<UndefValue>(SimpleCond))
+ KnownSucc = SI->getSuccessor(0);
+ else if (ConstantInt *SimpleCondVal =
+ dyn_cast<ConstantInt>(SimpleCond))
+ KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
+ }
+ }
+ if (KnownSucc) {
+ if (L->contains(KnownSucc))
+ BBWorklist.insert(KnownSucc);
+ else
+ ExitWorklist.insert({BB, KnownSucc});
+ continue;
+ }
+
+ // Add BB's successors to the worklist.
+ for (BasicBlock *Succ : successors(BB))
+ if (L->contains(Succ))
+ BBWorklist.insert(Succ);
+ else
+ ExitWorklist.insert({BB, Succ});
+ AddCostRecursively(*TI, Iteration);
+ }
+
+ // If we found no optimization opportunities on the first iteration, we
+ // won't find them on later ones too.
+ if (UnrolledCost == RolledDynamicCost) {
+ DEBUG(dbgs() << " No opportunities found.. exiting.\n"
+ << " UnrolledCost: " << UnrolledCost << "\n");
+ return None;
+ }
+ }
+
+ while (!ExitWorklist.empty()) {
+ BasicBlock *ExitingBB, *ExitBB;
+ std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
+
+ for (Instruction &I : *ExitBB) {
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ break;
+
+ Value *Op = PN->getIncomingValueForBlock(ExitingBB);
+ if (auto *OpI = dyn_cast<Instruction>(Op))
+ if (L->contains(OpI))
+ AddCostRecursively(*OpI, TripCount - 1);
+ }
+ }
+
+ DEBUG(dbgs() << "Analysis finished:\n"
+ << "UnrolledCost: " << UnrolledCost << ", "
+ << "RolledDynamicCost: " << RolledDynamicCost << "\n");
+ return {{UnrolledCost, RolledDynamicCost}};
+}
+
+/// ApproximateLoopSize - Approximate the size of the loop.
+static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
+ bool &NotDuplicatable, bool &Convergent,
+ const TargetTransformInfo &TTI,
+ AssumptionCache *AC, unsigned BEInsns) {
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+ CodeMetrics Metrics;
+ for (BasicBlock *BB : L->blocks())
+ Metrics.analyzeBasicBlock(BB, TTI, EphValues);
+ NumCalls = Metrics.NumInlineCandidates;
+ NotDuplicatable = Metrics.notDuplicatable;
+ Convergent = Metrics.convergent;
+
+ unsigned LoopSize = Metrics.NumInsts;
+
+ // Don't allow an estimate of size zero. This would allows unrolling of loops
+ // with huge iteration counts, which is a compile time problem even if it's
+ // not a problem for code quality. Also, the code using this size may assume
+ // that each loop has at least three instructions (likely a conditional
+ // branch, a comparison feeding that branch, and some kind of loop increment
+ // feeding that comparison instruction).
+ LoopSize = std::max(LoopSize, BEInsns + 1);
+
+ return LoopSize;
+}
+
+// Returns the loop hint metadata node with the given name (for example,
+// "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
+// returned.
+static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
+ if (MDNode *LoopID = L->getLoopID())
+ return GetUnrollMetadata(LoopID, Name);
+ return nullptr;
+}
+
+// Returns true if the loop has an unroll(full) pragma.
+static bool HasUnrollFullPragma(const Loop *L) {
+ return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
+}
+
+// Returns true if the loop has an unroll(enable) pragma. This metadata is used
+// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
+static bool HasUnrollEnablePragma(const Loop *L) {
+ return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
+}
+
+// Returns true if the loop has an unroll(disable) pragma.
+static bool HasUnrollDisablePragma(const Loop *L) {
+ return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable");
+}
+
+// Returns true if the loop has an runtime unroll(disable) pragma.
+static bool HasRuntimeUnrollDisablePragma(const Loop *L) {
+ return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
+}
+
+// If loop has an unroll_count pragma return the (necessarily
+// positive) value from the pragma. Otherwise return 0.
+static unsigned UnrollCountPragmaValue(const Loop *L) {
+ MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
+ if (MD) {
+ assert(MD->getNumOperands() == 2 &&
+ "Unroll count hint metadata should have two operands.");
+ unsigned Count =
+ mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+ assert(Count >= 1 && "Unroll count must be positive.");
+ return Count;
+ }
+ return 0;
+}
+
+// Computes the boosting factor for complete unrolling.
+// If fully unrolling the loop would save a lot of RolledDynamicCost, it would
+// be beneficial to fully unroll the loop even if unrolledcost is large. We
+// use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
+// the unroll threshold.
+static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
+ unsigned MaxPercentThresholdBoost) {
+ if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
+ return 100;
+ else if (Cost.UnrolledCost != 0)
+ // The boosting factor is RolledDynamicCost / UnrolledCost
+ return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
+ MaxPercentThresholdBoost);
+ else
+ return MaxPercentThresholdBoost;
+}
+
+// Returns loop size estimation for unrolled loop.
+static uint64_t getUnrolledLoopSize(
+ unsigned LoopSize,
+ TargetTransformInfo::UnrollingPreferences &UP) {
+ assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
+ return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
+}
+
+// Returns true if unroll count was set explicitly.
+// Calculates unroll count and writes it to UP.Count.
+static bool computeUnrollCount(
+ Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
+ ScalarEvolution &SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
+ unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize,
+ TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
+ // Check for explicit Count.
+ // 1st priority is unroll count set by "unroll-count" option.
+ bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
+ if (UserUnrollCount) {
+ UP.Count = UnrollCount;
+ UP.AllowExpensiveTripCount = true;
+ UP.Force = true;
+ if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold)
+ return true;
+ }
+
+ // 2nd priority is unroll count set by pragma.
+ unsigned PragmaCount = UnrollCountPragmaValue(L);
+ if (PragmaCount > 0) {
+ UP.Count = PragmaCount;
+ UP.Runtime = true;
+ UP.AllowExpensiveTripCount = true;
+ UP.Force = true;
+ if (UP.AllowRemainder &&
+ getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
+ return true;
+ }
+ bool PragmaFullUnroll = HasUnrollFullPragma(L);
+ if (PragmaFullUnroll && TripCount != 0) {
+ UP.Count = TripCount;
+ if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
+ return false;
+ }
+
+ bool PragmaEnableUnroll = HasUnrollEnablePragma(L);
+ bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
+ PragmaEnableUnroll || UserUnrollCount;
+
+ if (ExplicitUnroll && TripCount != 0) {
+ // If the loop has an unrolling pragma, we want to be more aggressive with
+ // unrolling limits. Set thresholds to at least the PragmaThreshold value
+ // which is larger than the default limits.
+ UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
+ UP.PartialThreshold =
+ std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
+ }
+
+ // 3rd priority is full unroll count.
+ // Full unroll makes sense only when TripCount or its upper bound could be
+ // statically calculated.
+ // Also we need to check if we exceed FullUnrollMaxCount.
+ // If using the upper bound to unroll, TripMultiple should be set to 1 because
+ // we do not know when loop may exit.
+ // MaxTripCount and ExactTripCount cannot both be non zero since we only
+ // compute the former when the latter is zero.
+ unsigned ExactTripCount = TripCount;
+ assert((ExactTripCount == 0 || MaxTripCount == 0) &&
+ "ExtractTripCound and MaxTripCount cannot both be non zero.");
+ unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount;
+ UP.Count = FullUnrollTripCount;
+ if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
+ // When computing the unrolled size, note that BEInsns are not replicated
+ // like the rest of the loop body.
+ if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
+ UseUpperBound = (MaxTripCount == FullUnrollTripCount);
+ TripCount = FullUnrollTripCount;
+ TripMultiple = UP.UpperBound ? 1 : TripMultiple;
+ return ExplicitUnroll;
+ } else {
+ // The loop isn't that small, but we still can fully unroll it if that
+ // helps to remove a significant number of instructions.
+ // To check that, run additional analysis on the loop.
+ if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
+ L, FullUnrollTripCount, DT, SE, TTI,
+ UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
+ unsigned Boost =
+ getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
+ if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
+ UseUpperBound = (MaxTripCount == FullUnrollTripCount);
+ TripCount = FullUnrollTripCount;
+ TripMultiple = UP.UpperBound ? 1 : TripMultiple;
+ return ExplicitUnroll;
+ }
+ }
+ }
+ }
+
+ // 4th priority is loop peeling
+ computePeelCount(L, LoopSize, UP, TripCount);
+ if (UP.PeelCount) {
+ UP.Runtime = false;
+ UP.Count = 1;
+ return ExplicitUnroll;
+ }
+
+ // 5th priority is partial unrolling.
+ // Try partial unroll only when TripCount could be staticaly calculated.
+ if (TripCount) {
+ UP.Partial |= ExplicitUnroll;
+ if (!UP.Partial) {
+ DEBUG(dbgs() << " will not try to unroll partially because "
+ << "-unroll-allow-partial not given\n");
+ UP.Count = 0;
+ return false;
+ }
+ if (UP.Count == 0)
+ UP.Count = TripCount;
+ if (UP.PartialThreshold != NoThreshold) {
+ // Reduce unroll count to be modulo of TripCount for partial unrolling.
+ if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+ UP.Count =
+ (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
+ (LoopSize - UP.BEInsns);
+ if (UP.Count > UP.MaxCount)
+ UP.Count = UP.MaxCount;
+ while (UP.Count != 0 && TripCount % UP.Count != 0)
+ UP.Count--;
+ if (UP.AllowRemainder && UP.Count <= 1) {
+ // If there is no Count that is modulo of TripCount, set Count to
+ // largest power-of-two factor that satisfies the threshold limit.
+ // As we'll create fixup loop, do the type of unrolling only if
+ // remainder loop is allowed.
+ UP.Count = UP.DefaultUnrollRuntimeCount;
+ while (UP.Count != 0 &&
+ getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+ UP.Count >>= 1;
+ }
+ if (UP.Count < 2) {
+ if (PragmaEnableUnroll)
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE,
+ "UnrollAsDirectedTooLarge",
+ L->getStartLoc(), L->getHeader())
+ << "Unable to unroll loop as directed by unroll(enable) "
+ "pragma "
+ "because unrolled size is too large.";
+ });
+ UP.Count = 0;
+ }
+ } else {
+ UP.Count = TripCount;
+ }
+ if (UP.Count > UP.MaxCount)
+ UP.Count = UP.MaxCount;
+ if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
+ UP.Count != TripCount)
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE,
+ "FullUnrollAsDirectedTooLarge",
+ L->getStartLoc(), L->getHeader())
+ << "Unable to fully unroll loop as directed by unroll pragma "
+ "because "
+ "unrolled size is too large.";
+ });
+ return ExplicitUnroll;
+ }
+ assert(TripCount == 0 &&
+ "All cases when TripCount is constant should be covered here.");
+ if (PragmaFullUnroll)
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(
+ DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
+ L->getStartLoc(), L->getHeader())
+ << "Unable to fully unroll loop as directed by unroll(full) "
+ "pragma "
+ "because loop has a runtime trip count.";
+ });
+
+ // 6th priority is runtime unrolling.
+ // Don't unroll a runtime trip count loop when it is disabled.
+ if (HasRuntimeUnrollDisablePragma(L)) {
+ UP.Count = 0;
+ return false;
+ }
+
+ // Check if the runtime trip count is too small when profile is available.
+ if (L->getHeader()->getParent()->hasProfileData()) {
+ if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
+ if (*ProfileTripCount < FlatLoopTripCountThreshold)
+ return false;
+ else
+ UP.AllowExpensiveTripCount = true;
+ }
+ }
+
+ // Reduce count based on the type of unrolling and the threshold values.
+ UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
+ if (!UP.Runtime) {
+ DEBUG(dbgs() << " will not try to unroll loop with runtime trip count "
+ << "-unroll-runtime not given\n");
+ UP.Count = 0;
+ return false;
+ }
+ if (UP.Count == 0)
+ UP.Count = UP.DefaultUnrollRuntimeCount;
+
+ // Reduce unroll count to be the largest power-of-two factor of
+ // the original count which satisfies the threshold limit.
+ while (UP.Count != 0 &&
+ getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+ UP.Count >>= 1;
+
+#ifndef NDEBUG
+ unsigned OrigCount = UP.Count;
+#endif
+
+ if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
+ while (UP.Count != 0 && TripMultiple % UP.Count != 0)
+ UP.Count >>= 1;
+ DEBUG(dbgs() << "Remainder loop is restricted (that could architecture "
+ "specific or because the loop contains a convergent "
+ "instruction), so unroll count must divide the trip "
+ "multiple, "
+ << TripMultiple << ". Reducing unroll count from "
+ << OrigCount << " to " << UP.Count << ".\n");
+
+ using namespace ore;
+
+ if (PragmaCount > 0 && !UP.AllowRemainder)
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE,
+ "DifferentUnrollCountFromDirected",
+ L->getStartLoc(), L->getHeader())
+ << "Unable to unroll loop the number of times directed by "
+ "unroll_count pragma because remainder loop is restricted "
+ "(that could architecture specific or because the loop "
+ "contains a convergent instruction) and so must have an "
+ "unroll "
+ "count that divides the loop trip multiple of "
+ << NV("TripMultiple", TripMultiple) << ". Unrolling instead "
+ << NV("UnrollCount", UP.Count) << " time(s).";
+ });
+ }
+
+ if (UP.Count > UP.MaxCount)
+ UP.Count = UP.MaxCount;
+ DEBUG(dbgs() << " partially unrolling with count: " << UP.Count << "\n");
+ if (UP.Count < 2)
+ UP.Count = 0;
+ return ExplicitUnroll;
+}
+
+static LoopUnrollResult tryToUnrollLoop(
+ Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
+ const TargetTransformInfo &TTI, AssumptionCache &AC,
+ OptimizationRemarkEmitter &ORE, bool PreserveLCSSA, int OptLevel,
+ Optional<unsigned> ProvidedCount, Optional<unsigned> ProvidedThreshold,
+ Optional<bool> ProvidedAllowPartial, Optional<bool> ProvidedRuntime,
+ Optional<bool> ProvidedUpperBound, Optional<bool> ProvidedAllowPeeling) {
+ DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName()
+ << "] Loop %" << L->getHeader()->getName() << "\n");
+ if (HasUnrollDisablePragma(L))
+ return LoopUnrollResult::Unmodified;
+ if (!L->isLoopSimplifyForm()) {
+ DEBUG(
+ dbgs() << " Not unrolling loop which is not in loop-simplify form.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ unsigned NumInlineCandidates;
+ bool NotDuplicatable;
+ bool Convergent;
+ TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
+ L, SE, TTI, OptLevel, ProvidedThreshold, ProvidedCount,
+ ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
+ ProvidedAllowPeeling);
+ // Exit early if unrolling is disabled.
+ if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
+ return LoopUnrollResult::Unmodified;
+ unsigned LoopSize = ApproximateLoopSize(
+ L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC, UP.BEInsns);
+ DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
+ if (NotDuplicatable) {
+ DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
+ << " instructions.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+ if (NumInlineCandidates != 0) {
+ DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ // Find trip count and trip multiple if count is not available
+ unsigned TripCount = 0;
+ unsigned MaxTripCount = 0;
+ unsigned TripMultiple = 1;
+ // If there are multiple exiting blocks but one of them is the latch, use the
+ // latch for the trip count estimation. Otherwise insist on a single exiting
+ // block for the trip count estimation.
+ BasicBlock *ExitingBlock = L->getLoopLatch();
+ if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
+ ExitingBlock = L->getExitingBlock();
+ if (ExitingBlock) {
+ TripCount = SE.getSmallConstantTripCount(L, ExitingBlock);
+ TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
+ }
+
+ // If the loop contains a convergent operation, the prelude we'd add
+ // to do the first few instructions before we hit the unrolled loop
+ // is unsafe -- it adds a control-flow dependency to the convergent
+ // operation. Therefore restrict remainder loop (try unrollig without).
+ //
+ // TODO: This is quite conservative. In practice, convergent_op()
+ // is likely to be called unconditionally in the loop. In this
+ // case, the program would be ill-formed (on most architectures)
+ // unless n were the same on all threads in a thread group.
+ // Assuming n is the same on all threads, any kind of unrolling is
+ // safe. But currently llvm's notion of convergence isn't powerful
+ // enough to express this.
+ if (Convergent)
+ UP.AllowRemainder = false;
+
+ // Try to find the trip count upper bound if we cannot find the exact trip
+ // count.
+ bool MaxOrZero = false;
+ if (!TripCount) {
+ MaxTripCount = SE.getSmallConstantMaxTripCount(L);
+ MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
+ // We can unroll by the upper bound amount if it's generally allowed or if
+ // we know that the loop is executed either the upper bound or zero times.
+ // (MaxOrZero unrolling keeps only the first loop test, so the number of
+ // loop tests remains the same compared to the non-unrolled version, whereas
+ // the generic upper bound unrolling keeps all but the last loop test so the
+ // number of loop tests goes up which may end up being worse on targets with
+ // constriained branch predictor resources so is controlled by an option.)
+ // In addition we only unroll small upper bounds.
+ if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) {
+ MaxTripCount = 0;
+ }
+ }
+
+ // computeUnrollCount() decides whether it is beneficial to use upper bound to
+ // fully unroll the loop.
+ bool UseUpperBound = false;
+ bool IsCountSetExplicitly =
+ computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount,
+ TripMultiple, LoopSize, UP, UseUpperBound);
+ if (!UP.Count)
+ return LoopUnrollResult::Unmodified;
+ // Unroll factor (Count) must be less or equal to TripCount.
+ if (TripCount && UP.Count > TripCount)
+ UP.Count = TripCount;
+
+ // Unroll the loop.
+ LoopUnrollResult UnrollResult = UnrollLoop(
+ L, UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
+ UseUpperBound, MaxOrZero, TripMultiple, UP.PeelCount, UP.UnrollRemainder,
+ LI, &SE, &DT, &AC, &ORE, PreserveLCSSA);
+ if (UnrollResult == LoopUnrollResult::Unmodified)
+ return LoopUnrollResult::Unmodified;
+
+ // If loop has an unroll count pragma or unrolled by explicitly set count
+ // mark loop as unrolled to prevent unrolling beyond that requested.
+ // If the loop was peeled, we already "used up" the profile information
+ // we had, so we don't want to unroll or peel again.
+ if (UnrollResult != LoopUnrollResult::FullyUnrolled &&
+ (IsCountSetExplicitly || UP.PeelCount))
+ L->setLoopAlreadyUnrolled();
+
+ return UnrollResult;
+}
+
+namespace {
+
+class LoopUnroll : public LoopPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+
+ int OptLevel;
+ Optional<unsigned> ProvidedCount;
+ Optional<unsigned> ProvidedThreshold;
+ Optional<bool> ProvidedAllowPartial;
+ Optional<bool> ProvidedRuntime;
+ Optional<bool> ProvidedUpperBound;
+ Optional<bool> ProvidedAllowPeeling;
+
+ LoopUnroll(int OptLevel = 2, Optional<unsigned> Threshold = None,
+ Optional<unsigned> Count = None,
+ Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
+ Optional<bool> UpperBound = None,
+ Optional<bool> AllowPeeling = None)
+ : LoopPass(ID), OptLevel(OptLevel), ProvidedCount(std::move(Count)),
+ ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
+ ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
+ ProvidedAllowPeeling(AllowPeeling) {
+ initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+
+ Function &F = *L->getHeader()->getParent();
+
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ const TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+ // pass. Function analyses need to be preserved across loop transformations
+ // but ORE cannot be preserved (see comment before the pass definition).
+ OptimizationRemarkEmitter ORE(&F);
+ bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+ LoopUnrollResult Result = tryToUnrollLoop(
+ L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel, ProvidedCount,
+ ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime,
+ ProvidedUpperBound, ProvidedAllowPeeling);
+
+ if (Result == LoopUnrollResult::FullyUnrolled)
+ LPM.markLoopAsDeleted(*L);
+
+ return Result != LoopUnrollResult::Unmodified;
+ }
+
+ /// This transformation requires natural loop information & requires that
+ /// loop preheaders be inserted into the CFG...
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ // FIXME: Loop passes are required to preserve domtree, and for now we just
+ // recreate dom info if anything gets unrolled.
+ getLoopAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char LoopUnroll::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+
+Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count,
+ int AllowPartial, int Runtime, int UpperBound,
+ int AllowPeeling) {
+ // TODO: It would make more sense for this function to take the optionals
+ // directly, but that's dangerous since it would silently break out of tree
+ // callers.
+ return new LoopUnroll(
+ OptLevel, Threshold == -1 ? None : Optional<unsigned>(Threshold),
+ Count == -1 ? None : Optional<unsigned>(Count),
+ AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
+ Runtime == -1 ? None : Optional<bool>(Runtime),
+ UpperBound == -1 ? None : Optional<bool>(UpperBound),
+ AllowPeeling == -1 ? None : Optional<bool>(AllowPeeling));
+}
+
+Pass *llvm::createSimpleLoopUnrollPass(int OptLevel) {
+ return createLoopUnrollPass(OptLevel, -1, -1, 0, 0, 0, 0);
+}
+
+PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &Updater) {
+ const auto &FAM =
+ AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
+ Function *F = L.getHeader()->getParent();
+
+ auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
+ // FIXME: This should probably be optional rather than required.
+ if (!ORE)
+ report_fatal_error(
+ "LoopFullUnrollPass: OptimizationRemarkEmitterAnalysis not "
+ "cached at a higher level");
+
+ // Keep track of the previous loop structure so we can identify new loops
+ // created by unrolling.
+ Loop *ParentL = L.getParentLoop();
+ SmallPtrSet<Loop *, 4> OldLoops;
+ if (ParentL)
+ OldLoops.insert(ParentL->begin(), ParentL->end());
+ else
+ OldLoops.insert(AR.LI.begin(), AR.LI.end());
+
+ std::string LoopName = L.getName();
+
+ bool Changed =
+ tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
+ /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
+ /*Threshold*/ None, /*AllowPartial*/ false,
+ /*Runtime*/ false, /*UpperBound*/ false,
+ /*AllowPeeling*/ false) != LoopUnrollResult::Unmodified;
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ // The parent must not be damaged by unrolling!
+#ifndef NDEBUG
+ if (ParentL)
+ ParentL->verifyLoop();
+#endif
+
+ // Unrolling can do several things to introduce new loops into a loop nest:
+ // - Full unrolling clones child loops within the current loop but then
+ // removes the current loop making all of the children appear to be new
+ // sibling loops.
+ //
+ // When a new loop appears as a sibling loop after fully unrolling,
+ // its nesting structure has fundamentally changed and we want to revisit
+ // it to reflect that.
+ //
+ // When unrolling has removed the current loop, we need to tell the
+ // infrastructure that it is gone.
+ //
+ // Finally, we support a debugging/testing mode where we revisit child loops
+ // as well. These are not expected to require further optimizations as either
+ // they or the loop they were cloned from have been directly visited already.
+ // But the debugging mode allows us to check this assumption.
+ bool IsCurrentLoopValid = false;
+ SmallVector<Loop *, 4> SibLoops;
+ if (ParentL)
+ SibLoops.append(ParentL->begin(), ParentL->end());
+ else
+ SibLoops.append(AR.LI.begin(), AR.LI.end());
+ erase_if(SibLoops, [&](Loop *SibLoop) {
+ if (SibLoop == &L) {
+ IsCurrentLoopValid = true;
+ return true;
+ }
+
+ // Otherwise erase the loop from the list if it was in the old loops.
+ return OldLoops.count(SibLoop) != 0;
+ });
+ Updater.addSiblingLoops(SibLoops);
+
+ if (!IsCurrentLoopValid) {
+ Updater.markLoopAsDeleted(L, LoopName);
+ } else {
+ // We can only walk child loops if the current loop remained valid.
+ if (UnrollRevisitChildLoops) {
+ // Walk *all* of the child loops.
+ SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
+ Updater.addChildLoops(ChildLoops);
+ }
+ }
+
+ return getLoopPassPreservedAnalyses();
+}
+
+template <typename RangeT>
+static SmallVector<Loop *, 8> appendLoopsToWorklist(RangeT &&Loops) {
+ SmallVector<Loop *, 8> Worklist;
+ // We use an internal worklist to build up the preorder traversal without
+ // recursion.
+ SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist;
+
+ for (Loop *RootL : Loops) {
+ assert(PreOrderLoops.empty() && "Must start with an empty preorder walk.");
+ assert(PreOrderWorklist.empty() &&
+ "Must start with an empty preorder walk worklist.");
+ PreOrderWorklist.push_back(RootL);
+ do {
+ Loop *L = PreOrderWorklist.pop_back_val();
+ PreOrderWorklist.append(L->begin(), L->end());
+ PreOrderLoops.push_back(L);
+ } while (!PreOrderWorklist.empty());
+
+ Worklist.append(PreOrderLoops.begin(), PreOrderLoops.end());
+ PreOrderLoops.clear();
+ }
+ return Worklist;
+}
+
+PreservedAnalyses LoopUnrollPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+ LoopAnalysisManager *LAM = nullptr;
+ if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
+ LAM = &LAMProxy->getManager();
+
+ const ModuleAnalysisManager &MAM =
+ AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
+ ProfileSummaryInfo *PSI =
+ MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+
+ bool Changed = false;
+
+ // The unroller requires loops to be in simplified form, and also needs LCSSA.
+ // Since simplification may add new inner loops, it has to run before the
+ // legality and profitability checks. This means running the loop unroller
+ // will simplify all loops, regardless of whether anything end up being
+ // unrolled.
+ for (auto &L : LI) {
+ Changed |= simplifyLoop(L, &DT, &LI, &SE, &AC, false /* PreserveLCSSA */);
+ Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
+ }
+
+ SmallVector<Loop *, 8> Worklist = appendLoopsToWorklist(LI);
+
+ while (!Worklist.empty()) {
+ // Because the LoopInfo stores the loops in RPO, we walk the worklist
+ // from back to front so that we work forward across the CFG, which
+ // for unrolling is only needed to get optimization remarks emitted in
+ // a forward order.
+ Loop &L = *Worklist.pop_back_val();
+#ifndef NDEBUG
+ Loop *ParentL = L.getParentLoop();
+#endif
+
+ // The API here is quite complex to call, but there are only two interesting
+ // states we support: partial and full (or "simple") unrolling. However, to
+ // enable these things we actually pass "None" in for the optional to avoid
+ // providing an explicit choice.
+ Optional<bool> AllowPartialParam, RuntimeParam, UpperBoundParam,
+ AllowPeeling;
+ // Check if the profile summary indicates that the profiled application
+ // has a huge working set size, in which case we disable peeling to avoid
+ // bloating it further.
+ if (PSI && PSI->hasHugeWorkingSetSize())
+ AllowPeeling = false;
+ std::string LoopName = L.getName();
+ LoopUnrollResult Result =
+ tryToUnrollLoop(&L, DT, &LI, SE, TTI, AC, ORE,
+ /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
+ /*Threshold*/ None, AllowPartialParam, RuntimeParam,
+ UpperBoundParam, AllowPeeling);
+ Changed |= Result != LoopUnrollResult::Unmodified;
+
+ // The parent must not be damaged by unrolling!
+#ifndef NDEBUG
+ if (Result != LoopUnrollResult::Unmodified && ParentL)
+ ParentL->verifyLoop();
+#endif
+
+ // Clear any cached analysis results for L if we removed it completely.
+ if (LAM && Result == LoopUnrollResult::FullyUnrolled)
+ LAM->clear(L, LoopName);
+ }
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ return getLoopPassPreservedAnalyses();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
new file mode 100644
index 000000000000..f2405d9b0c03
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -0,0 +1,1613 @@
+//===- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms loops that contain branches on loop-invariant conditions
+// to multiple loops. For example, it turns the left into the right code:
+//
+// for (...) if (lic)
+// A for (...)
+// if (lic) A; B; C
+// B else
+// C for (...)
+// A; C
+//
+// This can increase the size of the code exponentially (doubling it every time
+// a loop is unswitched) so we only unswitch if the resultant code will be
+// smaller than a threshold.
+//
+// This pass expects LICM to be run before it to hoist invariant conditions out
+// of the loop, to make the unswitching opportunity obvious.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <map>
+#include <set>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unswitch"
+
+STATISTIC(NumBranches, "Number of branches unswitched");
+STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumGuards, "Number of guards unswitched");
+STATISTIC(NumSelects , "Number of selects unswitched");
+STATISTIC(NumTrivial , "Number of unswitches that are trivial");
+STATISTIC(NumSimplify, "Number of simplifications of unswitched code");
+STATISTIC(TotalInsts, "Total number of instructions analyzed");
+
+// The specific value of 100 here was chosen based only on intuition and a
+// few specific examples.
+static cl::opt<unsigned>
+Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
+ cl::init(100), cl::Hidden);
+
+namespace {
+
+ class LUAnalysisCache {
+ using UnswitchedValsMap =
+ DenseMap<const SwitchInst *, SmallPtrSet<const Value *, 8>>;
+ using UnswitchedValsIt = UnswitchedValsMap::iterator;
+
+ struct LoopProperties {
+ unsigned CanBeUnswitchedCount;
+ unsigned WasUnswitchedCount;
+ unsigned SizeEstimation;
+ UnswitchedValsMap UnswitchedVals;
+ };
+
+ // Here we use std::map instead of DenseMap, since we need to keep valid
+ // LoopProperties pointer for current loop for better performance.
+ using LoopPropsMap = std::map<const Loop *, LoopProperties>;
+ using LoopPropsMapIt = LoopPropsMap::iterator;
+
+ LoopPropsMap LoopsProperties;
+ UnswitchedValsMap *CurLoopInstructions = nullptr;
+ LoopProperties *CurrentLoopProperties = nullptr;
+
+ // A loop unswitching with an estimated cost above this threshold
+ // is not performed. MaxSize is turned into unswitching quota for
+ // the current loop, and reduced correspondingly, though note that
+ // the quota is returned by releaseMemory() when the loop has been
+ // processed, so that MaxSize will return to its previous
+ // value. So in most cases MaxSize will equal the Threshold flag
+ // when a new loop is processed. An exception to that is that
+ // MaxSize will have a smaller value while processing nested loops
+ // that were introduced due to loop unswitching of an outer loop.
+ //
+ // FIXME: The way that MaxSize works is subtle and depends on the
+ // pass manager processing loops and calling releaseMemory() in a
+ // specific order. It would be good to find a more straightforward
+ // way of doing what MaxSize does.
+ unsigned MaxSize;
+
+ public:
+ LUAnalysisCache() : MaxSize(Threshold) {}
+
+ // Analyze loop. Check its size, calculate is it possible to unswitch
+ // it. Returns true if we can unswitch this loop.
+ bool countLoop(const Loop *L, const TargetTransformInfo &TTI,
+ AssumptionCache *AC);
+
+ // Clean all data related to given loop.
+ void forgetLoop(const Loop *L);
+
+ // Mark case value as unswitched.
+ // Since SI instruction can be partly unswitched, in order to avoid
+ // extra unswitching in cloned loops keep track all unswitched values.
+ void setUnswitched(const SwitchInst *SI, const Value *V);
+
+ // Check was this case value unswitched before or not.
+ bool isUnswitched(const SwitchInst *SI, const Value *V);
+
+ // Returns true if another unswitching could be done within the cost
+ // threshold.
+ bool CostAllowsUnswitching();
+
+ // Clone all loop-unswitch related loop properties.
+ // Redistribute unswitching quotas.
+ // Note, that new loop data is stored inside the VMap.
+ void cloneData(const Loop *NewLoop, const Loop *OldLoop,
+ const ValueToValueMapTy &VMap);
+ };
+
+ class LoopUnswitch : public LoopPass {
+ LoopInfo *LI; // Loop information
+ LPPassManager *LPM;
+ AssumptionCache *AC;
+
+ // Used to check if second loop needs processing after
+ // RewriteLoopBodyWithConditionConstant rewrites first loop.
+ std::vector<Loop*> LoopProcessWorklist;
+
+ LUAnalysisCache BranchesInfo;
+
+ bool OptimizeForSize;
+ bool redoLoop = false;
+
+ Loop *currentLoop = nullptr;
+ DominatorTree *DT = nullptr;
+ BasicBlock *loopHeader = nullptr;
+ BasicBlock *loopPreheader = nullptr;
+
+ bool SanitizeMemory;
+ LoopSafetyInfo SafetyInfo;
+
+ // LoopBlocks contains all of the basic blocks of the loop, including the
+ // preheader of the loop, the body of the loop, and the exit blocks of the
+ // loop, in that order.
+ std::vector<BasicBlock*> LoopBlocks;
+ // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
+ std::vector<BasicBlock*> NewBlocks;
+
+ bool hasBranchDivergence;
+
+ public:
+ static char ID; // Pass ID, replacement for typeid
+
+ explicit LoopUnswitch(bool Os = false, bool hasBranchDivergence = false)
+ : LoopPass(ID), OptimizeForSize(Os),
+ hasBranchDivergence(hasBranchDivergence) {
+ initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+ bool processCurrentLoop();
+ bool isUnreachableDueToPreviousUnswitching(BasicBlock *);
+
+ /// This transformation requires natural loop information & requires that
+ /// loop preheaders be inserted into the CFG.
+ ///
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ if (hasBranchDivergence)
+ AU.addRequired<DivergenceAnalysis>();
+ getLoopAnalysisUsage(AU);
+ }
+
+ private:
+ void releaseMemory() override {
+ BranchesInfo.forgetLoop(currentLoop);
+ }
+
+ void initLoopData() {
+ loopHeader = currentLoop->getHeader();
+ loopPreheader = currentLoop->getLoopPreheader();
+ }
+
+ /// Split all of the edges from inside the loop to their exit blocks.
+ /// Update the appropriate Phi nodes as we do so.
+ void SplitExitEdges(Loop *L,
+ const SmallVectorImpl<BasicBlock *> &ExitBlocks);
+
+ bool TryTrivialLoopUnswitch(bool &Changed);
+
+ bool UnswitchIfProfitable(Value *LoopCond, Constant *Val,
+ TerminatorInst *TI = nullptr);
+ void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
+ BasicBlock *ExitBlock, TerminatorInst *TI);
+ void UnswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L,
+ TerminatorInst *TI);
+
+ void RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+ Constant *Val, bool isEqual);
+
+ void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+ BasicBlock *TrueDest,
+ BasicBlock *FalseDest,
+ BranchInst *OldBranch,
+ TerminatorInst *TI);
+
+ void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
+
+ /// Given that the Invariant is not equal to Val. Simplify instructions
+ /// in the loop.
+ Value *SimplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant,
+ Constant *Val);
+ };
+
+} // end anonymous namespace
+
+// Analyze loop. Check its size, calculate is it possible to unswitch
+// it. Returns true if we can unswitch this loop.
+bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
+ AssumptionCache *AC) {
+ LoopPropsMapIt PropsIt;
+ bool Inserted;
+ std::tie(PropsIt, Inserted) =
+ LoopsProperties.insert(std::make_pair(L, LoopProperties()));
+
+ LoopProperties &Props = PropsIt->second;
+
+ if (Inserted) {
+ // New loop.
+
+ // Limit the number of instructions to avoid causing significant code
+ // expansion, and the number of basic blocks, to avoid loops with
+ // large numbers of branches which cause loop unswitching to go crazy.
+ // This is a very ad-hoc heuristic.
+
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+ // FIXME: This is overly conservative because it does not take into
+ // consideration code simplification opportunities and code that can
+ // be shared by the resultant unswitched loops.
+ CodeMetrics Metrics;
+ for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E;
+ ++I)
+ Metrics.analyzeBasicBlock(*I, TTI, EphValues);
+
+ Props.SizeEstimation = Metrics.NumInsts;
+ Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation);
+ Props.WasUnswitchedCount = 0;
+ MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount;
+
+ if (Metrics.notDuplicatable) {
+ DEBUG(dbgs() << "NOT unswitching loop %"
+ << L->getHeader()->getName() << ", contents cannot be "
+ << "duplicated!\n");
+ return false;
+ }
+ }
+
+ // Be careful. This links are good only before new loop addition.
+ CurrentLoopProperties = &Props;
+ CurLoopInstructions = &Props.UnswitchedVals;
+
+ return true;
+}
+
+// Clean all data related to given loop.
+void LUAnalysisCache::forgetLoop(const Loop *L) {
+ LoopPropsMapIt LIt = LoopsProperties.find(L);
+
+ if (LIt != LoopsProperties.end()) {
+ LoopProperties &Props = LIt->second;
+ MaxSize += (Props.CanBeUnswitchedCount + Props.WasUnswitchedCount) *
+ Props.SizeEstimation;
+ LoopsProperties.erase(LIt);
+ }
+
+ CurrentLoopProperties = nullptr;
+ CurLoopInstructions = nullptr;
+}
+
+// Mark case value as unswitched.
+// Since SI instruction can be partly unswitched, in order to avoid
+// extra unswitching in cloned loops keep track all unswitched values.
+void LUAnalysisCache::setUnswitched(const SwitchInst *SI, const Value *V) {
+ (*CurLoopInstructions)[SI].insert(V);
+}
+
+// Check was this case value unswitched before or not.
+bool LUAnalysisCache::isUnswitched(const SwitchInst *SI, const Value *V) {
+ return (*CurLoopInstructions)[SI].count(V);
+}
+
+bool LUAnalysisCache::CostAllowsUnswitching() {
+ return CurrentLoopProperties->CanBeUnswitchedCount > 0;
+}
+
+// Clone all loop-unswitch related loop properties.
+// Redistribute unswitching quotas.
+// Note, that new loop data is stored inside the VMap.
+void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop,
+ const ValueToValueMapTy &VMap) {
+ LoopProperties &NewLoopProps = LoopsProperties[NewLoop];
+ LoopProperties &OldLoopProps = *CurrentLoopProperties;
+ UnswitchedValsMap &Insts = OldLoopProps.UnswitchedVals;
+
+ // Reallocate "can-be-unswitched quota"
+
+ --OldLoopProps.CanBeUnswitchedCount;
+ ++OldLoopProps.WasUnswitchedCount;
+ NewLoopProps.WasUnswitchedCount = 0;
+ unsigned Quota = OldLoopProps.CanBeUnswitchedCount;
+ NewLoopProps.CanBeUnswitchedCount = Quota / 2;
+ OldLoopProps.CanBeUnswitchedCount = Quota - Quota / 2;
+
+ NewLoopProps.SizeEstimation = OldLoopProps.SizeEstimation;
+
+ // Clone unswitched values info:
+ // for new loop switches we clone info about values that was
+ // already unswitched and has redundant successors.
+ for (UnswitchedValsIt I = Insts.begin(); I != Insts.end(); ++I) {
+ const SwitchInst *OldInst = I->first;
+ Value *NewI = VMap.lookup(OldInst);
+ const SwitchInst *NewInst = cast_or_null<SwitchInst>(NewI);
+ assert(NewInst && "All instructions that are in SrcBB must be in VMap.");
+
+ NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst];
+ }
+}
+
+char LoopUnswitch::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
+ false, false)
+
+Pass *llvm::createLoopUnswitchPass(bool Os, bool hasBranchDivergence) {
+ return new LoopUnswitch(Os, hasBranchDivergence);
+}
+
+/// Operator chain lattice.
+enum OperatorChain {
+ OC_OpChainNone, ///< There is no operator.
+ OC_OpChainOr, ///< There are only ORs.
+ OC_OpChainAnd, ///< There are only ANDs.
+ OC_OpChainMixed ///< There are ANDs and ORs.
+};
+
+/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
+/// an invariant piece, return the invariant. Otherwise, return null.
+//
+/// NOTE: FindLIVLoopCondition will not return a partial LIV by walking up a
+/// mixed operator chain, as we can not reliably find a value which will simplify
+/// the operator chain. If the chain is AND-only or OR-only, we can use 0 or ~0
+/// to simplify the chain.
+///
+/// NOTE: In case a partial LIV and a mixed operator chain, we may be able to
+/// simplify the condition itself to a loop variant condition, but at the
+/// cost of creating an entirely new loop.
+static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
+ OperatorChain &ParentChain,
+ DenseMap<Value *, Value *> &Cache) {
+ auto CacheIt = Cache.find(Cond);
+ if (CacheIt != Cache.end())
+ return CacheIt->second;
+
+ // We started analyze new instruction, increment scanned instructions counter.
+ ++TotalInsts;
+
+ // We can never unswitch on vector conditions.
+ if (Cond->getType()->isVectorTy())
+ return nullptr;
+
+ // Constants should be folded, not unswitched on!
+ if (isa<Constant>(Cond)) return nullptr;
+
+ // TODO: Handle: br (VARIANT|INVARIANT).
+
+ // Hoist simple values out.
+ if (L->makeLoopInvariant(Cond, Changed)) {
+ Cache[Cond] = Cond;
+ return Cond;
+ }
+
+ // Walk up the operator chain to find partial invariant conditions.
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond))
+ if (BO->getOpcode() == Instruction::And ||
+ BO->getOpcode() == Instruction::Or) {
+ // Given the previous operator, compute the current operator chain status.
+ OperatorChain NewChain;
+ switch (ParentChain) {
+ case OC_OpChainNone:
+ NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
+ OC_OpChainOr;
+ break;
+ case OC_OpChainOr:
+ NewChain = BO->getOpcode() == Instruction::Or ? OC_OpChainOr :
+ OC_OpChainMixed;
+ break;
+ case OC_OpChainAnd:
+ NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
+ OC_OpChainMixed;
+ break;
+ case OC_OpChainMixed:
+ NewChain = OC_OpChainMixed;
+ break;
+ }
+
+ // If we reach a Mixed state, we do not want to keep walking up as we can not
+ // reliably find a value that will simplify the chain. With this check, we
+ // will return null on the first sight of mixed chain and the caller will
+ // either backtrack to find partial LIV in other operand or return null.
+ if (NewChain != OC_OpChainMixed) {
+ // Update the current operator chain type before we search up the chain.
+ ParentChain = NewChain;
+ // If either the left or right side is invariant, we can unswitch on this,
+ // which will cause the branch to go away in one loop and the condition to
+ // simplify in the other one.
+ if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed,
+ ParentChain, Cache)) {
+ Cache[Cond] = LHS;
+ return LHS;
+ }
+ // We did not manage to find a partial LIV in operand(0). Backtrack and try
+ // operand(1).
+ ParentChain = NewChain;
+ if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed,
+ ParentChain, Cache)) {
+ Cache[Cond] = RHS;
+ return RHS;
+ }
+ }
+ }
+
+ Cache[Cond] = nullptr;
+ return nullptr;
+}
+
+/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
+/// an invariant piece, return the invariant along with the operator chain type.
+/// Otherwise, return null.
+static std::pair<Value *, OperatorChain> FindLIVLoopCondition(Value *Cond,
+ Loop *L,
+ bool &Changed) {
+ DenseMap<Value *, Value *> Cache;
+ OperatorChain OpChain = OC_OpChainNone;
+ Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache);
+
+ // In case we do find a LIV, it can not be obtained by walking up a mixed
+ // operator chain.
+ assert((!FCond || OpChain != OC_OpChainMixed) &&
+ "Do not expect a partial LIV with mixed operator chain");
+ return {FCond, OpChain};
+}
+
+bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
+ if (skipLoop(L))
+ return false;
+
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *L->getHeader()->getParent());
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ LPM = &LPM_Ref;
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ currentLoop = L;
+ Function *F = currentLoop->getHeader()->getParent();
+
+ SanitizeMemory = F->hasFnAttribute(Attribute::SanitizeMemory);
+ if (SanitizeMemory)
+ computeLoopSafetyInfo(&SafetyInfo, L);
+
+ bool Changed = false;
+ do {
+ assert(currentLoop->isLCSSAForm(*DT));
+ redoLoop = false;
+ Changed |= processCurrentLoop();
+ } while(redoLoop);
+
+ return Changed;
+}
+
+// Return true if the BasicBlock BB is unreachable from the loop header.
+// Return false, otherwise.
+bool LoopUnswitch::isUnreachableDueToPreviousUnswitching(BasicBlock *BB) {
+ auto *Node = DT->getNode(BB)->getIDom();
+ BasicBlock *DomBB = Node->getBlock();
+ while (currentLoop->contains(DomBB)) {
+ BranchInst *BInst = dyn_cast<BranchInst>(DomBB->getTerminator());
+
+ Node = DT->getNode(DomBB)->getIDom();
+ DomBB = Node->getBlock();
+
+ if (!BInst || !BInst->isConditional())
+ continue;
+
+ Value *Cond = BInst->getCondition();
+ if (!isa<ConstantInt>(Cond))
+ continue;
+
+ BasicBlock *UnreachableSucc =
+ Cond == ConstantInt::getTrue(Cond->getContext())
+ ? BInst->getSuccessor(1)
+ : BInst->getSuccessor(0);
+
+ if (DT->dominates(UnreachableSucc, BB))
+ return true;
+ }
+ return false;
+}
+
+/// FIXME: Remove this workaround when freeze related patches are done.
+/// LoopUnswitch and Equality propagation in GVN have discrepancy about
+/// whether branch on undef/poison has undefine behavior. Here it is to
+/// rule out some common cases that we found such discrepancy already
+/// causing problems. Detail could be found in PR31652. Note if the
+/// func returns true, it is unsafe. But if it is false, it doesn't mean
+/// it is necessarily safe.
+static bool EqualityPropUnSafe(Value &LoopCond) {
+ ICmpInst *CI = dyn_cast<ICmpInst>(&LoopCond);
+ if (!CI || !CI->isEquality())
+ return false;
+
+ Value *LHS = CI->getOperand(0);
+ Value *RHS = CI->getOperand(1);
+ if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS))
+ return true;
+
+ auto hasUndefInPHI = [](PHINode &PN) {
+ for (Value *Opd : PN.incoming_values()) {
+ if (isa<UndefValue>(Opd))
+ return true;
+ }
+ return false;
+ };
+ PHINode *LPHI = dyn_cast<PHINode>(LHS);
+ PHINode *RPHI = dyn_cast<PHINode>(RHS);
+ if ((LPHI && hasUndefInPHI(*LPHI)) || (RPHI && hasUndefInPHI(*RPHI)))
+ return true;
+
+ auto hasUndefInSelect = [](SelectInst &SI) {
+ if (isa<UndefValue>(SI.getTrueValue()) ||
+ isa<UndefValue>(SI.getFalseValue()))
+ return true;
+ return false;
+ };
+ SelectInst *LSI = dyn_cast<SelectInst>(LHS);
+ SelectInst *RSI = dyn_cast<SelectInst>(RHS);
+ if ((LSI && hasUndefInSelect(*LSI)) || (RSI && hasUndefInSelect(*RSI)))
+ return true;
+ return false;
+}
+
+/// Do actual work and unswitch loop if possible and profitable.
+bool LoopUnswitch::processCurrentLoop() {
+ bool Changed = false;
+
+ initLoopData();
+
+ // If LoopSimplify was unable to form a preheader, don't do any unswitching.
+ if (!loopPreheader)
+ return false;
+
+ // Loops with indirectbr cannot be cloned.
+ if (!currentLoop->isSafeToClone())
+ return false;
+
+ // Without dedicated exits, splitting the exit edge may fail.
+ if (!currentLoop->hasDedicatedExits())
+ return false;
+
+ LLVMContext &Context = loopHeader->getContext();
+
+ // Analyze loop cost, and stop unswitching if loop content can not be duplicated.
+ if (!BranchesInfo.countLoop(
+ currentLoop, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *currentLoop->getHeader()->getParent()),
+ AC))
+ return false;
+
+ // Try trivial unswitch first before loop over other basic blocks in the loop.
+ if (TryTrivialLoopUnswitch(Changed)) {
+ return true;
+ }
+
+ // Run through the instructions in the loop, keeping track of three things:
+ //
+ // - That we do not unswitch loops containing convergent operations, as we
+ // might be making them control dependent on the unswitch value when they
+ // were not before.
+ // FIXME: This could be refined to only bail if the convergent operation is
+ // not already control-dependent on the unswitch value.
+ //
+ // - That basic blocks in the loop contain invokes whose predecessor edges we
+ // cannot split.
+ //
+ // - The set of guard intrinsics encountered (these are non terminator
+ // instructions that are also profitable to be unswitched).
+
+ SmallVector<IntrinsicInst *, 4> Guards;
+
+ for (const auto BB : currentLoop->blocks()) {
+ for (auto &I : *BB) {
+ auto CS = CallSite(&I);
+ if (!CS) continue;
+ if (CS.hasFnAttr(Attribute::Convergent))
+ return false;
+ if (auto *II = dyn_cast<InvokeInst>(&I))
+ if (!II->getUnwindDest()->canSplitPredecessors())
+ return false;
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::experimental_guard)
+ Guards.push_back(II);
+ }
+ }
+
+ // Do not do non-trivial unswitch while optimizing for size.
+ // FIXME: Use Function::optForSize().
+ if (OptimizeForSize ||
+ loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
+ return false;
+
+ for (IntrinsicInst *Guard : Guards) {
+ Value *LoopCond =
+ FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed).first;
+ if (LoopCond &&
+ UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
+ // NB! Unswitching (if successful) could have erased some of the
+ // instructions in Guards leaving dangling pointers there. This is fine
+ // because we're returning now, and won't look at Guards again.
+ ++NumGuards;
+ return true;
+ }
+ }
+
+ // Loop over all of the basic blocks in the loop. If we find an interior
+ // block that is branching on a loop-invariant condition, we can unswitch this
+ // loop.
+ for (Loop::block_iterator I = currentLoop->block_begin(),
+ E = currentLoop->block_end(); I != E; ++I) {
+ TerminatorInst *TI = (*I)->getTerminator();
+
+ // Unswitching on a potentially uninitialized predicate is not
+ // MSan-friendly. Limit this to the cases when the original predicate is
+ // guaranteed to execute, to avoid creating a use-of-uninitialized-value
+ // in the code that did not have one.
+ // This is a workaround for the discrepancy between LLVM IR and MSan
+ // semantics. See PR28054 for more details.
+ if (SanitizeMemory &&
+ !isGuaranteedToExecute(*TI, DT, currentLoop, &SafetyInfo))
+ continue;
+
+ if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ // Some branches may be rendered unreachable because of previous
+ // unswitching.
+ // Unswitch only those branches that are reachable.
+ if (isUnreachableDueToPreviousUnswitching(*I))
+ continue;
+
+ // If this isn't branching on an invariant condition, we can't unswitch
+ // it.
+ if (BI->isConditional()) {
+ // See if this, or some part of it, is loop invariant. If so, we can
+ // unswitch on it if we desire.
+ Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
+ currentLoop, Changed).first;
+ if (LoopCond && !EqualityPropUnSafe(*LoopCond) &&
+ UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
+ ++NumBranches;
+ return true;
+ }
+ }
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+ Value *SC = SI->getCondition();
+ Value *LoopCond;
+ OperatorChain OpChain;
+ std::tie(LoopCond, OpChain) =
+ FindLIVLoopCondition(SC, currentLoop, Changed);
+
+ unsigned NumCases = SI->getNumCases();
+ if (LoopCond && NumCases) {
+ // Find a value to unswitch on:
+ // FIXME: this should chose the most expensive case!
+ // FIXME: scan for a case with a non-critical edge?
+ Constant *UnswitchVal = nullptr;
+ // Find a case value such that at least one case value is unswitched
+ // out.
+ if (OpChain == OC_OpChainAnd) {
+ // If the chain only has ANDs and the switch has a case value of 0.
+ // Dropping in a 0 to the chain will unswitch out the 0-casevalue.
+ auto *AllZero = cast<ConstantInt>(Constant::getNullValue(SC->getType()));
+ if (BranchesInfo.isUnswitched(SI, AllZero))
+ continue;
+ // We are unswitching 0 out.
+ UnswitchVal = AllZero;
+ } else if (OpChain == OC_OpChainOr) {
+ // If the chain only has ORs and the switch has a case value of ~0.
+ // Dropping in a ~0 to the chain will unswitch out the ~0-casevalue.
+ auto *AllOne = cast<ConstantInt>(Constant::getAllOnesValue(SC->getType()));
+ if (BranchesInfo.isUnswitched(SI, AllOne))
+ continue;
+ // We are unswitching ~0 out.
+ UnswitchVal = AllOne;
+ } else {
+ assert(OpChain == OC_OpChainNone &&
+ "Expect to unswitch on trivial chain");
+ // Do not process same value again and again.
+ // At this point we have some cases already unswitched and
+ // some not yet unswitched. Let's find the first not yet unswitched one.
+ for (auto Case : SI->cases()) {
+ Constant *UnswitchValCandidate = Case.getCaseValue();
+ if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
+ UnswitchVal = UnswitchValCandidate;
+ break;
+ }
+ }
+ }
+
+ if (!UnswitchVal)
+ continue;
+
+ if (UnswitchIfProfitable(LoopCond, UnswitchVal)) {
+ ++NumSwitches;
+ // In case of a full LIV, UnswitchVal is the value we unswitched out.
+ // In case of a partial LIV, we only unswitch when its an AND-chain
+ // or OR-chain. In both cases switch input value simplifies to
+ // UnswitchVal.
+ BranchesInfo.setUnswitched(SI, UnswitchVal);
+ return true;
+ }
+ }
+ }
+
+ // Scan the instructions to check for unswitchable values.
+ for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end();
+ BBI != E; ++BBI)
+ if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
+ Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
+ currentLoop, Changed).first;
+ if (LoopCond && UnswitchIfProfitable(LoopCond,
+ ConstantInt::getTrue(Context))) {
+ ++NumSelects;
+ return true;
+ }
+ }
+ }
+ return Changed;
+}
+
+/// Check to see if all paths from BB exit the loop with no side effects
+/// (including infinite loops).
+///
+/// If true, we return true and set ExitBB to the block we
+/// exit through.
+///
+static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
+ BasicBlock *&ExitBB,
+ std::set<BasicBlock*> &Visited) {
+ if (!Visited.insert(BB).second) {
+ // Already visited. Without more analysis, this could indicate an infinite
+ // loop.
+ return false;
+ }
+ if (!L->contains(BB)) {
+ // Otherwise, this is a loop exit, this is fine so long as this is the
+ // first exit.
+ if (ExitBB) return false;
+ ExitBB = BB;
+ return true;
+ }
+
+ // Otherwise, this is an unvisited intra-loop node. Check all successors.
+ for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) {
+ // Check to see if the successor is a trivial loop exit.
+ if (!isTrivialLoopExitBlockHelper(L, *SI, ExitBB, Visited))
+ return false;
+ }
+
+ // Okay, everything after this looks good, check to make sure that this block
+ // doesn't include any side effects.
+ for (Instruction &I : *BB)
+ if (I.mayHaveSideEffects())
+ return false;
+
+ return true;
+}
+
+/// Return true if the specified block unconditionally leads to an exit from
+/// the specified loop, and has no side-effects in the process. If so, return
+/// the block that is exited to, otherwise return null.
+static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
+ std::set<BasicBlock*> Visited;
+ Visited.insert(L->getHeader()); // Branches to header make infinite loops.
+ BasicBlock *ExitBB = nullptr;
+ if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited))
+ return ExitBB;
+ return nullptr;
+}
+
+/// We have found that we can unswitch currentLoop when LoopCond == Val to
+/// simplify the loop. If we decide that this is profitable,
+/// unswitch the loop, reprocess the pieces, then return true.
+bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
+ TerminatorInst *TI) {
+ // Check to see if it would be profitable to unswitch current loop.
+ if (!BranchesInfo.CostAllowsUnswitching()) {
+ DEBUG(dbgs() << "NOT unswitching loop %"
+ << currentLoop->getHeader()->getName()
+ << " at non-trivial condition '" << *Val
+ << "' == " << *LoopCond << "\n"
+ << ". Cost too high.\n");
+ return false;
+ }
+ if (hasBranchDivergence &&
+ getAnalysis<DivergenceAnalysis>().isDivergent(LoopCond)) {
+ DEBUG(dbgs() << "NOT unswitching loop %"
+ << currentLoop->getHeader()->getName()
+ << " at non-trivial condition '" << *Val
+ << "' == " << *LoopCond << "\n"
+ << ". Condition is divergent.\n");
+ return false;
+ }
+
+ UnswitchNontrivialCondition(LoopCond, Val, currentLoop, TI);
+ return true;
+}
+
+/// Recursively clone the specified loop and all of its children,
+/// mapping the blocks with the specified map.
+static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
+ LoopInfo *LI, LPPassManager *LPM) {
+ Loop &New = *LI->AllocateLoop();
+ if (PL)
+ PL->addChildLoop(&New);
+ else
+ LI->addTopLevelLoop(&New);
+ LPM->addLoop(New);
+
+ // Add all of the blocks in L to the new loop.
+ for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+ I != E; ++I)
+ if (LI->getLoopFor(*I) == L)
+ New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);
+
+ // Add all of the subloops to the new loop.
+ for (Loop *I : *L)
+ CloneLoop(I, &New, VM, LI, LPM);
+
+ return &New;
+}
+
+/// Emit a conditional branch on two values if LIC == Val, branch to TrueDst,
+/// otherwise branch to FalseDest. Insert the code immediately before OldBranch
+/// and remove (but not erase!) it from the function.
+void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+ BasicBlock *TrueDest,
+ BasicBlock *FalseDest,
+ BranchInst *OldBranch,
+ TerminatorInst *TI) {
+ assert(OldBranch->isUnconditional() && "Preheader is not split correctly");
+ // Insert a conditional branch on LIC to the two preheaders. The original
+ // code is the true version and the new code is the false version.
+ Value *BranchVal = LIC;
+ bool Swapped = false;
+ if (!isa<ConstantInt>(Val) ||
+ Val->getType() != Type::getInt1Ty(LIC->getContext()))
+ BranchVal = new ICmpInst(OldBranch, ICmpInst::ICMP_EQ, LIC, Val);
+ else if (Val != ConstantInt::getTrue(Val->getContext())) {
+ // We want to enter the new loop when the condition is true.
+ std::swap(TrueDest, FalseDest);
+ Swapped = true;
+ }
+
+ // Old branch will be removed, so save its parent and successor to update the
+ // DomTree.
+ auto *OldBranchSucc = OldBranch->getSuccessor(0);
+ auto *OldBranchParent = OldBranch->getParent();
+
+ // Insert the new branch.
+ BranchInst *BI =
+ IRBuilder<>(OldBranch).CreateCondBr(BranchVal, TrueDest, FalseDest, TI);
+ if (Swapped)
+ BI->swapProfMetadata();
+
+ // Remove the old branch so there is only one branch at the end. This is
+ // needed to perform DomTree's internal DFS walk on the function's CFG.
+ OldBranch->removeFromParent();
+
+ // Inform the DT about the new branch.
+ if (DT) {
+ // First, add both successors.
+ SmallVector<DominatorTree::UpdateType, 3> Updates;
+ if (TrueDest != OldBranchParent)
+ Updates.push_back({DominatorTree::Insert, OldBranchParent, TrueDest});
+ if (FalseDest != OldBranchParent)
+ Updates.push_back({DominatorTree::Insert, OldBranchParent, FalseDest});
+ // If both of the new successors are different from the old one, inform the
+ // DT that the edge was deleted.
+ if (OldBranchSucc != TrueDest && OldBranchSucc != FalseDest) {
+ Updates.push_back({DominatorTree::Delete, OldBranchParent, OldBranchSucc});
+ }
+
+ DT->applyUpdates(Updates);
+ }
+
+ // If either edge is critical, split it. This helps preserve LoopSimplify
+ // form for enclosing loops.
+ auto Options = CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA();
+ SplitCriticalEdge(BI, 0, Options);
+ SplitCriticalEdge(BI, 1, Options);
+}
+
+/// Given a loop that has a trivial unswitchable condition in it (a cond branch
+/// from its header block to its latch block, where the path through the loop
+/// that doesn't execute its body has no side-effects), unswitch it. This
+/// doesn't involve any code duplication, just moving the conditional branch
+/// outside of the loop and updating loop info.
+void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
+ BasicBlock *ExitBlock,
+ TerminatorInst *TI) {
+ DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
+ << loopHeader->getName() << " [" << L->getBlocks().size()
+ << " blocks] in Function "
+ << L->getHeader()->getParent()->getName() << " on cond: " << *Val
+ << " == " << *Cond << "\n");
+
+ // First step, split the preheader, so that we know that there is a safe place
+ // to insert the conditional branch. We will change loopPreheader to have a
+ // conditional branch on Cond.
+ BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, DT, LI);
+
+ // Now that we have a place to insert the conditional branch, create a place
+ // to branch to: this is the exit block out of the loop that we should
+ // short-circuit to.
+
+ // Split this block now, so that the loop maintains its exit block, and so
+ // that the jump from the preheader can execute the contents of the exit block
+ // without actually branching to it (the exit block should be dominated by the
+ // loop header, not the preheader).
+ assert(!L->contains(ExitBlock) && "Exit block is in the loop?");
+ BasicBlock *NewExit = SplitBlock(ExitBlock, &ExitBlock->front(), DT, LI);
+
+ // Okay, now we have a position to branch from and a position to branch to,
+ // insert the new conditional branch.
+ auto *OldBranch = dyn_cast<BranchInst>(loopPreheader->getTerminator());
+ assert(OldBranch && "Failed to split the preheader");
+ EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, OldBranch, TI);
+ LPM->deleteSimpleAnalysisValue(OldBranch, L);
+
+ // EmitPreheaderBranchOnCondition removed the OldBranch from the function.
+ // Delete it, as it is no longer needed.
+ delete OldBranch;
+
+ // We need to reprocess this loop, it could be unswitched again.
+ redoLoop = true;
+
+ // Now that we know that the loop is never entered when this condition is a
+ // particular value, rewrite the loop with this info. We know that this will
+ // at least eliminate the old branch.
+ RewriteLoopBodyWithConditionConstant(L, Cond, Val, false);
+ ++NumTrivial;
+}
+
+/// Check if the first non-constant condition starting from the loop header is
+/// a trivial unswitch condition: that is, a condition controls whether or not
+/// the loop does anything at all. If it is a trivial condition, unswitching
+/// produces no code duplications (equivalently, it produces a simpler loop and
+/// a new empty loop, which gets deleted). Therefore always unswitch trivial
+/// condition.
+bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
+ BasicBlock *CurrentBB = currentLoop->getHeader();
+ TerminatorInst *CurrentTerm = CurrentBB->getTerminator();
+ LLVMContext &Context = CurrentBB->getContext();
+
+ // If loop header has only one reachable successor (currently via an
+ // unconditional branch or constant foldable conditional branch, but
+ // should also consider adding constant foldable switch instruction in
+ // future), we should keep looking for trivial condition candidates in
+ // the successor as well. An alternative is to constant fold conditions
+ // and merge successors into loop header (then we only need to check header's
+ // terminator). The reason for not doing this in LoopUnswitch pass is that
+ // it could potentially break LoopPassManager's invariants. Folding dead
+ // branches could either eliminate the current loop or make other loops
+ // unreachable. LCSSA form might also not be preserved after deleting
+ // branches. The following code keeps traversing loop header's successors
+ // until it finds the trivial condition candidate (condition that is not a
+ // constant). Since unswitching generates branches with constant conditions,
+ // this scenario could be very common in practice.
+ SmallSet<BasicBlock*, 8> Visited;
+
+ while (true) {
+ // If we exit loop or reach a previous visited block, then
+ // we can not reach any trivial condition candidates (unfoldable
+ // branch instructions or switch instructions) and no unswitch
+ // can happen. Exit and return false.
+ if (!currentLoop->contains(CurrentBB) || !Visited.insert(CurrentBB).second)
+ return false;
+
+ // Check if this loop will execute any side-effecting instructions (e.g.
+ // stores, calls, volatile loads) in the part of the loop that the code
+ // *would* execute. Check the header first.
+ for (Instruction &I : *CurrentBB)
+ if (I.mayHaveSideEffects())
+ return false;
+
+ if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
+ if (BI->isUnconditional()) {
+ CurrentBB = BI->getSuccessor(0);
+ } else if (BI->getCondition() == ConstantInt::getTrue(Context)) {
+ CurrentBB = BI->getSuccessor(0);
+ } else if (BI->getCondition() == ConstantInt::getFalse(Context)) {
+ CurrentBB = BI->getSuccessor(1);
+ } else {
+ // Found a trivial condition candidate: non-foldable conditional branch.
+ break;
+ }
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+ // At this point, any constant-foldable instructions should have probably
+ // been folded.
+ ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
+ if (!Cond)
+ break;
+ // Find the target block we are definitely going to.
+ CurrentBB = SI->findCaseValue(Cond)->getCaseSuccessor();
+ } else {
+ // We do not understand these terminator instructions.
+ break;
+ }
+
+ CurrentTerm = CurrentBB->getTerminator();
+ }
+
+ // CondVal is the condition that controls the trivial condition.
+ // LoopExitBB is the BasicBlock that loop exits when meets trivial condition.
+ Constant *CondVal = nullptr;
+ BasicBlock *LoopExitBB = nullptr;
+
+ if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
+ // If this isn't branching on an invariant condition, we can't unswitch it.
+ if (!BI->isConditional())
+ return false;
+
+ Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
+ currentLoop, Changed).first;
+
+ // Unswitch only if the trivial condition itself is an LIV (not
+ // partial LIV which could occur in and/or)
+ if (!LoopCond || LoopCond != BI->getCondition())
+ return false;
+
+ // Check to see if a successor of the branch is guaranteed to
+ // exit through a unique exit block without having any
+ // side-effects. If so, determine the value of Cond that causes
+ // it to do this.
+ if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
+ BI->getSuccessor(0)))) {
+ CondVal = ConstantInt::getTrue(Context);
+ } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
+ BI->getSuccessor(1)))) {
+ CondVal = ConstantInt::getFalse(Context);
+ }
+
+ // If we didn't find a single unique LoopExit block, or if the loop exit
+ // block contains phi nodes, this isn't trivial.
+ if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
+ return false; // Can't handle this.
+
+ if (EqualityPropUnSafe(*LoopCond))
+ return false;
+
+ UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
+ CurrentTerm);
+ ++NumBranches;
+ return true;
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+ // If this isn't switching on an invariant condition, we can't unswitch it.
+ Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
+ currentLoop, Changed).first;
+
+ // Unswitch only if the trivial condition itself is an LIV (not
+ // partial LIV which could occur in and/or)
+ if (!LoopCond || LoopCond != SI->getCondition())
+ return false;
+
+ // Check to see if a successor of the switch is guaranteed to go to the
+ // latch block or exit through a one exit block without having any
+ // side-effects. If so, determine the value of Cond that causes it to do
+ // this.
+ // Note that we can't trivially unswitch on the default case or
+ // on already unswitched cases.
+ for (auto Case : SI->cases()) {
+ BasicBlock *LoopExitCandidate;
+ if ((LoopExitCandidate =
+ isTrivialLoopExitBlock(currentLoop, Case.getCaseSuccessor()))) {
+ // Okay, we found a trivial case, remember the value that is trivial.
+ ConstantInt *CaseVal = Case.getCaseValue();
+
+ // Check that it was not unswitched before, since already unswitched
+ // trivial vals are looks trivial too.
+ if (BranchesInfo.isUnswitched(SI, CaseVal))
+ continue;
+ LoopExitBB = LoopExitCandidate;
+ CondVal = CaseVal;
+ break;
+ }
+ }
+
+ // If we didn't find a single unique LoopExit block, or if the loop exit
+ // block contains phi nodes, this isn't trivial.
+ if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
+ return false; // Can't handle this.
+
+ UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
+ nullptr);
+
+ // We are only unswitching full LIV.
+ BranchesInfo.setUnswitched(SI, CondVal);
+ ++NumSwitches;
+ return true;
+ }
+ return false;
+}
+
+/// Split all of the edges from inside the loop to their exit blocks.
+/// Update the appropriate Phi nodes as we do so.
+void LoopUnswitch::SplitExitEdges(Loop *L,
+ const SmallVectorImpl<BasicBlock *> &ExitBlocks){
+
+ for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+ BasicBlock *ExitBlock = ExitBlocks[i];
+ SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock),
+ pred_end(ExitBlock));
+
+ // Although SplitBlockPredecessors doesn't preserve loop-simplify in
+ // general, if we call it on all predecessors of all exits then it does.
+ SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", DT, LI,
+ /*PreserveLCSSA*/ true);
+ }
+}
+
+/// We determined that the loop is profitable to unswitch when LIC equal Val.
+/// Split it into loop versions and test the condition outside of either loop.
+/// Return the loops created as Out1/Out2.
+void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
+ Loop *L, TerminatorInst *TI) {
+ Function *F = loopHeader->getParent();
+ DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
+ << loopHeader->getName() << " [" << L->getBlocks().size()
+ << " blocks] in Function " << F->getName()
+ << " when '" << *Val << "' == " << *LIC << "\n");
+
+ if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
+ SEWP->getSE().forgetLoop(L);
+
+ LoopBlocks.clear();
+ NewBlocks.clear();
+
+ // First step, split the preheader and exit blocks, and add these blocks to
+ // the LoopBlocks list.
+ BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, DT, LI);
+ LoopBlocks.push_back(NewPreheader);
+
+ // We want the loop to come after the preheader, but before the exit blocks.
+ LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end());
+
+ SmallVector<BasicBlock*, 8> ExitBlocks;
+ L->getUniqueExitBlocks(ExitBlocks);
+
+ // Split all of the edges from inside the loop to their exit blocks. Update
+ // the appropriate Phi nodes as we do so.
+ SplitExitEdges(L, ExitBlocks);
+
+ // The exit blocks may have been changed due to edge splitting, recompute.
+ ExitBlocks.clear();
+ L->getUniqueExitBlocks(ExitBlocks);
+
+ // Add exit blocks to the loop blocks.
+ LoopBlocks.insert(LoopBlocks.end(), ExitBlocks.begin(), ExitBlocks.end());
+
+ // Next step, clone all of the basic blocks that make up the loop (including
+ // the loop preheader and exit blocks), keeping track of the mapping between
+ // the instructions and blocks.
+ NewBlocks.reserve(LoopBlocks.size());
+ ValueToValueMapTy VMap;
+ for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) {
+ BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[i], VMap, ".us", F);
+
+ NewBlocks.push_back(NewBB);
+ VMap[LoopBlocks[i]] = NewBB; // Keep the BB mapping.
+ LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], NewBB, L);
+ }
+
+ // Splice the newly inserted blocks into the function right before the
+ // original preheader.
+ F->getBasicBlockList().splice(NewPreheader->getIterator(),
+ F->getBasicBlockList(),
+ NewBlocks[0]->getIterator(), F->end());
+
+ // Now we create the new Loop object for the versioned loop.
+ Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
+
+ // Recalculate unswitching quota, inherit simplified switches info for NewBB,
+ // Probably clone more loop-unswitch related loop properties.
+ BranchesInfo.cloneData(NewLoop, L, VMap);
+
+ Loop *ParentLoop = L->getParentLoop();
+ if (ParentLoop) {
+ // Make sure to add the cloned preheader and exit blocks to the parent loop
+ // as well.
+ ParentLoop->addBasicBlockToLoop(NewBlocks[0], *LI);
+ }
+
+ for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+ BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]);
+ // The new exit block should be in the same loop as the old one.
+ if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i]))
+ ExitBBLoop->addBasicBlockToLoop(NewExit, *LI);
+
+ assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
+ "Exit block should have been split to have one successor!");
+ BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0);
+
+ // If the successor of the exit block had PHI nodes, add an entry for
+ // NewExit.
+ for (PHINode &PN : ExitSucc->phis()) {
+ Value *V = PN.getIncomingValueForBlock(ExitBlocks[i]);
+ ValueToValueMapTy::iterator It = VMap.find(V);
+ if (It != VMap.end()) V = It->second;
+ PN.addIncoming(V, NewExit);
+ }
+
+ if (LandingPadInst *LPad = NewExit->getLandingPadInst()) {
+ PHINode *PN = PHINode::Create(LPad->getType(), 0, "",
+ &*ExitSucc->getFirstInsertionPt());
+
+ for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc);
+ I != E; ++I) {
+ BasicBlock *BB = *I;
+ LandingPadInst *LPI = BB->getLandingPadInst();
+ LPI->replaceAllUsesWith(PN);
+ PN->addIncoming(LPI, BB);
+ }
+ }
+ }
+
+ // Rewrite the code to refer to itself.
+ for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) {
+ for (Instruction &I : *NewBlocks[i]) {
+ RemapInstruction(&I, VMap,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC->registerAssumption(II);
+ }
+ }
+
+ // Rewrite the original preheader to select between versions of the loop.
+ BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator());
+ assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] &&
+ "Preheader splitting did not work correctly!");
+
+ // Emit the new branch that selects between the two versions of this loop.
+ EmitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR,
+ TI);
+ LPM->deleteSimpleAnalysisValue(OldBR, L);
+
+ // The OldBr was replaced by a new one and removed (but not erased) by
+ // EmitPreheaderBranchOnCondition. It is no longer needed, so delete it.
+ delete OldBR;
+
+ LoopProcessWorklist.push_back(NewLoop);
+ redoLoop = true;
+
+ // Keep a WeakTrackingVH holding onto LIC. If the first call to
+ // RewriteLoopBody
+ // deletes the instruction (for example by simplifying a PHI that feeds into
+ // the condition that we're unswitching on), we don't rewrite the second
+ // iteration.
+ WeakTrackingVH LICHandle(LIC);
+
+ // Now we rewrite the original code to know that the condition is true and the
+ // new code to know that the condition is false.
+ RewriteLoopBodyWithConditionConstant(L, LIC, Val, false);
+
+ // It's possible that simplifying one loop could cause the other to be
+ // changed to another value or a constant. If its a constant, don't simplify
+ // it.
+ if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop &&
+ LICHandle && !isa<Constant>(LICHandle))
+ RewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val, true);
+}
+
+/// Remove all instances of I from the worklist vector specified.
+static void RemoveFromWorklist(Instruction *I,
+ std::vector<Instruction*> &Worklist) {
+
+ Worklist.erase(std::remove(Worklist.begin(), Worklist.end(), I),
+ Worklist.end());
+}
+
+/// When we find that I really equals V, remove I from the
+/// program, replacing all uses with V and update the worklist.
+static void ReplaceUsesOfWith(Instruction *I, Value *V,
+ std::vector<Instruction*> &Worklist,
+ Loop *L, LPPassManager *LPM) {
+ DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n");
+
+ // Add uses to the worklist, which may be dead now.
+ for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+ if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+ Worklist.push_back(Use);
+
+ // Add users to the worklist which may be simplified now.
+ for (User *U : I->users())
+ Worklist.push_back(cast<Instruction>(U));
+ LPM->deleteSimpleAnalysisValue(I, L);
+ RemoveFromWorklist(I, Worklist);
+ I->replaceAllUsesWith(V);
+ if (!I->mayHaveSideEffects())
+ I->eraseFromParent();
+ ++NumSimplify;
+}
+
+/// We know either that the value LIC has the value specified by Val in the
+/// specified loop, or we know it does NOT have that value.
+/// Rewrite any uses of LIC or of properties correlated to it.
+void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+ Constant *Val,
+ bool IsEqual) {
+ assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
+
+ // FIXME: Support correlated properties, like:
+ // for (...)
+ // if (li1 < li2)
+ // ...
+ // if (li1 > li2)
+ // ...
+
+ // FOLD boolean conditions (X|LIC), (X&LIC). Fold conditional branches,
+ // selects, switches.
+ std::vector<Instruction*> Worklist;
+ LLVMContext &Context = Val->getContext();
+
+ // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC
+ // in the loop with the appropriate one directly.
+ if (IsEqual || (isa<ConstantInt>(Val) &&
+ Val->getType()->isIntegerTy(1))) {
+ Value *Replacement;
+ if (IsEqual)
+ Replacement = Val;
+ else
+ Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()),
+ !cast<ConstantInt>(Val)->getZExtValue());
+
+ for (User *U : LIC->users()) {
+ Instruction *UI = dyn_cast<Instruction>(U);
+ if (!UI || !L->contains(UI))
+ continue;
+ Worklist.push_back(UI);
+ }
+
+ for (Instruction *UI : Worklist)
+ UI->replaceUsesOfWith(LIC, Replacement);
+
+ SimplifyCode(Worklist, L);
+ return;
+ }
+
+ // Otherwise, we don't know the precise value of LIC, but we do know that it
+ // is certainly NOT "Val". As such, simplify any uses in the loop that we
+ // can. This case occurs when we unswitch switch statements.
+ for (User *U : LIC->users()) {
+ Instruction *UI = dyn_cast<Instruction>(U);
+ if (!UI || !L->contains(UI))
+ continue;
+
+ // At this point, we know LIC is definitely not Val. Try to use some simple
+ // logic to simplify the user w.r.t. to the context.
+ if (Value *Replacement = SimplifyInstructionWithNotEqual(UI, LIC, Val)) {
+ if (LI->replacementPreservesLCSSAForm(UI, Replacement)) {
+ // This in-loop instruction has been simplified w.r.t. its context,
+ // i.e. LIC != Val, make sure we propagate its replacement value to
+ // all its users.
+ //
+ // We can not yet delete UI, the LIC user, yet, because that would invalidate
+ // the LIC->users() iterator !. However, we can make this instruction
+ // dead by replacing all its users and push it onto the worklist so that
+ // it can be properly deleted and its operands simplified.
+ UI->replaceAllUsesWith(Replacement);
+ }
+ }
+
+ // This is a LIC user, push it into the worklist so that SimplifyCode can
+ // attempt to simplify it.
+ Worklist.push_back(UI);
+
+ // If we know that LIC is not Val, use this info to simplify code.
+ SwitchInst *SI = dyn_cast<SwitchInst>(UI);
+ if (!SI || !isa<ConstantInt>(Val)) continue;
+
+ // NOTE: if a case value for the switch is unswitched out, we record it
+ // after the unswitch finishes. We can not record it here as the switch
+ // is not a direct user of the partial LIV.
+ SwitchInst::CaseHandle DeadCase =
+ *SI->findCaseValue(cast<ConstantInt>(Val));
+ // Default case is live for multiple values.
+ if (DeadCase == *SI->case_default())
+ continue;
+
+ // Found a dead case value. Don't remove PHI nodes in the
+ // successor if they become single-entry, those PHI nodes may
+ // be in the Users list.
+
+ BasicBlock *Switch = SI->getParent();
+ BasicBlock *SISucc = DeadCase.getCaseSuccessor();
+ BasicBlock *Latch = L->getLoopLatch();
+
+ if (!SI->findCaseDest(SISucc)) continue; // Edge is critical.
+ // If the DeadCase successor dominates the loop latch, then the
+ // transformation isn't safe since it will delete the sole predecessor edge
+ // to the latch.
+ if (Latch && DT->dominates(SISucc, Latch))
+ continue;
+
+ // FIXME: This is a hack. We need to keep the successor around
+ // and hooked up so as to preserve the loop structure, because
+ // trying to update it is complicated. So instead we preserve the
+ // loop structure and put the block on a dead code path.
+ SplitEdge(Switch, SISucc, DT, LI);
+ // Compute the successors instead of relying on the return value
+ // of SplitEdge, since it may have split the switch successor
+ // after PHI nodes.
+ BasicBlock *NewSISucc = DeadCase.getCaseSuccessor();
+ BasicBlock *OldSISucc = *succ_begin(NewSISucc);
+ // Create an "unreachable" destination.
+ BasicBlock *Abort = BasicBlock::Create(Context, "us-unreachable",
+ Switch->getParent(),
+ OldSISucc);
+ new UnreachableInst(Context, Abort);
+ // Force the new case destination to branch to the "unreachable"
+ // block while maintaining a (dead) CFG edge to the old block.
+ NewSISucc->getTerminator()->eraseFromParent();
+ BranchInst::Create(Abort, OldSISucc,
+ ConstantInt::getTrue(Context), NewSISucc);
+ // Release the PHI operands for this edge.
+ for (PHINode &PN : NewSISucc->phis())
+ PN.setIncomingValue(PN.getBasicBlockIndex(Switch),
+ UndefValue::get(PN.getType()));
+ // Tell the domtree about the new block. We don't fully update the
+ // domtree here -- instead we force it to do a full recomputation
+ // after the pass is complete -- but we do need to inform it of
+ // new blocks.
+ DT->addNewBlock(Abort, NewSISucc);
+ }
+
+ SimplifyCode(Worklist, L);
+}
+
+/// Now that we have simplified some instructions in the loop, walk over it and
+/// constant prop, dce, and fold control flow where possible. Note that this is
+/// effectively a very simple loop-structure-aware optimizer. During processing
+/// of this loop, L could very well be deleted, so it must not be used.
+///
+/// FIXME: When the loop optimizer is more mature, separate this out to a new
+/// pass.
+///
+void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.back();
+ Worklist.pop_back();
+
+ // Simple DCE.
+ if (isInstructionTriviallyDead(I)) {
+ DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n");
+
+ // Add uses to the worklist, which may be dead now.
+ for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+ if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+ Worklist.push_back(Use);
+ LPM->deleteSimpleAnalysisValue(I, L);
+ RemoveFromWorklist(I, Worklist);
+ I->eraseFromParent();
+ ++NumSimplify;
+ continue;
+ }
+
+ // See if instruction simplification can hack this up. This is common for
+ // things like "select false, X, Y" after unswitching made the condition be
+ // 'false'. TODO: update the domtree properly so we can pass it here.
+ if (Value *V = SimplifyInstruction(I, DL))
+ if (LI->replacementPreservesLCSSAForm(I, V)) {
+ ReplaceUsesOfWith(I, V, Worklist, L, LPM);
+ continue;
+ }
+
+ // Special case hacks that appear commonly in unswitched code.
+ if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+ if (BI->isUnconditional()) {
+ // If BI's parent is the only pred of the successor, fold the two blocks
+ // together.
+ BasicBlock *Pred = BI->getParent();
+ BasicBlock *Succ = BI->getSuccessor(0);
+ BasicBlock *SinglePred = Succ->getSinglePredecessor();
+ if (!SinglePred) continue; // Nothing to do.
+ assert(SinglePred == Pred && "CFG broken");
+
+ DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- "
+ << Succ->getName() << "\n");
+
+ // Resolve any single entry PHI nodes in Succ.
+ while (PHINode *PN = dyn_cast<PHINode>(Succ->begin()))
+ ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM);
+
+ // If Succ has any successors with PHI nodes, update them to have
+ // entries coming from Pred instead of Succ.
+ Succ->replaceAllUsesWith(Pred);
+
+ // Move all of the successor contents from Succ to Pred.
+ Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(),
+ Succ->begin(), Succ->end());
+ LPM->deleteSimpleAnalysisValue(BI, L);
+ RemoveFromWorklist(BI, Worklist);
+ BI->eraseFromParent();
+
+ // Remove Succ from the loop tree.
+ LI->removeBlock(Succ);
+ LPM->deleteSimpleAnalysisValue(Succ, L);
+ Succ->eraseFromParent();
+ ++NumSimplify;
+ continue;
+ }
+
+ continue;
+ }
+ }
+}
+
+/// Simple simplifications we can do given the information that Cond is
+/// definitely not equal to Val.
+Value *LoopUnswitch::SimplifyInstructionWithNotEqual(Instruction *Inst,
+ Value *Invariant,
+ Constant *Val) {
+ // icmp eq cond, val -> false
+ ICmpInst *CI = dyn_cast<ICmpInst>(Inst);
+ if (CI && CI->isEquality()) {
+ Value *Op0 = CI->getOperand(0);
+ Value *Op1 = CI->getOperand(1);
+ if ((Op0 == Invariant && Op1 == Val) || (Op0 == Val && Op1 == Invariant)) {
+ LLVMContext &Ctx = Inst->getContext();
+ if (CI->getPredicate() == CmpInst::ICMP_EQ)
+ return ConstantInt::getFalse(Ctx);
+ else
+ return ConstantInt::getTrue(Ctx);
+ }
+ }
+
+ // FIXME: there may be other opportunities, e.g. comparison with floating
+ // point, or Invariant - Val != 0, etc.
+ return nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
new file mode 100644
index 000000000000..53b25e688e82
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -0,0 +1,598 @@
+//===- LoopVersioningLICM.cpp - LICM Loop Versioning ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// When alias analysis is uncertain about the aliasing between any two accesses,
+// it will return MayAlias. This uncertainty from alias analysis restricts LICM
+// from proceeding further. In cases where alias analysis is uncertain we might
+// use loop versioning as an alternative.
+//
+// Loop Versioning will create a version of the loop with aggressive aliasing
+// assumptions in addition to the original with conservative (default) aliasing
+// assumptions. The version of the loop making aggressive aliasing assumptions
+// will have all the memory accesses marked as no-alias. These two versions of
+// loop will be preceded by a memory runtime check. This runtime check consists
+// of bound checks for all unique memory accessed in loop, and it ensures the
+// lack of memory aliasing. The result of the runtime check determines which of
+// the loop versions is executed: If the runtime check detects any memory
+// aliasing, then the original loop is executed. Otherwise, the version with
+// aggressive aliasing assumptions is used.
+//
+// Following are the top level steps:
+//
+// a) Perform LoopVersioningLICM's feasibility check.
+// b) If loop is a candidate for versioning then create a memory bound check,
+// by considering all the memory accesses in loop body.
+// c) Clone original loop and set all memory accesses as no-alias in new loop.
+// d) Set original loop & versioned loop as a branch target of the runtime check
+// result.
+//
+// It transforms loop as shown below:
+//
+// +----------------+
+// |Runtime Memcheck|
+// +----------------+
+// |
+// +----------+----------------+----------+
+// | |
+// +---------+----------+ +-----------+----------+
+// |Orig Loop Preheader | |Cloned Loop Preheader |
+// +--------------------+ +----------------------+
+// | |
+// +--------------------+ +----------------------+
+// |Orig Loop Body | |Cloned Loop Body |
+// +--------------------+ +----------------------+
+// | |
+// +--------------------+ +----------------------+
+// |Orig Loop Exit Block| |Cloned Loop Exit Block|
+// +--------------------+ +-----------+----------+
+// | |
+// +----------+--------------+-----------+
+// |
+// +-----+----+
+// |Join Block|
+// +----------+
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include <cassert>
+#include <memory>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-versioning-licm"
+
+static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable";
+
+/// Threshold minimum allowed percentage for possible
+/// invariant instructions in a loop.
+static cl::opt<float>
+ LVInvarThreshold("licm-versioning-invariant-threshold",
+ cl::desc("LoopVersioningLICM's minimum allowed percentage"
+ "of possible invariant instructions per loop"),
+ cl::init(25), cl::Hidden);
+
+/// Threshold for maximum allowed loop nest/depth
+static cl::opt<unsigned> LVLoopDepthThreshold(
+ "licm-versioning-max-depth-threshold",
+ cl::desc(
+ "LoopVersioningLICM's threshold for maximum allowed loop nest/depth"),
+ cl::init(2), cl::Hidden);
+
+/// \brief Create MDNode for input string.
+static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) {
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+ Metadata *MDs[] = {
+ MDString::get(Context, Name),
+ ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))};
+ return MDNode::get(Context, MDs);
+}
+
+/// \brief Set input string into loop metadata by keeping other values intact.
+void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
+ unsigned V) {
+ SmallVector<Metadata *, 4> MDs(1);
+ // If the loop already has metadata, retain it.
+ MDNode *LoopID = TheLoop->getLoopID();
+ if (LoopID) {
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
+ MDs.push_back(Node);
+ }
+ }
+ // Add new metadata.
+ MDs.push_back(createStringMetadata(TheLoop, MDString, V));
+ // Replace current metadata node with new one.
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+ MDNode *NewLoopID = MDNode::get(Context, MDs);
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+ TheLoop->setLoopID(NewLoopID);
+}
+
+namespace {
+
+struct LoopVersioningLICM : public LoopPass {
+ static char ID;
+
+ LoopVersioningLICM()
+ : LoopPass(ID), LoopDepthThreshold(LVLoopDepthThreshold),
+ InvariantThreshold(LVInvarThreshold) {
+ initializeLoopVersioningLICMPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequiredID(LCSSAID);
+ AU.addRequired<LoopAccessLegacyAnalysis>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+
+ StringRef getPassName() const override { return "Loop Versioning for LICM"; }
+
+ void reset() {
+ AA = nullptr;
+ SE = nullptr;
+ LAA = nullptr;
+ CurLoop = nullptr;
+ LoadAndStoreCounter = 0;
+ InvariantCounter = 0;
+ IsReadOnlyLoop = true;
+ CurAST.reset();
+ }
+
+ class AutoResetter {
+ public:
+ AutoResetter(LoopVersioningLICM &LVLICM) : LVLICM(LVLICM) {}
+ ~AutoResetter() { LVLICM.reset(); }
+
+ private:
+ LoopVersioningLICM &LVLICM;
+ };
+
+private:
+ // Current AliasAnalysis information
+ AliasAnalysis *AA = nullptr;
+
+ // Current ScalarEvolution
+ ScalarEvolution *SE = nullptr;
+
+ // Current LoopAccessAnalysis
+ LoopAccessLegacyAnalysis *LAA = nullptr;
+
+ // Current Loop's LoopAccessInfo
+ const LoopAccessInfo *LAI = nullptr;
+
+ // The current loop we are working on.
+ Loop *CurLoop = nullptr;
+
+ // AliasSet information for the current loop.
+ std::unique_ptr<AliasSetTracker> CurAST;
+
+ // Maximum loop nest threshold
+ unsigned LoopDepthThreshold;
+
+ // Minimum invariant threshold
+ float InvariantThreshold;
+
+ // Counter to track num of load & store
+ unsigned LoadAndStoreCounter = 0;
+
+ // Counter to track num of invariant
+ unsigned InvariantCounter = 0;
+
+ // Read only loop marker.
+ bool IsReadOnlyLoop = true;
+
+ bool isLegalForVersioning();
+ bool legalLoopStructure();
+ bool legalLoopInstructions();
+ bool legalLoopMemoryAccesses();
+ bool isLoopAlreadyVisited();
+ void setNoAliasToLoop(Loop *VerLoop);
+ bool instructionSafeForVersioning(Instruction *I);
+};
+
+} // end anonymous namespace
+
+/// \brief Check loop structure and confirms it's good for LoopVersioningLICM.
+bool LoopVersioningLICM::legalLoopStructure() {
+ // Loop must be in loop simplify form.
+ if (!CurLoop->isLoopSimplifyForm()) {
+ DEBUG(
+ dbgs() << " loop is not in loop-simplify form.\n");
+ return false;
+ }
+ // Loop should be innermost loop, if not return false.
+ if (!CurLoop->getSubLoops().empty()) {
+ DEBUG(dbgs() << " loop is not innermost\n");
+ return false;
+ }
+ // Loop should have a single backedge, if not return false.
+ if (CurLoop->getNumBackEdges() != 1) {
+ DEBUG(dbgs() << " loop has multiple backedges\n");
+ return false;
+ }
+ // Loop must have a single exiting block, if not return false.
+ if (!CurLoop->getExitingBlock()) {
+ DEBUG(dbgs() << " loop has multiple exiting block\n");
+ return false;
+ }
+ // We only handle bottom-tested loop, i.e. loop in which the condition is
+ // checked at the end of each iteration. With that we can assume that all
+ // instructions in the loop are executed the same number of times.
+ if (CurLoop->getExitingBlock() != CurLoop->getLoopLatch()) {
+ DEBUG(dbgs() << " loop is not bottom tested\n");
+ return false;
+ }
+ // Parallel loops must not have aliasing loop-invariant memory accesses.
+ // Hence we don't need to version anything in this case.
+ if (CurLoop->isAnnotatedParallel()) {
+ DEBUG(dbgs() << " Parallel loop is not worth versioning\n");
+ return false;
+ }
+ // Loop depth more then LoopDepthThreshold are not allowed
+ if (CurLoop->getLoopDepth() > LoopDepthThreshold) {
+ DEBUG(dbgs() << " loop depth is more then threshold\n");
+ return false;
+ }
+ // We need to be able to compute the loop trip count in order
+ // to generate the bound checks.
+ const SCEV *ExitCount = SE->getBackedgeTakenCount(CurLoop);
+ if (ExitCount == SE->getCouldNotCompute()) {
+ DEBUG(dbgs() << " loop does not has trip count\n");
+ return false;
+ }
+ return true;
+}
+
+/// \brief Check memory accesses in loop and confirms it's good for
+/// LoopVersioningLICM.
+bool LoopVersioningLICM::legalLoopMemoryAccesses() {
+ bool HasMayAlias = false;
+ bool TypeSafety = false;
+ bool HasMod = false;
+ // Memory check:
+ // Transform phase will generate a versioned loop and also a runtime check to
+ // ensure the pointers are independent and they don’t alias.
+ // In version variant of loop, alias meta data asserts that all access are
+ // mutually independent.
+ //
+ // Pointers aliasing in alias domain are avoided because with multiple
+ // aliasing domains we may not be able to hoist potential loop invariant
+ // access out of the loop.
+ //
+ // Iterate over alias tracker sets, and confirm AliasSets doesn't have any
+ // must alias set.
+ for (const auto &I : *CurAST) {
+ const AliasSet &AS = I;
+ // Skip Forward Alias Sets, as this should be ignored as part of
+ // the AliasSetTracker object.
+ if (AS.isForwardingAliasSet())
+ continue;
+ // With MustAlias its not worth adding runtime bound check.
+ if (AS.isMustAlias())
+ return false;
+ Value *SomePtr = AS.begin()->getValue();
+ bool TypeCheck = true;
+ // Check for Mod & MayAlias
+ HasMayAlias |= AS.isMayAlias();
+ HasMod |= AS.isMod();
+ for (const auto &A : AS) {
+ Value *Ptr = A.getValue();
+ // Alias tracker should have pointers of same data type.
+ TypeCheck = (TypeCheck && (SomePtr->getType() == Ptr->getType()));
+ }
+ // At least one alias tracker should have pointers of same data type.
+ TypeSafety |= TypeCheck;
+ }
+ // Ensure types should be of same type.
+ if (!TypeSafety) {
+ DEBUG(dbgs() << " Alias tracker type safety failed!\n");
+ return false;
+ }
+ // Ensure loop body shouldn't be read only.
+ if (!HasMod) {
+ DEBUG(dbgs() << " No memory modified in loop body\n");
+ return false;
+ }
+ // Make sure alias set has may alias case.
+ // If there no alias memory ambiguity, return false.
+ if (!HasMayAlias) {
+ DEBUG(dbgs() << " No ambiguity in memory access.\n");
+ return false;
+ }
+ return true;
+}
+
+/// \brief Check loop instructions safe for Loop versioning.
+/// It returns true if it's safe else returns false.
+/// Consider following:
+/// 1) Check all load store in loop body are non atomic & non volatile.
+/// 2) Check function call safety, by ensuring its not accessing memory.
+/// 3) Loop body shouldn't have any may throw instruction.
+bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) {
+ assert(I != nullptr && "Null instruction found!");
+ // Check function call safety
+ if (isa<CallInst>(I) && !AA->doesNotAccessMemory(CallSite(I))) {
+ DEBUG(dbgs() << " Unsafe call site found.\n");
+ return false;
+ }
+ // Avoid loops with possiblity of throw
+ if (I->mayThrow()) {
+ DEBUG(dbgs() << " May throw instruction found in loop body\n");
+ return false;
+ }
+ // If current instruction is load instructions
+ // make sure it's a simple load (non atomic & non volatile)
+ if (I->mayReadFromMemory()) {
+ LoadInst *Ld = dyn_cast<LoadInst>(I);
+ if (!Ld || !Ld->isSimple()) {
+ DEBUG(dbgs() << " Found a non-simple load.\n");
+ return false;
+ }
+ LoadAndStoreCounter++;
+ Value *Ptr = Ld->getPointerOperand();
+ // Check loop invariant.
+ if (SE->isLoopInvariant(SE->getSCEV(Ptr), CurLoop))
+ InvariantCounter++;
+ }
+ // If current instruction is store instruction
+ // make sure it's a simple store (non atomic & non volatile)
+ else if (I->mayWriteToMemory()) {
+ StoreInst *St = dyn_cast<StoreInst>(I);
+ if (!St || !St->isSimple()) {
+ DEBUG(dbgs() << " Found a non-simple store.\n");
+ return false;
+ }
+ LoadAndStoreCounter++;
+ Value *Ptr = St->getPointerOperand();
+ // Check loop invariant.
+ if (SE->isLoopInvariant(SE->getSCEV(Ptr), CurLoop))
+ InvariantCounter++;
+
+ IsReadOnlyLoop = false;
+ }
+ return true;
+}
+
+/// \brief Check loop instructions and confirms it's good for
+/// LoopVersioningLICM.
+bool LoopVersioningLICM::legalLoopInstructions() {
+ // Resetting counters.
+ LoadAndStoreCounter = 0;
+ InvariantCounter = 0;
+ IsReadOnlyLoop = true;
+ // Iterate over loop blocks and instructions of each block and check
+ // instruction safety.
+ for (auto *Block : CurLoop->getBlocks())
+ for (auto &Inst : *Block) {
+ // If instruction is unsafe just return false.
+ if (!instructionSafeForVersioning(&Inst))
+ return false;
+ }
+ // Get LoopAccessInfo from current loop.
+ LAI = &LAA->getInfo(CurLoop);
+ // Check LoopAccessInfo for need of runtime check.
+ if (LAI->getRuntimePointerChecking()->getChecks().empty()) {
+ DEBUG(dbgs() << " LAA: Runtime check not found !!\n");
+ return false;
+ }
+ // Number of runtime-checks should be less then RuntimeMemoryCheckThreshold
+ if (LAI->getNumRuntimePointerChecks() >
+ VectorizerParams::RuntimeMemoryCheckThreshold) {
+ DEBUG(dbgs() << " LAA: Runtime checks are more than threshold !!\n");
+ return false;
+ }
+ // Loop should have at least one invariant load or store instruction.
+ if (!InvariantCounter) {
+ DEBUG(dbgs() << " Invariant not found !!\n");
+ return false;
+ }
+ // Read only loop not allowed.
+ if (IsReadOnlyLoop) {
+ DEBUG(dbgs() << " Found a read-only loop!\n");
+ return false;
+ }
+ // Profitablity check:
+ // Check invariant threshold, should be in limit.
+ if (InvariantCounter * 100 < InvariantThreshold * LoadAndStoreCounter) {
+ DEBUG(dbgs()
+ << " Invariant load & store are less then defined threshold\n");
+ DEBUG(dbgs() << " Invariant loads & stores: "
+ << ((InvariantCounter * 100) / LoadAndStoreCounter) << "%\n");
+ DEBUG(dbgs() << " Invariant loads & store threshold: "
+ << InvariantThreshold << "%\n");
+ return false;
+ }
+ return true;
+}
+
+/// \brief It checks loop is already visited or not.
+/// check loop meta data, if loop revisited return true
+/// else false.
+bool LoopVersioningLICM::isLoopAlreadyVisited() {
+ // Check LoopVersioningLICM metadata into loop
+ if (findStringMetadataForLoop(CurLoop, LICMVersioningMetaData)) {
+ return true;
+ }
+ return false;
+}
+
+/// \brief Checks legality for LoopVersioningLICM by considering following:
+/// a) loop structure legality b) loop instruction legality
+/// c) loop memory access legality.
+/// Return true if legal else returns false.
+bool LoopVersioningLICM::isLegalForVersioning() {
+ DEBUG(dbgs() << "Loop: " << *CurLoop);
+ // Make sure not re-visiting same loop again.
+ if (isLoopAlreadyVisited()) {
+ DEBUG(
+ dbgs() << " Revisiting loop in LoopVersioningLICM not allowed.\n\n");
+ return false;
+ }
+ // Check loop structure leagality.
+ if (!legalLoopStructure()) {
+ DEBUG(
+ dbgs() << " Loop structure not suitable for LoopVersioningLICM\n\n");
+ return false;
+ }
+ // Check loop instruction leagality.
+ if (!legalLoopInstructions()) {
+ DEBUG(dbgs()
+ << " Loop instructions not suitable for LoopVersioningLICM\n\n");
+ return false;
+ }
+ // Check loop memory access leagality.
+ if (!legalLoopMemoryAccesses()) {
+ DEBUG(dbgs()
+ << " Loop memory access not suitable for LoopVersioningLICM\n\n");
+ return false;
+ }
+ // Loop versioning is feasible, return true.
+ DEBUG(dbgs() << " Loop Versioning found to be beneficial\n\n");
+ return true;
+}
+
+/// \brief Update loop with aggressive aliasing assumptions.
+/// It marks no-alias to any pairs of memory operations by assuming
+/// loop should not have any must-alias memory accesses pairs.
+/// During LoopVersioningLICM legality we ignore loops having must
+/// aliasing memory accesses.
+void LoopVersioningLICM::setNoAliasToLoop(Loop *VerLoop) {
+ // Get latch terminator instruction.
+ Instruction *I = VerLoop->getLoopLatch()->getTerminator();
+ // Create alias scope domain.
+ MDBuilder MDB(I->getContext());
+ MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("LVDomain");
+ StringRef Name = "LVAliasScope";
+ SmallVector<Metadata *, 4> Scopes, NoAliases;
+ MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
+ // Iterate over each instruction of loop.
+ // set no-alias for all load & store instructions.
+ for (auto *Block : CurLoop->getBlocks()) {
+ for (auto &Inst : *Block) {
+ // Only interested in instruction that may modify or read memory.
+ if (!Inst.mayReadFromMemory() && !Inst.mayWriteToMemory())
+ continue;
+ Scopes.push_back(NewScope);
+ NoAliases.push_back(NewScope);
+ // Set no-alias for current instruction.
+ Inst.setMetadata(
+ LLVMContext::MD_noalias,
+ MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_noalias),
+ MDNode::get(Inst.getContext(), NoAliases)));
+ // set alias-scope for current instruction.
+ Inst.setMetadata(
+ LLVMContext::MD_alias_scope,
+ MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_alias_scope),
+ MDNode::get(Inst.getContext(), Scopes)));
+ }
+ }
+}
+
+bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) {
+ // This will automatically release all resources hold by the current
+ // LoopVersioningLICM object.
+ AutoResetter Resetter(*this);
+
+ if (skipLoop(L))
+ return false;
+ // Get Analysis information.
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+ LAI = nullptr;
+ // Set Current Loop
+ CurLoop = L;
+ CurAST.reset(new AliasSetTracker(*AA));
+
+ // Loop over the body of this loop, construct AST.
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ for (auto *Block : L->getBlocks()) {
+ if (LI->getLoopFor(Block) == L) // Ignore blocks in subloop.
+ CurAST->add(*Block); // Incorporate the specified basic block
+ }
+
+ bool Changed = false;
+
+ // Check feasiblity of LoopVersioningLICM.
+ // If versioning found to be feasible and beneficial then proceed
+ // else simply return, by cleaning up memory.
+ if (isLegalForVersioning()) {
+ // Do loop versioning.
+ // Create memcheck for memory accessed inside loop.
+ // Clone original loop, and set blocks properly.
+ DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopVersioning LVer(*LAI, CurLoop, LI, DT, SE, true);
+ LVer.versionLoop();
+ // Set Loop Versioning metaData for original loop.
+ addStringMetadataToLoop(LVer.getNonVersionedLoop(), LICMVersioningMetaData);
+ // Set Loop Versioning metaData for version loop.
+ addStringMetadataToLoop(LVer.getVersionedLoop(), LICMVersioningMetaData);
+ // Set "llvm.mem.parallel_loop_access" metaData to versioned loop.
+ addStringMetadataToLoop(LVer.getVersionedLoop(),
+ "llvm.mem.parallel_loop_access");
+ // Update version loop with aggressive aliasing assumption.
+ setNoAliasToLoop(LVer.getVersionedLoop());
+ Changed = true;
+ }
+ return Changed;
+}
+
+char LoopVersioningLICM::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopVersioningLICM, "loop-versioning-licm",
+ "Loop Versioning For LICM", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopVersioningLICM, "loop-versioning-licm",
+ "Loop Versioning For LICM", false, false)
+
+Pass *llvm::createLoopVersioningLICMPass() { return new LoopVersioningLICM(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
new file mode 100644
index 000000000000..c165c5ece95c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -0,0 +1,172 @@
+//===- LowerAtomic.cpp - Lower atomic intrinsics --------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers atomic intrinsics to non-atomic form for use in a known
+// non-preemptible environment.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerAtomic.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loweratomic"
+
+static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
+ IRBuilder<> Builder(CXI);
+ Value *Ptr = CXI->getPointerOperand();
+ Value *Cmp = CXI->getCompareOperand();
+ Value *Val = CXI->getNewValOperand();
+
+ LoadInst *Orig = Builder.CreateLoad(Ptr);
+ Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
+ Value *Res = Builder.CreateSelect(Equal, Val, Orig);
+ Builder.CreateStore(Res, Ptr);
+
+ Res = Builder.CreateInsertValue(UndefValue::get(CXI->getType()), Orig, 0);
+ Res = Builder.CreateInsertValue(Res, Equal, 1);
+
+ CXI->replaceAllUsesWith(Res);
+ CXI->eraseFromParent();
+ return true;
+}
+
+static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) {
+ IRBuilder<> Builder(RMWI);
+ Value *Ptr = RMWI->getPointerOperand();
+ Value *Val = RMWI->getValOperand();
+
+ LoadInst *Orig = Builder.CreateLoad(Ptr);
+ Value *Res = nullptr;
+
+ switch (RMWI->getOperation()) {
+ default: llvm_unreachable("Unexpected RMW operation");
+ case AtomicRMWInst::Xchg:
+ Res = Val;
+ break;
+ case AtomicRMWInst::Add:
+ Res = Builder.CreateAdd(Orig, Val);
+ break;
+ case AtomicRMWInst::Sub:
+ Res = Builder.CreateSub(Orig, Val);
+ break;
+ case AtomicRMWInst::And:
+ Res = Builder.CreateAnd(Orig, Val);
+ break;
+ case AtomicRMWInst::Nand:
+ Res = Builder.CreateNot(Builder.CreateAnd(Orig, Val));
+ break;
+ case AtomicRMWInst::Or:
+ Res = Builder.CreateOr(Orig, Val);
+ break;
+ case AtomicRMWInst::Xor:
+ Res = Builder.CreateXor(Orig, Val);
+ break;
+ case AtomicRMWInst::Max:
+ Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+ Val, Orig);
+ break;
+ case AtomicRMWInst::Min:
+ Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+ Orig, Val);
+ break;
+ case AtomicRMWInst::UMax:
+ Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+ Val, Orig);
+ break;
+ case AtomicRMWInst::UMin:
+ Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+ Orig, Val);
+ break;
+ }
+ Builder.CreateStore(Res, Ptr);
+ RMWI->replaceAllUsesWith(Orig);
+ RMWI->eraseFromParent();
+ return true;
+}
+
+static bool LowerFenceInst(FenceInst *FI) {
+ FI->eraseFromParent();
+ return true;
+}
+
+static bool LowerLoadInst(LoadInst *LI) {
+ LI->setAtomic(AtomicOrdering::NotAtomic);
+ return true;
+}
+
+static bool LowerStoreInst(StoreInst *SI) {
+ SI->setAtomic(AtomicOrdering::NotAtomic);
+ return true;
+}
+
+static bool runOnBasicBlock(BasicBlock &BB) {
+ bool Changed = false;
+ for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE;) {
+ Instruction *Inst = &*DI++;
+ if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
+ Changed |= LowerFenceInst(FI);
+ else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst))
+ Changed |= LowerAtomicCmpXchgInst(CXI);
+ else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(Inst))
+ Changed |= LowerAtomicRMWInst(RMWI);
+ else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ if (LI->isAtomic())
+ LowerLoadInst(LI);
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ if (SI->isAtomic())
+ LowerStoreInst(SI);
+ }
+ }
+ return Changed;
+}
+
+static bool lowerAtomics(Function &F) {
+ bool Changed = false;
+ for (BasicBlock &BB : F) {
+ Changed |= runOnBasicBlock(BB);
+ }
+ return Changed;
+}
+
+PreservedAnalyses LowerAtomicPass::run(Function &F, FunctionAnalysisManager &) {
+ if (lowerAtomics(F))
+ return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+}
+
+namespace {
+class LowerAtomicLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ LowerAtomicLegacyPass() : FunctionPass(ID) {
+ initializeLowerAtomicLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ // Don't skip optnone functions; atomics still need to be lowered.
+ FunctionAnalysisManager DummyFAM;
+ auto PA = Impl.run(F, DummyFAM);
+ return !PA.areAllPreserved();
+ }
+
+private:
+ LowerAtomicPass Impl;
+ };
+}
+
+char LowerAtomicLegacyPass::ID = 0;
+INITIALIZE_PASS(LowerAtomicLegacyPass, "loweratomic",
+ "Lower atomic intrinsics to non-atomic form", false, false)
+
+Pass *llvm::createLowerAtomicPass() { return new LowerAtomicLegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
new file mode 100644
index 000000000000..46f8a3564265
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -0,0 +1,383 @@
+//===- LowerExpectIntrinsic.cpp - Lower expect intrinsic ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the 'expect' intrinsic to LLVM metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lower-expect-intrinsic"
+
+STATISTIC(ExpectIntrinsicsHandled,
+ "Number of 'expect' intrinsic instructions handled");
+
+// These default values are chosen to represent an extremely skewed outcome for
+// a condition, but they leave some room for interpretation by later passes.
+//
+// If the documentation for __builtin_expect() was made explicit that it should
+// only be used in extreme cases, we could make this ratio higher. As it stands,
+// programmers may be using __builtin_expect() / llvm.expect to annotate that a
+// branch is likely or unlikely to be taken.
+//
+// There is a known dependency on this ratio in CodeGenPrepare when transforming
+// 'select' instructions. It may be worthwhile to hoist these values to some
+// shared space, so they can be used directly by other passes.
+
+static cl::opt<uint32_t> LikelyBranchWeight(
+ "likely-branch-weight", cl::Hidden, cl::init(2000),
+ cl::desc("Weight of the branch likely to be taken (default = 2000)"));
+static cl::opt<uint32_t> UnlikelyBranchWeight(
+ "unlikely-branch-weight", cl::Hidden, cl::init(1),
+ cl::desc("Weight of the branch unlikely to be taken (default = 1)"));
+
+static bool handleSwitchExpect(SwitchInst &SI) {
+ CallInst *CI = dyn_cast<CallInst>(SI.getCondition());
+ if (!CI)
+ return false;
+
+ Function *Fn = CI->getCalledFunction();
+ if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
+ return false;
+
+ Value *ArgValue = CI->getArgOperand(0);
+ ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+ if (!ExpectedValue)
+ return false;
+
+ SwitchInst::CaseHandle Case = *SI.findCaseValue(ExpectedValue);
+ unsigned n = SI.getNumCases(); // +1 for default case.
+ SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight);
+
+ if (Case == *SI.case_default())
+ Weights[0] = LikelyBranchWeight;
+ else
+ Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight;
+
+ SI.setMetadata(LLVMContext::MD_prof,
+ MDBuilder(CI->getContext()).createBranchWeights(Weights));
+
+ SI.setCondition(ArgValue);
+ return true;
+}
+
+/// Handler for PHINodes that define the value argument to an
+/// @llvm.expect call.
+///
+/// If the operand of the phi has a constant value and it 'contradicts'
+/// with the expected value of phi def, then the corresponding incoming
+/// edge of the phi is unlikely to be taken. Using that information,
+/// the branch probability info for the originating branch can be inferred.
+static void handlePhiDef(CallInst *Expect) {
+ Value &Arg = *Expect->getArgOperand(0);
+ ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(Expect->getArgOperand(1));
+ if (!ExpectedValue)
+ return;
+ const APInt &ExpectedPhiValue = ExpectedValue->getValue();
+
+ // Walk up in backward a list of instructions that
+ // have 'copy' semantics by 'stripping' the copies
+ // until a PHI node or an instruction of unknown kind
+ // is reached. Negation via xor is also handled.
+ //
+ // C = PHI(...);
+ // B = C;
+ // A = B;
+ // D = __builtin_expect(A, 0);
+ //
+ Value *V = &Arg;
+ SmallVector<Instruction *, 4> Operations;
+ while (!isa<PHINode>(V)) {
+ if (ZExtInst *ZExt = dyn_cast<ZExtInst>(V)) {
+ V = ZExt->getOperand(0);
+ Operations.push_back(ZExt);
+ continue;
+ }
+
+ if (SExtInst *SExt = dyn_cast<SExtInst>(V)) {
+ V = SExt->getOperand(0);
+ Operations.push_back(SExt);
+ continue;
+ }
+
+ BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V);
+ if (!BinOp || BinOp->getOpcode() != Instruction::Xor)
+ return;
+
+ ConstantInt *CInt = dyn_cast<ConstantInt>(BinOp->getOperand(1));
+ if (!CInt)
+ return;
+
+ V = BinOp->getOperand(0);
+ Operations.push_back(BinOp);
+ }
+
+ // Executes the recorded operations on input 'Value'.
+ auto ApplyOperations = [&](const APInt &Value) {
+ APInt Result = Value;
+ for (auto Op : llvm::reverse(Operations)) {
+ switch (Op->getOpcode()) {
+ case Instruction::Xor:
+ Result ^= cast<ConstantInt>(Op->getOperand(1))->getValue();
+ break;
+ case Instruction::ZExt:
+ Result = Result.zext(Op->getType()->getIntegerBitWidth());
+ break;
+ case Instruction::SExt:
+ Result = Result.sext(Op->getType()->getIntegerBitWidth());
+ break;
+ default:
+ llvm_unreachable("Unexpected operation");
+ }
+ }
+ return Result;
+ };
+
+ auto *PhiDef = dyn_cast<PHINode>(V);
+
+ // Get the first dominating conditional branch of the operand
+ // i's incoming block.
+ auto GetDomConditional = [&](unsigned i) -> BranchInst * {
+ BasicBlock *BB = PhiDef->getIncomingBlock(i);
+ BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (BI && BI->isConditional())
+ return BI;
+ BB = BB->getSinglePredecessor();
+ if (!BB)
+ return nullptr;
+ BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BI || BI->isUnconditional())
+ return nullptr;
+ return BI;
+ };
+
+ // Now walk through all Phi operands to find phi oprerands with values
+ // conflicting with the expected phi output value. Any such operand
+ // indicates the incoming edge to that operand is unlikely.
+ for (unsigned i = 0, e = PhiDef->getNumIncomingValues(); i != e; ++i) {
+
+ Value *PhiOpnd = PhiDef->getIncomingValue(i);
+ ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd);
+ if (!CI)
+ continue;
+
+ // Not an interesting case when IsUnlikely is false -- we can not infer
+ // anything useful when the operand value matches the expected phi
+ // output.
+ if (ExpectedPhiValue == ApplyOperations(CI->getValue()))
+ continue;
+
+ BranchInst *BI = GetDomConditional(i);
+ if (!BI)
+ continue;
+
+ MDBuilder MDB(PhiDef->getContext());
+
+ // There are two situations in which an operand of the PhiDef comes
+ // from a given successor of a branch instruction BI.
+ // 1) When the incoming block of the operand is the successor block;
+ // 2) When the incoming block is BI's enclosing block and the
+ // successor is the PhiDef's enclosing block.
+ //
+ // Returns true if the operand which comes from OpndIncomingBB
+ // comes from outgoing edge of BI that leads to Succ block.
+ auto *OpndIncomingBB = PhiDef->getIncomingBlock(i);
+ auto IsOpndComingFromSuccessor = [&](BasicBlock *Succ) {
+ if (OpndIncomingBB == Succ)
+ // If this successor is the incoming block for this
+ // Phi operand, then this successor does lead to the Phi.
+ return true;
+ if (OpndIncomingBB == BI->getParent() && Succ == PhiDef->getParent())
+ // Otherwise, if the edge is directly from the branch
+ // to the Phi, this successor is the one feeding this
+ // Phi operand.
+ return true;
+ return false;
+ };
+
+ if (IsOpndComingFromSuccessor(BI->getSuccessor(1)))
+ BI->setMetadata(
+ LLVMContext::MD_prof,
+ MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight));
+ else if (IsOpndComingFromSuccessor(BI->getSuccessor(0)))
+ BI->setMetadata(
+ LLVMContext::MD_prof,
+ MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight));
+ }
+}
+
+// Handle both BranchInst and SelectInst.
+template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
+
+ // Handle non-optimized IR code like:
+ // %expval = call i64 @llvm.expect.i64(i64 %conv1, i64 1)
+ // %tobool = icmp ne i64 %expval, 0
+ // br i1 %tobool, label %if.then, label %if.end
+ //
+ // Or the following simpler case:
+ // %expval = call i1 @llvm.expect.i1(i1 %cmp, i1 1)
+ // br i1 %expval, label %if.then, label %if.end
+
+ CallInst *CI;
+
+ ICmpInst *CmpI = dyn_cast<ICmpInst>(BSI.getCondition());
+ CmpInst::Predicate Predicate;
+ ConstantInt *CmpConstOperand = nullptr;
+ if (!CmpI) {
+ CI = dyn_cast<CallInst>(BSI.getCondition());
+ Predicate = CmpInst::ICMP_NE;
+ } else {
+ Predicate = CmpI->getPredicate();
+ if (Predicate != CmpInst::ICMP_NE && Predicate != CmpInst::ICMP_EQ)
+ return false;
+
+ CmpConstOperand = dyn_cast<ConstantInt>(CmpI->getOperand(1));
+ if (!CmpConstOperand)
+ return false;
+ CI = dyn_cast<CallInst>(CmpI->getOperand(0));
+ }
+
+ if (!CI)
+ return false;
+
+ uint64_t ValueComparedTo = 0;
+ if (CmpConstOperand) {
+ if (CmpConstOperand->getBitWidth() > 64)
+ return false;
+ ValueComparedTo = CmpConstOperand->getZExtValue();
+ }
+
+ Function *Fn = CI->getCalledFunction();
+ if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
+ return false;
+
+ Value *ArgValue = CI->getArgOperand(0);
+ ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+ if (!ExpectedValue)
+ return false;
+
+ MDBuilder MDB(CI->getContext());
+ MDNode *Node;
+
+ if ((ExpectedValue->getZExtValue() == ValueComparedTo) ==
+ (Predicate == CmpInst::ICMP_EQ))
+ Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight);
+ else
+ Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight);
+
+ BSI.setMetadata(LLVMContext::MD_prof, Node);
+
+ if (CmpI)
+ CmpI->setOperand(0, ArgValue);
+ else
+ BSI.setCondition(ArgValue);
+ return true;
+}
+
+static bool handleBranchExpect(BranchInst &BI) {
+ if (BI.isUnconditional())
+ return false;
+
+ return handleBrSelExpect<BranchInst>(BI);
+}
+
+static bool lowerExpectIntrinsic(Function &F) {
+ bool Changed = false;
+
+ for (BasicBlock &BB : F) {
+ // Create "block_weights" metadata.
+ if (BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
+ if (handleBranchExpect(*BI))
+ ExpectIntrinsicsHandled++;
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB.getTerminator())) {
+ if (handleSwitchExpect(*SI))
+ ExpectIntrinsicsHandled++;
+ }
+
+ // Remove llvm.expect intrinsics. Iterate backwards in order
+ // to process select instructions before the intrinsic gets
+ // removed.
+ for (auto BI = BB.rbegin(), BE = BB.rend(); BI != BE;) {
+ Instruction *Inst = &*BI++;
+ CallInst *CI = dyn_cast<CallInst>(Inst);
+ if (!CI) {
+ if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
+ if (handleBrSelExpect(*SI))
+ ExpectIntrinsicsHandled++;
+ }
+ continue;
+ }
+
+ Function *Fn = CI->getCalledFunction();
+ if (Fn && Fn->getIntrinsicID() == Intrinsic::expect) {
+ // Before erasing the llvm.expect, walk backward to find
+ // phi that define llvm.expect's first arg, and
+ // infer branch probability:
+ handlePhiDef(CI);
+ Value *Exp = CI->getArgOperand(0);
+ CI->replaceAllUsesWith(Exp);
+ CI->eraseFromParent();
+ Changed = true;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F,
+ FunctionAnalysisManager &) {
+ if (lowerExpectIntrinsic(F))
+ return PreservedAnalyses::none();
+
+ return PreservedAnalyses::all();
+}
+
+namespace {
+/// \brief Legacy pass for lowering expect intrinsics out of the IR.
+///
+/// When this pass is run over a function it uses expect intrinsics which feed
+/// branches and switches to provide branch weight metadata for those
+/// terminators. It then removes the expect intrinsics from the IR so the rest
+/// of the optimizer can ignore them.
+class LowerExpectIntrinsic : public FunctionPass {
+public:
+ static char ID;
+ LowerExpectIntrinsic() : FunctionPass(ID) {
+ initializeLowerExpectIntrinsicPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override { return lowerExpectIntrinsic(F); }
+};
+}
+
+char LowerExpectIntrinsic::ID = 0;
+INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect",
+ "Lower 'expect' Intrinsics", false, false)
+
+FunctionPass *llvm::createLowerExpectIntrinsicPass() {
+ return new LowerExpectIntrinsic();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
new file mode 100644
index 000000000000..070114a84cc5
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
@@ -0,0 +1,137 @@
+//===- LowerGuardIntrinsic.cpp - Lower the guard intrinsic ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the llvm.experimental.guard intrinsic to a conditional call
+// to @llvm.experimental.deoptimize. Once this happens, the guard can no longer
+// be widened.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+static cl::opt<uint32_t> PredicatePassBranchWeight(
+ "guards-predicate-pass-branch-weight", cl::Hidden, cl::init(1 << 20),
+ cl::desc("The probability of a guard failing is assumed to be the "
+ "reciprocal of this value (default = 1 << 20)"));
+
+namespace {
+struct LowerGuardIntrinsicLegacyPass : public FunctionPass {
+ static char ID;
+ LowerGuardIntrinsicLegacyPass() : FunctionPass(ID) {
+ initializeLowerGuardIntrinsicLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+};
+}
+
+static void MakeGuardControlFlowExplicit(Function *DeoptIntrinsic,
+ CallInst *CI) {
+ OperandBundleDef DeoptOB(*CI->getOperandBundle(LLVMContext::OB_deopt));
+ SmallVector<Value *, 4> Args(std::next(CI->arg_begin()), CI->arg_end());
+
+ auto *CheckBB = CI->getParent();
+ auto *DeoptBlockTerm =
+ SplitBlockAndInsertIfThen(CI->getArgOperand(0), CI, true);
+
+ auto *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
+
+ // SplitBlockAndInsertIfThen inserts control flow that branches to
+ // DeoptBlockTerm if the condition is true. We want the opposite.
+ CheckBI->swapSuccessors();
+
+ CheckBI->getSuccessor(0)->setName("guarded");
+ CheckBI->getSuccessor(1)->setName("deopt");
+
+ if (auto *MD = CI->getMetadata(LLVMContext::MD_make_implicit))
+ CheckBI->setMetadata(LLVMContext::MD_make_implicit, MD);
+
+ MDBuilder MDB(CI->getContext());
+ CheckBI->setMetadata(LLVMContext::MD_prof,
+ MDB.createBranchWeights(PredicatePassBranchWeight, 1));
+
+ IRBuilder<> B(DeoptBlockTerm);
+ auto *DeoptCall = B.CreateCall(DeoptIntrinsic, Args, {DeoptOB}, "");
+
+ if (DeoptIntrinsic->getReturnType()->isVoidTy()) {
+ B.CreateRetVoid();
+ } else {
+ DeoptCall->setName("deoptcall");
+ B.CreateRet(DeoptCall);
+ }
+
+ DeoptCall->setCallingConv(CI->getCallingConv());
+ DeoptBlockTerm->eraseFromParent();
+}
+
+static bool lowerGuardIntrinsic(Function &F) {
+ // Check if we can cheaply rule out the possibility of not having any work to
+ // do.
+ auto *GuardDecl = F.getParent()->getFunction(
+ Intrinsic::getName(Intrinsic::experimental_guard));
+ if (!GuardDecl || GuardDecl->use_empty())
+ return false;
+
+ SmallVector<CallInst *, 8> ToLower;
+ for (auto &I : instructions(F))
+ if (auto *CI = dyn_cast<CallInst>(&I))
+ if (auto *F = CI->getCalledFunction())
+ if (F->getIntrinsicID() == Intrinsic::experimental_guard)
+ ToLower.push_back(CI);
+
+ if (ToLower.empty())
+ return false;
+
+ auto *DeoptIntrinsic = Intrinsic::getDeclaration(
+ F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()});
+ DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv());
+
+ for (auto *CI : ToLower) {
+ MakeGuardControlFlowExplicit(DeoptIntrinsic, CI);
+ CI->eraseFromParent();
+ }
+
+ return true;
+}
+
+bool LowerGuardIntrinsicLegacyPass::runOnFunction(Function &F) {
+ return lowerGuardIntrinsic(F);
+}
+
+char LowerGuardIntrinsicLegacyPass::ID = 0;
+INITIALIZE_PASS(LowerGuardIntrinsicLegacyPass, "lower-guard-intrinsic",
+ "Lower the guard intrinsic to normal control flow", false,
+ false)
+
+Pass *llvm::createLowerGuardIntrinsicPass() {
+ return new LowerGuardIntrinsicLegacyPass();
+}
+
+PreservedAnalyses LowerGuardIntrinsicPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ if (lowerGuardIntrinsic(F))
+ return PreservedAnalyses::none();
+
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
new file mode 100644
index 000000000000..9c870b42a747
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -0,0 +1,1492 @@
+//===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs various transformations related to eliminating memcpy
+// calls, or transforming sets of stores into memset's.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "memcpyopt"
+
+STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
+STATISTIC(NumMemSetInfer, "Number of memsets inferred");
+STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy");
+STATISTIC(NumCpyToSet, "Number of memcpys converted to memset");
+
+static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx,
+ bool &VariableIdxFound,
+ const DataLayout &DL) {
+ // Skip over the first indices.
+ gep_type_iterator GTI = gep_type_begin(GEP);
+ for (unsigned i = 1; i != Idx; ++i, ++GTI)
+ /*skip along*/;
+
+ // Compute the offset implied by the rest of the indices.
+ int64_t Offset = 0;
+ for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
+ ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i));
+ if (!OpC)
+ return VariableIdxFound = true;
+ if (OpC->isZero()) continue; // No offset.
+
+ // Handle struct indices, which add their field offset to the pointer.
+ if (StructType *STy = GTI.getStructTypeOrNull()) {
+ Offset += DL.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
+ continue;
+ }
+
+ // Otherwise, we have a sequential type like an array or vector. Multiply
+ // the index by the ElementSize.
+ uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
+ Offset += Size*OpC->getSExtValue();
+ }
+
+ return Offset;
+}
+
+/// Return true if Ptr1 is provably equal to Ptr2 plus a constant offset, and
+/// return that constant offset. For example, Ptr1 might be &A[42], and Ptr2
+/// might be &A[40]. In this case offset would be -8.
+static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
+ const DataLayout &DL) {
+ Ptr1 = Ptr1->stripPointerCasts();
+ Ptr2 = Ptr2->stripPointerCasts();
+
+ // Handle the trivial case first.
+ if (Ptr1 == Ptr2) {
+ Offset = 0;
+ return true;
+ }
+
+ GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1);
+ GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2);
+
+ bool VariableIdxFound = false;
+
+ // If one pointer is a GEP and the other isn't, then see if the GEP is a
+ // constant offset from the base, as in "P" and "gep P, 1".
+ if (GEP1 && !GEP2 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) {
+ Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, DL);
+ return !VariableIdxFound;
+ }
+
+ if (GEP2 && !GEP1 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) {
+ Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, DL);
+ return !VariableIdxFound;
+ }
+
+ // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical
+ // base. After that base, they may have some number of common (and
+ // potentially variable) indices. After that they handle some constant
+ // offset, which determines their offset from each other. At this point, we
+ // handle no other case.
+ if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0))
+ return false;
+
+ // Skip any common indices and track the GEP types.
+ unsigned Idx = 1;
+ for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx)
+ if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx))
+ break;
+
+ int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, DL);
+ int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, DL);
+ if (VariableIdxFound) return false;
+
+ Offset = Offset2-Offset1;
+ return true;
+}
+
+namespace {
+
+/// Represents a range of memset'd bytes with the ByteVal value.
+/// This allows us to analyze stores like:
+/// store 0 -> P+1
+/// store 0 -> P+0
+/// store 0 -> P+3
+/// store 0 -> P+2
+/// which sometimes happens with stores to arrays of structs etc. When we see
+/// the first store, we make a range [1, 2). The second store extends the range
+/// to [0, 2). The third makes a new range [2, 3). The fourth store joins the
+/// two ranges into [0, 3) which is memset'able.
+struct MemsetRange {
+ // Start/End - A semi range that describes the span that this range covers.
+ // The range is closed at the start and open at the end: [Start, End).
+ int64_t Start, End;
+
+ /// StartPtr - The getelementptr instruction that points to the start of the
+ /// range.
+ Value *StartPtr;
+
+ /// Alignment - The known alignment of the first store.
+ unsigned Alignment;
+
+ /// TheStores - The actual stores that make up this range.
+ SmallVector<Instruction*, 16> TheStores;
+
+ bool isProfitableToUseMemset(const DataLayout &DL) const;
+};
+
+} // end anonymous namespace
+
+bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
+ // If we found more than 4 stores to merge or 16 bytes, use memset.
+ if (TheStores.size() >= 4 || End-Start >= 16) return true;
+
+ // If there is nothing to merge, don't do anything.
+ if (TheStores.size() < 2) return false;
+
+ // If any of the stores are a memset, then it is always good to extend the
+ // memset.
+ for (Instruction *SI : TheStores)
+ if (!isa<StoreInst>(SI))
+ return true;
+
+ // Assume that the code generator is capable of merging pairs of stores
+ // together if it wants to.
+ if (TheStores.size() == 2) return false;
+
+ // If we have fewer than 8 stores, it can still be worthwhile to do this.
+ // For example, merging 4 i8 stores into an i32 store is useful almost always.
+ // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
+ // memset will be split into 2 32-bit stores anyway) and doing so can
+ // pessimize the llvm optimizer.
+ //
+ // Since we don't have perfect knowledge here, make some assumptions: assume
+ // the maximum GPR width is the same size as the largest legal integer
+ // size. If so, check to see whether we will end up actually reducing the
+ // number of stores used.
+ unsigned Bytes = unsigned(End-Start);
+ unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8;
+ if (MaxIntSize == 0)
+ MaxIntSize = 1;
+ unsigned NumPointerStores = Bytes / MaxIntSize;
+
+ // Assume the remaining bytes if any are done a byte at a time.
+ unsigned NumByteStores = Bytes % MaxIntSize;
+
+ // If we will reduce the # stores (according to this heuristic), do the
+ // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
+ // etc.
+ return TheStores.size() > NumPointerStores+NumByteStores;
+}
+
+namespace {
+
+class MemsetRanges {
+ using range_iterator = SmallVectorImpl<MemsetRange>::iterator;
+
+ /// A sorted list of the memset ranges.
+ SmallVector<MemsetRange, 8> Ranges;
+
+ const DataLayout &DL;
+
+public:
+ MemsetRanges(const DataLayout &DL) : DL(DL) {}
+
+ using const_iterator = SmallVectorImpl<MemsetRange>::const_iterator;
+
+ const_iterator begin() const { return Ranges.begin(); }
+ const_iterator end() const { return Ranges.end(); }
+ bool empty() const { return Ranges.empty(); }
+
+ void addInst(int64_t OffsetFromFirst, Instruction *Inst) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+ addStore(OffsetFromFirst, SI);
+ else
+ addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst));
+ }
+
+ void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
+ int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
+
+ addRange(OffsetFromFirst, StoreSize,
+ SI->getPointerOperand(), SI->getAlignment(), SI);
+ }
+
+ void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
+ int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+ addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI);
+ }
+
+ void addRange(int64_t Start, int64_t Size, Value *Ptr,
+ unsigned Alignment, Instruction *Inst);
+};
+
+} // end anonymous namespace
+
+/// Add a new store to the MemsetRanges data structure. This adds a
+/// new range for the specified store at the specified offset, merging into
+/// existing ranges as appropriate.
+void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
+ unsigned Alignment, Instruction *Inst) {
+ int64_t End = Start+Size;
+
+ range_iterator I = std::lower_bound(Ranges.begin(), Ranges.end(), Start,
+ [](const MemsetRange &LHS, int64_t RHS) { return LHS.End < RHS; });
+
+ // We now know that I == E, in which case we didn't find anything to merge
+ // with, or that Start <= I->End. If End < I->Start or I == E, then we need
+ // to insert a new range. Handle this now.
+ if (I == Ranges.end() || End < I->Start) {
+ MemsetRange &R = *Ranges.insert(I, MemsetRange());
+ R.Start = Start;
+ R.End = End;
+ R.StartPtr = Ptr;
+ R.Alignment = Alignment;
+ R.TheStores.push_back(Inst);
+ return;
+ }
+
+ // This store overlaps with I, add it.
+ I->TheStores.push_back(Inst);
+
+ // At this point, we may have an interval that completely contains our store.
+ // If so, just add it to the interval and return.
+ if (I->Start <= Start && I->End >= End)
+ return;
+
+ // Now we know that Start <= I->End and End >= I->Start so the range overlaps
+ // but is not entirely contained within the range.
+
+ // See if the range extends the start of the range. In this case, it couldn't
+ // possibly cause it to join the prior range, because otherwise we would have
+ // stopped on *it*.
+ if (Start < I->Start) {
+ I->Start = Start;
+ I->StartPtr = Ptr;
+ I->Alignment = Alignment;
+ }
+
+ // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
+ // is in or right at the end of I), and that End >= I->Start. Extend I out to
+ // End.
+ if (End > I->End) {
+ I->End = End;
+ range_iterator NextI = I;
+ while (++NextI != Ranges.end() && End >= NextI->Start) {
+ // Merge the range in.
+ I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end());
+ if (NextI->End > I->End)
+ I->End = NextI->End;
+ Ranges.erase(NextI);
+ NextI = I;
+ }
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// MemCpyOptLegacyPass Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class MemCpyOptLegacyPass : public FunctionPass {
+ MemCpyOptPass Impl;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ MemCpyOptLegacyPass() : FunctionPass(ID) {
+ initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+private:
+ // This transformation requires dominator postdominator info
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<MemoryDependenceWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<MemoryDependenceWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+char MemCpyOptLegacyPass::ID = 0;
+
+/// The public interface to this file...
+FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); }
+
+INITIALIZE_PASS_BEGIN(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
+ false, false)
+
+/// When scanning forward over instructions, we look for some other patterns to
+/// fold away. In particular, this looks for stores to neighboring locations of
+/// memory. If it sees enough consecutive ones, it attempts to merge them
+/// together into a memcpy/memset.
+Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
+ Value *StartPtr,
+ Value *ByteVal) {
+ const DataLayout &DL = StartInst->getModule()->getDataLayout();
+
+ // Okay, so we now have a single store that can be splatable. Scan to find
+ // all subsequent stores of the same value to offset from the same pointer.
+ // Join these together into ranges, so we can decide whether contiguous blocks
+ // are stored.
+ MemsetRanges Ranges(DL);
+
+ BasicBlock::iterator BI(StartInst);
+ for (++BI; !isa<TerminatorInst>(BI); ++BI) {
+ if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
+ // If the instruction is readnone, ignore it, otherwise bail out. We
+ // don't even allow readonly here because we don't want something like:
+ // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
+ if (BI->mayWriteToMemory() || BI->mayReadFromMemory())
+ break;
+ continue;
+ }
+
+ if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
+ // If this is a store, see if we can merge it in.
+ if (!NextStore->isSimple()) break;
+
+ // Check to see if this stored value is of the same byte-splattable value.
+ if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
+ break;
+
+ // Check to see if this store is to a constant offset from the start ptr.
+ int64_t Offset;
+ if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset,
+ DL))
+ break;
+
+ Ranges.addStore(Offset, NextStore);
+ } else {
+ MemSetInst *MSI = cast<MemSetInst>(BI);
+
+ if (MSI->isVolatile() || ByteVal != MSI->getValue() ||
+ !isa<ConstantInt>(MSI->getLength()))
+ break;
+
+ // Check to see if this store is to a constant offset from the start ptr.
+ int64_t Offset;
+ if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, DL))
+ break;
+
+ Ranges.addMemSet(Offset, MSI);
+ }
+ }
+
+ // If we have no ranges, then we just had a single store with nothing that
+ // could be merged in. This is a very common case of course.
+ if (Ranges.empty())
+ return nullptr;
+
+ // If we had at least one store that could be merged in, add the starting
+ // store as well. We try to avoid this unless there is at least something
+ // interesting as a small compile-time optimization.
+ Ranges.addInst(0, StartInst);
+
+ // If we create any memsets, we put it right before the first instruction that
+ // isn't part of the memset block. This ensure that the memset is dominated
+ // by any addressing instruction needed by the start of the block.
+ IRBuilder<> Builder(&*BI);
+
+ // Now that we have full information about ranges, loop over the ranges and
+ // emit memset's for anything big enough to be worthwhile.
+ Instruction *AMemSet = nullptr;
+ for (const MemsetRange &Range : Ranges) {
+ if (Range.TheStores.size() == 1) continue;
+
+ // If it is profitable to lower this range to memset, do so now.
+ if (!Range.isProfitableToUseMemset(DL))
+ continue;
+
+ // Otherwise, we do want to transform this! Create a new memset.
+ // Get the starting pointer of the block.
+ StartPtr = Range.StartPtr;
+
+ // Determine alignment
+ unsigned Alignment = Range.Alignment;
+ if (Alignment == 0) {
+ Type *EltType =
+ cast<PointerType>(StartPtr->getType())->getElementType();
+ Alignment = DL.getABITypeAlignment(EltType);
+ }
+
+ AMemSet =
+ Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
+
+ DEBUG(dbgs() << "Replace stores:\n";
+ for (Instruction *SI : Range.TheStores)
+ dbgs() << *SI << '\n';
+ dbgs() << "With: " << *AMemSet << '\n');
+
+ if (!Range.TheStores.empty())
+ AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
+
+ // Zap all the stores.
+ for (Instruction *SI : Range.TheStores) {
+ MD->removeInstruction(SI);
+ SI->eraseFromParent();
+ }
+ ++NumMemSetInfer;
+ }
+
+ return AMemSet;
+}
+
+static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
+ const LoadInst *LI) {
+ unsigned StoreAlign = SI->getAlignment();
+ if (!StoreAlign)
+ StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
+ unsigned LoadAlign = LI->getAlignment();
+ if (!LoadAlign)
+ LoadAlign = DL.getABITypeAlignment(LI->getType());
+
+ return std::min(StoreAlign, LoadAlign);
+}
+
+// This method try to lift a store instruction before position P.
+// It will lift the store and its argument + that anything that
+// may alias with these.
+// The method returns true if it was successful.
+static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
+ const LoadInst *LI) {
+ // If the store alias this position, early bail out.
+ MemoryLocation StoreLoc = MemoryLocation::get(SI);
+ if (isModOrRefSet(AA.getModRefInfo(P, StoreLoc)))
+ return false;
+
+ // Keep track of the arguments of all instruction we plan to lift
+ // so we can make sure to lift them as well if apropriate.
+ DenseSet<Instruction*> Args;
+ if (auto *Ptr = dyn_cast<Instruction>(SI->getPointerOperand()))
+ if (Ptr->getParent() == SI->getParent())
+ Args.insert(Ptr);
+
+ // Instruction to lift before P.
+ SmallVector<Instruction*, 8> ToLift;
+
+ // Memory locations of lifted instructions.
+ SmallVector<MemoryLocation, 8> MemLocs{StoreLoc};
+
+ // Lifted callsites.
+ SmallVector<ImmutableCallSite, 8> CallSites;
+
+ const MemoryLocation LoadLoc = MemoryLocation::get(LI);
+
+ for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) {
+ auto *C = &*I;
+
+ bool MayAlias = isModOrRefSet(AA.getModRefInfo(C, None));
+
+ bool NeedLift = false;
+ if (Args.erase(C))
+ NeedLift = true;
+ else if (MayAlias) {
+ NeedLift = llvm::any_of(MemLocs, [C, &AA](const MemoryLocation &ML) {
+ return isModOrRefSet(AA.getModRefInfo(C, ML));
+ });
+
+ if (!NeedLift)
+ NeedLift =
+ llvm::any_of(CallSites, [C, &AA](const ImmutableCallSite &CS) {
+ return isModOrRefSet(AA.getModRefInfo(C, CS));
+ });
+ }
+
+ if (!NeedLift)
+ continue;
+
+ if (MayAlias) {
+ // Since LI is implicitly moved downwards past the lifted instructions,
+ // none of them may modify its source.
+ if (isModSet(AA.getModRefInfo(C, LoadLoc)))
+ return false;
+ else if (auto CS = ImmutableCallSite(C)) {
+ // If we can't lift this before P, it's game over.
+ if (isModOrRefSet(AA.getModRefInfo(P, CS)))
+ return false;
+
+ CallSites.push_back(CS);
+ } else if (isa<LoadInst>(C) || isa<StoreInst>(C) || isa<VAArgInst>(C)) {
+ // If we can't lift this before P, it's game over.
+ auto ML = MemoryLocation::get(C);
+ if (isModOrRefSet(AA.getModRefInfo(P, ML)))
+ return false;
+
+ MemLocs.push_back(ML);
+ } else
+ // We don't know how to lift this instruction.
+ return false;
+ }
+
+ ToLift.push_back(C);
+ for (unsigned k = 0, e = C->getNumOperands(); k != e; ++k)
+ if (auto *A = dyn_cast<Instruction>(C->getOperand(k)))
+ if (A->getParent() == SI->getParent())
+ Args.insert(A);
+ }
+
+ // We made it, we need to lift
+ for (auto *I : llvm::reverse(ToLift)) {
+ DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
+ I->moveBefore(P);
+ }
+
+ return true;
+}
+
+bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
+ if (!SI->isSimple()) return false;
+
+ // Avoid merging nontemporal stores since the resulting
+ // memcpy/memset would not be able to preserve the nontemporal hint.
+ // In theory we could teach how to propagate the !nontemporal metadata to
+ // memset calls. However, that change would force the backend to
+ // conservatively expand !nontemporal memset calls back to sequences of
+ // store instructions (effectively undoing the merging).
+ if (SI->getMetadata(LLVMContext::MD_nontemporal))
+ return false;
+
+ const DataLayout &DL = SI->getModule()->getDataLayout();
+
+ // Load to store forwarding can be interpreted as memcpy.
+ if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
+ if (LI->isSimple() && LI->hasOneUse() &&
+ LI->getParent() == SI->getParent()) {
+
+ auto *T = LI->getType();
+ if (T->isAggregateType()) {
+ AliasAnalysis &AA = LookupAliasAnalysis();
+ MemoryLocation LoadLoc = MemoryLocation::get(LI);
+
+ // We use alias analysis to check if an instruction may store to
+ // the memory we load from in between the load and the store. If
+ // such an instruction is found, we try to promote there instead
+ // of at the store position.
+ Instruction *P = SI;
+ for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) {
+ if (isModSet(AA.getModRefInfo(&I, LoadLoc))) {
+ P = &I;
+ break;
+ }
+ }
+
+ // We found an instruction that may write to the loaded memory.
+ // We can try to promote at this position instead of the store
+ // position if nothing alias the store memory after this and the store
+ // destination is not in the range.
+ if (P && P != SI) {
+ if (!moveUp(AA, SI, P, LI))
+ P = nullptr;
+ }
+
+ // If a valid insertion position is found, then we can promote
+ // the load/store pair to a memcpy.
+ if (P) {
+ // If we load from memory that may alias the memory we store to,
+ // memmove must be used to preserve semantic. If not, memcpy can
+ // be used.
+ bool UseMemMove = false;
+ if (!AA.isNoAlias(MemoryLocation::get(SI), LoadLoc))
+ UseMemMove = true;
+
+ unsigned Align = findCommonAlignment(DL, SI, LI);
+ uint64_t Size = DL.getTypeStoreSize(T);
+
+ IRBuilder<> Builder(P);
+ Instruction *M;
+ if (UseMemMove)
+ M = Builder.CreateMemMove(SI->getPointerOperand(),
+ LI->getPointerOperand(), Size,
+ Align, SI->isVolatile());
+ else
+ M = Builder.CreateMemCpy(SI->getPointerOperand(),
+ LI->getPointerOperand(), Size,
+ Align, SI->isVolatile());
+
+ DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI
+ << " => " << *M << "\n");
+
+ MD->removeInstruction(SI);
+ SI->eraseFromParent();
+ MD->removeInstruction(LI);
+ LI->eraseFromParent();
+ ++NumMemCpyInstr;
+
+ // Make sure we do not invalidate the iterator.
+ BBI = M->getIterator();
+ return true;
+ }
+ }
+
+ // Detect cases where we're performing call slot forwarding, but
+ // happen to be using a load-store pair to implement it, rather than
+ // a memcpy.
+ MemDepResult ldep = MD->getDependency(LI);
+ CallInst *C = nullptr;
+ if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
+ C = dyn_cast<CallInst>(ldep.getInst());
+
+ if (C) {
+ // Check that nothing touches the dest of the "copy" between
+ // the call and the store.
+ Value *CpyDest = SI->getPointerOperand()->stripPointerCasts();
+ bool CpyDestIsLocal = isa<AllocaInst>(CpyDest);
+ AliasAnalysis &AA = LookupAliasAnalysis();
+ MemoryLocation StoreLoc = MemoryLocation::get(SI);
+ for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator();
+ I != E; --I) {
+ if (isModOrRefSet(AA.getModRefInfo(&*I, StoreLoc))) {
+ C = nullptr;
+ break;
+ }
+ // The store to dest may never happen if an exception can be thrown
+ // between the load and the store.
+ if (I->mayThrow() && !CpyDestIsLocal) {
+ C = nullptr;
+ break;
+ }
+ }
+ }
+
+ if (C) {
+ bool changed = performCallSlotOptzn(
+ LI, SI->getPointerOperand()->stripPointerCasts(),
+ LI->getPointerOperand()->stripPointerCasts(),
+ DL.getTypeStoreSize(SI->getOperand(0)->getType()),
+ findCommonAlignment(DL, SI, LI), C);
+ if (changed) {
+ MD->removeInstruction(SI);
+ SI->eraseFromParent();
+ MD->removeInstruction(LI);
+ LI->eraseFromParent();
+ ++NumMemCpyInstr;
+ return true;
+ }
+ }
+ }
+ }
+
+ // There are two cases that are interesting for this code to handle: memcpy
+ // and memset. Right now we only handle memset.
+
+ // Ensure that the value being stored is something that can be memset'able a
+ // byte at a time like "0" or "-1" or any width, as well as things like
+ // 0xA0A0A0A0 and 0.0.
+ auto *V = SI->getOperand(0);
+ if (Value *ByteVal = isBytewiseValue(V)) {
+ if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(),
+ ByteVal)) {
+ BBI = I->getIterator(); // Don't invalidate iterator.
+ return true;
+ }
+
+ // If we have an aggregate, we try to promote it to memset regardless
+ // of opportunity for merging as it can expose optimization opportunities
+ // in subsequent passes.
+ auto *T = V->getType();
+ if (T->isAggregateType()) {
+ uint64_t Size = DL.getTypeStoreSize(T);
+ unsigned Align = SI->getAlignment();
+ if (!Align)
+ Align = DL.getABITypeAlignment(T);
+ IRBuilder<> Builder(SI);
+ auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal,
+ Size, Align, SI->isVolatile());
+
+ DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
+
+ MD->removeInstruction(SI);
+ SI->eraseFromParent();
+ NumMemSetInfer++;
+
+ // Make sure we do not invalidate the iterator.
+ BBI = M->getIterator();
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
+ // See if there is another memset or store neighboring this memset which
+ // allows us to widen out the memset to do a single larger store.
+ if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile())
+ if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(),
+ MSI->getValue())) {
+ BBI = I->getIterator(); // Don't invalidate iterator.
+ return true;
+ }
+ return false;
+}
+
+/// Takes a memcpy and a call that it depends on,
+/// and checks for the possibility of a call slot optimization by having
+/// the call write its result directly into the destination of the memcpy.
+bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
+ Value *cpySrc, uint64_t cpyLen,
+ unsigned cpyAlign, CallInst *C) {
+ // The general transformation to keep in mind is
+ //
+ // call @func(..., src, ...)
+ // memcpy(dest, src, ...)
+ //
+ // ->
+ //
+ // memcpy(dest, src, ...)
+ // call @func(..., dest, ...)
+ //
+ // Since moving the memcpy is technically awkward, we additionally check that
+ // src only holds uninitialized values at the moment of the call, meaning that
+ // the memcpy can be discarded rather than moved.
+
+ // Lifetime marks shouldn't be operated on.
+ if (Function *F = C->getCalledFunction())
+ if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
+ return false;
+
+ // Deliberately get the source and destination with bitcasts stripped away,
+ // because we'll need to do type comparisons based on the underlying type.
+ CallSite CS(C);
+
+ // Require that src be an alloca. This simplifies the reasoning considerably.
+ AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
+ if (!srcAlloca)
+ return false;
+
+ ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
+ if (!srcArraySize)
+ return false;
+
+ const DataLayout &DL = cpy->getModule()->getDataLayout();
+ uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) *
+ srcArraySize->getZExtValue();
+
+ if (cpyLen < srcSize)
+ return false;
+
+ // Check that accessing the first srcSize bytes of dest will not cause a
+ // trap. Otherwise the transform is invalid since it might cause a trap
+ // to occur earlier than it otherwise would.
+ if (AllocaInst *A = dyn_cast<AllocaInst>(cpyDest)) {
+ // The destination is an alloca. Check it is larger than srcSize.
+ ConstantInt *destArraySize = dyn_cast<ConstantInt>(A->getArraySize());
+ if (!destArraySize)
+ return false;
+
+ uint64_t destSize = DL.getTypeAllocSize(A->getAllocatedType()) *
+ destArraySize->getZExtValue();
+
+ if (destSize < srcSize)
+ return false;
+ } else if (Argument *A = dyn_cast<Argument>(cpyDest)) {
+ // The store to dest may never happen if the call can throw.
+ if (C->mayThrow())
+ return false;
+
+ if (A->getDereferenceableBytes() < srcSize) {
+ // If the destination is an sret parameter then only accesses that are
+ // outside of the returned struct type can trap.
+ if (!A->hasStructRetAttr())
+ return false;
+
+ Type *StructTy = cast<PointerType>(A->getType())->getElementType();
+ if (!StructTy->isSized()) {
+ // The call may never return and hence the copy-instruction may never
+ // be executed, and therefore it's not safe to say "the destination
+ // has at least <cpyLen> bytes, as implied by the copy-instruction",
+ return false;
+ }
+
+ uint64_t destSize = DL.getTypeAllocSize(StructTy);
+ if (destSize < srcSize)
+ return false;
+ }
+ } else {
+ return false;
+ }
+
+ // Check that dest points to memory that is at least as aligned as src.
+ unsigned srcAlign = srcAlloca->getAlignment();
+ if (!srcAlign)
+ srcAlign = DL.getABITypeAlignment(srcAlloca->getAllocatedType());
+ bool isDestSufficientlyAligned = srcAlign <= cpyAlign;
+ // If dest is not aligned enough and we can't increase its alignment then
+ // bail out.
+ if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest))
+ return false;
+
+ // Check that src is not accessed except via the call and the memcpy. This
+ // guarantees that it holds only undefined values when passed in (so the final
+ // memcpy can be dropped), that it is not read or written between the call and
+ // the memcpy, and that writing beyond the end of it is undefined.
+ SmallVector<User*, 8> srcUseList(srcAlloca->user_begin(),
+ srcAlloca->user_end());
+ while (!srcUseList.empty()) {
+ User *U = srcUseList.pop_back_val();
+
+ if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) {
+ for (User *UU : U->users())
+ srcUseList.push_back(UU);
+ continue;
+ }
+ if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) {
+ if (!G->hasAllZeroIndices())
+ return false;
+
+ for (User *UU : U->users())
+ srcUseList.push_back(UU);
+ continue;
+ }
+ if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U))
+ if (IT->getIntrinsicID() == Intrinsic::lifetime_start ||
+ IT->getIntrinsicID() == Intrinsic::lifetime_end)
+ continue;
+
+ if (U != C && U != cpy)
+ return false;
+ }
+
+ // Check that src isn't captured by the called function since the
+ // transformation can cause aliasing issues in that case.
+ for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
+ if (CS.getArgument(i) == cpySrc && !CS.doesNotCapture(i))
+ return false;
+
+ // Since we're changing the parameter to the callsite, we need to make sure
+ // that what would be the new parameter dominates the callsite.
+ DominatorTree &DT = LookupDomTree();
+ if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest))
+ if (!DT.dominates(cpyDestInst, C))
+ return false;
+
+ // In addition to knowing that the call does not access src in some
+ // unexpected manner, for example via a global, which we deduce from
+ // the use analysis, we also need to know that it does not sneakily
+ // access dest. We rely on AA to figure this out for us.
+ AliasAnalysis &AA = LookupAliasAnalysis();
+ ModRefInfo MR = AA.getModRefInfo(C, cpyDest, srcSize);
+ // If necessary, perform additional analysis.
+ if (isModOrRefSet(MR))
+ MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT);
+ if (isModOrRefSet(MR))
+ return false;
+
+ // We can't create address space casts here because we don't know if they're
+ // safe for the target.
+ if (cpySrc->getType()->getPointerAddressSpace() !=
+ cpyDest->getType()->getPointerAddressSpace())
+ return false;
+ for (unsigned i = 0; i < CS.arg_size(); ++i)
+ if (CS.getArgument(i)->stripPointerCasts() == cpySrc &&
+ cpySrc->getType()->getPointerAddressSpace() !=
+ CS.getArgument(i)->getType()->getPointerAddressSpace())
+ return false;
+
+ // All the checks have passed, so do the transformation.
+ bool changedArgument = false;
+ for (unsigned i = 0; i < CS.arg_size(); ++i)
+ if (CS.getArgument(i)->stripPointerCasts() == cpySrc) {
+ Value *Dest = cpySrc->getType() == cpyDest->getType() ? cpyDest
+ : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(),
+ cpyDest->getName(), C);
+ changedArgument = true;
+ if (CS.getArgument(i)->getType() == Dest->getType())
+ CS.setArgument(i, Dest);
+ else
+ CS.setArgument(i, CastInst::CreatePointerCast(Dest,
+ CS.getArgument(i)->getType(), Dest->getName(), C));
+ }
+
+ if (!changedArgument)
+ return false;
+
+ // If the destination wasn't sufficiently aligned then increase its alignment.
+ if (!isDestSufficientlyAligned) {
+ assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
+ cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
+ }
+
+ // Drop any cached information about the call, because we may have changed
+ // its dependence information by changing its parameter.
+ MD->removeInstruction(C);
+
+ // Update AA metadata
+ // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
+ // handled here, but combineMetadata doesn't support them yet
+ unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias,
+ LLVMContext::MD_invariant_group};
+ combineMetadata(C, cpy, KnownIDs);
+
+ // Remove the memcpy.
+ MD->removeInstruction(cpy);
+ ++NumMemCpyInstr;
+
+ return true;
+}
+
+/// We've found that the (upward scanning) memory dependence of memcpy 'M' is
+/// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can.
+bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
+ MemCpyInst *MDep) {
+ // We can only transforms memcpy's where the dest of one is the source of the
+ // other.
+ if (M->getSource() != MDep->getDest() || MDep->isVolatile())
+ return false;
+
+ // If dep instruction is reading from our current input, then it is a noop
+ // transfer and substituting the input won't change this instruction. Just
+ // ignore the input and let someone else zap MDep. This handles cases like:
+ // memcpy(a <- a)
+ // memcpy(b <- a)
+ if (M->getSource() == MDep->getSource())
+ return false;
+
+ // Second, the length of the memcpy's must be the same, or the preceding one
+ // must be larger than the following one.
+ ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
+ ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength());
+ if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
+ return false;
+
+ AliasAnalysis &AA = LookupAliasAnalysis();
+
+ // Verify that the copied-from memory doesn't change in between the two
+ // transfers. For example, in:
+ // memcpy(a <- b)
+ // *b = 42;
+ // memcpy(c <- a)
+ // It would be invalid to transform the second memcpy into memcpy(c <- b).
+ //
+ // TODO: If the code between M and MDep is transparent to the destination "c",
+ // then we could still perform the xform by moving M up to the first memcpy.
+ //
+ // NOTE: This is conservative, it will stop on any read from the source loc,
+ // not just the defining memcpy.
+ MemDepResult SourceDep =
+ MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
+ M->getIterator(), M->getParent());
+ if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+ return false;
+
+ // If the dest of the second might alias the source of the first, then the
+ // source and dest might overlap. We still want to eliminate the intermediate
+ // value, but we have to generate a memmove instead of memcpy.
+ bool UseMemMove = false;
+ if (!AA.isNoAlias(MemoryLocation::getForDest(M),
+ MemoryLocation::getForSource(MDep)))
+ UseMemMove = true;
+
+ // If all checks passed, then we can transform M.
+
+ // Make sure to use the lesser of the alignment of the source and the dest
+ // since we're changing where we're reading from, but don't want to increase
+ // the alignment past what can be read from or written to.
+ // TODO: Is this worth it if we're creating a less aligned memcpy? For
+ // example we could be moving from movaps -> movq on x86.
+ unsigned Align = std::min(MDep->getAlignment(), M->getAlignment());
+
+ IRBuilder<> Builder(M);
+ if (UseMemMove)
+ Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(),
+ Align, M->isVolatile());
+ else
+ Builder.CreateMemCpy(M->getRawDest(), MDep->getRawSource(), M->getLength(),
+ Align, M->isVolatile());
+
+ // Remove the instruction we're replacing.
+ MD->removeInstruction(M);
+ M->eraseFromParent();
+ ++NumMemCpyInstr;
+ return true;
+}
+
+/// We've found that the (upward scanning) memory dependence of \p MemCpy is
+/// \p MemSet. Try to simplify \p MemSet to only set the trailing bytes that
+/// weren't copied over by \p MemCpy.
+///
+/// In other words, transform:
+/// \code
+/// memset(dst, c, dst_size);
+/// memcpy(dst, src, src_size);
+/// \endcode
+/// into:
+/// \code
+/// memcpy(dst, src, src_size);
+/// memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size);
+/// \endcode
+bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
+ MemSetInst *MemSet) {
+ // We can only transform memset/memcpy with the same destination.
+ if (MemSet->getDest() != MemCpy->getDest())
+ return false;
+
+ // Check that there are no other dependencies on the memset destination.
+ MemDepResult DstDepInfo =
+ MD->getPointerDependencyFrom(MemoryLocation::getForDest(MemSet), false,
+ MemCpy->getIterator(), MemCpy->getParent());
+ if (DstDepInfo.getInst() != MemSet)
+ return false;
+
+ // Use the same i8* dest as the memcpy, killing the memset dest if different.
+ Value *Dest = MemCpy->getRawDest();
+ Value *DestSize = MemSet->getLength();
+ Value *SrcSize = MemCpy->getLength();
+
+ // By default, create an unaligned memset.
+ unsigned Align = 1;
+ // If Dest is aligned, and SrcSize is constant, use the minimum alignment
+ // of the sum.
+ const unsigned DestAlign =
+ std::max(MemSet->getAlignment(), MemCpy->getAlignment());
+ if (DestAlign > 1)
+ if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize))
+ Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign);
+
+ IRBuilder<> Builder(MemCpy);
+
+ // If the sizes have different types, zext the smaller one.
+ if (DestSize->getType() != SrcSize->getType()) {
+ if (DestSize->getType()->getIntegerBitWidth() >
+ SrcSize->getType()->getIntegerBitWidth())
+ SrcSize = Builder.CreateZExt(SrcSize, DestSize->getType());
+ else
+ DestSize = Builder.CreateZExt(DestSize, SrcSize->getType());
+ }
+
+ Value *Ule = Builder.CreateICmpULE(DestSize, SrcSize);
+ Value *SizeDiff = Builder.CreateSub(DestSize, SrcSize);
+ Value *MemsetLen = Builder.CreateSelect(
+ Ule, ConstantInt::getNullValue(DestSize->getType()), SizeDiff);
+ Builder.CreateMemSet(Builder.CreateGEP(Dest, SrcSize), MemSet->getOperand(1),
+ MemsetLen, Align);
+
+ MD->removeInstruction(MemSet);
+ MemSet->eraseFromParent();
+ return true;
+}
+
+/// Transform memcpy to memset when its source was just memset.
+/// In other words, turn:
+/// \code
+/// memset(dst1, c, dst1_size);
+/// memcpy(dst2, dst1, dst2_size);
+/// \endcode
+/// into:
+/// \code
+/// memset(dst1, c, dst1_size);
+/// memset(dst2, c, dst2_size);
+/// \endcode
+/// When dst2_size <= dst1_size.
+///
+/// The \p MemCpy must have a Constant length.
+bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
+ MemSetInst *MemSet) {
+ AliasAnalysis &AA = LookupAliasAnalysis();
+
+ // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and
+ // memcpying from the same address. Otherwise it is hard to reason about.
+ if (!AA.isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource()))
+ return false;
+
+ ConstantInt *CopySize = cast<ConstantInt>(MemCpy->getLength());
+ ConstantInt *MemSetSize = dyn_cast<ConstantInt>(MemSet->getLength());
+ // Make sure the memcpy doesn't read any more than what the memset wrote.
+ // Don't worry about sizes larger than i64.
+ if (!MemSetSize || CopySize->getZExtValue() > MemSetSize->getZExtValue())
+ return false;
+
+ IRBuilder<> Builder(MemCpy);
+ Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
+ CopySize, MemCpy->getAlignment());
+ return true;
+}
+
+/// Perform simplification of memcpy's. If we have memcpy A
+/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
+/// B to be a memcpy from X to Z (or potentially a memmove, depending on
+/// circumstances). This allows later passes to remove the first memcpy
+/// altogether.
+bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
+ // We can only optimize non-volatile memcpy's.
+ if (M->isVolatile()) return false;
+
+ // If the source and destination of the memcpy are the same, then zap it.
+ if (M->getSource() == M->getDest()) {
+ MD->removeInstruction(M);
+ M->eraseFromParent();
+ return false;
+ }
+
+ // If copying from a constant, try to turn the memcpy into a memset.
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource()))
+ if (GV->isConstant() && GV->hasDefinitiveInitializer())
+ if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) {
+ IRBuilder<> Builder(M);
+ Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
+ M->getAlignment(), false);
+ MD->removeInstruction(M);
+ M->eraseFromParent();
+ ++NumCpyToSet;
+ return true;
+ }
+
+ MemDepResult DepInfo = MD->getDependency(M);
+
+ // Try to turn a partially redundant memset + memcpy into
+ // memcpy + smaller memset. We don't need the memcpy size for this.
+ if (DepInfo.isClobber())
+ if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst()))
+ if (processMemSetMemCpyDependence(M, MDep))
+ return true;
+
+ // The optimizations after this point require the memcpy size.
+ ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
+ if (!CopySize) return false;
+
+ // There are four possible optimizations we can do for memcpy:
+ // a) memcpy-memcpy xform which exposes redundance for DSE.
+ // b) call-memcpy xform for return slot optimization.
+ // c) memcpy from freshly alloca'd space or space that has just started its
+ // lifetime copies undefined data, and we can therefore eliminate the
+ // memcpy in favor of the data that was already at the destination.
+ // d) memcpy from a just-memset'd source can be turned into memset.
+ if (DepInfo.isClobber()) {
+ if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
+ if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
+ CopySize->getZExtValue(), M->getAlignment(),
+ C)) {
+ MD->removeInstruction(M);
+ M->eraseFromParent();
+ return true;
+ }
+ }
+ }
+
+ MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
+ MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
+ SrcLoc, true, M->getIterator(), M->getParent());
+
+ if (SrcDepInfo.isClobber()) {
+ if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
+ return processMemCpyMemCpyDependence(M, MDep);
+ } else if (SrcDepInfo.isDef()) {
+ Instruction *I = SrcDepInfo.getInst();
+ bool hasUndefContents = false;
+
+ if (isa<AllocaInst>(I)) {
+ hasUndefContents = true;
+ } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+ if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0)))
+ if (LTSize->getZExtValue() >= CopySize->getZExtValue())
+ hasUndefContents = true;
+ }
+
+ if (hasUndefContents) {
+ MD->removeInstruction(M);
+ M->eraseFromParent();
+ ++NumMemCpyInstr;
+ return true;
+ }
+ }
+
+ if (SrcDepInfo.isClobber())
+ if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
+ if (performMemCpyToMemSetOptzn(M, MDep)) {
+ MD->removeInstruction(M);
+ M->eraseFromParent();
+ ++NumCpyToSet;
+ return true;
+ }
+
+ return false;
+}
+
+/// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
+/// not to alias.
+bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
+ AliasAnalysis &AA = LookupAliasAnalysis();
+
+ if (!TLI->has(LibFunc_memmove))
+ return false;
+
+ // See if the pointers alias.
+ if (!AA.isNoAlias(MemoryLocation::getForDest(M),
+ MemoryLocation::getForSource(M)))
+ return false;
+
+ DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
+ << "\n");
+
+ // If not, then we know we can transform this.
+ Type *ArgTys[3] = { M->getRawDest()->getType(),
+ M->getRawSource()->getType(),
+ M->getLength()->getType() };
+ M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(),
+ Intrinsic::memcpy, ArgTys));
+
+ // MemDep may have over conservative information about this instruction, just
+ // conservatively flush it from the cache.
+ MD->removeInstruction(M);
+
+ ++NumMoveToCpy;
+ return true;
+}
+
+/// This is called on every byval argument in call sites.
+bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
+ const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout();
+ // Find out what feeds this byval argument.
+ Value *ByValArg = CS.getArgument(ArgNo);
+ Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
+ uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
+ MemDepResult DepInfo = MD->getPointerDependencyFrom(
+ MemoryLocation(ByValArg, ByValSize), true,
+ CS.getInstruction()->getIterator(), CS.getInstruction()->getParent());
+ if (!DepInfo.isClobber())
+ return false;
+
+ // If the byval argument isn't fed by a memcpy, ignore it. If it is fed by
+ // a memcpy, see if we can byval from the source of the memcpy instead of the
+ // result.
+ MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
+ if (!MDep || MDep->isVolatile() ||
+ ByValArg->stripPointerCasts() != MDep->getDest())
+ return false;
+
+ // The length of the memcpy must be larger or equal to the size of the byval.
+ ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
+ if (!C1 || C1->getValue().getZExtValue() < ByValSize)
+ return false;
+
+ // Get the alignment of the byval. If the call doesn't specify the alignment,
+ // then it is some target specific value that we can't know.
+ unsigned ByValAlign = CS.getParamAlignment(ArgNo);
+ if (ByValAlign == 0) return false;
+
+ // If it is greater than the memcpy, then we check to see if we can force the
+ // source of the memcpy to the alignment we need. If we fail, we bail out.
+ AssumptionCache &AC = LookupAssumptionCache();
+ DominatorTree &DT = LookupDomTree();
+ if (MDep->getAlignment() < ByValAlign &&
+ getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL,
+ CS.getInstruction(), &AC, &DT) < ByValAlign)
+ return false;
+
+ // The address space of the memcpy source must match the byval argument
+ if (MDep->getSource()->getType()->getPointerAddressSpace() !=
+ ByValArg->getType()->getPointerAddressSpace())
+ return false;
+
+ // Verify that the copied-from memory doesn't change in between the memcpy and
+ // the byval call.
+ // memcpy(a <- b)
+ // *b = 42;
+ // foo(*a)
+ // It would be invalid to transform the second memcpy into foo(*b).
+ //
+ // NOTE: This is conservative, it will stop on any read from the source loc,
+ // not just the defining memcpy.
+ MemDepResult SourceDep = MD->getPointerDependencyFrom(
+ MemoryLocation::getForSource(MDep), false,
+ CS.getInstruction()->getIterator(), MDep->getParent());
+ if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+ return false;
+
+ Value *TmpCast = MDep->getSource();
+ if (MDep->getSource()->getType() != ByValArg->getType())
+ TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
+ "tmpcast", CS.getInstruction());
+
+ DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
+ << " " << *MDep << "\n"
+ << " " << *CS.getInstruction() << "\n");
+
+ // Otherwise we're good! Update the byval argument.
+ CS.setArgument(ArgNo, TmpCast);
+ ++NumMemCpyInstr;
+ return true;
+}
+
+/// Executes one iteration of MemCpyOptPass.
+bool MemCpyOptPass::iterateOnFunction(Function &F) {
+ bool MadeChange = false;
+
+ // Walk all instruction in the function.
+ for (BasicBlock &BB : F) {
+ for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
+ // Avoid invalidating the iterator.
+ Instruction *I = &*BI++;
+
+ bool RepeatInstruction = false;
+
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ MadeChange |= processStore(SI, BI);
+ else if (MemSetInst *M = dyn_cast<MemSetInst>(I))
+ RepeatInstruction = processMemSet(M, BI);
+ else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I))
+ RepeatInstruction = processMemCpy(M);
+ else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I))
+ RepeatInstruction = processMemMove(M);
+ else if (auto CS = CallSite(I)) {
+ for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
+ if (CS.isByValArgument(i))
+ MadeChange |= processByValArgument(CS, i);
+ }
+
+ // Reprocess the instruction if desired.
+ if (RepeatInstruction) {
+ if (BI != BB.begin())
+ --BI;
+ MadeChange = true;
+ }
+ }
+ }
+
+ return MadeChange;
+}
+
+PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
+ auto &MD = AM.getResult<MemoryDependenceAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+
+ auto LookupAliasAnalysis = [&]() -> AliasAnalysis & {
+ return AM.getResult<AAManager>(F);
+ };
+ auto LookupAssumptionCache = [&]() -> AssumptionCache & {
+ return AM.getResult<AssumptionAnalysis>(F);
+ };
+ auto LookupDomTree = [&]() -> DominatorTree & {
+ return AM.getResult<DominatorTreeAnalysis>(F);
+ };
+
+ bool MadeChange = runImpl(F, &MD, &TLI, LookupAliasAnalysis,
+ LookupAssumptionCache, LookupDomTree);
+ if (!MadeChange)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ PA.preserve<MemoryDependenceAnalysis>();
+ return PA;
+}
+
+bool MemCpyOptPass::runImpl(
+ Function &F, MemoryDependenceResults *MD_, TargetLibraryInfo *TLI_,
+ std::function<AliasAnalysis &()> LookupAliasAnalysis_,
+ std::function<AssumptionCache &()> LookupAssumptionCache_,
+ std::function<DominatorTree &()> LookupDomTree_) {
+ bool MadeChange = false;
+ MD = MD_;
+ TLI = TLI_;
+ LookupAliasAnalysis = std::move(LookupAliasAnalysis_);
+ LookupAssumptionCache = std::move(LookupAssumptionCache_);
+ LookupDomTree = std::move(LookupDomTree_);
+
+ // If we don't have at least memset and memcpy, there is little point of doing
+ // anything here. These are required by a freestanding implementation, so if
+ // even they are disabled, there is no point in trying hard.
+ if (!TLI->has(LibFunc_memset) || !TLI->has(LibFunc_memcpy))
+ return false;
+
+ while (true) {
+ if (!iterateOnFunction(F))
+ break;
+ MadeChange = true;
+ }
+
+ MD = nullptr;
+ return MadeChange;
+}
+
+/// This is the main transformation entry point for a function.
+bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ auto *MD = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+ auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+ auto LookupAliasAnalysis = [this]() -> AliasAnalysis & {
+ return getAnalysis<AAResultsWrapperPass>().getAAResults();
+ };
+ auto LookupAssumptionCache = [this, &F]() -> AssumptionCache & {
+ return getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ };
+ auto LookupDomTree = [this]() -> DominatorTree & {
+ return getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ };
+
+ return Impl.runImpl(F, MD, TLI, LookupAliasAnalysis, LookupAssumptionCache,
+ LookupDomTree);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/contrib/llvm/lib/Transforms/Scalar/MergeICmps.cpp
new file mode 100644
index 000000000000..9869a3fb96fa
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/MergeICmps.cpp
@@ -0,0 +1,650 @@
+//===- MergeICmps.cpp - Optimize chains of integer comparisons ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass turns chains of integer comparisons into memcmp (the memcmp is
+// later typically inlined as a chain of efficient hardware comparisons). This
+// typically benefits c++ member or nonmember operator==().
+//
+// The basic idea is to replace a larger chain of integer comparisons loaded
+// from contiguous memory locations into a smaller chain of such integer
+// comparisons. Benefits are double:
+// - There are less jumps, and therefore less opportunities for mispredictions
+// and I-cache misses.
+// - Code size is smaller, both because jumps are removed and because the
+// encoding of a 2*n byte compare is smaller than that of two n-byte
+// compares.
+
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <numeric>
+#include <utility>
+#include <vector>
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+
+using namespace llvm;
+
+namespace {
+
+#define DEBUG_TYPE "mergeicmps"
+
+// A BCE atom.
+struct BCEAtom {
+ BCEAtom() : GEP(nullptr), LoadI(nullptr), Offset() {}
+
+ const Value *Base() const { return GEP ? GEP->getPointerOperand() : nullptr; }
+
+ bool operator<(const BCEAtom &O) const {
+ assert(Base() && "invalid atom");
+ assert(O.Base() && "invalid atom");
+ // Just ordering by (Base(), Offset) is sufficient. However because this
+ // means that the ordering will depend on the addresses of the base
+ // values, which are not reproducible from run to run. To guarantee
+ // stability, we use the names of the values if they exist; we sort by:
+ // (Base.getName(), Base(), Offset).
+ const int NameCmp = Base()->getName().compare(O.Base()->getName());
+ if (NameCmp == 0) {
+ if (Base() == O.Base()) {
+ return Offset.slt(O.Offset);
+ }
+ return Base() < O.Base();
+ }
+ return NameCmp < 0;
+ }
+
+ GetElementPtrInst *GEP;
+ LoadInst *LoadI;
+ APInt Offset;
+};
+
+// If this value is a load from a constant offset w.r.t. a base address, and
+// there are no othe rusers of the load or address, returns the base address and
+// the offset.
+BCEAtom visitICmpLoadOperand(Value *const Val) {
+ BCEAtom Result;
+ if (auto *const LoadI = dyn_cast<LoadInst>(Val)) {
+ DEBUG(dbgs() << "load\n");
+ if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
+ DEBUG(dbgs() << "used outside of block\n");
+ return {};
+ }
+ if (LoadI->isVolatile()) {
+ DEBUG(dbgs() << "volatile\n");
+ return {};
+ }
+ Value *const Addr = LoadI->getOperand(0);
+ if (auto *const GEP = dyn_cast<GetElementPtrInst>(Addr)) {
+ DEBUG(dbgs() << "GEP\n");
+ if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
+ DEBUG(dbgs() << "used outside of block\n");
+ return {};
+ }
+ const auto &DL = GEP->getModule()->getDataLayout();
+ if (!isDereferenceablePointer(GEP, DL)) {
+ DEBUG(dbgs() << "not dereferenceable\n");
+ // We need to make sure that we can do comparison in any order, so we
+ // require memory to be unconditionnally dereferencable.
+ return {};
+ }
+ Result.Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0);
+ if (GEP->accumulateConstantOffset(DL, Result.Offset)) {
+ Result.GEP = GEP;
+ Result.LoadI = LoadI;
+ }
+ }
+ }
+ return Result;
+}
+
+// A basic block with a comparison between two BCE atoms.
+// Note: the terminology is misleading: the comparison is symmetric, so there
+// is no real {l/r}hs. What we want though is to have the same base on the
+// left (resp. right), so that we can detect consecutive loads. To ensure this
+// we put the smallest atom on the left.
+class BCECmpBlock {
+ public:
+ BCECmpBlock() {}
+
+ BCECmpBlock(BCEAtom L, BCEAtom R, int SizeBits)
+ : Lhs_(L), Rhs_(R), SizeBits_(SizeBits) {
+ if (Rhs_ < Lhs_) std::swap(Rhs_, Lhs_);
+ }
+
+ bool IsValid() const {
+ return Lhs_.Base() != nullptr && Rhs_.Base() != nullptr;
+ }
+
+ // Assert the the block is consistent: If valid, it should also have
+ // non-null members besides Lhs_ and Rhs_.
+ void AssertConsistent() const {
+ if (IsValid()) {
+ assert(BB);
+ assert(CmpI);
+ assert(BranchI);
+ }
+ }
+
+ const BCEAtom &Lhs() const { return Lhs_; }
+ const BCEAtom &Rhs() const { return Rhs_; }
+ int SizeBits() const { return SizeBits_; }
+
+ // Returns true if the block does other works besides comparison.
+ bool doesOtherWork() const;
+
+ // The basic block where this comparison happens.
+ BasicBlock *BB = nullptr;
+ // The ICMP for this comparison.
+ ICmpInst *CmpI = nullptr;
+ // The terminating branch.
+ BranchInst *BranchI = nullptr;
+
+ private:
+ BCEAtom Lhs_;
+ BCEAtom Rhs_;
+ int SizeBits_ = 0;
+};
+
+bool BCECmpBlock::doesOtherWork() const {
+ AssertConsistent();
+ // TODO(courbet): Can we allow some other things ? This is very conservative.
+ // We might be able to get away with anything does does not have any side
+ // effects outside of the basic block.
+ // Note: The GEPs and/or loads are not necessarily in the same block.
+ for (const Instruction &Inst : *BB) {
+ if (const auto *const GEP = dyn_cast<GetElementPtrInst>(&Inst)) {
+ if (!(Lhs_.GEP == GEP || Rhs_.GEP == GEP)) return true;
+ } else if (const auto *const L = dyn_cast<LoadInst>(&Inst)) {
+ if (!(Lhs_.LoadI == L || Rhs_.LoadI == L)) return true;
+ } else if (const auto *const C = dyn_cast<ICmpInst>(&Inst)) {
+ if (C != CmpI) return true;
+ } else if (const auto *const Br = dyn_cast<BranchInst>(&Inst)) {
+ if (Br != BranchI) return true;
+ } else {
+ return true;
+ }
+ }
+ return false;
+}
+
+// Visit the given comparison. If this is a comparison between two valid
+// BCE atoms, returns the comparison.
+BCECmpBlock visitICmp(const ICmpInst *const CmpI,
+ const ICmpInst::Predicate ExpectedPredicate) {
+ if (CmpI->getPredicate() == ExpectedPredicate) {
+ DEBUG(dbgs() << "cmp "
+ << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
+ << "\n");
+ auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0));
+ if (!Lhs.Base()) return {};
+ auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1));
+ if (!Rhs.Base()) return {};
+ return BCECmpBlock(std::move(Lhs), std::move(Rhs),
+ CmpI->getOperand(0)->getType()->getScalarSizeInBits());
+ }
+ return {};
+}
+
+// Visit the given comparison block. If this is a comparison between two valid
+// BCE atoms, returns the comparison.
+BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
+ const BasicBlock *const PhiBlock) {
+ if (Block->empty()) return {};
+ auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator());
+ if (!BranchI) return {};
+ DEBUG(dbgs() << "branch\n");
+ if (BranchI->isUnconditional()) {
+ // In this case, we expect an incoming value which is the result of the
+ // comparison. This is the last link in the chain of comparisons (note
+ // that this does not mean that this is the last incoming value, blocks
+ // can be reordered).
+ auto *const CmpI = dyn_cast<ICmpInst>(Val);
+ if (!CmpI) return {};
+ DEBUG(dbgs() << "icmp\n");
+ auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ);
+ Result.CmpI = CmpI;
+ Result.BranchI = BranchI;
+ return Result;
+ } else {
+ // In this case, we expect a constant incoming value (the comparison is
+ // chained).
+ const auto *const Const = dyn_cast<ConstantInt>(Val);
+ DEBUG(dbgs() << "const\n");
+ if (!Const->isZero()) return {};
+ DEBUG(dbgs() << "false\n");
+ auto *const CmpI = dyn_cast<ICmpInst>(BranchI->getCondition());
+ if (!CmpI) return {};
+ DEBUG(dbgs() << "icmp\n");
+ assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch");
+ BasicBlock *const FalseBlock = BranchI->getSuccessor(1);
+ auto Result = visitICmp(
+ CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE);
+ Result.CmpI = CmpI;
+ Result.BranchI = BranchI;
+ return Result;
+ }
+ return {};
+}
+
+// A chain of comparisons.
+class BCECmpChain {
+ public:
+ BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi);
+
+ int size() const { return Comparisons_.size(); }
+
+#ifdef MERGEICMPS_DOT_ON
+ void dump() const;
+#endif // MERGEICMPS_DOT_ON
+
+ bool simplify(const TargetLibraryInfo *const TLI);
+
+ private:
+ static bool IsContiguous(const BCECmpBlock &First,
+ const BCECmpBlock &Second) {
+ return First.Lhs().Base() == Second.Lhs().Base() &&
+ First.Rhs().Base() == Second.Rhs().Base() &&
+ First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset &&
+ First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset;
+ }
+
+ // Merges the given comparison blocks into one memcmp block and update
+ // branches. Comparisons are assumed to be continguous. If NextBBInChain is
+ // null, the merged block will link to the phi block.
+ static void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
+ BasicBlock *const NextBBInChain, PHINode &Phi,
+ const TargetLibraryInfo *const TLI);
+
+ PHINode &Phi_;
+ std::vector<BCECmpBlock> Comparisons_;
+ // The original entry block (before sorting);
+ BasicBlock *EntryBlock_;
+};
+
+BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi)
+ : Phi_(Phi) {
+ // Now look inside blocks to check for BCE comparisons.
+ std::vector<BCECmpBlock> Comparisons;
+ for (BasicBlock *Block : Blocks) {
+ BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block),
+ Block, Phi.getParent());
+ Comparison.BB = Block;
+ if (!Comparison.IsValid()) {
+ DEBUG(dbgs() << "skip: not a valid BCECmpBlock\n");
+ return;
+ }
+ if (Comparison.doesOtherWork()) {
+ DEBUG(dbgs() << "block does extra work besides compare\n");
+ if (Comparisons.empty()) { // First block.
+ // TODO(courbet): The first block can do other things, and we should
+ // split them apart in a separate block before the comparison chain.
+ // Right now we just discard it and make the chain shorter.
+ DEBUG(dbgs()
+ << "ignoring first block that does extra work besides compare\n");
+ continue;
+ }
+ // TODO(courbet): Right now we abort the whole chain. We could be
+ // merging only the blocks that don't do other work and resume the
+ // chain from there. For example:
+ // if (a[0] == b[0]) { // bb1
+ // if (a[1] == b[1]) { // bb2
+ // some_value = 3; //bb3
+ // if (a[2] == b[2]) { //bb3
+ // do a ton of stuff //bb4
+ // }
+ // }
+ // }
+ //
+ // This is:
+ //
+ // bb1 --eq--> bb2 --eq--> bb3* -eq--> bb4 --+
+ // \ \ \ \
+ // ne ne ne \
+ // \ \ \ v
+ // +------------+-----------+----------> bb_phi
+ //
+ // We can only merge the first two comparisons, because bb3* does
+ // "other work" (setting some_value to 3).
+ // We could still merge bb1 and bb2 though.
+ return;
+ }
+ DEBUG(dbgs() << "*Found cmp of " << Comparison.SizeBits()
+ << " bits between " << Comparison.Lhs().Base() << " + "
+ << Comparison.Lhs().Offset << " and "
+ << Comparison.Rhs().Base() << " + " << Comparison.Rhs().Offset
+ << "\n");
+ DEBUG(dbgs() << "\n");
+ Comparisons.push_back(Comparison);
+ }
+ EntryBlock_ = Comparisons[0].BB;
+ Comparisons_ = std::move(Comparisons);
+#ifdef MERGEICMPS_DOT_ON
+ errs() << "BEFORE REORDERING:\n\n";
+ dump();
+#endif // MERGEICMPS_DOT_ON
+ // Reorder blocks by LHS. We can do that without changing the
+ // semantics because we are only accessing dereferencable memory.
+ std::sort(Comparisons_.begin(), Comparisons_.end(),
+ [](const BCECmpBlock &a, const BCECmpBlock &b) {
+ return a.Lhs() < b.Lhs();
+ });
+#ifdef MERGEICMPS_DOT_ON
+ errs() << "AFTER REORDERING:\n\n";
+ dump();
+#endif // MERGEICMPS_DOT_ON
+}
+
+#ifdef MERGEICMPS_DOT_ON
+void BCECmpChain::dump() const {
+ errs() << "digraph dag {\n";
+ errs() << " graph [bgcolor=transparent];\n";
+ errs() << " node [color=black,style=filled,fillcolor=lightyellow];\n";
+ errs() << " edge [color=black];\n";
+ for (size_t I = 0; I < Comparisons_.size(); ++I) {
+ const auto &Comparison = Comparisons_[I];
+ errs() << " \"" << I << "\" [label=\"%"
+ << Comparison.Lhs().Base()->getName() << " + "
+ << Comparison.Lhs().Offset << " == %"
+ << Comparison.Rhs().Base()->getName() << " + "
+ << Comparison.Rhs().Offset << " (" << (Comparison.SizeBits() / 8)
+ << " bytes)\"];\n";
+ const Value *const Val = Phi_.getIncomingValueForBlock(Comparison.BB);
+ if (I > 0) errs() << " \"" << (I - 1) << "\" -> \"" << I << "\";\n";
+ errs() << " \"" << I << "\" -> \"Phi\" [label=\"" << *Val << "\"];\n";
+ }
+ errs() << " \"Phi\" [label=\"Phi\"];\n";
+ errs() << "}\n\n";
+}
+#endif // MERGEICMPS_DOT_ON
+
+bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI) {
+ // First pass to check if there is at least one merge. If not, we don't do
+ // anything and we keep analysis passes intact.
+ {
+ bool AtLeastOneMerged = false;
+ for (size_t I = 1; I < Comparisons_.size(); ++I) {
+ if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) {
+ AtLeastOneMerged = true;
+ break;
+ }
+ }
+ if (!AtLeastOneMerged) return false;
+ }
+
+ // Remove phi references to comparison blocks, they will be rebuilt as we
+ // merge the blocks.
+ for (const auto &Comparison : Comparisons_) {
+ Phi_.removeIncomingValue(Comparison.BB, false);
+ }
+
+ // Point the predecessors of the chain to the first comparison block (which is
+ // the new entry point).
+ if (EntryBlock_ != Comparisons_[0].BB)
+ EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB);
+
+ // Effectively merge blocks.
+ int NumMerged = 1;
+ for (size_t I = 1; I < Comparisons_.size(); ++I) {
+ if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) {
+ ++NumMerged;
+ } else {
+ // Merge all previous comparisons and start a new merge block.
+ mergeComparisons(
+ makeArrayRef(Comparisons_).slice(I - NumMerged, NumMerged),
+ Comparisons_[I].BB, Phi_, TLI);
+ NumMerged = 1;
+ }
+ }
+ mergeComparisons(makeArrayRef(Comparisons_)
+ .slice(Comparisons_.size() - NumMerged, NumMerged),
+ nullptr, Phi_, TLI);
+
+ return true;
+}
+
+void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
+ BasicBlock *const NextBBInChain,
+ PHINode &Phi,
+ const TargetLibraryInfo *const TLI) {
+ assert(!Comparisons.empty());
+ const auto &FirstComparison = *Comparisons.begin();
+ BasicBlock *const BB = FirstComparison.BB;
+ LLVMContext &Context = BB->getContext();
+
+ if (Comparisons.size() >= 2) {
+ DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n");
+ const auto TotalSize =
+ std::accumulate(Comparisons.begin(), Comparisons.end(), 0,
+ [](int Size, const BCECmpBlock &C) {
+ return Size + C.SizeBits();
+ }) /
+ 8;
+
+ // Incoming edges do not need to be updated, and both GEPs are already
+ // computing the right address, we just need to:
+ // - replace the two loads and the icmp with the memcmp
+ // - update the branch
+ // - update the incoming values in the phi.
+ FirstComparison.BranchI->eraseFromParent();
+ FirstComparison.CmpI->eraseFromParent();
+ FirstComparison.Lhs().LoadI->eraseFromParent();
+ FirstComparison.Rhs().LoadI->eraseFromParent();
+
+ IRBuilder<> Builder(BB);
+ const auto &DL = Phi.getModule()->getDataLayout();
+ Value *const MemCmpCall = emitMemCmp(
+ FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP, ConstantInt::get(DL.getIntPtrType(Context), TotalSize),
+ Builder, DL, TLI);
+ Value *const MemCmpIsZero = Builder.CreateICmpEQ(
+ MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0));
+
+ // Add a branch to the next basic block in the chain.
+ if (NextBBInChain) {
+ Builder.CreateCondBr(MemCmpIsZero, NextBBInChain, Phi.getParent());
+ Phi.addIncoming(ConstantInt::getFalse(Context), BB);
+ } else {
+ Builder.CreateBr(Phi.getParent());
+ Phi.addIncoming(MemCmpIsZero, BB);
+ }
+
+ // Delete merged blocks.
+ for (size_t I = 1; I < Comparisons.size(); ++I) {
+ BasicBlock *CBB = Comparisons[I].BB;
+ CBB->replaceAllUsesWith(BB);
+ CBB->eraseFromParent();
+ }
+ } else {
+ assert(Comparisons.size() == 1);
+ // There are no blocks to merge, but we still need to update the branches.
+ DEBUG(dbgs() << "Only one comparison, updating branches\n");
+ if (NextBBInChain) {
+ if (FirstComparison.BranchI->isConditional()) {
+ DEBUG(dbgs() << "conditional -> conditional\n");
+ // Just update the "true" target, the "false" target should already be
+ // the phi block.
+ assert(FirstComparison.BranchI->getSuccessor(1) == Phi.getParent());
+ FirstComparison.BranchI->setSuccessor(0, NextBBInChain);
+ Phi.addIncoming(ConstantInt::getFalse(Context), BB);
+ } else {
+ DEBUG(dbgs() << "unconditional -> conditional\n");
+ // Replace the unconditional branch by a conditional one.
+ FirstComparison.BranchI->eraseFromParent();
+ IRBuilder<> Builder(BB);
+ Builder.CreateCondBr(FirstComparison.CmpI, NextBBInChain,
+ Phi.getParent());
+ Phi.addIncoming(FirstComparison.CmpI, BB);
+ }
+ } else {
+ if (FirstComparison.BranchI->isConditional()) {
+ DEBUG(dbgs() << "conditional -> unconditional\n");
+ // Replace the conditional branch by an unconditional one.
+ FirstComparison.BranchI->eraseFromParent();
+ IRBuilder<> Builder(BB);
+ Builder.CreateBr(Phi.getParent());
+ Phi.addIncoming(FirstComparison.CmpI, BB);
+ } else {
+ DEBUG(dbgs() << "unconditional -> unconditional\n");
+ Phi.addIncoming(FirstComparison.CmpI, BB);
+ }
+ }
+ }
+}
+
+std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
+ BasicBlock *const LastBlock,
+ int NumBlocks) {
+ // Walk up from the last block to find other blocks.
+ std::vector<BasicBlock *> Blocks(NumBlocks);
+ BasicBlock *CurBlock = LastBlock;
+ for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) {
+ if (CurBlock->hasAddressTaken()) {
+ // Somebody is jumping to the block through an address, all bets are
+ // off.
+ DEBUG(dbgs() << "skip: block " << BlockIndex
+ << " has its address taken\n");
+ return {};
+ }
+ Blocks[BlockIndex] = CurBlock;
+ auto *SinglePredecessor = CurBlock->getSinglePredecessor();
+ if (!SinglePredecessor) {
+ // The block has two or more predecessors.
+ DEBUG(dbgs() << "skip: block " << BlockIndex
+ << " has two or more predecessors\n");
+ return {};
+ }
+ if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) {
+ // The block does not link back to the phi.
+ DEBUG(dbgs() << "skip: block " << BlockIndex
+ << " does not link back to the phi\n");
+ return {};
+ }
+ CurBlock = SinglePredecessor;
+ }
+ Blocks[0] = CurBlock;
+ return Blocks;
+}
+
+bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
+ DEBUG(dbgs() << "processPhi()\n");
+ if (Phi.getNumIncomingValues() <= 1) {
+ DEBUG(dbgs() << "skip: only one incoming value in phi\n");
+ return false;
+ }
+ // We are looking for something that has the following structure:
+ // bb1 --eq--> bb2 --eq--> bb3 --eq--> bb4 --+
+ // \ \ \ \
+ // ne ne ne \
+ // \ \ \ v
+ // +------------+-----------+----------> bb_phi
+ //
+ // - The last basic block (bb4 here) must branch unconditionally to bb_phi.
+ // It's the only block that contributes a non-constant value to the Phi.
+ // - All other blocks (b1, b2, b3) must have exactly two successors, one of
+ // them being the the phi block.
+ // - All intermediate blocks (bb2, bb3) must have only one predecessor.
+ // - Blocks cannot do other work besides the comparison, see doesOtherWork()
+
+ // The blocks are not necessarily ordered in the phi, so we start from the
+ // last block and reconstruct the order.
+ BasicBlock *LastBlock = nullptr;
+ for (unsigned I = 0; I < Phi.getNumIncomingValues(); ++I) {
+ if (isa<ConstantInt>(Phi.getIncomingValue(I))) continue;
+ if (LastBlock) {
+ // There are several non-constant values.
+ DEBUG(dbgs() << "skip: several non-constant values\n");
+ return false;
+ }
+ LastBlock = Phi.getIncomingBlock(I);
+ }
+ if (!LastBlock) {
+ // There is no non-constant block.
+ DEBUG(dbgs() << "skip: no non-constant block\n");
+ return false;
+ }
+ if (LastBlock->getSingleSuccessor() != Phi.getParent()) {
+ DEBUG(dbgs() << "skip: last block non-phi successor\n");
+ return false;
+ }
+
+ const auto Blocks =
+ getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues());
+ if (Blocks.empty()) return false;
+ BCECmpChain CmpChain(Blocks, Phi);
+
+ if (CmpChain.size() < 2) {
+ DEBUG(dbgs() << "skip: only one compare block\n");
+ return false;
+ }
+
+ return CmpChain.simplify(TLI);
+}
+
+class MergeICmps : public FunctionPass {
+ public:
+ static char ID;
+
+ MergeICmps() : FunctionPass(ID) {
+ initializeMergeICmpsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F)) return false;
+ const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto PA = runImpl(F, &TLI, &TTI);
+ return !PA.areAllPreserved();
+ }
+
+ private:
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+
+ PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI);
+};
+
+PreservedAnalyses MergeICmps::runImpl(Function &F, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI) {
+ DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n");
+
+ // We only try merging comparisons if the target wants to expand memcmp later.
+ // The rationale is to avoid turning small chains into memcmp calls.
+ if (!TTI->enableMemCmpExpansion(true)) return PreservedAnalyses::all();
+
+ bool MadeChange = false;
+
+ for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) {
+ // A Phi operation is always first in a basic block.
+ if (auto *const Phi = dyn_cast<PHINode>(&*BBIt->begin()))
+ MadeChange |= processPhi(*Phi, TLI);
+ }
+
+ if (MadeChange) return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+}
+
+} // namespace
+
+char MergeICmps::ID = 0;
+INITIALIZE_PASS_BEGIN(MergeICmps, "mergeicmps",
+ "Merge contiguous icmps into a memcmp", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(MergeICmps, "mergeicmps",
+ "Merge contiguous icmps into a memcmp", false, false)
+
+Pass *llvm::createMergeICmpsPass() { return new MergeICmps(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
new file mode 100644
index 000000000000..f2f615cb9b0f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -0,0 +1,429 @@
+//===- MergedLoadStoreMotion.cpp - merge and hoist/sink load/stores -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//! \file
+//! \brief This pass performs merges of loads and stores on both sides of a
+// diamond (hammock). It hoists the loads and sinks the stores.
+//
+// The algorithm iteratively hoists two loads to the same address out of a
+// diamond (hammock) and merges them into a single load in the header. Similar
+// it sinks and merges two stores to the tail block (footer). The algorithm
+// iterates over the instructions of one side of the diamond and attempts to
+// find a matching load/store on the other side. It hoists / sinks when it
+// thinks it safe to do so. This optimization helps with eg. hiding load
+// latencies, triggering if-conversion, and reducing static code size.
+//
+// NOTE: This code no longer performs load hoisting, it is subsumed by GVNHoist.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// Example:
+// Diamond shaped code before merge:
+//
+// header:
+// br %cond, label %if.then, label %if.else
+// + +
+// + +
+// + +
+// if.then: if.else:
+// %lt = load %addr_l %le = load %addr_l
+// <use %lt> <use %le>
+// <...> <...>
+// store %st, %addr_s store %se, %addr_s
+// br label %if.end br label %if.end
+// + +
+// + +
+// + +
+// if.end ("footer"):
+// <...>
+//
+// Diamond shaped code after merge:
+//
+// header:
+// %l = load %addr_l
+// br %cond, label %if.then, label %if.else
+// + +
+// + +
+// + +
+// if.then: if.else:
+// <use %l> <use %l>
+// <...> <...>
+// br label %if.end br label %if.end
+// + +
+// + +
+// + +
+// if.end ("footer"):
+// %s.sink = phi [%st, if.then], [%se, if.else]
+// <...>
+// store %s.sink, %addr_s
+// <...>
+//
+//
+//===----------------------- TODO -----------------------------------------===//
+//
+// 1) Generalize to regions other than diamonds
+// 2) Be more aggressive merging memory operations
+// Note that both changes require register pressure control
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mldst-motion"
+
+namespace {
+//===----------------------------------------------------------------------===//
+// MergedLoadStoreMotion Pass
+//===----------------------------------------------------------------------===//
+class MergedLoadStoreMotion {
+ MemoryDependenceResults *MD = nullptr;
+ AliasAnalysis *AA = nullptr;
+
+ // The mergeLoad/Store algorithms could have Size0 * Size1 complexity,
+ // where Size0 and Size1 are the #instructions on the two sides of
+ // the diamond. The constant chosen here is arbitrary. Compiler Time
+ // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl.
+ const int MagicCompileTimeControl = 250;
+
+public:
+ bool run(Function &F, MemoryDependenceResults *MD, AliasAnalysis &AA);
+
+private:
+ ///
+ /// \brief Remove instruction from parent and update memory dependence
+ /// analysis.
+ ///
+ void removeInstruction(Instruction *Inst);
+ BasicBlock *getDiamondTail(BasicBlock *BB);
+ bool isDiamondHead(BasicBlock *BB);
+ // Routines for sinking stores
+ StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI);
+ PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
+ bool isStoreSinkBarrierInRange(const Instruction &Start,
+ const Instruction &End, MemoryLocation Loc);
+ bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst);
+ bool mergeStores(BasicBlock *BB);
+};
+} // end anonymous namespace
+
+///
+/// \brief Remove instruction from parent and update memory dependence analysis.
+///
+void MergedLoadStoreMotion::removeInstruction(Instruction *Inst) {
+ // Notify the memory dependence analysis.
+ if (MD) {
+ MD->removeInstruction(Inst);
+ if (auto *LI = dyn_cast<LoadInst>(Inst))
+ MD->invalidateCachedPointerInfo(LI->getPointerOperand());
+ if (Inst->getType()->isPtrOrPtrVectorTy()) {
+ MD->invalidateCachedPointerInfo(Inst);
+ }
+ }
+ Inst->eraseFromParent();
+}
+
+///
+/// \brief Return tail block of a diamond.
+///
+BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) {
+ assert(isDiamondHead(BB) && "Basic block is not head of a diamond");
+ return BB->getTerminator()->getSuccessor(0)->getSingleSuccessor();
+}
+
+///
+/// \brief True when BB is the head of a diamond (hammock)
+///
+bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
+ if (!BB)
+ return false;
+ auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BI || !BI->isConditional())
+ return false;
+
+ BasicBlock *Succ0 = BI->getSuccessor(0);
+ BasicBlock *Succ1 = BI->getSuccessor(1);
+
+ if (!Succ0->getSinglePredecessor())
+ return false;
+ if (!Succ1->getSinglePredecessor())
+ return false;
+
+ BasicBlock *Succ0Succ = Succ0->getSingleSuccessor();
+ BasicBlock *Succ1Succ = Succ1->getSingleSuccessor();
+ // Ignore triangles.
+ if (!Succ0Succ || !Succ1Succ || Succ0Succ != Succ1Succ)
+ return false;
+ return true;
+}
+
+
+///
+/// \brief True when instruction is a sink barrier for a store
+/// located in Loc
+///
+/// Whenever an instruction could possibly read or modify the
+/// value being stored or protect against the store from
+/// happening it is considered a sink barrier.
+///
+bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start,
+ const Instruction &End,
+ MemoryLocation Loc) {
+ for (const Instruction &Inst :
+ make_range(Start.getIterator(), End.getIterator()))
+ if (Inst.mayThrow())
+ return true;
+ return AA->canInstructionRangeModRef(Start, End, Loc, ModRefInfo::ModRef);
+}
+
+///
+/// \brief Check if \p BB contains a store to the same address as \p SI
+///
+/// \return The store in \p when it is safe to sink. Otherwise return Null.
+///
+StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
+ StoreInst *Store0) {
+ DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
+ BasicBlock *BB0 = Store0->getParent();
+ for (Instruction &Inst : reverse(*BB1)) {
+ auto *Store1 = dyn_cast<StoreInst>(&Inst);
+ if (!Store1)
+ continue;
+
+ MemoryLocation Loc0 = MemoryLocation::get(Store0);
+ MemoryLocation Loc1 = MemoryLocation::get(Store1);
+ if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) &&
+ !isStoreSinkBarrierInRange(*Store1->getNextNode(), BB1->back(), Loc1) &&
+ !isStoreSinkBarrierInRange(*Store0->getNextNode(), BB0->back(), Loc0)) {
+ return Store1;
+ }
+ }
+ return nullptr;
+}
+
+///
+/// \brief Create a PHI node in BB for the operands of S0 and S1
+///
+PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
+ StoreInst *S1) {
+ // Create a phi if the values mismatch.
+ Value *Opd1 = S0->getValueOperand();
+ Value *Opd2 = S1->getValueOperand();
+ if (Opd1 == Opd2)
+ return nullptr;
+
+ auto *NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink",
+ &BB->front());
+ NewPN->addIncoming(Opd1, S0->getParent());
+ NewPN->addIncoming(Opd2, S1->getParent());
+ if (MD && NewPN->getType()->isPtrOrPtrVectorTy())
+ MD->invalidateCachedPointerInfo(NewPN);
+ return NewPN;
+}
+
+///
+/// \brief Merge two stores to same address and sink into \p BB
+///
+/// Also sinks GEP instruction computing the store address
+///
+bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
+ StoreInst *S1) {
+ // Only one definition?
+ auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
+ auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
+ if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() &&
+ (A0->getParent() == S0->getParent()) && A1->hasOneUse() &&
+ (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0)) {
+ DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
+ dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
+ dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
+ // Hoist the instruction.
+ BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
+ // Intersect optional metadata.
+ S0->andIRFlags(S1);
+ S0->dropUnknownNonDebugMetadata();
+
+ // Create the new store to be inserted at the join point.
+ StoreInst *SNew = cast<StoreInst>(S0->clone());
+ Instruction *ANew = A0->clone();
+ SNew->insertBefore(&*InsertPt);
+ ANew->insertBefore(SNew);
+
+ assert(S0->getParent() == A0->getParent());
+ assert(S1->getParent() == A1->getParent());
+
+ // New PHI operand? Use it.
+ if (PHINode *NewPN = getPHIOperand(BB, S0, S1))
+ SNew->setOperand(0, NewPN);
+ removeInstruction(S0);
+ removeInstruction(S1);
+ A0->replaceAllUsesWith(ANew);
+ removeInstruction(A0);
+ A1->replaceAllUsesWith(ANew);
+ removeInstruction(A1);
+ return true;
+ }
+ return false;
+}
+
+///
+/// \brief True when two stores are equivalent and can sink into the footer
+///
+/// Starting from a diamond tail block, iterate over the instructions in one
+/// predecessor block and try to match a store in the second predecessor.
+///
+bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
+
+ bool MergedStores = false;
+ assert(T && "Footer of a diamond cannot be empty");
+
+ pred_iterator PI = pred_begin(T), E = pred_end(T);
+ assert(PI != E);
+ BasicBlock *Pred0 = *PI;
+ ++PI;
+ BasicBlock *Pred1 = *PI;
+ ++PI;
+ // tail block of a diamond/hammock?
+ if (Pred0 == Pred1)
+ return false; // No.
+ if (PI != E)
+ return false; // No. More than 2 predecessors.
+
+ // #Instructions in Succ1 for Compile Time Control
+ int Size1 = Pred1->size();
+ int NStores = 0;
+
+ for (BasicBlock::reverse_iterator RBI = Pred0->rbegin(), RBE = Pred0->rend();
+ RBI != RBE;) {
+
+ Instruction *I = &*RBI;
+ ++RBI;
+
+ // Don't sink non-simple (atomic, volatile) stores.
+ auto *S0 = dyn_cast<StoreInst>(I);
+ if (!S0 || !S0->isSimple())
+ continue;
+
+ ++NStores;
+ if (NStores * Size1 >= MagicCompileTimeControl)
+ break;
+ if (StoreInst *S1 = canSinkFromBlock(Pred1, S0)) {
+ bool Res = sinkStore(T, S0, S1);
+ MergedStores |= Res;
+ // Don't attempt to sink below stores that had to stick around
+ // But after removal of a store and some of its feeding
+ // instruction search again from the beginning since the iterator
+ // is likely stale at this point.
+ if (!Res)
+ break;
+ RBI = Pred0->rbegin();
+ RBE = Pred0->rend();
+ DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
+ }
+ }
+ return MergedStores;
+}
+
+bool MergedLoadStoreMotion::run(Function &F, MemoryDependenceResults *MD,
+ AliasAnalysis &AA) {
+ this->MD = MD;
+ this->AA = &AA;
+
+ bool Changed = false;
+ DEBUG(dbgs() << "Instruction Merger\n");
+
+ // Merge unconditional branches, allowing PRE to catch more
+ // optimization opportunities.
+ for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) {
+ BasicBlock *BB = &*FI++;
+
+ // Hoist equivalent loads and sink stores
+ // outside diamonds when possible
+ if (isDiamondHead(BB)) {
+ Changed |= mergeStores(getDiamondTail(BB));
+ }
+ }
+ return Changed;
+}
+
+namespace {
+class MergedLoadStoreMotionLegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+ MergedLoadStoreMotionLegacyPass() : FunctionPass(ID) {
+ initializeMergedLoadStoreMotionLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ ///
+ /// \brief Run the transformation for each function
+ ///
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ MergedLoadStoreMotion Impl;
+ auto *MDWP = getAnalysisIfAvailable<MemoryDependenceWrapperPass>();
+ return Impl.run(F, MDWP ? &MDWP->getMemDep() : nullptr,
+ getAnalysis<AAResultsWrapperPass>().getAAResults());
+ }
+
+private:
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<MemoryDependenceWrapperPass>();
+ }
+};
+
+char MergedLoadStoreMotionLegacyPass::ID = 0;
+} // anonymous namespace
+
+///
+/// \brief createMergedLoadStoreMotionPass - The public interface to this file.
+///
+FunctionPass *llvm::createMergedLoadStoreMotionPass() {
+ return new MergedLoadStoreMotionLegacyPass();
+}
+
+INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion",
+ "MergedLoadStoreMotion", false, false)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion",
+ "MergedLoadStoreMotion", false, false)
+
+PreservedAnalyses
+MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) {
+ MergedLoadStoreMotion Impl;
+ auto *MD = AM.getCachedResult<MemoryDependenceAnalysis>(F);
+ auto &AA = AM.getResult<AAManager>(F);
+ if (!Impl.run(F, MD, AA))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ PA.preserve<MemoryDependenceAnalysis>();
+ return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
new file mode 100644
index 000000000000..b026c8d692c3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -0,0 +1,538 @@
+//===- NaryReassociate.cpp - Reassociate n-ary expressions ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reassociates n-ary add expressions and eliminates the redundancy
+// exposed by the reassociation.
+//
+// A motivating example:
+//
+// void foo(int a, int b) {
+// bar(a + b);
+// bar((a + 2) + b);
+// }
+//
+// An ideal compiler should reassociate (a + 2) + b to (a + b) + 2 and simplify
+// the above code to
+//
+// int t = a + b;
+// bar(t);
+// bar(t + 2);
+//
+// However, the Reassociate pass is unable to do that because it processes each
+// instruction individually and believes (a + 2) + b is the best form according
+// to its rank system.
+//
+// To address this limitation, NaryReassociate reassociates an expression in a
+// form that reuses existing instructions. As a result, NaryReassociate can
+// reassociate (a + 2) + b in the example to (a + b) + 2 because it detects that
+// (a + b) is computed before.
+//
+// NaryReassociate works as follows. For every instruction in the form of (a +
+// b) + c, it checks whether a + c or b + c is already computed by a dominating
+// instruction. If so, it then reassociates (a + b) + c into (a + c) + b or (b +
+// c) + a and removes the redundancy accordingly. To efficiently look up whether
+// an expression is computed before, we store each instruction seen and its SCEV
+// into an SCEV-to-instruction map.
+//
+// Although the algorithm pattern-matches only ternary additions, it
+// automatically handles many >3-ary expressions by walking through the function
+// in the depth-first order. For example, given
+//
+// (a + c) + d
+// ((a + b) + c) + d
+//
+// NaryReassociate first rewrites (a + b) + c to (a + c) + b, and then rewrites
+// ((a + c) + b) + d into ((a + c) + d) + b.
+//
+// Finally, the above dominator-based algorithm may need to be run multiple
+// iterations before emitting optimal code. One source of this need is that we
+// only split an operand when it is used only once. The above algorithm can
+// eliminate an instruction and decrease the usage count of its operands. As a
+// result, an instruction that previously had multiple uses may become a
+// single-use instruction and thus eligible for split consideration. For
+// example,
+//
+// ac = a + c
+// ab = a + b
+// abc = ab + c
+// ab2 = ab + b
+// ab2c = ab2 + c
+//
+// In the first iteration, we cannot reassociate abc to ac+b because ab is used
+// twice. However, we can reassociate ab2c to abc+b in the first iteration. As a
+// result, ab2 becomes dead and ab will be used only once in the second
+// iteration.
+//
+// Limitations and TODO items:
+//
+// 1) We only considers n-ary adds and muls for now. This should be extended
+// and generalized.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/NaryReassociate.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "nary-reassociate"
+
+namespace {
+
+class NaryReassociateLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ NaryReassociateLegacyPass() : FunctionPass(ID) {
+ initializeNaryReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool doInitialization(Module &M) override {
+ return false;
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.setPreservesCFG();
+ }
+
+private:
+ NaryReassociatePass Impl;
+};
+
+} // end anonymous namespace
+
+char NaryReassociateLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(NaryReassociateLegacyPass, "nary-reassociate",
+ "Nary reassociation", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(NaryReassociateLegacyPass, "nary-reassociate",
+ "Nary reassociation", false, false)
+
+FunctionPass *llvm::createNaryReassociatePass() {
+ return new NaryReassociateLegacyPass();
+}
+
+bool NaryReassociateLegacyPass::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+ return Impl.runImpl(F, AC, DT, SE, TLI, TTI);
+}
+
+PreservedAnalyses NaryReassociatePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto *AC = &AM.getResult<AssumptionAnalysis>(F);
+ auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+ auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+ if (!runImpl(F, AC, DT, SE, TLI, TTI))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<ScalarEvolutionAnalysis>();
+ return PA;
+}
+
+bool NaryReassociatePass::runImpl(Function &F, AssumptionCache *AC_,
+ DominatorTree *DT_, ScalarEvolution *SE_,
+ TargetLibraryInfo *TLI_,
+ TargetTransformInfo *TTI_) {
+ AC = AC_;
+ DT = DT_;
+ SE = SE_;
+ TLI = TLI_;
+ TTI = TTI_;
+ DL = &F.getParent()->getDataLayout();
+
+ bool Changed = false, ChangedInThisIteration;
+ do {
+ ChangedInThisIteration = doOneIteration(F);
+ Changed |= ChangedInThisIteration;
+ } while (ChangedInThisIteration);
+ return Changed;
+}
+
+// Whitelist the instruction types NaryReassociate handles for now.
+static bool isPotentiallyNaryReassociable(Instruction *I) {
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::GetElementPtr:
+ case Instruction::Mul:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool NaryReassociatePass::doOneIteration(Function &F) {
+ bool Changed = false;
+ SeenExprs.clear();
+ // Process the basic blocks in a depth first traversal of the dominator
+ // tree. This order ensures that all bases of a candidate are in Candidates
+ // when we process it.
+ for (const auto Node : depth_first(DT)) {
+ BasicBlock *BB = Node->getBlock();
+ for (auto I = BB->begin(); I != BB->end(); ++I) {
+ if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(&*I)) {
+ const SCEV *OldSCEV = SE->getSCEV(&*I);
+ if (Instruction *NewI = tryReassociate(&*I)) {
+ Changed = true;
+ SE->forgetValue(&*I);
+ I->replaceAllUsesWith(NewI);
+ // If SeenExprs constains I's WeakTrackingVH, that entry will be
+ // replaced with
+ // nullptr.
+ RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI);
+ I = NewI->getIterator();
+ }
+ // Add the rewritten instruction to SeenExprs; the original instruction
+ // is deleted.
+ const SCEV *NewSCEV = SE->getSCEV(&*I);
+ SeenExprs[NewSCEV].push_back(WeakTrackingVH(&*I));
+ // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I)
+ // is equivalent to I. However, ScalarEvolution::getSCEV may
+ // weaken nsw causing NewSCEV not to equal OldSCEV. For example, suppose
+ // we reassociate
+ // I = &a[sext(i +nsw j)] // assuming sizeof(a[0]) = 4
+ // to
+ // NewI = &a[sext(i)] + sext(j).
+ //
+ // ScalarEvolution computes
+ // getSCEV(I) = a + 4 * sext(i + j)
+ // getSCEV(newI) = a + 4 * sext(i) + 4 * sext(j)
+ // which are different SCEVs.
+ //
+ // To alleviate this issue of ScalarEvolution not always capturing
+ // equivalence, we add I to SeenExprs[OldSCEV] as well so that we can
+ // map both SCEV before and after tryReassociate(I) to I.
+ //
+ // This improvement is exercised in @reassociate_gep_nsw in nary-gep.ll.
+ if (NewSCEV != OldSCEV)
+ SeenExprs[OldSCEV].push_back(WeakTrackingVH(&*I));
+ }
+ }
+ }
+ return Changed;
+}
+
+Instruction *NaryReassociatePass::tryReassociate(Instruction *I) {
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Mul:
+ return tryReassociateBinaryOp(cast<BinaryOperator>(I));
+ case Instruction::GetElementPtr:
+ return tryReassociateGEP(cast<GetElementPtrInst>(I));
+ default:
+ llvm_unreachable("should be filtered out by isPotentiallyNaryReassociable");
+ }
+}
+
+static bool isGEPFoldable(GetElementPtrInst *GEP,
+ const TargetTransformInfo *TTI) {
+ SmallVector<const Value*, 4> Indices;
+ for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
+ Indices.push_back(*I);
+ return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
+ Indices) == TargetTransformInfo::TCC_Free;
+}
+
+Instruction *NaryReassociatePass::tryReassociateGEP(GetElementPtrInst *GEP) {
+ // Not worth reassociating GEP if it is foldable.
+ if (isGEPFoldable(GEP, TTI))
+ return nullptr;
+
+ gep_type_iterator GTI = gep_type_begin(*GEP);
+ for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+ if (GTI.isSequential()) {
+ if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I - 1,
+ GTI.getIndexedType())) {
+ return NewGEP;
+ }
+ }
+ }
+ return nullptr;
+}
+
+bool NaryReassociatePass::requiresSignExtension(Value *Index,
+ GetElementPtrInst *GEP) {
+ unsigned PointerSizeInBits =
+ DL->getPointerSizeInBits(GEP->getType()->getPointerAddressSpace());
+ return cast<IntegerType>(Index->getType())->getBitWidth() < PointerSizeInBits;
+}
+
+GetElementPtrInst *
+NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
+ unsigned I, Type *IndexedType) {
+ Value *IndexToSplit = GEP->getOperand(I + 1);
+ if (SExtInst *SExt = dyn_cast<SExtInst>(IndexToSplit)) {
+ IndexToSplit = SExt->getOperand(0);
+ } else if (ZExtInst *ZExt = dyn_cast<ZExtInst>(IndexToSplit)) {
+ // zext can be treated as sext if the source is non-negative.
+ if (isKnownNonNegative(ZExt->getOperand(0), *DL, 0, AC, GEP, DT))
+ IndexToSplit = ZExt->getOperand(0);
+ }
+
+ if (AddOperator *AO = dyn_cast<AddOperator>(IndexToSplit)) {
+ // If the I-th index needs sext and the underlying add is not equipped with
+ // nsw, we cannot split the add because
+ // sext(LHS + RHS) != sext(LHS) + sext(RHS).
+ if (requiresSignExtension(IndexToSplit, GEP) &&
+ computeOverflowForSignedAdd(AO, *DL, AC, GEP, DT) !=
+ OverflowResult::NeverOverflows)
+ return nullptr;
+
+ Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1);
+ // IndexToSplit = LHS + RHS.
+ if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+ return NewGEP;
+ // Symmetrically, try IndexToSplit = RHS + LHS.
+ if (LHS != RHS) {
+ if (auto *NewGEP =
+ tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+ return NewGEP;
+ }
+ }
+ return nullptr;
+}
+
+GetElementPtrInst *
+NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
+ unsigned I, Value *LHS,
+ Value *RHS, Type *IndexedType) {
+ // Look for GEP's closest dominator that has the same SCEV as GEP except that
+ // the I-th index is replaced with LHS.
+ SmallVector<const SCEV *, 4> IndexExprs;
+ for (auto Index = GEP->idx_begin(); Index != GEP->idx_end(); ++Index)
+ IndexExprs.push_back(SE->getSCEV(*Index));
+ // Replace the I-th index with LHS.
+ IndexExprs[I] = SE->getSCEV(LHS);
+ if (isKnownNonNegative(LHS, *DL, 0, AC, GEP, DT) &&
+ DL->getTypeSizeInBits(LHS->getType()) <
+ DL->getTypeSizeInBits(GEP->getOperand(I)->getType())) {
+ // Zero-extend LHS if it is non-negative. InstCombine canonicalizes sext to
+ // zext if the source operand is proved non-negative. We should do that
+ // consistently so that CandidateExpr more likely appears before. See
+ // @reassociate_gep_assume for an example of this canonicalization.
+ IndexExprs[I] =
+ SE->getZeroExtendExpr(IndexExprs[I], GEP->getOperand(I)->getType());
+ }
+ const SCEV *CandidateExpr = SE->getGEPExpr(cast<GEPOperator>(GEP),
+ IndexExprs);
+
+ Value *Candidate = findClosestMatchingDominator(CandidateExpr, GEP);
+ if (Candidate == nullptr)
+ return nullptr;
+
+ IRBuilder<> Builder(GEP);
+ // Candidate does not necessarily have the same pointer type as GEP. Use
+ // bitcast or pointer cast to make sure they have the same type, so that the
+ // later RAUW doesn't complain.
+ Candidate = Builder.CreateBitOrPointerCast(Candidate, GEP->getType());
+ assert(Candidate->getType() == GEP->getType());
+
+ // NewGEP = (char *)Candidate + RHS * sizeof(IndexedType)
+ uint64_t IndexedSize = DL->getTypeAllocSize(IndexedType);
+ Type *ElementType = GEP->getResultElementType();
+ uint64_t ElementSize = DL->getTypeAllocSize(ElementType);
+ // Another less rare case: because I is not necessarily the last index of the
+ // GEP, the size of the type at the I-th index (IndexedSize) is not
+ // necessarily divisible by ElementSize. For example,
+ //
+ // #pragma pack(1)
+ // struct S {
+ // int a[3];
+ // int64 b[8];
+ // };
+ // #pragma pack()
+ //
+ // sizeof(S) = 100 is indivisible by sizeof(int64) = 8.
+ //
+ // TODO: bail out on this case for now. We could emit uglygep.
+ if (IndexedSize % ElementSize != 0)
+ return nullptr;
+
+ // NewGEP = &Candidate[RHS * (sizeof(IndexedType) / sizeof(Candidate[0])));
+ Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+ if (RHS->getType() != IntPtrTy)
+ RHS = Builder.CreateSExtOrTrunc(RHS, IntPtrTy);
+ if (IndexedSize != ElementSize) {
+ RHS = Builder.CreateMul(
+ RHS, ConstantInt::get(IntPtrTy, IndexedSize / ElementSize));
+ }
+ GetElementPtrInst *NewGEP =
+ cast<GetElementPtrInst>(Builder.CreateGEP(Candidate, RHS));
+ NewGEP->setIsInBounds(GEP->isInBounds());
+ NewGEP->takeName(GEP);
+ return NewGEP;
+}
+
+Instruction *NaryReassociatePass::tryReassociateBinaryOp(BinaryOperator *I) {
+ Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+ if (auto *NewI = tryReassociateBinaryOp(LHS, RHS, I))
+ return NewI;
+ if (auto *NewI = tryReassociateBinaryOp(RHS, LHS, I))
+ return NewI;
+ return nullptr;
+}
+
+Instruction *NaryReassociatePass::tryReassociateBinaryOp(Value *LHS, Value *RHS,
+ BinaryOperator *I) {
+ Value *A = nullptr, *B = nullptr;
+ // To be conservative, we reassociate I only when it is the only user of (A op
+ // B).
+ if (LHS->hasOneUse() && matchTernaryOp(I, LHS, A, B)) {
+ // I = (A op B) op RHS
+ // = (A op RHS) op B or (B op RHS) op A
+ const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B);
+ const SCEV *RHSExpr = SE->getSCEV(RHS);
+ if (BExpr != RHSExpr) {
+ if (auto *NewI =
+ tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
+ return NewI;
+ }
+ if (AExpr != RHSExpr) {
+ if (auto *NewI =
+ tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
+ return NewI;
+ }
+ }
+ return nullptr;
+}
+
+Instruction *NaryReassociatePass::tryReassociatedBinaryOp(const SCEV *LHSExpr,
+ Value *RHS,
+ BinaryOperator *I) {
+ // Look for the closest dominator LHS of I that computes LHSExpr, and replace
+ // I with LHS op RHS.
+ auto *LHS = findClosestMatchingDominator(LHSExpr, I);
+ if (LHS == nullptr)
+ return nullptr;
+
+ Instruction *NewI = nullptr;
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I);
+ break;
+ case Instruction::Mul:
+ NewI = BinaryOperator::CreateMul(LHS, RHS, "", I);
+ break;
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ }
+ NewI->takeName(I);
+ return NewI;
+}
+
+bool NaryReassociatePass::matchTernaryOp(BinaryOperator *I, Value *V,
+ Value *&Op1, Value *&Op2) {
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ return match(V, m_Add(m_Value(Op1), m_Value(Op2)));
+ case Instruction::Mul:
+ return match(V, m_Mul(m_Value(Op1), m_Value(Op2)));
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ }
+ return false;
+}
+
+const SCEV *NaryReassociatePass::getBinarySCEV(BinaryOperator *I,
+ const SCEV *LHS,
+ const SCEV *RHS) {
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ return SE->getAddExpr(LHS, RHS);
+ case Instruction::Mul:
+ return SE->getMulExpr(LHS, RHS);
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ }
+ return nullptr;
+}
+
+Instruction *
+NaryReassociatePass::findClosestMatchingDominator(const SCEV *CandidateExpr,
+ Instruction *Dominatee) {
+ auto Pos = SeenExprs.find(CandidateExpr);
+ if (Pos == SeenExprs.end())
+ return nullptr;
+
+ auto &Candidates = Pos->second;
+ // Because we process the basic blocks in pre-order of the dominator tree, a
+ // candidate that doesn't dominate the current instruction won't dominate any
+ // future instruction either. Therefore, we pop it out of the stack. This
+ // optimization makes the algorithm O(n).
+ while (!Candidates.empty()) {
+ // Candidates stores WeakTrackingVHs, so a candidate can be nullptr if it's
+ // removed
+ // during rewriting.
+ if (Value *Candidate = Candidates.back()) {
+ Instruction *CandidateInstruction = cast<Instruction>(Candidate);
+ if (DT->dominates(CandidateInstruction, Dominatee))
+ return CandidateInstruction;
+ }
+ Candidates.pop_back();
+ }
+ return nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
new file mode 100644
index 000000000000..9ebf2d769356
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -0,0 +1,4252 @@
+//===- NewGVN.cpp - Global Value Numbering Pass ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements the new LLVM's Global Value Numbering pass.
+/// GVN partitions values computed by a function into congruence classes.
+/// Values ending up in the same congruence class are guaranteed to be the same
+/// for every execution of the program. In that respect, congruency is a
+/// compile-time approximation of equivalence of values at runtime.
+/// The algorithm implemented here uses a sparse formulation and it's based
+/// on the ideas described in the paper:
+/// "A Sparse Algorithm for Predicated Global Value Numbering" from
+/// Karthik Gargi.
+///
+/// A brief overview of the algorithm: The algorithm is essentially the same as
+/// the standard RPO value numbering algorithm (a good reference is the paper
+/// "SCC based value numbering" by L. Taylor Simpson) with one major difference:
+/// The RPO algorithm proceeds, on every iteration, to process every reachable
+/// block and every instruction in that block. This is because the standard RPO
+/// algorithm does not track what things have the same value number, it only
+/// tracks what the value number of a given operation is (the mapping is
+/// operation -> value number). Thus, when a value number of an operation
+/// changes, it must reprocess everything to ensure all uses of a value number
+/// get updated properly. In constrast, the sparse algorithm we use *also*
+/// tracks what operations have a given value number (IE it also tracks the
+/// reverse mapping from value number -> operations with that value number), so
+/// that it only needs to reprocess the instructions that are affected when
+/// something's value number changes. The vast majority of complexity and code
+/// in this file is devoted to tracking what value numbers could change for what
+/// instructions when various things happen. The rest of the algorithm is
+/// devoted to performing symbolic evaluation, forward propagation, and
+/// simplification of operations based on the value numbers deduced so far
+///
+/// In order to make the GVN mostly-complete, we use a technique derived from
+/// "Detection of Redundant Expressions: A Complete and Polynomial-time
+/// Algorithm in SSA" by R.R. Pai. The source of incompleteness in most SSA
+/// based GVN algorithms is related to their inability to detect equivalence
+/// between phi of ops (IE phi(a+b, c+d)) and op of phis (phi(a,c) + phi(b, d)).
+/// We resolve this issue by generating the equivalent "phi of ops" form for
+/// each op of phis we see, in a way that only takes polynomial time to resolve.
+///
+/// We also do not perform elimination by using any published algorithm. All
+/// published algorithms are O(Instructions). Instead, we use a technique that
+/// is O(number of operations with the same value number), enabling us to skip
+/// trying to eliminate things that have unique value numbers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/NewGVN.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFGPrinter.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/ArrayRecycler.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/PointerLikeTypeTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVNExpression.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/Transforms/Utils/VNCoercion.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::GVNExpression;
+using namespace llvm::VNCoercion;
+
+#define DEBUG_TYPE "newgvn"
+
+STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted");
+STATISTIC(NumGVNBlocksDeleted, "Number of blocks deleted");
+STATISTIC(NumGVNOpsSimplified, "Number of Expressions simplified");
+STATISTIC(NumGVNPhisAllSame, "Number of PHIs whos arguments are all the same");
+STATISTIC(NumGVNMaxIterations,
+ "Maximum Number of iterations it took to converge GVN");
+STATISTIC(NumGVNLeaderChanges, "Number of leader changes");
+STATISTIC(NumGVNSortedLeaderChanges, "Number of sorted leader changes");
+STATISTIC(NumGVNAvoidedSortedLeaderChanges,
+ "Number of avoided sorted leader changes");
+STATISTIC(NumGVNDeadStores, "Number of redundant/dead stores eliminated");
+STATISTIC(NumGVNPHIOfOpsCreated, "Number of PHI of ops created");
+STATISTIC(NumGVNPHIOfOpsEliminations,
+ "Number of things eliminated using PHI of ops");
+DEBUG_COUNTER(VNCounter, "newgvn-vn",
+ "Controls which instructions are value numbered");
+DEBUG_COUNTER(PHIOfOpsCounter, "newgvn-phi",
+ "Controls which instructions we create phi of ops for");
+// Currently store defining access refinement is too slow due to basicaa being
+// egregiously slow. This flag lets us keep it working while we work on this
+// issue.
+static cl::opt<bool> EnableStoreRefinement("enable-store-refinement",
+ cl::init(false), cl::Hidden);
+
+/// Currently, the generation "phi of ops" can result in correctness issues.
+static cl::opt<bool> EnablePhiOfOps("enable-phi-of-ops", cl::init(true),
+ cl::Hidden);
+
+//===----------------------------------------------------------------------===//
+// GVN Pass
+//===----------------------------------------------------------------------===//
+
+// Anchor methods.
+namespace llvm {
+namespace GVNExpression {
+
+Expression::~Expression() = default;
+BasicExpression::~BasicExpression() = default;
+CallExpression::~CallExpression() = default;
+LoadExpression::~LoadExpression() = default;
+StoreExpression::~StoreExpression() = default;
+AggregateValueExpression::~AggregateValueExpression() = default;
+PHIExpression::~PHIExpression() = default;
+
+} // end namespace GVNExpression
+} // end namespace llvm
+
+namespace {
+
+// Tarjan's SCC finding algorithm with Nuutila's improvements
+// SCCIterator is actually fairly complex for the simple thing we want.
+// It also wants to hand us SCC's that are unrelated to the phi node we ask
+// about, and have us process them there or risk redoing work.
+// Graph traits over a filter iterator also doesn't work that well here.
+// This SCC finder is specialized to walk use-def chains, and only follows
+// instructions,
+// not generic values (arguments, etc).
+struct TarjanSCC {
+ TarjanSCC() : Components(1) {}
+
+ void Start(const Instruction *Start) {
+ if (Root.lookup(Start) == 0)
+ FindSCC(Start);
+ }
+
+ const SmallPtrSetImpl<const Value *> &getComponentFor(const Value *V) const {
+ unsigned ComponentID = ValueToComponent.lookup(V);
+
+ assert(ComponentID > 0 &&
+ "Asking for a component for a value we never processed");
+ return Components[ComponentID];
+ }
+
+private:
+ void FindSCC(const Instruction *I) {
+ Root[I] = ++DFSNum;
+ // Store the DFS Number we had before it possibly gets incremented.
+ unsigned int OurDFS = DFSNum;
+ for (auto &Op : I->operands()) {
+ if (auto *InstOp = dyn_cast<Instruction>(Op)) {
+ if (Root.lookup(Op) == 0)
+ FindSCC(InstOp);
+ if (!InComponent.count(Op))
+ Root[I] = std::min(Root.lookup(I), Root.lookup(Op));
+ }
+ }
+ // See if we really were the root of a component, by seeing if we still have
+ // our DFSNumber. If we do, we are the root of the component, and we have
+ // completed a component. If we do not, we are not the root of a component,
+ // and belong on the component stack.
+ if (Root.lookup(I) == OurDFS) {
+ unsigned ComponentID = Components.size();
+ Components.resize(Components.size() + 1);
+ auto &Component = Components.back();
+ Component.insert(I);
+ DEBUG(dbgs() << "Component root is " << *I << "\n");
+ InComponent.insert(I);
+ ValueToComponent[I] = ComponentID;
+ // Pop a component off the stack and label it.
+ while (!Stack.empty() && Root.lookup(Stack.back()) >= OurDFS) {
+ auto *Member = Stack.back();
+ DEBUG(dbgs() << "Component member is " << *Member << "\n");
+ Component.insert(Member);
+ InComponent.insert(Member);
+ ValueToComponent[Member] = ComponentID;
+ Stack.pop_back();
+ }
+ } else {
+ // Part of a component, push to stack
+ Stack.push_back(I);
+ }
+ }
+
+ unsigned int DFSNum = 1;
+ SmallPtrSet<const Value *, 8> InComponent;
+ DenseMap<const Value *, unsigned int> Root;
+ SmallVector<const Value *, 8> Stack;
+
+ // Store the components as vector of ptr sets, because we need the topo order
+ // of SCC's, but not individual member order
+ SmallVector<SmallPtrSet<const Value *, 8>, 8> Components;
+
+ DenseMap<const Value *, unsigned> ValueToComponent;
+};
+
+// Congruence classes represent the set of expressions/instructions
+// that are all the same *during some scope in the function*.
+// That is, because of the way we perform equality propagation, and
+// because of memory value numbering, it is not correct to assume
+// you can willy-nilly replace any member with any other at any
+// point in the function.
+//
+// For any Value in the Member set, it is valid to replace any dominated member
+// with that Value.
+//
+// Every congruence class has a leader, and the leader is used to symbolize
+// instructions in a canonical way (IE every operand of an instruction that is a
+// member of the same congruence class will always be replaced with leader
+// during symbolization). To simplify symbolization, we keep the leader as a
+// constant if class can be proved to be a constant value. Otherwise, the
+// leader is the member of the value set with the smallest DFS number. Each
+// congruence class also has a defining expression, though the expression may be
+// null. If it exists, it can be used for forward propagation and reassociation
+// of values.
+
+// For memory, we also track a representative MemoryAccess, and a set of memory
+// members for MemoryPhis (which have no real instructions). Note that for
+// memory, it seems tempting to try to split the memory members into a
+// MemoryCongruenceClass or something. Unfortunately, this does not work
+// easily. The value numbering of a given memory expression depends on the
+// leader of the memory congruence class, and the leader of memory congruence
+// class depends on the value numbering of a given memory expression. This
+// leads to wasted propagation, and in some cases, missed optimization. For
+// example: If we had value numbered two stores together before, but now do not,
+// we move them to a new value congruence class. This in turn will move at one
+// of the memorydefs to a new memory congruence class. Which in turn, affects
+// the value numbering of the stores we just value numbered (because the memory
+// congruence class is part of the value number). So while theoretically
+// possible to split them up, it turns out to be *incredibly* complicated to get
+// it to work right, because of the interdependency. While structurally
+// slightly messier, it is algorithmically much simpler and faster to do what we
+// do here, and track them both at once in the same class.
+// Note: The default iterators for this class iterate over values
+class CongruenceClass {
+public:
+ using MemberType = Value;
+ using MemberSet = SmallPtrSet<MemberType *, 4>;
+ using MemoryMemberType = MemoryPhi;
+ using MemoryMemberSet = SmallPtrSet<const MemoryMemberType *, 2>;
+
+ explicit CongruenceClass(unsigned ID) : ID(ID) {}
+ CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
+ : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
+
+ unsigned getID() const { return ID; }
+
+ // True if this class has no members left. This is mainly used for assertion
+ // purposes, and for skipping empty classes.
+ bool isDead() const {
+ // If it's both dead from a value perspective, and dead from a memory
+ // perspective, it's really dead.
+ return empty() && memory_empty();
+ }
+
+ // Leader functions
+ Value *getLeader() const { return RepLeader; }
+ void setLeader(Value *Leader) { RepLeader = Leader; }
+ const std::pair<Value *, unsigned int> &getNextLeader() const {
+ return NextLeader;
+ }
+ void resetNextLeader() { NextLeader = {nullptr, ~0}; }
+ void addPossibleNextLeader(std::pair<Value *, unsigned int> LeaderPair) {
+ if (LeaderPair.second < NextLeader.second)
+ NextLeader = LeaderPair;
+ }
+
+ Value *getStoredValue() const { return RepStoredValue; }
+ void setStoredValue(Value *Leader) { RepStoredValue = Leader; }
+ const MemoryAccess *getMemoryLeader() const { return RepMemoryAccess; }
+ void setMemoryLeader(const MemoryAccess *Leader) { RepMemoryAccess = Leader; }
+
+ // Forward propagation info
+ const Expression *getDefiningExpr() const { return DefiningExpr; }
+
+ // Value member set
+ bool empty() const { return Members.empty(); }
+ unsigned size() const { return Members.size(); }
+ MemberSet::const_iterator begin() const { return Members.begin(); }
+ MemberSet::const_iterator end() const { return Members.end(); }
+ void insert(MemberType *M) { Members.insert(M); }
+ void erase(MemberType *M) { Members.erase(M); }
+ void swap(MemberSet &Other) { Members.swap(Other); }
+
+ // Memory member set
+ bool memory_empty() const { return MemoryMembers.empty(); }
+ unsigned memory_size() const { return MemoryMembers.size(); }
+ MemoryMemberSet::const_iterator memory_begin() const {
+ return MemoryMembers.begin();
+ }
+ MemoryMemberSet::const_iterator memory_end() const {
+ return MemoryMembers.end();
+ }
+ iterator_range<MemoryMemberSet::const_iterator> memory() const {
+ return make_range(memory_begin(), memory_end());
+ }
+
+ void memory_insert(const MemoryMemberType *M) { MemoryMembers.insert(M); }
+ void memory_erase(const MemoryMemberType *M) { MemoryMembers.erase(M); }
+
+ // Store count
+ unsigned getStoreCount() const { return StoreCount; }
+ void incStoreCount() { ++StoreCount; }
+ void decStoreCount() {
+ assert(StoreCount != 0 && "Store count went negative");
+ --StoreCount;
+ }
+
+ // True if this class has no memory members.
+ bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); }
+
+ // Return true if two congruence classes are equivalent to each other. This
+ // means
+ // that every field but the ID number and the dead field are equivalent.
+ bool isEquivalentTo(const CongruenceClass *Other) const {
+ if (!Other)
+ return false;
+ if (this == Other)
+ return true;
+
+ if (std::tie(StoreCount, RepLeader, RepStoredValue, RepMemoryAccess) !=
+ std::tie(Other->StoreCount, Other->RepLeader, Other->RepStoredValue,
+ Other->RepMemoryAccess))
+ return false;
+ if (DefiningExpr != Other->DefiningExpr)
+ if (!DefiningExpr || !Other->DefiningExpr ||
+ *DefiningExpr != *Other->DefiningExpr)
+ return false;
+ // We need some ordered set
+ std::set<Value *> AMembers(Members.begin(), Members.end());
+ std::set<Value *> BMembers(Members.begin(), Members.end());
+ return AMembers == BMembers;
+ }
+
+private:
+ unsigned ID;
+
+ // Representative leader.
+ Value *RepLeader = nullptr;
+
+ // The most dominating leader after our current leader, because the member set
+ // is not sorted and is expensive to keep sorted all the time.
+ std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
+
+ // If this is represented by a store, the value of the store.
+ Value *RepStoredValue = nullptr;
+
+ // If this class contains MemoryDefs or MemoryPhis, this is the leading memory
+ // access.
+ const MemoryAccess *RepMemoryAccess = nullptr;
+
+ // Defining Expression.
+ const Expression *DefiningExpr = nullptr;
+
+ // Actual members of this class.
+ MemberSet Members;
+
+ // This is the set of MemoryPhis that exist in the class. MemoryDefs and
+ // MemoryUses have real instructions representing them, so we only need to
+ // track MemoryPhis here.
+ MemoryMemberSet MemoryMembers;
+
+ // Number of stores in this congruence class.
+ // This is used so we can detect store equivalence changes properly.
+ int StoreCount = 0;
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+struct ExactEqualsExpression {
+ const Expression &E;
+
+ explicit ExactEqualsExpression(const Expression &E) : E(E) {}
+
+ hash_code getComputedHash() const { return E.getComputedHash(); }
+
+ bool operator==(const Expression &Other) const {
+ return E.exactlyEquals(Other);
+ }
+};
+
+template <> struct DenseMapInfo<const Expression *> {
+ static const Expression *getEmptyKey() {
+ auto Val = static_cast<uintptr_t>(-1);
+ Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
+ return reinterpret_cast<const Expression *>(Val);
+ }
+
+ static const Expression *getTombstoneKey() {
+ auto Val = static_cast<uintptr_t>(~1U);
+ Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
+ return reinterpret_cast<const Expression *>(Val);
+ }
+
+ static unsigned getHashValue(const Expression *E) {
+ return E->getComputedHash();
+ }
+
+ static unsigned getHashValue(const ExactEqualsExpression &E) {
+ return E.getComputedHash();
+ }
+
+ static bool isEqual(const ExactEqualsExpression &LHS, const Expression *RHS) {
+ if (RHS == getTombstoneKey() || RHS == getEmptyKey())
+ return false;
+ return LHS == *RHS;
+ }
+
+ static bool isEqual(const Expression *LHS, const Expression *RHS) {
+ if (LHS == RHS)
+ return true;
+ if (LHS == getTombstoneKey() || RHS == getTombstoneKey() ||
+ LHS == getEmptyKey() || RHS == getEmptyKey())
+ return false;
+ // Compare hashes before equality. This is *not* what the hashtable does,
+ // since it is computing it modulo the number of buckets, whereas we are
+ // using the full hash keyspace. Since the hashes are precomputed, this
+ // check is *much* faster than equality.
+ if (LHS->getComputedHash() != RHS->getComputedHash())
+ return false;
+ return *LHS == *RHS;
+ }
+};
+
+} // end namespace llvm
+
+namespace {
+
+class NewGVN {
+ Function &F;
+ DominatorTree *DT;
+ const TargetLibraryInfo *TLI;
+ AliasAnalysis *AA;
+ MemorySSA *MSSA;
+ MemorySSAWalker *MSSAWalker;
+ const DataLayout &DL;
+ std::unique_ptr<PredicateInfo> PredInfo;
+
+ // These are the only two things the create* functions should have
+ // side-effects on due to allocating memory.
+ mutable BumpPtrAllocator ExpressionAllocator;
+ mutable ArrayRecycler<Value *> ArgRecycler;
+ mutable TarjanSCC SCCFinder;
+ const SimplifyQuery SQ;
+
+ // Number of function arguments, used by ranking
+ unsigned int NumFuncArgs;
+
+ // RPOOrdering of basic blocks
+ DenseMap<const DomTreeNode *, unsigned> RPOOrdering;
+
+ // Congruence class info.
+
+ // This class is called INITIAL in the paper. It is the class everything
+ // startsout in, and represents any value. Being an optimistic analysis,
+ // anything in the TOP class has the value TOP, which is indeterminate and
+ // equivalent to everything.
+ CongruenceClass *TOPClass;
+ std::vector<CongruenceClass *> CongruenceClasses;
+ unsigned NextCongruenceNum;
+
+ // Value Mappings.
+ DenseMap<Value *, CongruenceClass *> ValueToClass;
+ DenseMap<Value *, const Expression *> ValueToExpression;
+
+ // Value PHI handling, used to make equivalence between phi(op, op) and
+ // op(phi, phi).
+ // These mappings just store various data that would normally be part of the
+ // IR.
+ SmallPtrSet<const Instruction *, 8> PHINodeUses;
+
+ DenseMap<const Value *, bool> OpSafeForPHIOfOps;
+
+ // Map a temporary instruction we created to a parent block.
+ DenseMap<const Value *, BasicBlock *> TempToBlock;
+
+ // Map between the already in-program instructions and the temporary phis we
+ // created that they are known equivalent to.
+ DenseMap<const Value *, PHINode *> RealToTemp;
+
+ // In order to know when we should re-process instructions that have
+ // phi-of-ops, we track the set of expressions that they needed as
+ // leaders. When we discover new leaders for those expressions, we process the
+ // associated phi-of-op instructions again in case they have changed. The
+ // other way they may change is if they had leaders, and those leaders
+ // disappear. However, at the point they have leaders, there are uses of the
+ // relevant operands in the created phi node, and so they will get reprocessed
+ // through the normal user marking we perform.
+ mutable DenseMap<const Value *, SmallPtrSet<Value *, 2>> AdditionalUsers;
+ DenseMap<const Expression *, SmallPtrSet<Instruction *, 2>>
+ ExpressionToPhiOfOps;
+
+ // Map from temporary operation to MemoryAccess.
+ DenseMap<const Instruction *, MemoryUseOrDef *> TempToMemory;
+
+ // Set of all temporary instructions we created.
+ // Note: This will include instructions that were just created during value
+ // numbering. The way to test if something is using them is to check
+ // RealToTemp.
+ DenseSet<Instruction *> AllTempInstructions;
+
+ // This is the set of instructions to revisit on a reachability change. At
+ // the end of the main iteration loop it will contain at least all the phi of
+ // ops instructions that will be changed to phis, as well as regular phis.
+ // During the iteration loop, it may contain other things, such as phi of ops
+ // instructions that used edge reachability to reach a result, and so need to
+ // be revisited when the edge changes, independent of whether the phi they
+ // depended on changes.
+ DenseMap<BasicBlock *, SparseBitVector<>> RevisitOnReachabilityChange;
+
+ // Mapping from predicate info we used to the instructions we used it with.
+ // In order to correctly ensure propagation, we must keep track of what
+ // comparisons we used, so that when the values of the comparisons change, we
+ // propagate the information to the places we used the comparison.
+ mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>>
+ PredicateToUsers;
+
+ // the same reasoning as PredicateToUsers. When we skip MemoryAccesses for
+ // stores, we no longer can rely solely on the def-use chains of MemorySSA.
+ mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>>
+ MemoryToUsers;
+
+ // A table storing which memorydefs/phis represent a memory state provably
+ // equivalent to another memory state.
+ // We could use the congruence class machinery, but the MemoryAccess's are
+ // abstract memory states, so they can only ever be equivalent to each other,
+ // and not to constants, etc.
+ DenseMap<const MemoryAccess *, CongruenceClass *> MemoryAccessToClass;
+
+ // We could, if we wanted, build MemoryPhiExpressions and
+ // MemoryVariableExpressions, etc, and value number them the same way we value
+ // number phi expressions. For the moment, this seems like overkill. They
+ // can only exist in one of three states: they can be TOP (equal to
+ // everything), Equivalent to something else, or unique. Because we do not
+ // create expressions for them, we need to simulate leader change not just
+ // when they change class, but when they change state. Note: We can do the
+ // same thing for phis, and avoid having phi expressions if we wanted, We
+ // should eventually unify in one direction or the other, so this is a little
+ // bit of an experiment in which turns out easier to maintain.
+ enum MemoryPhiState { MPS_Invalid, MPS_TOP, MPS_Equivalent, MPS_Unique };
+ DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState;
+
+ enum InstCycleState { ICS_Unknown, ICS_CycleFree, ICS_Cycle };
+ mutable DenseMap<const Instruction *, InstCycleState> InstCycleState;
+
+ // Expression to class mapping.
+ using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
+ ExpressionClassMap ExpressionToClass;
+
+ // We have a single expression that represents currently DeadExpressions.
+ // For dead expressions we can prove will stay dead, we mark them with
+ // DFS number zero. However, it's possible in the case of phi nodes
+ // for us to assume/prove all arguments are dead during fixpointing.
+ // We use DeadExpression for that case.
+ DeadExpression *SingletonDeadExpression = nullptr;
+
+ // Which values have changed as a result of leader changes.
+ SmallPtrSet<Value *, 8> LeaderChanges;
+
+ // Reachability info.
+ using BlockEdge = BasicBlockEdge;
+ DenseSet<BlockEdge> ReachableEdges;
+ SmallPtrSet<const BasicBlock *, 8> ReachableBlocks;
+
+ // This is a bitvector because, on larger functions, we may have
+ // thousands of touched instructions at once (entire blocks,
+ // instructions with hundreds of uses, etc). Even with optimization
+ // for when we mark whole blocks as touched, when this was a
+ // SmallPtrSet or DenseSet, for some functions, we spent >20% of all
+ // the time in GVN just managing this list. The bitvector, on the
+ // other hand, efficiently supports test/set/clear of both
+ // individual and ranges, as well as "find next element" This
+ // enables us to use it as a worklist with essentially 0 cost.
+ BitVector TouchedInstructions;
+
+ DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
+
+#ifndef NDEBUG
+ // Debugging for how many times each block and instruction got processed.
+ DenseMap<const Value *, unsigned> ProcessedCount;
+#endif
+
+ // DFS info.
+ // This contains a mapping from Instructions to DFS numbers.
+ // The numbering starts at 1. An instruction with DFS number zero
+ // means that the instruction is dead.
+ DenseMap<const Value *, unsigned> InstrDFS;
+
+ // This contains the mapping DFS numbers to instructions.
+ SmallVector<Value *, 32> DFSToInstr;
+
+ // Deletion info.
+ SmallPtrSet<Instruction *, 8> InstructionsToErase;
+
+public:
+ NewGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
+ TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA,
+ const DataLayout &DL)
+ : F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), DL(DL),
+ PredInfo(make_unique<PredicateInfo>(F, *DT, *AC)), SQ(DL, TLI, DT, AC) {
+ }
+
+ bool runGVN();
+
+private:
+ // Expression handling.
+ const Expression *createExpression(Instruction *) const;
+ const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *,
+ Instruction *) const;
+
+ // Our canonical form for phi arguments is a pair of incoming value, incoming
+ // basic block.
+ using ValPair = std::pair<Value *, BasicBlock *>;
+
+ PHIExpression *createPHIExpression(ArrayRef<ValPair>, const Instruction *,
+ BasicBlock *, bool &HasBackEdge,
+ bool &OriginalOpsConstant) const;
+ const DeadExpression *createDeadExpression() const;
+ const VariableExpression *createVariableExpression(Value *) const;
+ const ConstantExpression *createConstantExpression(Constant *) const;
+ const Expression *createVariableOrConstant(Value *V) const;
+ const UnknownExpression *createUnknownExpression(Instruction *) const;
+ const StoreExpression *createStoreExpression(StoreInst *,
+ const MemoryAccess *) const;
+ LoadExpression *createLoadExpression(Type *, Value *, LoadInst *,
+ const MemoryAccess *) const;
+ const CallExpression *createCallExpression(CallInst *,
+ const MemoryAccess *) const;
+ const AggregateValueExpression *
+ createAggregateValueExpression(Instruction *) const;
+ bool setBasicExpressionInfo(Instruction *, BasicExpression *) const;
+
+ // Congruence class handling.
+ CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) {
+ auto *result = new CongruenceClass(NextCongruenceNum++, Leader, E);
+ CongruenceClasses.emplace_back(result);
+ return result;
+ }
+
+ CongruenceClass *createMemoryClass(MemoryAccess *MA) {
+ auto *CC = createCongruenceClass(nullptr, nullptr);
+ CC->setMemoryLeader(MA);
+ return CC;
+ }
+
+ CongruenceClass *ensureLeaderOfMemoryClass(MemoryAccess *MA) {
+ auto *CC = getMemoryClass(MA);
+ if (CC->getMemoryLeader() != MA)
+ CC = createMemoryClass(MA);
+ return CC;
+ }
+
+ CongruenceClass *createSingletonCongruenceClass(Value *Member) {
+ CongruenceClass *CClass = createCongruenceClass(Member, nullptr);
+ CClass->insert(Member);
+ ValueToClass[Member] = CClass;
+ return CClass;
+ }
+
+ void initializeCongruenceClasses(Function &F);
+ const Expression *makePossiblePHIOfOps(Instruction *,
+ SmallPtrSetImpl<Value *> &);
+ Value *findLeaderForInst(Instruction *ValueOp,
+ SmallPtrSetImpl<Value *> &Visited,
+ MemoryAccess *MemAccess, Instruction *OrigInst,
+ BasicBlock *PredBB);
+ bool OpIsSafeForPHIOfOpsHelper(Value *V, const BasicBlock *PHIBlock,
+ SmallPtrSetImpl<const Value *> &Visited,
+ SmallVectorImpl<Instruction *> &Worklist);
+ bool OpIsSafeForPHIOfOps(Value *Op, const BasicBlock *PHIBlock,
+ SmallPtrSetImpl<const Value *> &);
+ void addPhiOfOps(PHINode *Op, BasicBlock *BB, Instruction *ExistingValue);
+ void removePhiOfOps(Instruction *I, PHINode *PHITemp);
+
+ // Value number an Instruction or MemoryPhi.
+ void valueNumberMemoryPhi(MemoryPhi *);
+ void valueNumberInstruction(Instruction *);
+
+ // Symbolic evaluation.
+ const Expression *checkSimplificationResults(Expression *, Instruction *,
+ Value *) const;
+ const Expression *performSymbolicEvaluation(Value *,
+ SmallPtrSetImpl<Value *> &) const;
+ const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
+ Instruction *,
+ MemoryAccess *) const;
+ const Expression *performSymbolicLoadEvaluation(Instruction *) const;
+ const Expression *performSymbolicStoreEvaluation(Instruction *) const;
+ const Expression *performSymbolicCallEvaluation(Instruction *) const;
+ void sortPHIOps(MutableArrayRef<ValPair> Ops) const;
+ const Expression *performSymbolicPHIEvaluation(ArrayRef<ValPair>,
+ Instruction *I,
+ BasicBlock *PHIBlock) const;
+ const Expression *performSymbolicAggrValueEvaluation(Instruction *) const;
+ const Expression *performSymbolicCmpEvaluation(Instruction *) const;
+ const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const;
+
+ // Congruence finding.
+ bool someEquivalentDominates(const Instruction *, const Instruction *) const;
+ Value *lookupOperandLeader(Value *) const;
+ CongruenceClass *getClassForExpression(const Expression *E) const;
+ void performCongruenceFinding(Instruction *, const Expression *);
+ void moveValueToNewCongruenceClass(Instruction *, const Expression *,
+ CongruenceClass *, CongruenceClass *);
+ void moveMemoryToNewCongruenceClass(Instruction *, MemoryAccess *,
+ CongruenceClass *, CongruenceClass *);
+ Value *getNextValueLeader(CongruenceClass *) const;
+ const MemoryAccess *getNextMemoryLeader(CongruenceClass *) const;
+ bool setMemoryClass(const MemoryAccess *From, CongruenceClass *To);
+ CongruenceClass *getMemoryClass(const MemoryAccess *MA) const;
+ const MemoryAccess *lookupMemoryLeader(const MemoryAccess *) const;
+ bool isMemoryAccessTOP(const MemoryAccess *) const;
+
+ // Ranking
+ unsigned int getRank(const Value *) const;
+ bool shouldSwapOperands(const Value *, const Value *) const;
+
+ // Reachability handling.
+ void updateReachableEdge(BasicBlock *, BasicBlock *);
+ void processOutgoingEdges(TerminatorInst *, BasicBlock *);
+ Value *findConditionEquivalence(Value *) const;
+
+ // Elimination.
+ struct ValueDFS;
+ void convertClassToDFSOrdered(const CongruenceClass &,
+ SmallVectorImpl<ValueDFS> &,
+ DenseMap<const Value *, unsigned int> &,
+ SmallPtrSetImpl<Instruction *> &) const;
+ void convertClassToLoadsAndStores(const CongruenceClass &,
+ SmallVectorImpl<ValueDFS> &) const;
+
+ bool eliminateInstructions(Function &);
+ void replaceInstruction(Instruction *, Value *);
+ void markInstructionForDeletion(Instruction *);
+ void deleteInstructionsInBlock(BasicBlock *);
+ Value *findPHIOfOpsLeader(const Expression *, const Instruction *,
+ const BasicBlock *) const;
+
+ // New instruction creation.
+ void handleNewInstruction(Instruction *) {}
+
+ // Various instruction touch utilities
+ template <typename Map, typename KeyType, typename Func>
+ void for_each_found(Map &, const KeyType &, Func);
+ template <typename Map, typename KeyType>
+ void touchAndErase(Map &, const KeyType &);
+ void markUsersTouched(Value *);
+ void markMemoryUsersTouched(const MemoryAccess *);
+ void markMemoryDefTouched(const MemoryAccess *);
+ void markPredicateUsersTouched(Instruction *);
+ void markValueLeaderChangeTouched(CongruenceClass *CC);
+ void markMemoryLeaderChangeTouched(CongruenceClass *CC);
+ void markPhiOfOpsChanged(const Expression *E);
+ void addPredicateUsers(const PredicateBase *, Instruction *) const;
+ void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const;
+ void addAdditionalUsers(Value *To, Value *User) const;
+
+ // Main loop of value numbering
+ void iterateTouchedInstructions();
+
+ // Utilities.
+ void cleanupTables();
+ std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned);
+ void updateProcessedCount(const Value *V);
+ void verifyMemoryCongruency() const;
+ void verifyIterationSettled(Function &F);
+ void verifyStoreExpressions() const;
+ bool singleReachablePHIPath(SmallPtrSet<const MemoryAccess *, 8> &,
+ const MemoryAccess *, const MemoryAccess *) const;
+ BasicBlock *getBlockForValue(Value *V) const;
+ void deleteExpression(const Expression *E) const;
+ MemoryUseOrDef *getMemoryAccess(const Instruction *) const;
+ MemoryAccess *getDefiningAccess(const MemoryAccess *) const;
+ MemoryPhi *getMemoryAccess(const BasicBlock *) const;
+ template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
+
+ unsigned InstrToDFSNum(const Value *V) const {
+ assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");
+ return InstrDFS.lookup(V);
+ }
+
+ unsigned InstrToDFSNum(const MemoryAccess *MA) const {
+ return MemoryToDFSNum(MA);
+ }
+
+ Value *InstrFromDFSNum(unsigned DFSNum) { return DFSToInstr[DFSNum]; }
+
+ // Given a MemoryAccess, return the relevant instruction DFS number. Note:
+ // This deliberately takes a value so it can be used with Use's, which will
+ // auto-convert to Value's but not to MemoryAccess's.
+ unsigned MemoryToDFSNum(const Value *MA) const {
+ assert(isa<MemoryAccess>(MA) &&
+ "This should not be used with instructions");
+ return isa<MemoryUseOrDef>(MA)
+ ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())
+ : InstrDFS.lookup(MA);
+ }
+
+ bool isCycleFree(const Instruction *) const;
+ bool isBackedge(BasicBlock *From, BasicBlock *To) const;
+
+ // Debug counter info. When verifying, we have to reset the value numbering
+ // debug counter to the same state it started in to get the same results.
+ std::pair<int, int> StartingVNCounter;
+};
+
+} // end anonymous namespace
+
+template <typename T>
+static bool equalsLoadStoreHelper(const T &LHS, const Expression &RHS) {
+ if (!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS))
+ return false;
+ return LHS.MemoryExpression::equals(RHS);
+}
+
+bool LoadExpression::equals(const Expression &Other) const {
+ return equalsLoadStoreHelper(*this, Other);
+}
+
+bool StoreExpression::equals(const Expression &Other) const {
+ if (!equalsLoadStoreHelper(*this, Other))
+ return false;
+ // Make sure that store vs store includes the value operand.
+ if (const auto *S = dyn_cast<StoreExpression>(&Other))
+ if (getStoredValue() != S->getStoredValue())
+ return false;
+ return true;
+}
+
+// Determine if the edge From->To is a backedge
+bool NewGVN::isBackedge(BasicBlock *From, BasicBlock *To) const {
+ return From == To ||
+ RPOOrdering.lookup(DT->getNode(From)) >=
+ RPOOrdering.lookup(DT->getNode(To));
+}
+
+#ifndef NDEBUG
+static std::string getBlockName(const BasicBlock *B) {
+ return DOTGraphTraits<const Function *>::getSimpleNodeLabel(B, nullptr);
+}
+#endif
+
+// Get a MemoryAccess for an instruction, fake or real.
+MemoryUseOrDef *NewGVN::getMemoryAccess(const Instruction *I) const {
+ auto *Result = MSSA->getMemoryAccess(I);
+ return Result ? Result : TempToMemory.lookup(I);
+}
+
+// Get a MemoryPhi for a basic block. These are all real.
+MemoryPhi *NewGVN::getMemoryAccess(const BasicBlock *BB) const {
+ return MSSA->getMemoryAccess(BB);
+}
+
+// Get the basic block from an instruction/memory value.
+BasicBlock *NewGVN::getBlockForValue(Value *V) const {
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ auto *Parent = I->getParent();
+ if (Parent)
+ return Parent;
+ Parent = TempToBlock.lookup(V);
+ assert(Parent && "Every fake instruction should have a block");
+ return Parent;
+ }
+
+ auto *MP = dyn_cast<MemoryPhi>(V);
+ assert(MP && "Should have been an instruction or a MemoryPhi");
+ return MP->getBlock();
+}
+
+// Delete a definitely dead expression, so it can be reused by the expression
+// allocator. Some of these are not in creation functions, so we have to accept
+// const versions.
+void NewGVN::deleteExpression(const Expression *E) const {
+ assert(isa<BasicExpression>(E));
+ auto *BE = cast<BasicExpression>(E);
+ const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler);
+ ExpressionAllocator.Deallocate(E);
+}
+
+// If V is a predicateinfo copy, get the thing it is a copy of.
+static Value *getCopyOf(const Value *V) {
+ if (auto *II = dyn_cast<IntrinsicInst>(V))
+ if (II->getIntrinsicID() == Intrinsic::ssa_copy)
+ return II->getOperand(0);
+ return nullptr;
+}
+
+// Return true if V is really PN, even accounting for predicateinfo copies.
+static bool isCopyOfPHI(const Value *V, const PHINode *PN) {
+ return V == PN || getCopyOf(V) == PN;
+}
+
+static bool isCopyOfAPHI(const Value *V) {
+ auto *CO = getCopyOf(V);
+ return CO && isa<PHINode>(CO);
+}
+
+// Sort PHI Operands into a canonical order. What we use here is an RPO
+// order. The BlockInstRange numbers are generated in an RPO walk of the basic
+// blocks.
+void NewGVN::sortPHIOps(MutableArrayRef<ValPair> Ops) const {
+ std::sort(Ops.begin(), Ops.end(), [&](const ValPair &P1, const ValPair &P2) {
+ return BlockInstRange.lookup(P1.second).first <
+ BlockInstRange.lookup(P2.second).first;
+ });
+}
+
+// Return true if V is a value that will always be available (IE can
+// be placed anywhere) in the function. We don't do globals here
+// because they are often worse to put in place.
+static bool alwaysAvailable(Value *V) {
+ return isa<Constant>(V) || isa<Argument>(V);
+}
+
+// Create a PHIExpression from an array of {incoming edge, value} pairs. I is
+// the original instruction we are creating a PHIExpression for (but may not be
+// a phi node). We require, as an invariant, that all the PHIOperands in the
+// same block are sorted the same way. sortPHIOps will sort them into a
+// canonical order.
+PHIExpression *NewGVN::createPHIExpression(ArrayRef<ValPair> PHIOperands,
+ const Instruction *I,
+ BasicBlock *PHIBlock,
+ bool &HasBackedge,
+ bool &OriginalOpsConstant) const {
+ unsigned NumOps = PHIOperands.size();
+ auto *E = new (ExpressionAllocator) PHIExpression(NumOps, PHIBlock);
+
+ E->allocateOperands(ArgRecycler, ExpressionAllocator);
+ E->setType(PHIOperands.begin()->first->getType());
+ E->setOpcode(Instruction::PHI);
+
+ // Filter out unreachable phi operands.
+ auto Filtered = make_filter_range(PHIOperands, [&](const ValPair &P) {
+ auto *BB = P.second;
+ if (auto *PHIOp = dyn_cast<PHINode>(I))
+ if (isCopyOfPHI(P.first, PHIOp))
+ return false;
+ if (!ReachableEdges.count({BB, PHIBlock}))
+ return false;
+ // Things in TOPClass are equivalent to everything.
+ if (ValueToClass.lookup(P.first) == TOPClass)
+ return false;
+ OriginalOpsConstant = OriginalOpsConstant && isa<Constant>(P.first);
+ HasBackedge = HasBackedge || isBackedge(BB, PHIBlock);
+ return lookupOperandLeader(P.first) != I;
+ });
+ std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
+ [&](const ValPair &P) -> Value * {
+ return lookupOperandLeader(P.first);
+ });
+ return E;
+}
+
+// Set basic expression info (Arguments, type, opcode) for Expression
+// E from Instruction I in block B.
+bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) const {
+ bool AllConstant = true;
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+ E->setType(GEP->getSourceElementType());
+ else
+ E->setType(I->getType());
+ E->setOpcode(I->getOpcode());
+ E->allocateOperands(ArgRecycler, ExpressionAllocator);
+
+ // Transform the operand array into an operand leader array, and keep track of
+ // whether all members are constant.
+ std::transform(I->op_begin(), I->op_end(), op_inserter(E), [&](Value *O) {
+ auto Operand = lookupOperandLeader(O);
+ AllConstant = AllConstant && isa<Constant>(Operand);
+ return Operand;
+ });
+
+ return AllConstant;
+}
+
+const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
+ Value *Arg1, Value *Arg2,
+ Instruction *I) const {
+ auto *E = new (ExpressionAllocator) BasicExpression(2);
+
+ E->setType(T);
+ E->setOpcode(Opcode);
+ E->allocateOperands(ArgRecycler, ExpressionAllocator);
+ if (Instruction::isCommutative(Opcode)) {
+ // Ensure that commutative instructions that only differ by a permutation
+ // of their operands get the same value number by sorting the operand value
+ // numbers. Since all commutative instructions have two operands it is more
+ // efficient to sort by hand rather than using, say, std::sort.
+ if (shouldSwapOperands(Arg1, Arg2))
+ std::swap(Arg1, Arg2);
+ }
+ E->op_push_back(lookupOperandLeader(Arg1));
+ E->op_push_back(lookupOperandLeader(Arg2));
+
+ Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), SQ);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ return E;
+}
+
+// Take a Value returned by simplification of Expression E/Instruction
+// I, and see if it resulted in a simpler expression. If so, return
+// that expression.
+const Expression *NewGVN::checkSimplificationResults(Expression *E,
+ Instruction *I,
+ Value *V) const {
+ if (!V)
+ return nullptr;
+ if (auto *C = dyn_cast<Constant>(V)) {
+ if (I)
+ DEBUG(dbgs() << "Simplified " << *I << " to "
+ << " constant " << *C << "\n");
+ NumGVNOpsSimplified++;
+ assert(isa<BasicExpression>(E) &&
+ "We should always have had a basic expression here");
+ deleteExpression(E);
+ return createConstantExpression(C);
+ } else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
+ if (I)
+ DEBUG(dbgs() << "Simplified " << *I << " to "
+ << " variable " << *V << "\n");
+ deleteExpression(E);
+ return createVariableExpression(V);
+ }
+
+ CongruenceClass *CC = ValueToClass.lookup(V);
+ if (CC) {
+ if (CC->getLeader() && CC->getLeader() != I) {
+ // Don't add temporary instructions to the user lists.
+ if (!AllTempInstructions.count(I))
+ addAdditionalUsers(V, I);
+ return createVariableOrConstant(CC->getLeader());
+ }
+ if (CC->getDefiningExpr()) {
+ // If we simplified to something else, we need to communicate
+ // that we're users of the value we simplified to.
+ if (I != V) {
+ // Don't add temporary instructions to the user lists.
+ if (!AllTempInstructions.count(I))
+ addAdditionalUsers(V, I);
+ }
+
+ if (I)
+ DEBUG(dbgs() << "Simplified " << *I << " to "
+ << " expression " << *CC->getDefiningExpr() << "\n");
+ NumGVNOpsSimplified++;
+ deleteExpression(E);
+ return CC->getDefiningExpr();
+ }
+ }
+
+ return nullptr;
+}
+
+// Create a value expression from the instruction I, replacing operands with
+// their leaders.
+
+const Expression *NewGVN::createExpression(Instruction *I) const {
+ auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
+
+ bool AllConstant = setBasicExpressionInfo(I, E);
+
+ if (I->isCommutative()) {
+ // Ensure that commutative instructions that only differ by a permutation
+ // of their operands get the same value number by sorting the operand value
+ // numbers. Since all commutative instructions have two operands it is more
+ // efficient to sort by hand rather than using, say, std::sort.
+ assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
+ if (shouldSwapOperands(E->getOperand(0), E->getOperand(1)))
+ E->swapOperands(0, 1);
+ }
+ // Perform simplification.
+ if (auto *CI = dyn_cast<CmpInst>(I)) {
+ // Sort the operand value numbers so x<y and y>x get the same value
+ // number.
+ CmpInst::Predicate Predicate = CI->getPredicate();
+ if (shouldSwapOperands(E->getOperand(0), E->getOperand(1))) {
+ E->swapOperands(0, 1);
+ Predicate = CmpInst::getSwappedPredicate(Predicate);
+ }
+ E->setOpcode((CI->getOpcode() << 8) | Predicate);
+ // TODO: 25% of our time is spent in SimplifyCmpInst with pointer operands
+ assert(I->getOperand(0)->getType() == I->getOperand(1)->getType() &&
+ "Wrong types on cmp instruction");
+ assert((E->getOperand(0)->getType() == I->getOperand(0)->getType() &&
+ E->getOperand(1)->getType() == I->getOperand(1)->getType()));
+ Value *V =
+ SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1), SQ);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ } else if (isa<SelectInst>(I)) {
+ if (isa<Constant>(E->getOperand(0)) ||
+ E->getOperand(1) == E->getOperand(2)) {
+ assert(E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
+ E->getOperand(2)->getType() == I->getOperand(2)->getType());
+ Value *V = SimplifySelectInst(E->getOperand(0), E->getOperand(1),
+ E->getOperand(2), SQ);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ }
+ } else if (I->isBinaryOp()) {
+ Value *V =
+ SimplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1), SQ);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ } else if (auto *BI = dyn_cast<BitCastInst>(I)) {
+ Value *V =
+ SimplifyCastInst(BI->getOpcode(), BI->getOperand(0), BI->getType(), SQ);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ } else if (isa<GetElementPtrInst>(I)) {
+ Value *V = SimplifyGEPInst(
+ E->getType(), ArrayRef<Value *>(E->op_begin(), E->op_end()), SQ);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ } else if (AllConstant) {
+ // We don't bother trying to simplify unless all of the operands
+ // were constant.
+ // TODO: There are a lot of Simplify*'s we could call here, if we
+ // wanted to. The original motivating case for this code was a
+ // zext i1 false to i8, which we don't have an interface to
+ // simplify (IE there is no SimplifyZExt).
+
+ SmallVector<Constant *, 8> C;
+ for (Value *Arg : E->operands())
+ C.emplace_back(cast<Constant>(Arg));
+
+ if (Value *V = ConstantFoldInstOperands(I, C, DL, TLI))
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ }
+ return E;
+}
+
+const AggregateValueExpression *
+NewGVN::createAggregateValueExpression(Instruction *I) const {
+ if (auto *II = dyn_cast<InsertValueInst>(I)) {
+ auto *E = new (ExpressionAllocator)
+ AggregateValueExpression(I->getNumOperands(), II->getNumIndices());
+ setBasicExpressionInfo(I, E);
+ E->allocateIntOperands(ExpressionAllocator);
+ std::copy(II->idx_begin(), II->idx_end(), int_op_inserter(E));
+ return E;
+ } else if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
+ auto *E = new (ExpressionAllocator)
+ AggregateValueExpression(I->getNumOperands(), EI->getNumIndices());
+ setBasicExpressionInfo(EI, E);
+ E->allocateIntOperands(ExpressionAllocator);
+ std::copy(EI->idx_begin(), EI->idx_end(), int_op_inserter(E));
+ return E;
+ }
+ llvm_unreachable("Unhandled type of aggregate value operation");
+}
+
+const DeadExpression *NewGVN::createDeadExpression() const {
+ // DeadExpression has no arguments and all DeadExpression's are the same,
+ // so we only need one of them.
+ return SingletonDeadExpression;
+}
+
+const VariableExpression *NewGVN::createVariableExpression(Value *V) const {
+ auto *E = new (ExpressionAllocator) VariableExpression(V);
+ E->setOpcode(V->getValueID());
+ return E;
+}
+
+const Expression *NewGVN::createVariableOrConstant(Value *V) const {
+ if (auto *C = dyn_cast<Constant>(V))
+ return createConstantExpression(C);
+ return createVariableExpression(V);
+}
+
+const ConstantExpression *NewGVN::createConstantExpression(Constant *C) const {
+ auto *E = new (ExpressionAllocator) ConstantExpression(C);
+ E->setOpcode(C->getValueID());
+ return E;
+}
+
+const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const {
+ auto *E = new (ExpressionAllocator) UnknownExpression(I);
+ E->setOpcode(I->getOpcode());
+ return E;
+}
+
+const CallExpression *
+NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const {
+ // FIXME: Add operand bundles for calls.
+ auto *E =
+ new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA);
+ setBasicExpressionInfo(CI, E);
+ return E;
+}
+
+// Return true if some equivalent of instruction Inst dominates instruction U.
+bool NewGVN::someEquivalentDominates(const Instruction *Inst,
+ const Instruction *U) const {
+ auto *CC = ValueToClass.lookup(Inst);
+ // This must be an instruction because we are only called from phi nodes
+ // in the case that the value it needs to check against is an instruction.
+
+ // The most likely candiates for dominance are the leader and the next leader.
+ // The leader or nextleader will dominate in all cases where there is an
+ // equivalent that is higher up in the dom tree.
+ // We can't *only* check them, however, because the
+ // dominator tree could have an infinite number of non-dominating siblings
+ // with instructions that are in the right congruence class.
+ // A
+ // B C D E F G
+ // |
+ // H
+ // Instruction U could be in H, with equivalents in every other sibling.
+ // Depending on the rpo order picked, the leader could be the equivalent in
+ // any of these siblings.
+ if (!CC)
+ return false;
+ if (alwaysAvailable(CC->getLeader()))
+ return true;
+ if (DT->dominates(cast<Instruction>(CC->getLeader()), U))
+ return true;
+ if (CC->getNextLeader().first &&
+ DT->dominates(cast<Instruction>(CC->getNextLeader().first), U))
+ return true;
+ return llvm::any_of(*CC, [&](const Value *Member) {
+ return Member != CC->getLeader() &&
+ DT->dominates(cast<Instruction>(Member), U);
+ });
+}
+
+// See if we have a congruence class and leader for this operand, and if so,
+// return it. Otherwise, return the operand itself.
+Value *NewGVN::lookupOperandLeader(Value *V) const {
+ CongruenceClass *CC = ValueToClass.lookup(V);
+ if (CC) {
+ // Everything in TOP is represented by undef, as it can be any value.
+ // We do have to make sure we get the type right though, so we can't set the
+ // RepLeader to undef.
+ if (CC == TOPClass)
+ return UndefValue::get(V->getType());
+ return CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
+ }
+
+ return V;
+}
+
+const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const {
+ auto *CC = getMemoryClass(MA);
+ assert(CC->getMemoryLeader() &&
+ "Every MemoryAccess should be mapped to a congruence class with a "
+ "representative memory access");
+ return CC->getMemoryLeader();
+}
+
+// Return true if the MemoryAccess is really equivalent to everything. This is
+// equivalent to the lattice value "TOP" in most lattices. This is the initial
+// state of all MemoryAccesses.
+bool NewGVN::isMemoryAccessTOP(const MemoryAccess *MA) const {
+ return getMemoryClass(MA) == TOPClass;
+}
+
+LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
+ LoadInst *LI,
+ const MemoryAccess *MA) const {
+ auto *E =
+ new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA));
+ E->allocateOperands(ArgRecycler, ExpressionAllocator);
+ E->setType(LoadType);
+
+ // Give store and loads same opcode so they value number together.
+ E->setOpcode(0);
+ E->op_push_back(PointerOp);
+ if (LI)
+ E->setAlignment(LI->getAlignment());
+
+ // TODO: Value number heap versions. We may be able to discover
+ // things alias analysis can't on it's own (IE that a store and a
+ // load have the same value, and thus, it isn't clobbering the load).
+ return E;
+}
+
+const StoreExpression *
+NewGVN::createStoreExpression(StoreInst *SI, const MemoryAccess *MA) const {
+ auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand());
+ auto *E = new (ExpressionAllocator)
+ StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA);
+ E->allocateOperands(ArgRecycler, ExpressionAllocator);
+ E->setType(SI->getValueOperand()->getType());
+
+ // Give store and loads same opcode so they value number together.
+ E->setOpcode(0);
+ E->op_push_back(lookupOperandLeader(SI->getPointerOperand()));
+
+ // TODO: Value number heap versions. We may be able to discover
+ // things alias analysis can't on it's own (IE that a store and a
+ // load have the same value, and thus, it isn't clobbering the load).
+ return E;
+}
+
+const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
+ // Unlike loads, we never try to eliminate stores, so we do not check if they
+ // are simple and avoid value numbering them.
+ auto *SI = cast<StoreInst>(I);
+ auto *StoreAccess = getMemoryAccess(SI);
+ // Get the expression, if any, for the RHS of the MemoryDef.
+ const MemoryAccess *StoreRHS = StoreAccess->getDefiningAccess();
+ if (EnableStoreRefinement)
+ StoreRHS = MSSAWalker->getClobberingMemoryAccess(StoreAccess);
+ // If we bypassed the use-def chains, make sure we add a use.
+ StoreRHS = lookupMemoryLeader(StoreRHS);
+ if (StoreRHS != StoreAccess->getDefiningAccess())
+ addMemoryUsers(StoreRHS, StoreAccess);
+ // If we are defined by ourselves, use the live on entry def.
+ if (StoreRHS == StoreAccess)
+ StoreRHS = MSSA->getLiveOnEntryDef();
+
+ if (SI->isSimple()) {
+ // See if we are defined by a previous store expression, it already has a
+ // value, and it's the same value as our current store. FIXME: Right now, we
+ // only do this for simple stores, we should expand to cover memcpys, etc.
+ const auto *LastStore = createStoreExpression(SI, StoreRHS);
+ const auto *LastCC = ExpressionToClass.lookup(LastStore);
+ // We really want to check whether the expression we matched was a store. No
+ // easy way to do that. However, we can check that the class we found has a
+ // store, which, assuming the value numbering state is not corrupt, is
+ // sufficient, because we must also be equivalent to that store's expression
+ // for it to be in the same class as the load.
+ if (LastCC && LastCC->getStoredValue() == LastStore->getStoredValue())
+ return LastStore;
+ // Also check if our value operand is defined by a load of the same memory
+ // location, and the memory state is the same as it was then (otherwise, it
+ // could have been overwritten later. See test32 in
+ // transforms/DeadStoreElimination/simple.ll).
+ if (auto *LI = dyn_cast<LoadInst>(LastStore->getStoredValue()))
+ if ((lookupOperandLeader(LI->getPointerOperand()) ==
+ LastStore->getOperand(0)) &&
+ (lookupMemoryLeader(getMemoryAccess(LI)->getDefiningAccess()) ==
+ StoreRHS))
+ return LastStore;
+ deleteExpression(LastStore);
+ }
+
+ // If the store is not equivalent to anything, value number it as a store that
+ // produces a unique memory state (instead of using it's MemoryUse, we use
+ // it's MemoryDef).
+ return createStoreExpression(SI, StoreAccess);
+}
+
+// See if we can extract the value of a loaded pointer from a load, a store, or
+// a memory instruction.
+const Expression *
+NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
+ LoadInst *LI, Instruction *DepInst,
+ MemoryAccess *DefiningAccess) const {
+ assert((!LI || LI->isSimple()) && "Not a simple load");
+ if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {
+ // Can't forward from non-atomic to atomic without violating memory model.
+ // Also don't need to coerce if they are the same type, we will just
+ // propagate.
+ if (LI->isAtomic() > DepSI->isAtomic() ||
+ LoadType == DepSI->getValueOperand()->getType())
+ return nullptr;
+ int Offset = analyzeLoadFromClobberingStore(LoadType, LoadPtr, DepSI, DL);
+ if (Offset >= 0) {
+ if (auto *C = dyn_cast<Constant>(
+ lookupOperandLeader(DepSI->getValueOperand()))) {
+ DEBUG(dbgs() << "Coercing load from store " << *DepSI << " to constant "
+ << *C << "\n");
+ return createConstantExpression(
+ getConstantStoreValueForLoad(C, Offset, LoadType, DL));
+ }
+ }
+ } else if (auto *DepLI = dyn_cast<LoadInst>(DepInst)) {
+ // Can't forward from non-atomic to atomic without violating memory model.
+ if (LI->isAtomic() > DepLI->isAtomic())
+ return nullptr;
+ int Offset = analyzeLoadFromClobberingLoad(LoadType, LoadPtr, DepLI, DL);
+ if (Offset >= 0) {
+ // We can coerce a constant load into a load.
+ if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI)))
+ if (auto *PossibleConstant =
+ getConstantLoadValueForLoad(C, Offset, LoadType, DL)) {
+ DEBUG(dbgs() << "Coercing load from load " << *LI << " to constant "
+ << *PossibleConstant << "\n");
+ return createConstantExpression(PossibleConstant);
+ }
+ }
+ } else if (auto *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
+ int Offset = analyzeLoadFromClobberingMemInst(LoadType, LoadPtr, DepMI, DL);
+ if (Offset >= 0) {
+ if (auto *PossibleConstant =
+ getConstantMemInstValueForLoad(DepMI, Offset, LoadType, DL)) {
+ DEBUG(dbgs() << "Coercing load from meminst " << *DepMI
+ << " to constant " << *PossibleConstant << "\n");
+ return createConstantExpression(PossibleConstant);
+ }
+ }
+ }
+
+ // All of the below are only true if the loaded pointer is produced
+ // by the dependent instruction.
+ if (LoadPtr != lookupOperandLeader(DepInst) &&
+ !AA->isMustAlias(LoadPtr, DepInst))
+ return nullptr;
+ // If this load really doesn't depend on anything, then we must be loading an
+ // undef value. This can happen when loading for a fresh allocation with no
+ // intervening stores, for example. Note that this is only true in the case
+ // that the result of the allocation is pointer equal to the load ptr.
+ if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) {
+ return createConstantExpression(UndefValue::get(LoadType));
+ }
+ // If this load occurs either right after a lifetime begin,
+ // then the loaded value is undefined.
+ else if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) {
+ if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+ return createConstantExpression(UndefValue::get(LoadType));
+ }
+ // If this load follows a calloc (which zero initializes memory),
+ // then the loaded value is zero
+ else if (isCallocLikeFn(DepInst, TLI)) {
+ return createConstantExpression(Constant::getNullValue(LoadType));
+ }
+
+ return nullptr;
+}
+
+const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
+ auto *LI = cast<LoadInst>(I);
+
+ // We can eliminate in favor of non-simple loads, but we won't be able to
+ // eliminate the loads themselves.
+ if (!LI->isSimple())
+ return nullptr;
+
+ Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand());
+ // Load of undef is undef.
+ if (isa<UndefValue>(LoadAddressLeader))
+ return createConstantExpression(UndefValue::get(LI->getType()));
+ MemoryAccess *OriginalAccess = getMemoryAccess(I);
+ MemoryAccess *DefiningAccess =
+ MSSAWalker->getClobberingMemoryAccess(OriginalAccess);
+
+ if (!MSSA->isLiveOnEntryDef(DefiningAccess)) {
+ if (auto *MD = dyn_cast<MemoryDef>(DefiningAccess)) {
+ Instruction *DefiningInst = MD->getMemoryInst();
+ // If the defining instruction is not reachable, replace with undef.
+ if (!ReachableBlocks.count(DefiningInst->getParent()))
+ return createConstantExpression(UndefValue::get(LI->getType()));
+ // This will handle stores and memory insts. We only do if it the
+ // defining access has a different type, or it is a pointer produced by
+ // certain memory operations that cause the memory to have a fixed value
+ // (IE things like calloc).
+ if (const auto *CoercionResult =
+ performSymbolicLoadCoercion(LI->getType(), LoadAddressLeader, LI,
+ DefiningInst, DefiningAccess))
+ return CoercionResult;
+ }
+ }
+
+ const auto *LE = createLoadExpression(LI->getType(), LoadAddressLeader, LI,
+ DefiningAccess);
+ // If our MemoryLeader is not our defining access, add a use to the
+ // MemoryLeader, so that we get reprocessed when it changes.
+ if (LE->getMemoryLeader() != DefiningAccess)
+ addMemoryUsers(LE->getMemoryLeader(), OriginalAccess);
+ return LE;
+}
+
+const Expression *
+NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
+ auto *PI = PredInfo->getPredicateInfoFor(I);
+ if (!PI)
+ return nullptr;
+
+ DEBUG(dbgs() << "Found predicate info from instruction !\n");
+
+ auto *PWC = dyn_cast<PredicateWithCondition>(PI);
+ if (!PWC)
+ return nullptr;
+
+ auto *CopyOf = I->getOperand(0);
+ auto *Cond = PWC->Condition;
+
+ // If this a copy of the condition, it must be either true or false depending
+ // on the predicate info type and edge.
+ if (CopyOf == Cond) {
+ // We should not need to add predicate users because the predicate info is
+ // already a use of this operand.
+ if (isa<PredicateAssume>(PI))
+ return createConstantExpression(ConstantInt::getTrue(Cond->getType()));
+ if (auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
+ if (PBranch->TrueEdge)
+ return createConstantExpression(ConstantInt::getTrue(Cond->getType()));
+ return createConstantExpression(ConstantInt::getFalse(Cond->getType()));
+ }
+ if (auto *PSwitch = dyn_cast<PredicateSwitch>(PI))
+ return createConstantExpression(cast<Constant>(PSwitch->CaseValue));
+ }
+
+ // Not a copy of the condition, so see what the predicates tell us about this
+ // value. First, though, we check to make sure the value is actually a copy
+ // of one of the condition operands. It's possible, in certain cases, for it
+ // to be a copy of a predicateinfo copy. In particular, if two branch
+ // operations use the same condition, and one branch dominates the other, we
+ // will end up with a copy of a copy. This is currently a small deficiency in
+ // predicateinfo. What will end up happening here is that we will value
+ // number both copies the same anyway.
+
+ // Everything below relies on the condition being a comparison.
+ auto *Cmp = dyn_cast<CmpInst>(Cond);
+ if (!Cmp)
+ return nullptr;
+
+ if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) {
+ DEBUG(dbgs() << "Copy is not of any condition operands!\n");
+ return nullptr;
+ }
+ Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
+ Value *SecondOp = lookupOperandLeader(Cmp->getOperand(1));
+ bool SwappedOps = false;
+ // Sort the ops.
+ if (shouldSwapOperands(FirstOp, SecondOp)) {
+ std::swap(FirstOp, SecondOp);
+ SwappedOps = true;
+ }
+ CmpInst::Predicate Predicate =
+ SwappedOps ? Cmp->getSwappedPredicate() : Cmp->getPredicate();
+
+ if (isa<PredicateAssume>(PI)) {
+ // If the comparison is true when the operands are equal, then we know the
+ // operands are equal, because assumes must always be true.
+ if (CmpInst::isTrueWhenEqual(Predicate)) {
+ addPredicateUsers(PI, I);
+ addAdditionalUsers(Cmp->getOperand(0), I);
+ return createVariableOrConstant(FirstOp);
+ }
+ }
+ if (const auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
+ // If we are *not* a copy of the comparison, we may equal to the other
+ // operand when the predicate implies something about equality of
+ // operations. In particular, if the comparison is true/false when the
+ // operands are equal, and we are on the right edge, we know this operation
+ // is equal to something.
+ if ((PBranch->TrueEdge && Predicate == CmpInst::ICMP_EQ) ||
+ (!PBranch->TrueEdge && Predicate == CmpInst::ICMP_NE)) {
+ addPredicateUsers(PI, I);
+ addAdditionalUsers(SwappedOps ? Cmp->getOperand(1) : Cmp->getOperand(0),
+ I);
+ return createVariableOrConstant(FirstOp);
+ }
+ // Handle the special case of floating point.
+ if (((PBranch->TrueEdge && Predicate == CmpInst::FCMP_OEQ) ||
+ (!PBranch->TrueEdge && Predicate == CmpInst::FCMP_UNE)) &&
+ isa<ConstantFP>(FirstOp) && !cast<ConstantFP>(FirstOp)->isZero()) {
+ addPredicateUsers(PI, I);
+ addAdditionalUsers(SwappedOps ? Cmp->getOperand(1) : Cmp->getOperand(0),
+ I);
+ return createConstantExpression(cast<Constant>(FirstOp));
+ }
+ }
+ return nullptr;
+}
+
+// Evaluate read only and pure calls, and create an expression result.
+const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
+ auto *CI = cast<CallInst>(I);
+ if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+ // Instrinsics with the returned attribute are copies of arguments.
+ if (auto *ReturnedValue = II->getReturnedArgOperand()) {
+ if (II->getIntrinsicID() == Intrinsic::ssa_copy)
+ if (const auto *Result = performSymbolicPredicateInfoEvaluation(I))
+ return Result;
+ return createVariableOrConstant(ReturnedValue);
+ }
+ }
+ if (AA->doesNotAccessMemory(CI)) {
+ return createCallExpression(CI, TOPClass->getMemoryLeader());
+ } else if (AA->onlyReadsMemory(CI)) {
+ MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(CI);
+ return createCallExpression(CI, DefiningAccess);
+ }
+ return nullptr;
+}
+
+// Retrieve the memory class for a given MemoryAccess.
+CongruenceClass *NewGVN::getMemoryClass(const MemoryAccess *MA) const {
+ auto *Result = MemoryAccessToClass.lookup(MA);
+ assert(Result && "Should have found memory class");
+ return Result;
+}
+
+// Update the MemoryAccess equivalence table to say that From is equal to To,
+// and return true if this is different from what already existed in the table.
+bool NewGVN::setMemoryClass(const MemoryAccess *From,
+ CongruenceClass *NewClass) {
+ assert(NewClass &&
+ "Every MemoryAccess should be getting mapped to a non-null class");
+ DEBUG(dbgs() << "Setting " << *From);
+ DEBUG(dbgs() << " equivalent to congruence class ");
+ DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader ");
+ DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");
+
+ auto LookupResult = MemoryAccessToClass.find(From);
+ bool Changed = false;
+ // If it's already in the table, see if the value changed.
+ if (LookupResult != MemoryAccessToClass.end()) {
+ auto *OldClass = LookupResult->second;
+ if (OldClass != NewClass) {
+ // If this is a phi, we have to handle memory member updates.
+ if (auto *MP = dyn_cast<MemoryPhi>(From)) {
+ OldClass->memory_erase(MP);
+ NewClass->memory_insert(MP);
+ // This may have killed the class if it had no non-memory members
+ if (OldClass->getMemoryLeader() == From) {
+ if (OldClass->definesNoMemory()) {
+ OldClass->setMemoryLeader(nullptr);
+ } else {
+ OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
+ DEBUG(dbgs() << "Memory class leader change for class "
+ << OldClass->getID() << " to "
+ << *OldClass->getMemoryLeader()
+ << " due to removal of a memory member " << *From
+ << "\n");
+ markMemoryLeaderChangeTouched(OldClass);
+ }
+ }
+ }
+ // It wasn't equivalent before, and now it is.
+ LookupResult->second = NewClass;
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+// Determine if a instruction is cycle-free. That means the values in the
+// instruction don't depend on any expressions that can change value as a result
+// of the instruction. For example, a non-cycle free instruction would be v =
+// phi(0, v+1).
+bool NewGVN::isCycleFree(const Instruction *I) const {
+ // In order to compute cycle-freeness, we do SCC finding on the instruction,
+ // and see what kind of SCC it ends up in. If it is a singleton, it is
+ // cycle-free. If it is not in a singleton, it is only cycle free if the
+ // other members are all phi nodes (as they do not compute anything, they are
+ // copies).
+ auto ICS = InstCycleState.lookup(I);
+ if (ICS == ICS_Unknown) {
+ SCCFinder.Start(I);
+ auto &SCC = SCCFinder.getComponentFor(I);
+ // It's cycle free if it's size 1 or or the SCC is *only* phi nodes.
+ if (SCC.size() == 1)
+ InstCycleState.insert({I, ICS_CycleFree});
+ else {
+ bool AllPhis = llvm::all_of(SCC, [](const Value *V) {
+ return isa<PHINode>(V) || isCopyOfAPHI(V);
+ });
+ ICS = AllPhis ? ICS_CycleFree : ICS_Cycle;
+ for (auto *Member : SCC)
+ if (auto *MemberPhi = dyn_cast<PHINode>(Member))
+ InstCycleState.insert({MemberPhi, ICS});
+ }
+ }
+ if (ICS == ICS_Cycle)
+ return false;
+ return true;
+}
+
+// Evaluate PHI nodes symbolically and create an expression result.
+const Expression *
+NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
+ Instruction *I,
+ BasicBlock *PHIBlock) const {
+ // True if one of the incoming phi edges is a backedge.
+ bool HasBackedge = false;
+ // All constant tracks the state of whether all the *original* phi operands
+ // This is really shorthand for "this phi cannot cycle due to forward
+ // change in value of the phi is guaranteed not to later change the value of
+ // the phi. IE it can't be v = phi(undef, v+1)
+ bool OriginalOpsConstant = true;
+ auto *E = cast<PHIExpression>(createPHIExpression(
+ PHIOps, I, PHIBlock, HasBackedge, OriginalOpsConstant));
+ // We match the semantics of SimplifyPhiNode from InstructionSimplify here.
+ // See if all arguments are the same.
+ // We track if any were undef because they need special handling.
+ bool HasUndef = false;
+ auto Filtered = make_filter_range(E->operands(), [&](Value *Arg) {
+ if (isa<UndefValue>(Arg)) {
+ HasUndef = true;
+ return false;
+ }
+ return true;
+ });
+ // If we are left with no operands, it's dead.
+ if (Filtered.begin() == Filtered.end()) {
+ // If it has undef at this point, it means there are no-non-undef arguments,
+ // and thus, the value of the phi node must be undef.
+ if (HasUndef) {
+ DEBUG(dbgs() << "PHI Node " << *I
+ << " has no non-undef arguments, valuing it as undef\n");
+ return createConstantExpression(UndefValue::get(I->getType()));
+ }
+
+ DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n");
+ deleteExpression(E);
+ return createDeadExpression();
+ }
+ Value *AllSameValue = *(Filtered.begin());
+ ++Filtered.begin();
+ // Can't use std::equal here, sadly, because filter.begin moves.
+ if (llvm::all_of(Filtered, [&](Value *Arg) { return Arg == AllSameValue; })) {
+ // In LLVM's non-standard representation of phi nodes, it's possible to have
+ // phi nodes with cycles (IE dependent on other phis that are .... dependent
+ // on the original phi node), especially in weird CFG's where some arguments
+ // are unreachable, or uninitialized along certain paths. This can cause
+ // infinite loops during evaluation. We work around this by not trying to
+ // really evaluate them independently, but instead using a variable
+ // expression to say if one is equivalent to the other.
+ // We also special case undef, so that if we have an undef, we can't use the
+ // common value unless it dominates the phi block.
+ if (HasUndef) {
+ // If we have undef and at least one other value, this is really a
+ // multivalued phi, and we need to know if it's cycle free in order to
+ // evaluate whether we can ignore the undef. The other parts of this are
+ // just shortcuts. If there is no backedge, or all operands are
+ // constants, it also must be cycle free.
+ if (HasBackedge && !OriginalOpsConstant &&
+ !isa<UndefValue>(AllSameValue) && !isCycleFree(I))
+ return E;
+
+ // Only have to check for instructions
+ if (auto *AllSameInst = dyn_cast<Instruction>(AllSameValue))
+ if (!someEquivalentDominates(AllSameInst, I))
+ return E;
+ }
+ // Can't simplify to something that comes later in the iteration.
+ // Otherwise, when and if it changes congruence class, we will never catch
+ // up. We will always be a class behind it.
+ if (isa<Instruction>(AllSameValue) &&
+ InstrToDFSNum(AllSameValue) > InstrToDFSNum(I))
+ return E;
+ NumGVNPhisAllSame++;
+ DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
+ << "\n");
+ deleteExpression(E);
+ return createVariableOrConstant(AllSameValue);
+ }
+ return E;
+}
+
+const Expression *
+NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const {
+ if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
+ auto *II = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
+ if (II && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) {
+ unsigned Opcode = 0;
+ // EI might be an extract from one of our recognised intrinsics. If it
+ // is we'll synthesize a semantically equivalent expression instead on
+ // an extract value expression.
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ Opcode = Instruction::Add;
+ break;
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ Opcode = Instruction::Sub;
+ break;
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow:
+ Opcode = Instruction::Mul;
+ break;
+ default:
+ break;
+ }
+
+ if (Opcode != 0) {
+ // Intrinsic recognized. Grab its args to finish building the
+ // expression.
+ assert(II->getNumArgOperands() == 2 &&
+ "Expect two args for recognised intrinsics.");
+ return createBinaryExpression(Opcode, EI->getType(),
+ II->getArgOperand(0),
+ II->getArgOperand(1), I);
+ }
+ }
+ }
+
+ return createAggregateValueExpression(I);
+}
+
+const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
+ assert(isa<CmpInst>(I) && "Expected a cmp instruction.");
+
+ auto *CI = cast<CmpInst>(I);
+ // See if our operands are equal to those of a previous predicate, and if so,
+ // if it implies true or false.
+ auto Op0 = lookupOperandLeader(CI->getOperand(0));
+ auto Op1 = lookupOperandLeader(CI->getOperand(1));
+ auto OurPredicate = CI->getPredicate();
+ if (shouldSwapOperands(Op0, Op1)) {
+ std::swap(Op0, Op1);
+ OurPredicate = CI->getSwappedPredicate();
+ }
+
+ // Avoid processing the same info twice.
+ const PredicateBase *LastPredInfo = nullptr;
+ // See if we know something about the comparison itself, like it is the target
+ // of an assume.
+ auto *CmpPI = PredInfo->getPredicateInfoFor(I);
+ if (dyn_cast_or_null<PredicateAssume>(CmpPI))
+ return createConstantExpression(ConstantInt::getTrue(CI->getType()));
+
+ if (Op0 == Op1) {
+ // This condition does not depend on predicates, no need to add users
+ if (CI->isTrueWhenEqual())
+ return createConstantExpression(ConstantInt::getTrue(CI->getType()));
+ else if (CI->isFalseWhenEqual())
+ return createConstantExpression(ConstantInt::getFalse(CI->getType()));
+ }
+
+ // NOTE: Because we are comparing both operands here and below, and using
+ // previous comparisons, we rely on fact that predicateinfo knows to mark
+ // comparisons that use renamed operands as users of the earlier comparisons.
+ // It is *not* enough to just mark predicateinfo renamed operands as users of
+ // the earlier comparisons, because the *other* operand may have changed in a
+ // previous iteration.
+ // Example:
+ // icmp slt %a, %b
+ // %b.0 = ssa.copy(%b)
+ // false branch:
+ // icmp slt %c, %b.0
+
+ // %c and %a may start out equal, and thus, the code below will say the second
+ // %icmp is false. c may become equal to something else, and in that case the
+ // %second icmp *must* be reexamined, but would not if only the renamed
+ // %operands are considered users of the icmp.
+
+ // *Currently* we only check one level of comparisons back, and only mark one
+ // level back as touched when changes happen. If you modify this code to look
+ // back farther through comparisons, you *must* mark the appropriate
+ // comparisons as users in PredicateInfo.cpp, or you will cause bugs. See if
+ // we know something just from the operands themselves
+
+ // See if our operands have predicate info, so that we may be able to derive
+ // something from a previous comparison.
+ for (const auto &Op : CI->operands()) {
+ auto *PI = PredInfo->getPredicateInfoFor(Op);
+ if (const auto *PBranch = dyn_cast_or_null<PredicateBranch>(PI)) {
+ if (PI == LastPredInfo)
+ continue;
+ LastPredInfo = PI;
+ // In phi of ops cases, we may have predicate info that we are evaluating
+ // in a different context.
+ if (!DT->dominates(PBranch->To, getBlockForValue(I)))
+ continue;
+ // TODO: Along the false edge, we may know more things too, like
+ // icmp of
+ // same operands is false.
+ // TODO: We only handle actual comparison conditions below, not
+ // and/or.
+ auto *BranchCond = dyn_cast<CmpInst>(PBranch->Condition);
+ if (!BranchCond)
+ continue;
+ auto *BranchOp0 = lookupOperandLeader(BranchCond->getOperand(0));
+ auto *BranchOp1 = lookupOperandLeader(BranchCond->getOperand(1));
+ auto BranchPredicate = BranchCond->getPredicate();
+ if (shouldSwapOperands(BranchOp0, BranchOp1)) {
+ std::swap(BranchOp0, BranchOp1);
+ BranchPredicate = BranchCond->getSwappedPredicate();
+ }
+ if (BranchOp0 == Op0 && BranchOp1 == Op1) {
+ if (PBranch->TrueEdge) {
+ // If we know the previous predicate is true and we are in the true
+ // edge then we may be implied true or false.
+ if (CmpInst::isImpliedTrueByMatchingCmp(BranchPredicate,
+ OurPredicate)) {
+ addPredicateUsers(PI, I);
+ return createConstantExpression(
+ ConstantInt::getTrue(CI->getType()));
+ }
+
+ if (CmpInst::isImpliedFalseByMatchingCmp(BranchPredicate,
+ OurPredicate)) {
+ addPredicateUsers(PI, I);
+ return createConstantExpression(
+ ConstantInt::getFalse(CI->getType()));
+ }
+ } else {
+ // Just handle the ne and eq cases, where if we have the same
+ // operands, we may know something.
+ if (BranchPredicate == OurPredicate) {
+ addPredicateUsers(PI, I);
+ // Same predicate, same ops,we know it was false, so this is false.
+ return createConstantExpression(
+ ConstantInt::getFalse(CI->getType()));
+ } else if (BranchPredicate ==
+ CmpInst::getInversePredicate(OurPredicate)) {
+ addPredicateUsers(PI, I);
+ // Inverse predicate, we know the other was false, so this is true.
+ return createConstantExpression(
+ ConstantInt::getTrue(CI->getType()));
+ }
+ }
+ }
+ }
+ }
+ // Create expression will take care of simplifyCmpInst
+ return createExpression(I);
+}
+
+// Substitute and symbolize the value before value numbering.
+const Expression *
+NewGVN::performSymbolicEvaluation(Value *V,
+ SmallPtrSetImpl<Value *> &Visited) const {
+ const Expression *E = nullptr;
+ if (auto *C = dyn_cast<Constant>(V))
+ E = createConstantExpression(C);
+ else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
+ E = createVariableExpression(V);
+ } else {
+ // TODO: memory intrinsics.
+ // TODO: Some day, we should do the forward propagation and reassociation
+ // parts of the algorithm.
+ auto *I = cast<Instruction>(V);
+ switch (I->getOpcode()) {
+ case Instruction::ExtractValue:
+ case Instruction::InsertValue:
+ E = performSymbolicAggrValueEvaluation(I);
+ break;
+ case Instruction::PHI: {
+ SmallVector<ValPair, 3> Ops;
+ auto *PN = cast<PHINode>(I);
+ for (unsigned i = 0; i < PN->getNumOperands(); ++i)
+ Ops.push_back({PN->getIncomingValue(i), PN->getIncomingBlock(i)});
+ // Sort to ensure the invariant createPHIExpression requires is met.
+ sortPHIOps(Ops);
+ E = performSymbolicPHIEvaluation(Ops, I, getBlockForValue(I));
+ } break;
+ case Instruction::Call:
+ E = performSymbolicCallEvaluation(I);
+ break;
+ case Instruction::Store:
+ E = performSymbolicStoreEvaluation(I);
+ break;
+ case Instruction::Load:
+ E = performSymbolicLoadEvaluation(I);
+ break;
+ case Instruction::BitCast:
+ E = createExpression(I);
+ break;
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ E = performSymbolicCmpEvaluation(I);
+ break;
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::Select:
+ case Instruction::ExtractElement:
+ case Instruction::InsertElement:
+ case Instruction::ShuffleVector:
+ case Instruction::GetElementPtr:
+ E = createExpression(I);
+ break;
+ default:
+ return nullptr;
+ }
+ }
+ return E;
+}
+
+// Look up a container in a map, and then call a function for each thing in the
+// found container.
+template <typename Map, typename KeyType, typename Func>
+void NewGVN::for_each_found(Map &M, const KeyType &Key, Func F) {
+ const auto Result = M.find_as(Key);
+ if (Result != M.end())
+ for (typename Map::mapped_type::value_type Mapped : Result->second)
+ F(Mapped);
+}
+
+// Look up a container of values/instructions in a map, and touch all the
+// instructions in the container. Then erase value from the map.
+template <typename Map, typename KeyType>
+void NewGVN::touchAndErase(Map &M, const KeyType &Key) {
+ const auto Result = M.find_as(Key);
+ if (Result != M.end()) {
+ for (const typename Map::mapped_type::value_type Mapped : Result->second)
+ TouchedInstructions.set(InstrToDFSNum(Mapped));
+ M.erase(Result);
+ }
+}
+
+void NewGVN::addAdditionalUsers(Value *To, Value *User) const {
+ assert(User && To != User);
+ if (isa<Instruction>(To))
+ AdditionalUsers[To].insert(User);
+}
+
+void NewGVN::markUsersTouched(Value *V) {
+ // Now mark the users as touched.
+ for (auto *User : V->users()) {
+ assert(isa<Instruction>(User) && "Use of value not within an instruction?");
+ TouchedInstructions.set(InstrToDFSNum(User));
+ }
+ touchAndErase(AdditionalUsers, V);
+}
+
+void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const {
+ DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
+ MemoryToUsers[To].insert(U);
+}
+
+void NewGVN::markMemoryDefTouched(const MemoryAccess *MA) {
+ TouchedInstructions.set(MemoryToDFSNum(MA));
+}
+
+void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) {
+ if (isa<MemoryUse>(MA))
+ return;
+ for (auto U : MA->users())
+ TouchedInstructions.set(MemoryToDFSNum(U));
+ touchAndErase(MemoryToUsers, MA);
+}
+
+// Add I to the set of users of a given predicate.
+void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const {
+ // Don't add temporary instructions to the user lists.
+ if (AllTempInstructions.count(I))
+ return;
+
+ if (auto *PBranch = dyn_cast<PredicateBranch>(PB))
+ PredicateToUsers[PBranch->Condition].insert(I);
+ else if (auto *PAssume = dyn_cast<PredicateBranch>(PB))
+ PredicateToUsers[PAssume->Condition].insert(I);
+}
+
+// Touch all the predicates that depend on this instruction.
+void NewGVN::markPredicateUsersTouched(Instruction *I) {
+ touchAndErase(PredicateToUsers, I);
+}
+
+// Mark users affected by a memory leader change.
+void NewGVN::markMemoryLeaderChangeTouched(CongruenceClass *CC) {
+ for (auto M : CC->memory())
+ markMemoryDefTouched(M);
+}
+
+// Touch the instructions that need to be updated after a congruence class has a
+// leader change, and mark changed values.
+void NewGVN::markValueLeaderChangeTouched(CongruenceClass *CC) {
+ for (auto M : *CC) {
+ if (auto *I = dyn_cast<Instruction>(M))
+ TouchedInstructions.set(InstrToDFSNum(I));
+ LeaderChanges.insert(M);
+ }
+}
+
+// Give a range of things that have instruction DFS numbers, this will return
+// the member of the range with the smallest dfs number.
+template <class T, class Range>
+T *NewGVN::getMinDFSOfRange(const Range &R) const {
+ std::pair<T *, unsigned> MinDFS = {nullptr, ~0U};
+ for (const auto X : R) {
+ auto DFSNum = InstrToDFSNum(X);
+ if (DFSNum < MinDFS.second)
+ MinDFS = {X, DFSNum};
+ }
+ return MinDFS.first;
+}
+
+// This function returns the MemoryAccess that should be the next leader of
+// congruence class CC, under the assumption that the current leader is going to
+// disappear.
+const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {
+ // TODO: If this ends up to slow, we can maintain a next memory leader like we
+ // do for regular leaders.
+ // Make sure there will be a leader to find.
+ assert(!CC->definesNoMemory() && "Can't get next leader if there is none");
+ if (CC->getStoreCount() > 0) {
+ if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))
+ return getMemoryAccess(NL);
+ // Find the store with the minimum DFS number.
+ auto *V = getMinDFSOfRange<Value>(make_filter_range(
+ *CC, [&](const Value *V) { return isa<StoreInst>(V); }));
+ return getMemoryAccess(cast<StoreInst>(V));
+ }
+ assert(CC->getStoreCount() == 0);
+
+ // Given our assertion, hitting this part must mean
+ // !OldClass->memory_empty()
+ if (CC->memory_size() == 1)
+ return *CC->memory_begin();
+ return getMinDFSOfRange<const MemoryPhi>(CC->memory());
+}
+
+// This function returns the next value leader of a congruence class, under the
+// assumption that the current leader is going away. This should end up being
+// the next most dominating member.
+Value *NewGVN::getNextValueLeader(CongruenceClass *CC) const {
+ // We don't need to sort members if there is only 1, and we don't care about
+ // sorting the TOP class because everything either gets out of it or is
+ // unreachable.
+
+ if (CC->size() == 1 || CC == TOPClass) {
+ return *(CC->begin());
+ } else if (CC->getNextLeader().first) {
+ ++NumGVNAvoidedSortedLeaderChanges;
+ return CC->getNextLeader().first;
+ } else {
+ ++NumGVNSortedLeaderChanges;
+ // NOTE: If this ends up to slow, we can maintain a dual structure for
+ // member testing/insertion, or keep things mostly sorted, and sort only
+ // here, or use SparseBitVector or ....
+ return getMinDFSOfRange<Value>(*CC);
+ }
+}
+
+// Move a MemoryAccess, currently in OldClass, to NewClass, including updates to
+// the memory members, etc for the move.
+//
+// The invariants of this function are:
+//
+// - I must be moving to NewClass from OldClass
+// - The StoreCount of OldClass and NewClass is expected to have been updated
+// for I already if it is is a store.
+// - The OldClass memory leader has not been updated yet if I was the leader.
+void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
+ MemoryAccess *InstMA,
+ CongruenceClass *OldClass,
+ CongruenceClass *NewClass) {
+ // If the leader is I, and we had a represenative MemoryAccess, it should
+ // be the MemoryAccess of OldClass.
+ assert((!InstMA || !OldClass->getMemoryLeader() ||
+ OldClass->getLeader() != I ||
+ MemoryAccessToClass.lookup(OldClass->getMemoryLeader()) ==
+ MemoryAccessToClass.lookup(InstMA)) &&
+ "Representative MemoryAccess mismatch");
+ // First, see what happens to the new class
+ if (!NewClass->getMemoryLeader()) {
+ // Should be a new class, or a store becoming a leader of a new class.
+ assert(NewClass->size() == 1 ||
+ (isa<StoreInst>(I) && NewClass->getStoreCount() == 1));
+ NewClass->setMemoryLeader(InstMA);
+ // Mark it touched if we didn't just create a singleton
+ DEBUG(dbgs() << "Memory class leader change for class " << NewClass->getID()
+ << " due to new memory instruction becoming leader\n");
+ markMemoryLeaderChangeTouched(NewClass);
+ }
+ setMemoryClass(InstMA, NewClass);
+ // Now, fixup the old class if necessary
+ if (OldClass->getMemoryLeader() == InstMA) {
+ if (!OldClass->definesNoMemory()) {
+ OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
+ DEBUG(dbgs() << "Memory class leader change for class "
+ << OldClass->getID() << " to "
+ << *OldClass->getMemoryLeader()
+ << " due to removal of old leader " << *InstMA << "\n");
+ markMemoryLeaderChangeTouched(OldClass);
+ } else
+ OldClass->setMemoryLeader(nullptr);
+ }
+}
+
+// Move a value, currently in OldClass, to be part of NewClass
+// Update OldClass and NewClass for the move (including changing leaders, etc).
+void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
+ CongruenceClass *OldClass,
+ CongruenceClass *NewClass) {
+ if (I == OldClass->getNextLeader().first)
+ OldClass->resetNextLeader();
+
+ OldClass->erase(I);
+ NewClass->insert(I);
+
+ if (NewClass->getLeader() != I)
+ NewClass->addPossibleNextLeader({I, InstrToDFSNum(I)});
+ // Handle our special casing of stores.
+ if (auto *SI = dyn_cast<StoreInst>(I)) {
+ OldClass->decStoreCount();
+ // Okay, so when do we want to make a store a leader of a class?
+ // If we have a store defined by an earlier load, we want the earlier load
+ // to lead the class.
+ // If we have a store defined by something else, we want the store to lead
+ // the class so everything else gets the "something else" as a value.
+ // If we have a store as the single member of the class, we want the store
+ // as the leader
+ if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) {
+ // If it's a store expression we are using, it means we are not equivalent
+ // to something earlier.
+ if (auto *SE = dyn_cast<StoreExpression>(E)) {
+ NewClass->setStoredValue(SE->getStoredValue());
+ markValueLeaderChangeTouched(NewClass);
+ // Shift the new class leader to be the store
+ DEBUG(dbgs() << "Changing leader of congruence class "
+ << NewClass->getID() << " from " << *NewClass->getLeader()
+ << " to " << *SI << " because store joined class\n");
+ // If we changed the leader, we have to mark it changed because we don't
+ // know what it will do to symbolic evaluation.
+ NewClass->setLeader(SI);
+ }
+ // We rely on the code below handling the MemoryAccess change.
+ }
+ NewClass->incStoreCount();
+ }
+ // True if there is no memory instructions left in a class that had memory
+ // instructions before.
+
+ // If it's not a memory use, set the MemoryAccess equivalence
+ auto *InstMA = dyn_cast_or_null<MemoryDef>(getMemoryAccess(I));
+ if (InstMA)
+ moveMemoryToNewCongruenceClass(I, InstMA, OldClass, NewClass);
+ ValueToClass[I] = NewClass;
+ // See if we destroyed the class or need to swap leaders.
+ if (OldClass->empty() && OldClass != TOPClass) {
+ if (OldClass->getDefiningExpr()) {
+ DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()
+ << " from table\n");
+ // We erase it as an exact expression to make sure we don't just erase an
+ // equivalent one.
+ auto Iter = ExpressionToClass.find_as(
+ ExactEqualsExpression(*OldClass->getDefiningExpr()));
+ if (Iter != ExpressionToClass.end())
+ ExpressionToClass.erase(Iter);
+#ifdef EXPENSIVE_CHECKS
+ assert(
+ (*OldClass->getDefiningExpr() != *E || ExpressionToClass.lookup(E)) &&
+ "We erased the expression we just inserted, which should not happen");
+#endif
+ }
+ } else if (OldClass->getLeader() == I) {
+ // When the leader changes, the value numbering of
+ // everything may change due to symbolization changes, so we need to
+ // reprocess.
+ DEBUG(dbgs() << "Value class leader change for class " << OldClass->getID()
+ << "\n");
+ ++NumGVNLeaderChanges;
+ // Destroy the stored value if there are no more stores to represent it.
+ // Note that this is basically clean up for the expression removal that
+ // happens below. If we remove stores from a class, we may leave it as a
+ // class of equivalent memory phis.
+ if (OldClass->getStoreCount() == 0) {
+ if (OldClass->getStoredValue())
+ OldClass->setStoredValue(nullptr);
+ }
+ OldClass->setLeader(getNextValueLeader(OldClass));
+ OldClass->resetNextLeader();
+ markValueLeaderChangeTouched(OldClass);
+ }
+}
+
+// For a given expression, mark the phi of ops instructions that could have
+// changed as a result.
+void NewGVN::markPhiOfOpsChanged(const Expression *E) {
+ touchAndErase(ExpressionToPhiOfOps, E);
+}
+
+// Perform congruence finding on a given value numbering expression.
+void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
+ // This is guaranteed to return something, since it will at least find
+ // TOP.
+
+ CongruenceClass *IClass = ValueToClass.lookup(I);
+ assert(IClass && "Should have found a IClass");
+ // Dead classes should have been eliminated from the mapping.
+ assert(!IClass->isDead() && "Found a dead class");
+
+ CongruenceClass *EClass = nullptr;
+ if (const auto *VE = dyn_cast<VariableExpression>(E)) {
+ EClass = ValueToClass.lookup(VE->getVariableValue());
+ } else if (isa<DeadExpression>(E)) {
+ EClass = TOPClass;
+ }
+ if (!EClass) {
+ auto lookupResult = ExpressionToClass.insert({E, nullptr});
+
+ // If it's not in the value table, create a new congruence class.
+ if (lookupResult.second) {
+ CongruenceClass *NewClass = createCongruenceClass(nullptr, E);
+ auto place = lookupResult.first;
+ place->second = NewClass;
+
+ // Constants and variables should always be made the leader.
+ if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
+ NewClass->setLeader(CE->getConstantValue());
+ } else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
+ StoreInst *SI = SE->getStoreInst();
+ NewClass->setLeader(SI);
+ NewClass->setStoredValue(SE->getStoredValue());
+ // The RepMemoryAccess field will be filled in properly by the
+ // moveValueToNewCongruenceClass call.
+ } else {
+ NewClass->setLeader(I);
+ }
+ assert(!isa<VariableExpression>(E) &&
+ "VariableExpression should have been handled already");
+
+ EClass = NewClass;
+ DEBUG(dbgs() << "Created new congruence class for " << *I
+ << " using expression " << *E << " at " << NewClass->getID()
+ << " and leader " << *(NewClass->getLeader()));
+ if (NewClass->getStoredValue())
+ DEBUG(dbgs() << " and stored value " << *(NewClass->getStoredValue()));
+ DEBUG(dbgs() << "\n");
+ } else {
+ EClass = lookupResult.first->second;
+ if (isa<ConstantExpression>(E))
+ assert((isa<Constant>(EClass->getLeader()) ||
+ (EClass->getStoredValue() &&
+ isa<Constant>(EClass->getStoredValue()))) &&
+ "Any class with a constant expression should have a "
+ "constant leader");
+
+ assert(EClass && "Somehow don't have an eclass");
+
+ assert(!EClass->isDead() && "We accidentally looked up a dead class");
+ }
+ }
+ bool ClassChanged = IClass != EClass;
+ bool LeaderChanged = LeaderChanges.erase(I);
+ if (ClassChanged || LeaderChanged) {
+ DEBUG(dbgs() << "New class " << EClass->getID() << " for expression " << *E
+ << "\n");
+ if (ClassChanged) {
+ moveValueToNewCongruenceClass(I, E, IClass, EClass);
+ markPhiOfOpsChanged(E);
+ }
+
+ markUsersTouched(I);
+ if (MemoryAccess *MA = getMemoryAccess(I))
+ markMemoryUsersTouched(MA);
+ if (auto *CI = dyn_cast<CmpInst>(I))
+ markPredicateUsersTouched(CI);
+ }
+ // If we changed the class of the store, we want to ensure nothing finds the
+ // old store expression. In particular, loads do not compare against stored
+ // value, so they will find old store expressions (and associated class
+ // mappings) if we leave them in the table.
+ if (ClassChanged && isa<StoreInst>(I)) {
+ auto *OldE = ValueToExpression.lookup(I);
+ // It could just be that the old class died. We don't want to erase it if we
+ // just moved classes.
+ if (OldE && isa<StoreExpression>(OldE) && *E != *OldE) {
+ // Erase this as an exact expression to ensure we don't erase expressions
+ // equivalent to it.
+ auto Iter = ExpressionToClass.find_as(ExactEqualsExpression(*OldE));
+ if (Iter != ExpressionToClass.end())
+ ExpressionToClass.erase(Iter);
+ }
+ }
+ ValueToExpression[I] = E;
+}
+
+// Process the fact that Edge (from, to) is reachable, including marking
+// any newly reachable blocks and instructions for processing.
+void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
+ // Check if the Edge was reachable before.
+ if (ReachableEdges.insert({From, To}).second) {
+ // If this block wasn't reachable before, all instructions are touched.
+ if (ReachableBlocks.insert(To).second) {
+ DEBUG(dbgs() << "Block " << getBlockName(To) << " marked reachable\n");
+ const auto &InstRange = BlockInstRange.lookup(To);
+ TouchedInstructions.set(InstRange.first, InstRange.second);
+ } else {
+ DEBUG(dbgs() << "Block " << getBlockName(To)
+ << " was reachable, but new edge {" << getBlockName(From)
+ << "," << getBlockName(To) << "} to it found\n");
+
+ // We've made an edge reachable to an existing block, which may
+ // impact predicates. Otherwise, only mark the phi nodes as touched, as
+ // they are the only thing that depend on new edges. Anything using their
+ // values will get propagated to if necessary.
+ if (MemoryAccess *MemPhi = getMemoryAccess(To))
+ TouchedInstructions.set(InstrToDFSNum(MemPhi));
+
+ // FIXME: We should just add a union op on a Bitvector and
+ // SparseBitVector. We can do it word by word faster than we are doing it
+ // here.
+ for (auto InstNum : RevisitOnReachabilityChange[To])
+ TouchedInstructions.set(InstNum);
+ }
+ }
+}
+
+// Given a predicate condition (from a switch, cmp, or whatever) and a block,
+// see if we know some constant value for it already.
+Value *NewGVN::findConditionEquivalence(Value *Cond) const {
+ auto Result = lookupOperandLeader(Cond);
+ return isa<Constant>(Result) ? Result : nullptr;
+}
+
+// Process the outgoing edges of a block for reachability.
+void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
+ // Evaluate reachability of terminator instruction.
+ BranchInst *BR;
+ if ((BR = dyn_cast<BranchInst>(TI)) && BR->isConditional()) {
+ Value *Cond = BR->getCondition();
+ Value *CondEvaluated = findConditionEquivalence(Cond);
+ if (!CondEvaluated) {
+ if (auto *I = dyn_cast<Instruction>(Cond)) {
+ const Expression *E = createExpression(I);
+ if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
+ CondEvaluated = CE->getConstantValue();
+ }
+ } else if (isa<ConstantInt>(Cond)) {
+ CondEvaluated = Cond;
+ }
+ }
+ ConstantInt *CI;
+ BasicBlock *TrueSucc = BR->getSuccessor(0);
+ BasicBlock *FalseSucc = BR->getSuccessor(1);
+ if (CondEvaluated && (CI = dyn_cast<ConstantInt>(CondEvaluated))) {
+ if (CI->isOne()) {
+ DEBUG(dbgs() << "Condition for Terminator " << *TI
+ << " evaluated to true\n");
+ updateReachableEdge(B, TrueSucc);
+ } else if (CI->isZero()) {
+ DEBUG(dbgs() << "Condition for Terminator " << *TI
+ << " evaluated to false\n");
+ updateReachableEdge(B, FalseSucc);
+ }
+ } else {
+ updateReachableEdge(B, TrueSucc);
+ updateReachableEdge(B, FalseSucc);
+ }
+ } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
+ // For switches, propagate the case values into the case
+ // destinations.
+
+ // Remember how many outgoing edges there are to every successor.
+ SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+
+ Value *SwitchCond = SI->getCondition();
+ Value *CondEvaluated = findConditionEquivalence(SwitchCond);
+ // See if we were able to turn this switch statement into a constant.
+ if (CondEvaluated && isa<ConstantInt>(CondEvaluated)) {
+ auto *CondVal = cast<ConstantInt>(CondEvaluated);
+ // We should be able to get case value for this.
+ auto Case = *SI->findCaseValue(CondVal);
+ if (Case.getCaseSuccessor() == SI->getDefaultDest()) {
+ // We proved the value is outside of the range of the case.
+ // We can't do anything other than mark the default dest as reachable,
+ // and go home.
+ updateReachableEdge(B, SI->getDefaultDest());
+ return;
+ }
+ // Now get where it goes and mark it reachable.
+ BasicBlock *TargetBlock = Case.getCaseSuccessor();
+ updateReachableEdge(B, TargetBlock);
+ } else {
+ for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
+ BasicBlock *TargetBlock = SI->getSuccessor(i);
+ ++SwitchEdges[TargetBlock];
+ updateReachableEdge(B, TargetBlock);
+ }
+ }
+ } else {
+ // Otherwise this is either unconditional, or a type we have no
+ // idea about. Just mark successors as reachable.
+ for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+ BasicBlock *TargetBlock = TI->getSuccessor(i);
+ updateReachableEdge(B, TargetBlock);
+ }
+
+ // This also may be a memory defining terminator, in which case, set it
+ // equivalent only to itself.
+ //
+ auto *MA = getMemoryAccess(TI);
+ if (MA && !isa<MemoryUse>(MA)) {
+ auto *CC = ensureLeaderOfMemoryClass(MA);
+ if (setMemoryClass(MA, CC))
+ markMemoryUsersTouched(MA);
+ }
+ }
+}
+
+// Remove the PHI of Ops PHI for I
+void NewGVN::removePhiOfOps(Instruction *I, PHINode *PHITemp) {
+ InstrDFS.erase(PHITemp);
+ // It's still a temp instruction. We keep it in the array so it gets erased.
+ // However, it's no longer used by I, or in the block
+ TempToBlock.erase(PHITemp);
+ RealToTemp.erase(I);
+ // We don't remove the users from the phi node uses. This wastes a little
+ // time, but such is life. We could use two sets to track which were there
+ // are the start of NewGVN, and which were added, but right nowt he cost of
+ // tracking is more than the cost of checking for more phi of ops.
+}
+
+// Add PHI Op in BB as a PHI of operations version of ExistingValue.
+void NewGVN::addPhiOfOps(PHINode *Op, BasicBlock *BB,
+ Instruction *ExistingValue) {
+ InstrDFS[Op] = InstrToDFSNum(ExistingValue);
+ AllTempInstructions.insert(Op);
+ TempToBlock[Op] = BB;
+ RealToTemp[ExistingValue] = Op;
+ // Add all users to phi node use, as they are now uses of the phi of ops phis
+ // and may themselves be phi of ops.
+ for (auto *U : ExistingValue->users())
+ if (auto *UI = dyn_cast<Instruction>(U))
+ PHINodeUses.insert(UI);
+}
+
+static bool okayForPHIOfOps(const Instruction *I) {
+ if (!EnablePhiOfOps)
+ return false;
+ return isa<BinaryOperator>(I) || isa<SelectInst>(I) || isa<CmpInst>(I) ||
+ isa<LoadInst>(I);
+}
+
+bool NewGVN::OpIsSafeForPHIOfOpsHelper(
+ Value *V, const BasicBlock *PHIBlock,
+ SmallPtrSetImpl<const Value *> &Visited,
+ SmallVectorImpl<Instruction *> &Worklist) {
+
+ if (!isa<Instruction>(V))
+ return true;
+ auto OISIt = OpSafeForPHIOfOps.find(V);
+ if (OISIt != OpSafeForPHIOfOps.end())
+ return OISIt->second;
+
+ // Keep walking until we either dominate the phi block, or hit a phi, or run
+ // out of things to check.
+ if (DT->properlyDominates(getBlockForValue(V), PHIBlock)) {
+ OpSafeForPHIOfOps.insert({V, true});
+ return true;
+ }
+ // PHI in the same block.
+ if (isa<PHINode>(V) && getBlockForValue(V) == PHIBlock) {
+ OpSafeForPHIOfOps.insert({V, false});
+ return false;
+ }
+
+ auto *OrigI = cast<Instruction>(V);
+ for (auto *Op : OrigI->operand_values()) {
+ if (!isa<Instruction>(Op))
+ continue;
+ // Stop now if we find an unsafe operand.
+ auto OISIt = OpSafeForPHIOfOps.find(OrigI);
+ if (OISIt != OpSafeForPHIOfOps.end()) {
+ if (!OISIt->second) {
+ OpSafeForPHIOfOps.insert({V, false});
+ return false;
+ }
+ continue;
+ }
+ if (!Visited.insert(Op).second)
+ continue;
+ Worklist.push_back(cast<Instruction>(Op));
+ }
+ return true;
+}
+
+// Return true if this operand will be safe to use for phi of ops.
+//
+// The reason some operands are unsafe is that we are not trying to recursively
+// translate everything back through phi nodes. We actually expect some lookups
+// of expressions to fail. In particular, a lookup where the expression cannot
+// exist in the predecessor. This is true even if the expression, as shown, can
+// be determined to be constant.
+bool NewGVN::OpIsSafeForPHIOfOps(Value *V, const BasicBlock *PHIBlock,
+ SmallPtrSetImpl<const Value *> &Visited) {
+ SmallVector<Instruction *, 4> Worklist;
+ if (!OpIsSafeForPHIOfOpsHelper(V, PHIBlock, Visited, Worklist))
+ return false;
+ while (!Worklist.empty()) {
+ auto *I = Worklist.pop_back_val();
+ if (!OpIsSafeForPHIOfOpsHelper(I, PHIBlock, Visited, Worklist))
+ return false;
+ }
+ OpSafeForPHIOfOps.insert({V, true});
+ return true;
+}
+
+// Try to find a leader for instruction TransInst, which is a phi translated
+// version of something in our original program. Visited is used to ensure we
+// don't infinite loop during translations of cycles. OrigInst is the
+// instruction in the original program, and PredBB is the predecessor we
+// translated it through.
+Value *NewGVN::findLeaderForInst(Instruction *TransInst,
+ SmallPtrSetImpl<Value *> &Visited,
+ MemoryAccess *MemAccess, Instruction *OrigInst,
+ BasicBlock *PredBB) {
+ unsigned IDFSNum = InstrToDFSNum(OrigInst);
+ // Make sure it's marked as a temporary instruction.
+ AllTempInstructions.insert(TransInst);
+ // and make sure anything that tries to add it's DFS number is
+ // redirected to the instruction we are making a phi of ops
+ // for.
+ TempToBlock.insert({TransInst, PredBB});
+ InstrDFS.insert({TransInst, IDFSNum});
+
+ const Expression *E = performSymbolicEvaluation(TransInst, Visited);
+ InstrDFS.erase(TransInst);
+ AllTempInstructions.erase(TransInst);
+ TempToBlock.erase(TransInst);
+ if (MemAccess)
+ TempToMemory.erase(TransInst);
+ if (!E)
+ return nullptr;
+ auto *FoundVal = findPHIOfOpsLeader(E, OrigInst, PredBB);
+ if (!FoundVal) {
+ ExpressionToPhiOfOps[E].insert(OrigInst);
+ DEBUG(dbgs() << "Cannot find phi of ops operand for " << *TransInst
+ << " in block " << getBlockName(PredBB) << "\n");
+ return nullptr;
+ }
+ if (auto *SI = dyn_cast<StoreInst>(FoundVal))
+ FoundVal = SI->getValueOperand();
+ return FoundVal;
+}
+
+// When we see an instruction that is an op of phis, generate the equivalent phi
+// of ops form.
+const Expression *
+NewGVN::makePossiblePHIOfOps(Instruction *I,
+ SmallPtrSetImpl<Value *> &Visited) {
+ if (!okayForPHIOfOps(I))
+ return nullptr;
+
+ if (!Visited.insert(I).second)
+ return nullptr;
+ // For now, we require the instruction be cycle free because we don't
+ // *always* create a phi of ops for instructions that could be done as phi
+ // of ops, we only do it if we think it is useful. If we did do it all the
+ // time, we could remove the cycle free check.
+ if (!isCycleFree(I))
+ return nullptr;
+
+ SmallPtrSet<const Value *, 8> ProcessedPHIs;
+ // TODO: We don't do phi translation on memory accesses because it's
+ // complicated. For a load, we'd need to be able to simulate a new memoryuse,
+ // which we don't have a good way of doing ATM.
+ auto *MemAccess = getMemoryAccess(I);
+ // If the memory operation is defined by a memory operation this block that
+ // isn't a MemoryPhi, transforming the pointer backwards through a scalar phi
+ // can't help, as it would still be killed by that memory operation.
+ if (MemAccess && !isa<MemoryPhi>(MemAccess->getDefiningAccess()) &&
+ MemAccess->getDefiningAccess()->getBlock() == I->getParent())
+ return nullptr;
+
+ SmallPtrSet<const Value *, 10> VisitedOps;
+ // Convert op of phis to phi of ops
+ for (auto *Op : I->operand_values()) {
+ if (!isa<PHINode>(Op)) {
+ auto *ValuePHI = RealToTemp.lookup(Op);
+ if (!ValuePHI)
+ continue;
+ DEBUG(dbgs() << "Found possible dependent phi of ops\n");
+ Op = ValuePHI;
+ }
+ auto *OpPHI = cast<PHINode>(Op);
+ // No point in doing this for one-operand phis.
+ if (OpPHI->getNumOperands() == 1)
+ continue;
+ if (!DebugCounter::shouldExecute(PHIOfOpsCounter))
+ return nullptr;
+ SmallVector<ValPair, 4> Ops;
+ SmallPtrSet<Value *, 4> Deps;
+ auto *PHIBlock = getBlockForValue(OpPHI);
+ RevisitOnReachabilityChange[PHIBlock].reset(InstrToDFSNum(I));
+ for (unsigned PredNum = 0; PredNum < OpPHI->getNumOperands(); ++PredNum) {
+ auto *PredBB = OpPHI->getIncomingBlock(PredNum);
+ Value *FoundVal = nullptr;
+ // We could just skip unreachable edges entirely but it's tricky to do
+ // with rewriting existing phi nodes.
+ if (ReachableEdges.count({PredBB, PHIBlock})) {
+ // Clone the instruction, create an expression from it that is
+ // translated back into the predecessor, and see if we have a leader.
+ Instruction *ValueOp = I->clone();
+ if (MemAccess)
+ TempToMemory.insert({ValueOp, MemAccess});
+ bool SafeForPHIOfOps = true;
+ VisitedOps.clear();
+ for (auto &Op : ValueOp->operands()) {
+ auto *OrigOp = &*Op;
+ // When these operand changes, it could change whether there is a
+ // leader for us or not, so we have to add additional users.
+ if (isa<PHINode>(Op)) {
+ Op = Op->DoPHITranslation(PHIBlock, PredBB);
+ if (Op != OrigOp && Op != I)
+ Deps.insert(Op);
+ } else if (auto *ValuePHI = RealToTemp.lookup(Op)) {
+ if (getBlockForValue(ValuePHI) == PHIBlock)
+ Op = ValuePHI->getIncomingValueForBlock(PredBB);
+ }
+ // If we phi-translated the op, it must be safe.
+ SafeForPHIOfOps =
+ SafeForPHIOfOps &&
+ (Op != OrigOp || OpIsSafeForPHIOfOps(Op, PHIBlock, VisitedOps));
+ }
+ // FIXME: For those things that are not safe we could generate
+ // expressions all the way down, and see if this comes out to a
+ // constant. For anything where that is true, and unsafe, we should
+ // have made a phi-of-ops (or value numbered it equivalent to something)
+ // for the pieces already.
+ FoundVal = !SafeForPHIOfOps ? nullptr
+ : findLeaderForInst(ValueOp, Visited,
+ MemAccess, I, PredBB);
+ ValueOp->deleteValue();
+ if (!FoundVal)
+ return nullptr;
+ } else {
+ DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
+ << getBlockName(PredBB)
+ << " because the block is unreachable\n");
+ FoundVal = UndefValue::get(I->getType());
+ RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
+ }
+
+ Ops.push_back({FoundVal, PredBB});
+ DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in "
+ << getBlockName(PredBB) << "\n");
+ }
+ for (auto Dep : Deps)
+ addAdditionalUsers(Dep, I);
+ sortPHIOps(Ops);
+ auto *E = performSymbolicPHIEvaluation(Ops, I, PHIBlock);
+ if (isa<ConstantExpression>(E) || isa<VariableExpression>(E)) {
+ DEBUG(dbgs()
+ << "Not creating real PHI of ops because it simplified to existing "
+ "value or constant\n");
+ return E;
+ }
+ auto *ValuePHI = RealToTemp.lookup(I);
+ bool NewPHI = false;
+ if (!ValuePHI) {
+ ValuePHI =
+ PHINode::Create(I->getType(), OpPHI->getNumOperands(), "phiofops");
+ addPhiOfOps(ValuePHI, PHIBlock, I);
+ NewPHI = true;
+ NumGVNPHIOfOpsCreated++;
+ }
+ if (NewPHI) {
+ for (auto PHIOp : Ops)
+ ValuePHI->addIncoming(PHIOp.first, PHIOp.second);
+ } else {
+ unsigned int i = 0;
+ for (auto PHIOp : Ops) {
+ ValuePHI->setIncomingValue(i, PHIOp.first);
+ ValuePHI->setIncomingBlock(i, PHIOp.second);
+ ++i;
+ }
+ }
+ RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
+ DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I
+ << "\n");
+
+ return E;
+ }
+ return nullptr;
+}
+
+// The algorithm initially places the values of the routine in the TOP
+// congruence class. The leader of TOP is the undetermined value `undef`.
+// When the algorithm has finished, values still in TOP are unreachable.
+void NewGVN::initializeCongruenceClasses(Function &F) {
+ NextCongruenceNum = 0;
+
+ // Note that even though we use the live on entry def as a representative
+ // MemoryAccess, it is *not* the same as the actual live on entry def. We
+ // have no real equivalemnt to undef for MemoryAccesses, and so we really
+ // should be checking whether the MemoryAccess is top if we want to know if it
+ // is equivalent to everything. Otherwise, what this really signifies is that
+ // the access "it reaches all the way back to the beginning of the function"
+
+ // Initialize all other instructions to be in TOP class.
+ TOPClass = createCongruenceClass(nullptr, nullptr);
+ TOPClass->setMemoryLeader(MSSA->getLiveOnEntryDef());
+ // The live on entry def gets put into it's own class
+ MemoryAccessToClass[MSSA->getLiveOnEntryDef()] =
+ createMemoryClass(MSSA->getLiveOnEntryDef());
+
+ for (auto DTN : nodes(DT)) {
+ BasicBlock *BB = DTN->getBlock();
+ // All MemoryAccesses are equivalent to live on entry to start. They must
+ // be initialized to something so that initial changes are noticed. For
+ // the maximal answer, we initialize them all to be the same as
+ // liveOnEntry.
+ auto *MemoryBlockDefs = MSSA->getBlockDefs(BB);
+ if (MemoryBlockDefs)
+ for (const auto &Def : *MemoryBlockDefs) {
+ MemoryAccessToClass[&Def] = TOPClass;
+ auto *MD = dyn_cast<MemoryDef>(&Def);
+ // Insert the memory phis into the member list.
+ if (!MD) {
+ const MemoryPhi *MP = cast<MemoryPhi>(&Def);
+ TOPClass->memory_insert(MP);
+ MemoryPhiState.insert({MP, MPS_TOP});
+ }
+
+ if (MD && isa<StoreInst>(MD->getMemoryInst()))
+ TOPClass->incStoreCount();
+ }
+
+ // FIXME: This is trying to discover which instructions are uses of phi
+ // nodes. We should move this into one of the myriad of places that walk
+ // all the operands already.
+ for (auto &I : *BB) {
+ if (isa<PHINode>(&I))
+ for (auto *U : I.users())
+ if (auto *UInst = dyn_cast<Instruction>(U))
+ if (InstrToDFSNum(UInst) != 0 && okayForPHIOfOps(UInst))
+ PHINodeUses.insert(UInst);
+ // Don't insert void terminators into the class. We don't value number
+ // them, and they just end up sitting in TOP.
+ if (isa<TerminatorInst>(I) && I.getType()->isVoidTy())
+ continue;
+ TOPClass->insert(&I);
+ ValueToClass[&I] = TOPClass;
+ }
+ }
+
+ // Initialize arguments to be in their own unique congruence classes
+ for (auto &FA : F.args())
+ createSingletonCongruenceClass(&FA);
+}
+
+void NewGVN::cleanupTables() {
+ for (unsigned i = 0, e = CongruenceClasses.size(); i != e; ++i) {
+ DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID()
+ << " has " << CongruenceClasses[i]->size() << " members\n");
+ // Make sure we delete the congruence class (probably worth switching to
+ // a unique_ptr at some point.
+ delete CongruenceClasses[i];
+ CongruenceClasses[i] = nullptr;
+ }
+
+ // Destroy the value expressions
+ SmallVector<Instruction *, 8> TempInst(AllTempInstructions.begin(),
+ AllTempInstructions.end());
+ AllTempInstructions.clear();
+
+ // We have to drop all references for everything first, so there are no uses
+ // left as we delete them.
+ for (auto *I : TempInst) {
+ I->dropAllReferences();
+ }
+
+ while (!TempInst.empty()) {
+ auto *I = TempInst.back();
+ TempInst.pop_back();
+ I->deleteValue();
+ }
+
+ ValueToClass.clear();
+ ArgRecycler.clear(ExpressionAllocator);
+ ExpressionAllocator.Reset();
+ CongruenceClasses.clear();
+ ExpressionToClass.clear();
+ ValueToExpression.clear();
+ RealToTemp.clear();
+ AdditionalUsers.clear();
+ ExpressionToPhiOfOps.clear();
+ TempToBlock.clear();
+ TempToMemory.clear();
+ PHINodeUses.clear();
+ OpSafeForPHIOfOps.clear();
+ ReachableBlocks.clear();
+ ReachableEdges.clear();
+#ifndef NDEBUG
+ ProcessedCount.clear();
+#endif
+ InstrDFS.clear();
+ InstructionsToErase.clear();
+ DFSToInstr.clear();
+ BlockInstRange.clear();
+ TouchedInstructions.clear();
+ MemoryAccessToClass.clear();
+ PredicateToUsers.clear();
+ MemoryToUsers.clear();
+ RevisitOnReachabilityChange.clear();
+}
+
+// Assign local DFS number mapping to instructions, and leave space for Value
+// PHI's.
+std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
+ unsigned Start) {
+ unsigned End = Start;
+ if (MemoryAccess *MemPhi = getMemoryAccess(B)) {
+ InstrDFS[MemPhi] = End++;
+ DFSToInstr.emplace_back(MemPhi);
+ }
+
+ // Then the real block goes next.
+ for (auto &I : *B) {
+ // There's no need to call isInstructionTriviallyDead more than once on
+ // an instruction. Therefore, once we know that an instruction is dead
+ // we change its DFS number so that it doesn't get value numbered.
+ if (isInstructionTriviallyDead(&I, TLI)) {
+ InstrDFS[&I] = 0;
+ DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
+ markInstructionForDeletion(&I);
+ continue;
+ }
+ if (isa<PHINode>(&I))
+ RevisitOnReachabilityChange[B].set(End);
+ InstrDFS[&I] = End++;
+ DFSToInstr.emplace_back(&I);
+ }
+
+ // All of the range functions taken half-open ranges (open on the end side).
+ // So we do not subtract one from count, because at this point it is one
+ // greater than the last instruction.
+ return std::make_pair(Start, End);
+}
+
+void NewGVN::updateProcessedCount(const Value *V) {
+#ifndef NDEBUG
+ if (ProcessedCount.count(V) == 0) {
+ ProcessedCount.insert({V, 1});
+ } else {
+ ++ProcessedCount[V];
+ assert(ProcessedCount[V] < 100 &&
+ "Seem to have processed the same Value a lot");
+ }
+#endif
+}
+
+// Evaluate MemoryPhi nodes symbolically, just like PHI nodes
+void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
+ // If all the arguments are the same, the MemoryPhi has the same value as the
+ // argument. Filter out unreachable blocks and self phis from our operands.
+ // TODO: We could do cycle-checking on the memory phis to allow valueizing for
+ // self-phi checking.
+ const BasicBlock *PHIBlock = MP->getBlock();
+ auto Filtered = make_filter_range(MP->operands(), [&](const Use &U) {
+ return cast<MemoryAccess>(U) != MP &&
+ !isMemoryAccessTOP(cast<MemoryAccess>(U)) &&
+ ReachableEdges.count({MP->getIncomingBlock(U), PHIBlock});
+ });
+ // If all that is left is nothing, our memoryphi is undef. We keep it as
+ // InitialClass. Note: The only case this should happen is if we have at
+ // least one self-argument.
+ if (Filtered.begin() == Filtered.end()) {
+ if (setMemoryClass(MP, TOPClass))
+ markMemoryUsersTouched(MP);
+ return;
+ }
+
+ // Transform the remaining operands into operand leaders.
+ // FIXME: mapped_iterator should have a range version.
+ auto LookupFunc = [&](const Use &U) {
+ return lookupMemoryLeader(cast<MemoryAccess>(U));
+ };
+ auto MappedBegin = map_iterator(Filtered.begin(), LookupFunc);
+ auto MappedEnd = map_iterator(Filtered.end(), LookupFunc);
+
+ // and now check if all the elements are equal.
+ // Sadly, we can't use std::equals since these are random access iterators.
+ const auto *AllSameValue = *MappedBegin;
+ ++MappedBegin;
+ bool AllEqual = std::all_of(
+ MappedBegin, MappedEnd,
+ [&AllSameValue](const MemoryAccess *V) { return V == AllSameValue; });
+
+ if (AllEqual)
+ DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue << "\n");
+ else
+ DEBUG(dbgs() << "Memory Phi value numbered to itself\n");
+ // If it's equal to something, it's in that class. Otherwise, it has to be in
+ // a class where it is the leader (other things may be equivalent to it, but
+ // it needs to start off in its own class, which means it must have been the
+ // leader, and it can't have stopped being the leader because it was never
+ // removed).
+ CongruenceClass *CC =
+ AllEqual ? getMemoryClass(AllSameValue) : ensureLeaderOfMemoryClass(MP);
+ auto OldState = MemoryPhiState.lookup(MP);
+ assert(OldState != MPS_Invalid && "Invalid memory phi state");
+ auto NewState = AllEqual ? MPS_Equivalent : MPS_Unique;
+ MemoryPhiState[MP] = NewState;
+ if (setMemoryClass(MP, CC) || OldState != NewState)
+ markMemoryUsersTouched(MP);
+}
+
+// Value number a single instruction, symbolically evaluating, performing
+// congruence finding, and updating mappings.
+void NewGVN::valueNumberInstruction(Instruction *I) {
+ DEBUG(dbgs() << "Processing instruction " << *I << "\n");
+ if (!I->isTerminator()) {
+ const Expression *Symbolized = nullptr;
+ SmallPtrSet<Value *, 2> Visited;
+ if (DebugCounter::shouldExecute(VNCounter)) {
+ Symbolized = performSymbolicEvaluation(I, Visited);
+ // Make a phi of ops if necessary
+ if (Symbolized && !isa<ConstantExpression>(Symbolized) &&
+ !isa<VariableExpression>(Symbolized) && PHINodeUses.count(I)) {
+ auto *PHIE = makePossiblePHIOfOps(I, Visited);
+ // If we created a phi of ops, use it.
+ // If we couldn't create one, make sure we don't leave one lying around
+ if (PHIE) {
+ Symbolized = PHIE;
+ } else if (auto *Op = RealToTemp.lookup(I)) {
+ removePhiOfOps(I, Op);
+ }
+ }
+ } else {
+ // Mark the instruction as unused so we don't value number it again.
+ InstrDFS[I] = 0;
+ }
+ // If we couldn't come up with a symbolic expression, use the unknown
+ // expression
+ if (Symbolized == nullptr)
+ Symbolized = createUnknownExpression(I);
+ performCongruenceFinding(I, Symbolized);
+ } else {
+ // Handle terminators that return values. All of them produce values we
+ // don't currently understand. We don't place non-value producing
+ // terminators in a class.
+ if (!I->getType()->isVoidTy()) {
+ auto *Symbolized = createUnknownExpression(I);
+ performCongruenceFinding(I, Symbolized);
+ }
+ processOutgoingEdges(dyn_cast<TerminatorInst>(I), I->getParent());
+ }
+}
+
+// Check if there is a path, using single or equal argument phi nodes, from
+// First to Second.
+bool NewGVN::singleReachablePHIPath(
+ SmallPtrSet<const MemoryAccess *, 8> &Visited, const MemoryAccess *First,
+ const MemoryAccess *Second) const {
+ if (First == Second)
+ return true;
+ if (MSSA->isLiveOnEntryDef(First))
+ return false;
+
+ // This is not perfect, but as we're just verifying here, we can live with
+ // the loss of precision. The real solution would be that of doing strongly
+ // connected component finding in this routine, and it's probably not worth
+ // the complexity for the time being. So, we just keep a set of visited
+ // MemoryAccess and return true when we hit a cycle.
+ if (Visited.count(First))
+ return true;
+ Visited.insert(First);
+
+ const auto *EndDef = First;
+ for (auto *ChainDef : optimized_def_chain(First)) {
+ if (ChainDef == Second)
+ return true;
+ if (MSSA->isLiveOnEntryDef(ChainDef))
+ return false;
+ EndDef = ChainDef;
+ }
+ auto *MP = cast<MemoryPhi>(EndDef);
+ auto ReachableOperandPred = [&](const Use &U) {
+ return ReachableEdges.count({MP->getIncomingBlock(U), MP->getBlock()});
+ };
+ auto FilteredPhiArgs =
+ make_filter_range(MP->operands(), ReachableOperandPred);
+ SmallVector<const Value *, 32> OperandList;
+ std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
+ std::back_inserter(OperandList));
+ bool Okay = OperandList.size() == 1;
+ if (!Okay)
+ Okay =
+ std::equal(OperandList.begin(), OperandList.end(), OperandList.begin());
+ if (Okay)
+ return singleReachablePHIPath(Visited, cast<MemoryAccess>(OperandList[0]),
+ Second);
+ return false;
+}
+
+// Verify the that the memory equivalence table makes sense relative to the
+// congruence classes. Note that this checking is not perfect, and is currently
+// subject to very rare false negatives. It is only useful for
+// testing/debugging.
+void NewGVN::verifyMemoryCongruency() const {
+#ifndef NDEBUG
+ // Verify that the memory table equivalence and memory member set match
+ for (const auto *CC : CongruenceClasses) {
+ if (CC == TOPClass || CC->isDead())
+ continue;
+ if (CC->getStoreCount() != 0) {
+ assert((CC->getStoredValue() || !isa<StoreInst>(CC->getLeader())) &&
+ "Any class with a store as a leader should have a "
+ "representative stored value");
+ assert(CC->getMemoryLeader() &&
+ "Any congruence class with a store should have a "
+ "representative access");
+ }
+
+ if (CC->getMemoryLeader())
+ assert(MemoryAccessToClass.lookup(CC->getMemoryLeader()) == CC &&
+ "Representative MemoryAccess does not appear to be reverse "
+ "mapped properly");
+ for (auto M : CC->memory())
+ assert(MemoryAccessToClass.lookup(M) == CC &&
+ "Memory member does not appear to be reverse mapped properly");
+ }
+
+ // Anything equivalent in the MemoryAccess table should be in the same
+ // congruence class.
+
+ // Filter out the unreachable and trivially dead entries, because they may
+ // never have been updated if the instructions were not processed.
+ auto ReachableAccessPred =
+ [&](const std::pair<const MemoryAccess *, CongruenceClass *> Pair) {
+ bool Result = ReachableBlocks.count(Pair.first->getBlock());
+ if (!Result || MSSA->isLiveOnEntryDef(Pair.first) ||
+ MemoryToDFSNum(Pair.first) == 0)
+ return false;
+ if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first))
+ return !isInstructionTriviallyDead(MemDef->getMemoryInst());
+
+ // We could have phi nodes which operands are all trivially dead,
+ // so we don't process them.
+ if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) {
+ for (auto &U : MemPHI->incoming_values()) {
+ if (auto *I = dyn_cast<Instruction>(&*U)) {
+ if (!isInstructionTriviallyDead(I))
+ return true;
+ }
+ }
+ return false;
+ }
+
+ return true;
+ };
+
+ auto Filtered = make_filter_range(MemoryAccessToClass, ReachableAccessPred);
+ for (auto KV : Filtered) {
+ if (auto *FirstMUD = dyn_cast<MemoryUseOrDef>(KV.first)) {
+ auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second->getMemoryLeader());
+ if (FirstMUD && SecondMUD) {
+ SmallPtrSet<const MemoryAccess *, 8> VisitedMAS;
+ assert((singleReachablePHIPath(VisitedMAS, FirstMUD, SecondMUD) ||
+ ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
+ ValueToClass.lookup(SecondMUD->getMemoryInst())) &&
+ "The instructions for these memory operations should have "
+ "been in the same congruence class or reachable through"
+ "a single argument phi");
+ }
+ } else if (auto *FirstMP = dyn_cast<MemoryPhi>(KV.first)) {
+ // We can only sanely verify that MemoryDefs in the operand list all have
+ // the same class.
+ auto ReachableOperandPred = [&](const Use &U) {
+ return ReachableEdges.count(
+ {FirstMP->getIncomingBlock(U), FirstMP->getBlock()}) &&
+ isa<MemoryDef>(U);
+
+ };
+ // All arguments should in the same class, ignoring unreachable arguments
+ auto FilteredPhiArgs =
+ make_filter_range(FirstMP->operands(), ReachableOperandPred);
+ SmallVector<const CongruenceClass *, 16> PhiOpClasses;
+ std::transform(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
+ std::back_inserter(PhiOpClasses), [&](const Use &U) {
+ const MemoryDef *MD = cast<MemoryDef>(U);
+ return ValueToClass.lookup(MD->getMemoryInst());
+ });
+ assert(std::equal(PhiOpClasses.begin(), PhiOpClasses.end(),
+ PhiOpClasses.begin()) &&
+ "All MemoryPhi arguments should be in the same class");
+ }
+ }
+#endif
+}
+
+// Verify that the sparse propagation we did actually found the maximal fixpoint
+// We do this by storing the value to class mapping, touching all instructions,
+// and redoing the iteration to see if anything changed.
+void NewGVN::verifyIterationSettled(Function &F) {
+#ifndef NDEBUG
+ DEBUG(dbgs() << "Beginning iteration verification\n");
+ if (DebugCounter::isCounterSet(VNCounter))
+ DebugCounter::setCounterValue(VNCounter, StartingVNCounter);
+
+ // Note that we have to store the actual classes, as we may change existing
+ // classes during iteration. This is because our memory iteration propagation
+ // is not perfect, and so may waste a little work. But it should generate
+ // exactly the same congruence classes we have now, with different IDs.
+ std::map<const Value *, CongruenceClass> BeforeIteration;
+
+ for (auto &KV : ValueToClass) {
+ if (auto *I = dyn_cast<Instruction>(KV.first))
+ // Skip unused/dead instructions.
+ if (InstrToDFSNum(I) == 0)
+ continue;
+ BeforeIteration.insert({KV.first, *KV.second});
+ }
+
+ TouchedInstructions.set();
+ TouchedInstructions.reset(0);
+ iterateTouchedInstructions();
+ DenseSet<std::pair<const CongruenceClass *, const CongruenceClass *>>
+ EqualClasses;
+ for (const auto &KV : ValueToClass) {
+ if (auto *I = dyn_cast<Instruction>(KV.first))
+ // Skip unused/dead instructions.
+ if (InstrToDFSNum(I) == 0)
+ continue;
+ // We could sink these uses, but i think this adds a bit of clarity here as
+ // to what we are comparing.
+ auto *BeforeCC = &BeforeIteration.find(KV.first)->second;
+ auto *AfterCC = KV.second;
+ // Note that the classes can't change at this point, so we memoize the set
+ // that are equal.
+ if (!EqualClasses.count({BeforeCC, AfterCC})) {
+ assert(BeforeCC->isEquivalentTo(AfterCC) &&
+ "Value number changed after main loop completed!");
+ EqualClasses.insert({BeforeCC, AfterCC});
+ }
+ }
+#endif
+}
+
+// Verify that for each store expression in the expression to class mapping,
+// only the latest appears, and multiple ones do not appear.
+// Because loads do not use the stored value when doing equality with stores,
+// if we don't erase the old store expressions from the table, a load can find
+// a no-longer valid StoreExpression.
+void NewGVN::verifyStoreExpressions() const {
+#ifndef NDEBUG
+ // This is the only use of this, and it's not worth defining a complicated
+ // densemapinfo hash/equality function for it.
+ std::set<
+ std::pair<const Value *,
+ std::tuple<const Value *, const CongruenceClass *, Value *>>>
+ StoreExpressionSet;
+ for (const auto &KV : ExpressionToClass) {
+ if (auto *SE = dyn_cast<StoreExpression>(KV.first)) {
+ // Make sure a version that will conflict with loads is not already there
+ auto Res = StoreExpressionSet.insert(
+ {SE->getOperand(0), std::make_tuple(SE->getMemoryLeader(), KV.second,
+ SE->getStoredValue())});
+ bool Okay = Res.second;
+ // It's okay to have the same expression already in there if it is
+ // identical in nature.
+ // This can happen when the leader of the stored value changes over time.
+ if (!Okay)
+ Okay = (std::get<1>(Res.first->second) == KV.second) &&
+ (lookupOperandLeader(std::get<2>(Res.first->second)) ==
+ lookupOperandLeader(SE->getStoredValue()));
+ assert(Okay && "Stored expression conflict exists in expression table");
+ auto *ValueExpr = ValueToExpression.lookup(SE->getStoreInst());
+ assert(ValueExpr && ValueExpr->equals(*SE) &&
+ "StoreExpression in ExpressionToClass is not latest "
+ "StoreExpression for value");
+ }
+ }
+#endif
+}
+
+// This is the main value numbering loop, it iterates over the initial touched
+// instruction set, propagating value numbers, marking things touched, etc,
+// until the set of touched instructions is completely empty.
+void NewGVN::iterateTouchedInstructions() {
+ unsigned int Iterations = 0;
+ // Figure out where touchedinstructions starts
+ int FirstInstr = TouchedInstructions.find_first();
+ // Nothing set, nothing to iterate, just return.
+ if (FirstInstr == -1)
+ return;
+ const BasicBlock *LastBlock = getBlockForValue(InstrFromDFSNum(FirstInstr));
+ while (TouchedInstructions.any()) {
+ ++Iterations;
+ // Walk through all the instructions in all the blocks in RPO.
+ // TODO: As we hit a new block, we should push and pop equalities into a
+ // table lookupOperandLeader can use, to catch things PredicateInfo
+ // might miss, like edge-only equivalences.
+ for (unsigned InstrNum : TouchedInstructions.set_bits()) {
+
+ // This instruction was found to be dead. We don't bother looking
+ // at it again.
+ if (InstrNum == 0) {
+ TouchedInstructions.reset(InstrNum);
+ continue;
+ }
+
+ Value *V = InstrFromDFSNum(InstrNum);
+ const BasicBlock *CurrBlock = getBlockForValue(V);
+
+ // If we hit a new block, do reachability processing.
+ if (CurrBlock != LastBlock) {
+ LastBlock = CurrBlock;
+ bool BlockReachable = ReachableBlocks.count(CurrBlock);
+ const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock);
+
+ // If it's not reachable, erase any touched instructions and move on.
+ if (!BlockReachable) {
+ TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
+ DEBUG(dbgs() << "Skipping instructions in block "
+ << getBlockName(CurrBlock)
+ << " because it is unreachable\n");
+ continue;
+ }
+ updateProcessedCount(CurrBlock);
+ }
+ // Reset after processing (because we may mark ourselves as touched when
+ // we propagate equalities).
+ TouchedInstructions.reset(InstrNum);
+
+ if (auto *MP = dyn_cast<MemoryPhi>(V)) {
+ DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
+ valueNumberMemoryPhi(MP);
+ } else if (auto *I = dyn_cast<Instruction>(V)) {
+ valueNumberInstruction(I);
+ } else {
+ llvm_unreachable("Should have been a MemoryPhi or Instruction");
+ }
+ updateProcessedCount(V);
+ }
+ }
+ NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations);
+}
+
+// This is the main transformation entry point.
+bool NewGVN::runGVN() {
+ if (DebugCounter::isCounterSet(VNCounter))
+ StartingVNCounter = DebugCounter::getCounterValue(VNCounter);
+ bool Changed = false;
+ NumFuncArgs = F.arg_size();
+ MSSAWalker = MSSA->getWalker();
+ SingletonDeadExpression = new (ExpressionAllocator) DeadExpression();
+
+ // Count number of instructions for sizing of hash tables, and come
+ // up with a global dfs numbering for instructions.
+ unsigned ICount = 1;
+ // Add an empty instruction to account for the fact that we start at 1
+ DFSToInstr.emplace_back(nullptr);
+ // Note: We want ideal RPO traversal of the blocks, which is not quite the
+ // same as dominator tree order, particularly with regard whether backedges
+ // get visited first or second, given a block with multiple successors.
+ // If we visit in the wrong order, we will end up performing N times as many
+ // iterations.
+ // The dominator tree does guarantee that, for a given dom tree node, it's
+ // parent must occur before it in the RPO ordering. Thus, we only need to sort
+ // the siblings.
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+ unsigned Counter = 0;
+ for (auto &B : RPOT) {
+ auto *Node = DT->getNode(B);
+ assert(Node && "RPO and Dominator tree should have same reachability");
+ RPOOrdering[Node] = ++Counter;
+ }
+ // Sort dominator tree children arrays into RPO.
+ for (auto &B : RPOT) {
+ auto *Node = DT->getNode(B);
+ if (Node->getChildren().size() > 1)
+ std::sort(Node->begin(), Node->end(),
+ [&](const DomTreeNode *A, const DomTreeNode *B) {
+ return RPOOrdering[A] < RPOOrdering[B];
+ });
+ }
+
+ // Now a standard depth first ordering of the domtree is equivalent to RPO.
+ for (auto DTN : depth_first(DT->getRootNode())) {
+ BasicBlock *B = DTN->getBlock();
+ const auto &BlockRange = assignDFSNumbers(B, ICount);
+ BlockInstRange.insert({B, BlockRange});
+ ICount += BlockRange.second - BlockRange.first;
+ }
+ initializeCongruenceClasses(F);
+
+ TouchedInstructions.resize(ICount);
+ // Ensure we don't end up resizing the expressionToClass map, as
+ // that can be quite expensive. At most, we have one expression per
+ // instruction.
+ ExpressionToClass.reserve(ICount);
+
+ // Initialize the touched instructions to include the entry block.
+ const auto &InstRange = BlockInstRange.lookup(&F.getEntryBlock());
+ TouchedInstructions.set(InstRange.first, InstRange.second);
+ DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock())
+ << " marked reachable\n");
+ ReachableBlocks.insert(&F.getEntryBlock());
+
+ iterateTouchedInstructions();
+ verifyMemoryCongruency();
+ verifyIterationSettled(F);
+ verifyStoreExpressions();
+
+ Changed |= eliminateInstructions(F);
+
+ // Delete all instructions marked for deletion.
+ for (Instruction *ToErase : InstructionsToErase) {
+ if (!ToErase->use_empty())
+ ToErase->replaceAllUsesWith(UndefValue::get(ToErase->getType()));
+
+ if (ToErase->getParent())
+ ToErase->eraseFromParent();
+ }
+
+ // Delete all unreachable blocks.
+ auto UnreachableBlockPred = [&](const BasicBlock &BB) {
+ return !ReachableBlocks.count(&BB);
+ };
+
+ for (auto &BB : make_filter_range(F, UnreachableBlockPred)) {
+ DEBUG(dbgs() << "We believe block " << getBlockName(&BB)
+ << " is unreachable\n");
+ deleteInstructionsInBlock(&BB);
+ Changed = true;
+ }
+
+ cleanupTables();
+ return Changed;
+}
+
+struct NewGVN::ValueDFS {
+ int DFSIn = 0;
+ int DFSOut = 0;
+ int LocalNum = 0;
+
+ // Only one of Def and U will be set.
+ // The bool in the Def tells us whether the Def is the stored value of a
+ // store.
+ PointerIntPair<Value *, 1, bool> Def;
+ Use *U = nullptr;
+
+ bool operator<(const ValueDFS &Other) const {
+ // It's not enough that any given field be less than - we have sets
+ // of fields that need to be evaluated together to give a proper ordering.
+ // For example, if you have;
+ // DFS (1, 3)
+ // Val 0
+ // DFS (1, 2)
+ // Val 50
+ // We want the second to be less than the first, but if we just go field
+ // by field, we will get to Val 0 < Val 50 and say the first is less than
+ // the second. We only want it to be less than if the DFS orders are equal.
+ //
+ // Each LLVM instruction only produces one value, and thus the lowest-level
+ // differentiator that really matters for the stack (and what we use as as a
+ // replacement) is the local dfs number.
+ // Everything else in the structure is instruction level, and only affects
+ // the order in which we will replace operands of a given instruction.
+ //
+ // For a given instruction (IE things with equal dfsin, dfsout, localnum),
+ // the order of replacement of uses does not matter.
+ // IE given,
+ // a = 5
+ // b = a + a
+ // When you hit b, you will have two valuedfs with the same dfsin, out, and
+ // localnum.
+ // The .val will be the same as well.
+ // The .u's will be different.
+ // You will replace both, and it does not matter what order you replace them
+ // in (IE whether you replace operand 2, then operand 1, or operand 1, then
+ // operand 2).
+ // Similarly for the case of same dfsin, dfsout, localnum, but different
+ // .val's
+ // a = 5
+ // b = 6
+ // c = a + b
+ // in c, we will a valuedfs for a, and one for b,with everything the same
+ // but .val and .u.
+ // It does not matter what order we replace these operands in.
+ // You will always end up with the same IR, and this is guaranteed.
+ return std::tie(DFSIn, DFSOut, LocalNum, Def, U) <
+ std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Def,
+ Other.U);
+ }
+};
+
+// This function converts the set of members for a congruence class from values,
+// to sets of defs and uses with associated DFS info. The total number of
+// reachable uses for each value is stored in UseCount, and instructions that
+// seem
+// dead (have no non-dead uses) are stored in ProbablyDead.
+void NewGVN::convertClassToDFSOrdered(
+ const CongruenceClass &Dense, SmallVectorImpl<ValueDFS> &DFSOrderedSet,
+ DenseMap<const Value *, unsigned int> &UseCounts,
+ SmallPtrSetImpl<Instruction *> &ProbablyDead) const {
+ for (auto D : Dense) {
+ // First add the value.
+ BasicBlock *BB = getBlockForValue(D);
+ // Constants are handled prior to ever calling this function, so
+ // we should only be left with instructions as members.
+ assert(BB && "Should have figured out a basic block for value");
+ ValueDFS VDDef;
+ DomTreeNode *DomNode = DT->getNode(BB);
+ VDDef.DFSIn = DomNode->getDFSNumIn();
+ VDDef.DFSOut = DomNode->getDFSNumOut();
+ // If it's a store, use the leader of the value operand, if it's always
+ // available, or the value operand. TODO: We could do dominance checks to
+ // find a dominating leader, but not worth it ATM.
+ if (auto *SI = dyn_cast<StoreInst>(D)) {
+ auto Leader = lookupOperandLeader(SI->getValueOperand());
+ if (alwaysAvailable(Leader)) {
+ VDDef.Def.setPointer(Leader);
+ } else {
+ VDDef.Def.setPointer(SI->getValueOperand());
+ VDDef.Def.setInt(true);
+ }
+ } else {
+ VDDef.Def.setPointer(D);
+ }
+ assert(isa<Instruction>(D) &&
+ "The dense set member should always be an instruction");
+ Instruction *Def = cast<Instruction>(D);
+ VDDef.LocalNum = InstrToDFSNum(D);
+ DFSOrderedSet.push_back(VDDef);
+ // If there is a phi node equivalent, add it
+ if (auto *PN = RealToTemp.lookup(Def)) {
+ auto *PHIE =
+ dyn_cast_or_null<PHIExpression>(ValueToExpression.lookup(Def));
+ if (PHIE) {
+ VDDef.Def.setInt(false);
+ VDDef.Def.setPointer(PN);
+ VDDef.LocalNum = 0;
+ DFSOrderedSet.push_back(VDDef);
+ }
+ }
+
+ unsigned int UseCount = 0;
+ // Now add the uses.
+ for (auto &U : Def->uses()) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ // Don't try to replace into dead uses
+ if (InstructionsToErase.count(I))
+ continue;
+ ValueDFS VDUse;
+ // Put the phi node uses in the incoming block.
+ BasicBlock *IBlock;
+ if (auto *P = dyn_cast<PHINode>(I)) {
+ IBlock = P->getIncomingBlock(U);
+ // Make phi node users appear last in the incoming block
+ // they are from.
+ VDUse.LocalNum = InstrDFS.size() + 1;
+ } else {
+ IBlock = getBlockForValue(I);
+ VDUse.LocalNum = InstrToDFSNum(I);
+ }
+
+ // Skip uses in unreachable blocks, as we're going
+ // to delete them.
+ if (ReachableBlocks.count(IBlock) == 0)
+ continue;
+
+ DomTreeNode *DomNode = DT->getNode(IBlock);
+ VDUse.DFSIn = DomNode->getDFSNumIn();
+ VDUse.DFSOut = DomNode->getDFSNumOut();
+ VDUse.U = &U;
+ ++UseCount;
+ DFSOrderedSet.emplace_back(VDUse);
+ }
+ }
+
+ // If there are no uses, it's probably dead (but it may have side-effects,
+ // so not definitely dead. Otherwise, store the number of uses so we can
+ // track if it becomes dead later).
+ if (UseCount == 0)
+ ProbablyDead.insert(Def);
+ else
+ UseCounts[Def] = UseCount;
+ }
+}
+
+// This function converts the set of members for a congruence class from values,
+// to the set of defs for loads and stores, with associated DFS info.
+void NewGVN::convertClassToLoadsAndStores(
+ const CongruenceClass &Dense,
+ SmallVectorImpl<ValueDFS> &LoadsAndStores) const {
+ for (auto D : Dense) {
+ if (!isa<LoadInst>(D) && !isa<StoreInst>(D))
+ continue;
+
+ BasicBlock *BB = getBlockForValue(D);
+ ValueDFS VD;
+ DomTreeNode *DomNode = DT->getNode(BB);
+ VD.DFSIn = DomNode->getDFSNumIn();
+ VD.DFSOut = DomNode->getDFSNumOut();
+ VD.Def.setPointer(D);
+
+ // If it's an instruction, use the real local dfs number.
+ if (auto *I = dyn_cast<Instruction>(D))
+ VD.LocalNum = InstrToDFSNum(I);
+ else
+ llvm_unreachable("Should have been an instruction");
+
+ LoadsAndStores.emplace_back(VD);
+ }
+}
+
+static void patchReplacementInstruction(Instruction *I, Value *Repl) {
+ auto *ReplInst = dyn_cast<Instruction>(Repl);
+ if (!ReplInst)
+ return;
+
+ // Patch the replacement so that it is not more restrictive than the value
+ // being replaced.
+ // Note that if 'I' is a load being replaced by some operation,
+ // for example, by an arithmetic operation, then andIRFlags()
+ // would just erase all math flags from the original arithmetic
+ // operation, which is clearly not wanted and not needed.
+ if (!isa<LoadInst>(I))
+ ReplInst->andIRFlags(I);
+
+ // FIXME: If both the original and replacement value are part of the
+ // same control-flow region (meaning that the execution of one
+ // guarantees the execution of the other), then we can combine the
+ // noalias scopes here and do better than the general conservative
+ // answer used in combineMetadata().
+
+ // In general, GVN unifies expressions over different control-flow
+ // regions, and so we need a conservative combination of the noalias
+ // scopes.
+ static const unsigned KnownIDs[] = {
+ LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias, LLVMContext::MD_range,
+ LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load,
+ LLVMContext::MD_invariant_group};
+ combineMetadata(ReplInst, I, KnownIDs);
+}
+
+static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
+ patchReplacementInstruction(I, Repl);
+ I->replaceAllUsesWith(Repl);
+}
+
+void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
+ DEBUG(dbgs() << " BasicBlock Dead:" << *BB);
+ ++NumGVNBlocksDeleted;
+
+ // Delete the instructions backwards, as it has a reduced likelihood of having
+ // to update as many def-use and use-def chains. Start after the terminator.
+ auto StartPoint = BB->rbegin();
+ ++StartPoint;
+ // Note that we explicitly recalculate BB->rend() on each iteration,
+ // as it may change when we remove the first instruction.
+ for (BasicBlock::reverse_iterator I(StartPoint); I != BB->rend();) {
+ Instruction &Inst = *I++;
+ if (!Inst.use_empty())
+ Inst.replaceAllUsesWith(UndefValue::get(Inst.getType()));
+ if (isa<LandingPadInst>(Inst))
+ continue;
+
+ Inst.eraseFromParent();
+ ++NumGVNInstrDeleted;
+ }
+ // Now insert something that simplifycfg will turn into an unreachable.
+ Type *Int8Ty = Type::getInt8Ty(BB->getContext());
+ new StoreInst(UndefValue::get(Int8Ty),
+ Constant::getNullValue(Int8Ty->getPointerTo()),
+ BB->getTerminator());
+}
+
+void NewGVN::markInstructionForDeletion(Instruction *I) {
+ DEBUG(dbgs() << "Marking " << *I << " for deletion\n");
+ InstructionsToErase.insert(I);
+}
+
+void NewGVN::replaceInstruction(Instruction *I, Value *V) {
+ DEBUG(dbgs() << "Replacing " << *I << " with " << *V << "\n");
+ patchAndReplaceAllUsesWith(I, V);
+ // We save the actual erasing to avoid invalidating memory
+ // dependencies until we are done with everything.
+ markInstructionForDeletion(I);
+}
+
+namespace {
+
+// This is a stack that contains both the value and dfs info of where
+// that value is valid.
+class ValueDFSStack {
+public:
+ Value *back() const { return ValueStack.back(); }
+ std::pair<int, int> dfs_back() const { return DFSStack.back(); }
+
+ void push_back(Value *V, int DFSIn, int DFSOut) {
+ ValueStack.emplace_back(V);
+ DFSStack.emplace_back(DFSIn, DFSOut);
+ }
+
+ bool empty() const { return DFSStack.empty(); }
+
+ bool isInScope(int DFSIn, int DFSOut) const {
+ if (empty())
+ return false;
+ return DFSIn >= DFSStack.back().first && DFSOut <= DFSStack.back().second;
+ }
+
+ void popUntilDFSScope(int DFSIn, int DFSOut) {
+
+ // These two should always be in sync at this point.
+ assert(ValueStack.size() == DFSStack.size() &&
+ "Mismatch between ValueStack and DFSStack");
+ while (
+ !DFSStack.empty() &&
+ !(DFSIn >= DFSStack.back().first && DFSOut <= DFSStack.back().second)) {
+ DFSStack.pop_back();
+ ValueStack.pop_back();
+ }
+ }
+
+private:
+ SmallVector<Value *, 8> ValueStack;
+ SmallVector<std::pair<int, int>, 8> DFSStack;
+};
+
+} // end anonymous namespace
+
+// Given an expression, get the congruence class for it.
+CongruenceClass *NewGVN::getClassForExpression(const Expression *E) const {
+ if (auto *VE = dyn_cast<VariableExpression>(E))
+ return ValueToClass.lookup(VE->getVariableValue());
+ else if (isa<DeadExpression>(E))
+ return TOPClass;
+ return ExpressionToClass.lookup(E);
+}
+
+// Given a value and a basic block we are trying to see if it is available in,
+// see if the value has a leader available in that block.
+Value *NewGVN::findPHIOfOpsLeader(const Expression *E,
+ const Instruction *OrigInst,
+ const BasicBlock *BB) const {
+ // It would already be constant if we could make it constant
+ if (auto *CE = dyn_cast<ConstantExpression>(E))
+ return CE->getConstantValue();
+ if (auto *VE = dyn_cast<VariableExpression>(E)) {
+ auto *V = VE->getVariableValue();
+ if (alwaysAvailable(V) || DT->dominates(getBlockForValue(V), BB))
+ return VE->getVariableValue();
+ }
+
+ auto *CC = getClassForExpression(E);
+ if (!CC)
+ return nullptr;
+ if (alwaysAvailable(CC->getLeader()))
+ return CC->getLeader();
+
+ for (auto Member : *CC) {
+ auto *MemberInst = dyn_cast<Instruction>(Member);
+ if (MemberInst == OrigInst)
+ continue;
+ // Anything that isn't an instruction is always available.
+ if (!MemberInst)
+ return Member;
+ if (DT->dominates(getBlockForValue(MemberInst), BB))
+ return Member;
+ }
+ return nullptr;
+}
+
+bool NewGVN::eliminateInstructions(Function &F) {
+ // This is a non-standard eliminator. The normal way to eliminate is
+ // to walk the dominator tree in order, keeping track of available
+ // values, and eliminating them. However, this is mildly
+ // pointless. It requires doing lookups on every instruction,
+ // regardless of whether we will ever eliminate it. For
+ // instructions part of most singleton congruence classes, we know we
+ // will never eliminate them.
+
+ // Instead, this eliminator looks at the congruence classes directly, sorts
+ // them into a DFS ordering of the dominator tree, and then we just
+ // perform elimination straight on the sets by walking the congruence
+ // class member uses in order, and eliminate the ones dominated by the
+ // last member. This is worst case O(E log E) where E = number of
+ // instructions in a single congruence class. In theory, this is all
+ // instructions. In practice, it is much faster, as most instructions are
+ // either in singleton congruence classes or can't possibly be eliminated
+ // anyway (if there are no overlapping DFS ranges in class).
+ // When we find something not dominated, it becomes the new leader
+ // for elimination purposes.
+ // TODO: If we wanted to be faster, We could remove any members with no
+ // overlapping ranges while sorting, as we will never eliminate anything
+ // with those members, as they don't dominate anything else in our set.
+
+ bool AnythingReplaced = false;
+
+ // Since we are going to walk the domtree anyway, and we can't guarantee the
+ // DFS numbers are updated, we compute some ourselves.
+ DT->updateDFSNumbers();
+
+ // Go through all of our phi nodes, and kill the arguments associated with
+ // unreachable edges.
+ auto ReplaceUnreachablePHIArgs = [&](PHINode *PHI, BasicBlock *BB) {
+ for (auto &Operand : PHI->incoming_values())
+ if (!ReachableEdges.count({PHI->getIncomingBlock(Operand), BB})) {
+ DEBUG(dbgs() << "Replacing incoming value of " << PHI << " for block "
+ << getBlockName(PHI->getIncomingBlock(Operand))
+ << " with undef due to it being unreachable\n");
+ Operand.set(UndefValue::get(PHI->getType()));
+ }
+ };
+ // Replace unreachable phi arguments.
+ // At this point, RevisitOnReachabilityChange only contains:
+ //
+ // 1. PHIs
+ // 2. Temporaries that will convert to PHIs
+ // 3. Operations that are affected by an unreachable edge but do not fit into
+ // 1 or 2 (rare).
+ // So it is a slight overshoot of what we want. We could make it exact by
+ // using two SparseBitVectors per block.
+ DenseMap<const BasicBlock *, unsigned> ReachablePredCount;
+ for (auto &KV : ReachableEdges)
+ ReachablePredCount[KV.getEnd()]++;
+ for (auto &BBPair : RevisitOnReachabilityChange) {
+ for (auto InstNum : BBPair.second) {
+ auto *Inst = InstrFromDFSNum(InstNum);
+ auto *PHI = dyn_cast<PHINode>(Inst);
+ PHI = PHI ? PHI : dyn_cast_or_null<PHINode>(RealToTemp.lookup(Inst));
+ if (!PHI)
+ continue;
+ auto *BB = BBPair.first;
+ if (ReachablePredCount.lookup(BB) != PHI->getNumIncomingValues())
+ ReplaceUnreachablePHIArgs(PHI, BB);
+ }
+ }
+
+ // Map to store the use counts
+ DenseMap<const Value *, unsigned int> UseCounts;
+ for (auto *CC : reverse(CongruenceClasses)) {
+ DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID() << "\n");
+ // Track the equivalent store info so we can decide whether to try
+ // dead store elimination.
+ SmallVector<ValueDFS, 8> PossibleDeadStores;
+ SmallPtrSet<Instruction *, 8> ProbablyDead;
+ if (CC->isDead() || CC->empty())
+ continue;
+ // Everything still in the TOP class is unreachable or dead.
+ if (CC == TOPClass) {
+ for (auto M : *CC) {
+ auto *VTE = ValueToExpression.lookup(M);
+ if (VTE && isa<DeadExpression>(VTE))
+ markInstructionForDeletion(cast<Instruction>(M));
+ assert((!ReachableBlocks.count(cast<Instruction>(M)->getParent()) ||
+ InstructionsToErase.count(cast<Instruction>(M))) &&
+ "Everything in TOP should be unreachable or dead at this "
+ "point");
+ }
+ continue;
+ }
+
+ assert(CC->getLeader() && "We should have had a leader");
+ // If this is a leader that is always available, and it's a
+ // constant or has no equivalences, just replace everything with
+ // it. We then update the congruence class with whatever members
+ // are left.
+ Value *Leader =
+ CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
+ if (alwaysAvailable(Leader)) {
+ CongruenceClass::MemberSet MembersLeft;
+ for (auto M : *CC) {
+ Value *Member = M;
+ // Void things have no uses we can replace.
+ if (Member == Leader || !isa<Instruction>(Member) ||
+ Member->getType()->isVoidTy()) {
+ MembersLeft.insert(Member);
+ continue;
+ }
+ DEBUG(dbgs() << "Found replacement " << *(Leader) << " for " << *Member
+ << "\n");
+ auto *I = cast<Instruction>(Member);
+ assert(Leader != I && "About to accidentally remove our leader");
+ replaceInstruction(I, Leader);
+ AnythingReplaced = true;
+ }
+ CC->swap(MembersLeft);
+ } else {
+ // If this is a singleton, we can skip it.
+ if (CC->size() != 1 || RealToTemp.count(Leader)) {
+ // This is a stack because equality replacement/etc may place
+ // constants in the middle of the member list, and we want to use
+ // those constant values in preference to the current leader, over
+ // the scope of those constants.
+ ValueDFSStack EliminationStack;
+
+ // Convert the members to DFS ordered sets and then merge them.
+ SmallVector<ValueDFS, 8> DFSOrderedSet;
+ convertClassToDFSOrdered(*CC, DFSOrderedSet, UseCounts, ProbablyDead);
+
+ // Sort the whole thing.
+ std::sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
+ for (auto &VD : DFSOrderedSet) {
+ int MemberDFSIn = VD.DFSIn;
+ int MemberDFSOut = VD.DFSOut;
+ Value *Def = VD.Def.getPointer();
+ bool FromStore = VD.Def.getInt();
+ Use *U = VD.U;
+ // We ignore void things because we can't get a value from them.
+ if (Def && Def->getType()->isVoidTy())
+ continue;
+ auto *DefInst = dyn_cast_or_null<Instruction>(Def);
+ if (DefInst && AllTempInstructions.count(DefInst)) {
+ auto *PN = cast<PHINode>(DefInst);
+
+ // If this is a value phi and that's the expression we used, insert
+ // it into the program
+ // remove from temp instruction list.
+ AllTempInstructions.erase(PN);
+ auto *DefBlock = getBlockForValue(Def);
+ DEBUG(dbgs() << "Inserting fully real phi of ops" << *Def
+ << " into block "
+ << getBlockName(getBlockForValue(Def)) << "\n");
+ PN->insertBefore(&DefBlock->front());
+ Def = PN;
+ NumGVNPHIOfOpsEliminations++;
+ }
+
+ if (EliminationStack.empty()) {
+ DEBUG(dbgs() << "Elimination Stack is empty\n");
+ } else {
+ DEBUG(dbgs() << "Elimination Stack Top DFS numbers are ("
+ << EliminationStack.dfs_back().first << ","
+ << EliminationStack.dfs_back().second << ")\n");
+ }
+
+ DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << ","
+ << MemberDFSOut << ")\n");
+ // First, we see if we are out of scope or empty. If so,
+ // and there equivalences, we try to replace the top of
+ // stack with equivalences (if it's on the stack, it must
+ // not have been eliminated yet).
+ // Then we synchronize to our current scope, by
+ // popping until we are back within a DFS scope that
+ // dominates the current member.
+ // Then, what happens depends on a few factors
+ // If the stack is now empty, we need to push
+ // If we have a constant or a local equivalence we want to
+ // start using, we also push.
+ // Otherwise, we walk along, processing members who are
+ // dominated by this scope, and eliminate them.
+ bool ShouldPush = Def && EliminationStack.empty();
+ bool OutOfScope =
+ !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut);
+
+ if (OutOfScope || ShouldPush) {
+ // Sync to our current scope.
+ EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
+ bool ShouldPush = Def && EliminationStack.empty();
+ if (ShouldPush) {
+ EliminationStack.push_back(Def, MemberDFSIn, MemberDFSOut);
+ }
+ }
+
+ // Skip the Def's, we only want to eliminate on their uses. But mark
+ // dominated defs as dead.
+ if (Def) {
+ // For anything in this case, what and how we value number
+ // guarantees that any side-effets that would have occurred (ie
+ // throwing, etc) can be proven to either still occur (because it's
+ // dominated by something that has the same side-effects), or never
+ // occur. Otherwise, we would not have been able to prove it value
+ // equivalent to something else. For these things, we can just mark
+ // it all dead. Note that this is different from the "ProbablyDead"
+ // set, which may not be dominated by anything, and thus, are only
+ // easy to prove dead if they are also side-effect free. Note that
+ // because stores are put in terms of the stored value, we skip
+ // stored values here. If the stored value is really dead, it will
+ // still be marked for deletion when we process it in its own class.
+ if (!EliminationStack.empty() && Def != EliminationStack.back() &&
+ isa<Instruction>(Def) && !FromStore)
+ markInstructionForDeletion(cast<Instruction>(Def));
+ continue;
+ }
+ // At this point, we know it is a Use we are trying to possibly
+ // replace.
+
+ assert(isa<Instruction>(U->get()) &&
+ "Current def should have been an instruction");
+ assert(isa<Instruction>(U->getUser()) &&
+ "Current user should have been an instruction");
+
+ // If the thing we are replacing into is already marked to be dead,
+ // this use is dead. Note that this is true regardless of whether
+ // we have anything dominating the use or not. We do this here
+ // because we are already walking all the uses anyway.
+ Instruction *InstUse = cast<Instruction>(U->getUser());
+ if (InstructionsToErase.count(InstUse)) {
+ auto &UseCount = UseCounts[U->get()];
+ if (--UseCount == 0) {
+ ProbablyDead.insert(cast<Instruction>(U->get()));
+ }
+ }
+
+ // If we get to this point, and the stack is empty we must have a use
+ // with nothing we can use to eliminate this use, so just skip it.
+ if (EliminationStack.empty())
+ continue;
+
+ Value *DominatingLeader = EliminationStack.back();
+
+ auto *II = dyn_cast<IntrinsicInst>(DominatingLeader);
+ if (II && II->getIntrinsicID() == Intrinsic::ssa_copy)
+ DominatingLeader = II->getOperand(0);
+
+ // Don't replace our existing users with ourselves.
+ if (U->get() == DominatingLeader)
+ continue;
+ DEBUG(dbgs() << "Found replacement " << *DominatingLeader << " for "
+ << *U->get() << " in " << *(U->getUser()) << "\n");
+
+ // If we replaced something in an instruction, handle the patching of
+ // metadata. Skip this if we are replacing predicateinfo with its
+ // original operand, as we already know we can just drop it.
+ auto *ReplacedInst = cast<Instruction>(U->get());
+ auto *PI = PredInfo->getPredicateInfoFor(ReplacedInst);
+ if (!PI || DominatingLeader != PI->OriginalOp)
+ patchReplacementInstruction(ReplacedInst, DominatingLeader);
+ U->set(DominatingLeader);
+ // This is now a use of the dominating leader, which means if the
+ // dominating leader was dead, it's now live!
+ auto &LeaderUseCount = UseCounts[DominatingLeader];
+ // It's about to be alive again.
+ if (LeaderUseCount == 0 && isa<Instruction>(DominatingLeader))
+ ProbablyDead.erase(cast<Instruction>(DominatingLeader));
+ if (LeaderUseCount == 0 && II)
+ ProbablyDead.insert(II);
+ ++LeaderUseCount;
+ AnythingReplaced = true;
+ }
+ }
+ }
+
+ // At this point, anything still in the ProbablyDead set is actually dead if
+ // would be trivially dead.
+ for (auto *I : ProbablyDead)
+ if (wouldInstructionBeTriviallyDead(I))
+ markInstructionForDeletion(I);
+
+ // Cleanup the congruence class.
+ CongruenceClass::MemberSet MembersLeft;
+ for (auto *Member : *CC)
+ if (!isa<Instruction>(Member) ||
+ !InstructionsToErase.count(cast<Instruction>(Member)))
+ MembersLeft.insert(Member);
+ CC->swap(MembersLeft);
+
+ // If we have possible dead stores to look at, try to eliminate them.
+ if (CC->getStoreCount() > 0) {
+ convertClassToLoadsAndStores(*CC, PossibleDeadStores);
+ std::sort(PossibleDeadStores.begin(), PossibleDeadStores.end());
+ ValueDFSStack EliminationStack;
+ for (auto &VD : PossibleDeadStores) {
+ int MemberDFSIn = VD.DFSIn;
+ int MemberDFSOut = VD.DFSOut;
+ Instruction *Member = cast<Instruction>(VD.Def.getPointer());
+ if (EliminationStack.empty() ||
+ !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut)) {
+ // Sync to our current scope.
+ EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
+ if (EliminationStack.empty()) {
+ EliminationStack.push_back(Member, MemberDFSIn, MemberDFSOut);
+ continue;
+ }
+ }
+ // We already did load elimination, so nothing to do here.
+ if (isa<LoadInst>(Member))
+ continue;
+ assert(!EliminationStack.empty());
+ Instruction *Leader = cast<Instruction>(EliminationStack.back());
+ (void)Leader;
+ assert(DT->dominates(Leader->getParent(), Member->getParent()));
+ // Member is dominater by Leader, and thus dead
+ DEBUG(dbgs() << "Marking dead store " << *Member
+ << " that is dominated by " << *Leader << "\n");
+ markInstructionForDeletion(Member);
+ CC->erase(Member);
+ ++NumGVNDeadStores;
+ }
+ }
+ }
+ return AnythingReplaced;
+}
+
+// This function provides global ranking of operations so that we can place them
+// in a canonical order. Note that rank alone is not necessarily enough for a
+// complete ordering, as constants all have the same rank. However, generally,
+// we will simplify an operation with all constants so that it doesn't matter
+// what order they appear in.
+unsigned int NewGVN::getRank(const Value *V) const {
+ // Prefer constants to undef to anything else
+ // Undef is a constant, have to check it first.
+ // Prefer smaller constants to constantexprs
+ if (isa<ConstantExpr>(V))
+ return 2;
+ if (isa<UndefValue>(V))
+ return 1;
+ if (isa<Constant>(V))
+ return 0;
+ else if (auto *A = dyn_cast<Argument>(V))
+ return 3 + A->getArgNo();
+
+ // Need to shift the instruction DFS by number of arguments + 3 to account for
+ // the constant and argument ranking above.
+ unsigned Result = InstrToDFSNum(V);
+ if (Result > 0)
+ return 4 + NumFuncArgs + Result;
+ // Unreachable or something else, just return a really large number.
+ return ~0;
+}
+
+// This is a function that says whether two commutative operations should
+// have their order swapped when canonicalizing.
+bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
+ // Because we only care about a total ordering, and don't rewrite expressions
+ // in this order, we order by rank, which will give a strict weak ordering to
+ // everything but constants, and then we order by pointer address.
+ return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B);
+}
+
+namespace {
+
+class NewGVNLegacyPass : public FunctionPass {
+public:
+ // Pass identification, replacement for typeid.
+ static char ID;
+
+ NewGVNLegacyPass() : FunctionPass(ID) {
+ initializeNewGVNLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+private:
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+bool NewGVNLegacyPass::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+ return NewGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+ &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+ &getAnalysis<MemorySSAWrapperPass>().getMSSA(),
+ F.getParent()->getDataLayout())
+ .runGVN();
+}
+
+char NewGVNLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(NewGVNLegacyPass, "newgvn", "Global Value Numbering",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(NewGVNLegacyPass, "newgvn", "Global Value Numbering", false,
+ false)
+
+// createGVNPass - The public interface to this file.
+FunctionPass *llvm::createNewGVNPass() { return new NewGVNLegacyPass(); }
+
+PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) {
+ // Apparently the order in which we get these results matter for
+ // the old GVN (see Chandler's comment in GVN.cpp). I'll keep
+ // the same order here, just in case.
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &AA = AM.getResult<AAManager>(F);
+ auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+ bool Changed =
+ NewGVN(F, &DT, &AC, &TLI, &AA, &MSSA, F.getParent()->getDataLayout())
+ .runGVN();
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
new file mode 100644
index 000000000000..1748815c5941
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -0,0 +1,180 @@
+//===--- PartiallyInlineLibCalls.cpp - Partially inline libcalls ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to partially inline the fast path of well-known library
+// functions, such as using square-root instructions for cases where sqrt()
+// does not need to set errno.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "partially-inline-libcalls"
+
+
+static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
+ BasicBlock &CurrBB, Function::iterator &BB,
+ const TargetTransformInfo *TTI) {
+ // There is no need to change the IR, since backend will emit sqrt
+ // instruction if the call has already been marked read-only.
+ if (Call->onlyReadsMemory())
+ return false;
+
+ // Do the following transformation:
+ //
+ // (before)
+ // dst = sqrt(src)
+ //
+ // (after)
+ // v0 = sqrt_noreadmem(src) # native sqrt instruction.
+ // [if (v0 is a NaN) || if (src < 0)]
+ // v1 = sqrt(src) # library call.
+ // dst = phi(v0, v1)
+ //
+
+ // Move all instructions following Call to newly created block JoinBB.
+ // Create phi and replace all uses.
+ BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode());
+ IRBuilder<> Builder(JoinBB, JoinBB->begin());
+ Type *Ty = Call->getType();
+ PHINode *Phi = Builder.CreatePHI(Ty, 2);
+ Call->replaceAllUsesWith(Phi);
+
+ // Create basic block LibCallBB and insert a call to library function sqrt.
+ BasicBlock *LibCallBB = BasicBlock::Create(CurrBB.getContext(), "call.sqrt",
+ CurrBB.getParent(), JoinBB);
+ Builder.SetInsertPoint(LibCallBB);
+ Instruction *LibCall = Call->clone();
+ Builder.Insert(LibCall);
+ Builder.CreateBr(JoinBB);
+
+ // Add attribute "readnone" so that backend can use a native sqrt instruction
+ // for this call. Insert a FP compare instruction and a conditional branch
+ // at the end of CurrBB.
+ Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
+ CurrBB.getTerminator()->eraseFromParent();
+ Builder.SetInsertPoint(&CurrBB);
+ Value *FCmp = TTI->isFCmpOrdCheaperThanFCmpZero(Ty)
+ ? Builder.CreateFCmpORD(Call, Call)
+ : Builder.CreateFCmpOGE(Call->getOperand(0),
+ ConstantFP::get(Ty, 0.0));
+ Builder.CreateCondBr(FCmp, JoinBB, LibCallBB);
+
+ // Add phi operands.
+ Phi->addIncoming(Call, &CurrBB);
+ Phi->addIncoming(LibCall, LibCallBB);
+
+ BB = JoinBB->getIterator();
+ return true;
+}
+
+static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI) {
+ bool Changed = false;
+
+ Function::iterator CurrBB;
+ for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) {
+ CurrBB = BB++;
+
+ for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end();
+ II != IE; ++II) {
+ CallInst *Call = dyn_cast<CallInst>(&*II);
+ Function *CalledFunc;
+
+ if (!Call || !(CalledFunc = Call->getCalledFunction()))
+ continue;
+
+ if (Call->isNoBuiltin())
+ continue;
+
+ // Skip if function either has local linkage or is not a known library
+ // function.
+ LibFunc LF;
+ if (CalledFunc->hasLocalLinkage() ||
+ !TLI->getLibFunc(*CalledFunc, LF) || !TLI->has(LF))
+ continue;
+
+ switch (LF) {
+ case LibFunc_sqrtf:
+ case LibFunc_sqrt:
+ if (TTI->haveFastSqrt(Call->getType()) &&
+ optimizeSQRT(Call, CalledFunc, *CurrBB, BB, TTI))
+ break;
+ continue;
+ default:
+ continue;
+ }
+
+ Changed = true;
+ break;
+ }
+ }
+
+ return Changed;
+}
+
+PreservedAnalyses
+PartiallyInlineLibCallsPass::run(Function &F, FunctionAnalysisManager &AM) {
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ if (!runPartiallyInlineLibCalls(F, &TLI, &TTI))
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
+
+namespace {
+class PartiallyInlineLibCallsLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ PartiallyInlineLibCallsLegacyPass() : FunctionPass(ID) {
+ initializePartiallyInlineLibCallsLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ const TargetTransformInfo *TTI =
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ return runPartiallyInlineLibCalls(F, TLI, TTI);
+ }
+};
+}
+
+char PartiallyInlineLibCallsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PartiallyInlineLibCallsLegacyPass,
+ "partially-inline-libcalls",
+ "Partially inline calls to library functions", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(PartiallyInlineLibCallsLegacyPass,
+ "partially-inline-libcalls",
+ "Partially inline calls to library functions", false, false)
+
+FunctionPass *llvm::createPartiallyInlineLibCallsPass() {
+ return new PartiallyInlineLibCallsLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
new file mode 100644
index 000000000000..2d0cb6fbf211
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -0,0 +1,691 @@
+//===- PlaceSafepoints.cpp - Place GC Safepoints --------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Place garbage collection safepoints at appropriate locations in the IR. This
+// does not make relocation semantics or variable liveness explicit. That's
+// done by RewriteStatepointsForGC.
+//
+// Terminology:
+// - A call is said to be "parseable" if there is a stack map generated for the
+// return PC of the call. A runtime can determine where values listed in the
+// deopt arguments and (after RewriteStatepointsForGC) gc arguments are located
+// on the stack when the code is suspended inside such a call. Every parse
+// point is represented by a call wrapped in an gc.statepoint intrinsic.
+// - A "poll" is an explicit check in the generated code to determine if the
+// runtime needs the generated code to cooperate by calling a helper routine
+// and thus suspending its execution at a known state. The call to the helper
+// routine will be parseable. The (gc & runtime specific) logic of a poll is
+// assumed to be provided in a function of the name "gc.safepoint_poll".
+//
+// We aim to insert polls such that running code can quickly be brought to a
+// well defined state for inspection by the collector. In the current
+// implementation, this is done via the insertion of poll sites at method entry
+// and the backedge of most loops. We try to avoid inserting more polls than
+// are necessary to ensure a finite period between poll sites. This is not
+// because the poll itself is expensive in the generated code; it's not. Polls
+// do tend to impact the optimizer itself in negative ways; we'd like to avoid
+// perturbing the optimization of the method as much as we can.
+//
+// We also need to make most call sites parseable. The callee might execute a
+// poll (or otherwise be inspected by the GC). If so, the entire stack
+// (including the suspended frame of the current method) must be parseable.
+//
+// This pass will insert:
+// - Call parse points ("call safepoints") for any call which may need to
+// reach a safepoint during the execution of the callee function.
+// - Backedge safepoint polls and entry safepoint polls to ensure that
+// executing code reaches a safepoint poll in a finite amount of time.
+//
+// We do not currently support return statepoints, but adding them would not
+// be hard. They are not required for correctness - entry safepoints are an
+// alternative - but some GCs may prefer them. Patches welcome.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#define DEBUG_TYPE "safepoint-placement"
+
+STATISTIC(NumEntrySafepoints, "Number of entry safepoints inserted");
+STATISTIC(NumBackedgeSafepoints, "Number of backedge safepoints inserted");
+
+STATISTIC(CallInLoop,
+ "Number of loops without safepoints due to calls in loop");
+STATISTIC(FiniteExecution,
+ "Number of loops without safepoints finite execution");
+
+using namespace llvm;
+
+// Ignore opportunities to avoid placing safepoints on backedges, useful for
+// validation
+static cl::opt<bool> AllBackedges("spp-all-backedges", cl::Hidden,
+ cl::init(false));
+
+/// How narrow does the trip count of a loop have to be to have to be considered
+/// "counted"? Counted loops do not get safepoints at backedges.
+static cl::opt<int> CountedLoopTripWidth("spp-counted-loop-trip-width",
+ cl::Hidden, cl::init(32));
+
+// If true, split the backedge of a loop when placing the safepoint, otherwise
+// split the latch block itself. Both are useful to support for
+// experimentation, but in practice, it looks like splitting the backedge
+// optimizes better.
+static cl::opt<bool> SplitBackedge("spp-split-backedge", cl::Hidden,
+ cl::init(false));
+
+namespace {
+
+/// An analysis pass whose purpose is to identify each of the backedges in
+/// the function which require a safepoint poll to be inserted.
+struct PlaceBackedgeSafepointsImpl : public FunctionPass {
+ static char ID;
+
+ /// The output of the pass - gives a list of each backedge (described by
+ /// pointing at the branch) which need a poll inserted.
+ std::vector<TerminatorInst *> PollLocations;
+
+ /// True unless we're running spp-no-calls in which case we need to disable
+ /// the call-dependent placement opts.
+ bool CallSafepointsEnabled;
+
+ ScalarEvolution *SE = nullptr;
+ DominatorTree *DT = nullptr;
+ LoopInfo *LI = nullptr;
+ TargetLibraryInfo *TLI = nullptr;
+
+ PlaceBackedgeSafepointsImpl(bool CallSafepoints = false)
+ : FunctionPass(ID), CallSafepointsEnabled(CallSafepoints) {
+ initializePlaceBackedgeSafepointsImplPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *);
+ void runOnLoopAndSubLoops(Loop *L) {
+ // Visit all the subloops
+ for (Loop *I : *L)
+ runOnLoopAndSubLoops(I);
+ runOnLoop(L);
+ }
+
+ bool runOnFunction(Function &F) override {
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ for (Loop *I : *LI) {
+ runOnLoopAndSubLoops(I);
+ }
+ return false;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ // We no longer modify the IR at all in this pass. Thus all
+ // analysis are preserved.
+ AU.setPreservesAll();
+ }
+};
+}
+
+static cl::opt<bool> NoEntry("spp-no-entry", cl::Hidden, cl::init(false));
+static cl::opt<bool> NoCall("spp-no-call", cl::Hidden, cl::init(false));
+static cl::opt<bool> NoBackedge("spp-no-backedge", cl::Hidden, cl::init(false));
+
+namespace {
+struct PlaceSafepoints : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+
+ PlaceSafepoints() : FunctionPass(ID) {
+ initializePlaceSafepointsPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // We modify the graph wholesale (inlining, block insertion, etc). We
+ // preserve nothing at the moment. We could potentially preserve dom tree
+ // if that was worth doing
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+};
+}
+
+// Insert a safepoint poll immediately before the given instruction. Does
+// not handle the parsability of state at the runtime call, that's the
+// callers job.
+static void
+InsertSafepointPoll(Instruction *InsertBefore,
+ std::vector<CallSite> &ParsePointsNeeded /*rval*/,
+ const TargetLibraryInfo &TLI);
+
+static bool needsStatepoint(const CallSite &CS, const TargetLibraryInfo &TLI) {
+ if (callsGCLeafFunction(CS, TLI))
+ return false;
+ if (CS.isCall()) {
+ CallInst *call = cast<CallInst>(CS.getInstruction());
+ if (call->isInlineAsm())
+ return false;
+ }
+
+ return !(isStatepoint(CS) || isGCRelocate(CS) || isGCResult(CS));
+}
+
+/// Returns true if this loop is known to contain a call safepoint which
+/// must unconditionally execute on any iteration of the loop which returns
+/// to the loop header via an edge from Pred. Returns a conservative correct
+/// answer; i.e. false is always valid.
+static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,
+ BasicBlock *Pred,
+ DominatorTree &DT,
+ const TargetLibraryInfo &TLI) {
+ // In general, we're looking for any cut of the graph which ensures
+ // there's a call safepoint along every edge between Header and Pred.
+ // For the moment, we look only for the 'cuts' that consist of a single call
+ // instruction in a block which is dominated by the Header and dominates the
+ // loop latch (Pred) block. Somewhat surprisingly, walking the entire chain
+ // of such dominating blocks gets substantially more occurrences than just
+ // checking the Pred and Header blocks themselves. This may be due to the
+ // density of loop exit conditions caused by range and null checks.
+ // TODO: structure this as an analysis pass, cache the result for subloops,
+ // avoid dom tree recalculations
+ assert(DT.dominates(Header, Pred) && "loop latch not dominated by header?");
+
+ BasicBlock *Current = Pred;
+ while (true) {
+ for (Instruction &I : *Current) {
+ if (auto CS = CallSite(&I))
+ // Note: Technically, needing a safepoint isn't quite the right
+ // condition here. We should instead be checking if the target method
+ // has an
+ // unconditional poll. In practice, this is only a theoretical concern
+ // since we don't have any methods with conditional-only safepoint
+ // polls.
+ if (needsStatepoint(CS, TLI))
+ return true;
+ }
+
+ if (Current == Header)
+ break;
+ Current = DT.getNode(Current)->getIDom()->getBlock();
+ }
+
+ return false;
+}
+
+/// Returns true if this loop is known to terminate in a finite number of
+/// iterations. Note that this function may return false for a loop which
+/// does actual terminate in a finite constant number of iterations due to
+/// conservatism in the analysis.
+static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
+ BasicBlock *Pred) {
+ // A conservative bound on the loop as a whole.
+ const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L);
+ if (MaxTrips != SE->getCouldNotCompute() &&
+ SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(
+ CountedLoopTripWidth))
+ return true;
+
+ // If this is a conditional branch to the header with the alternate path
+ // being outside the loop, we can ask questions about the execution frequency
+ // of the exit block.
+ if (L->isLoopExiting(Pred)) {
+ // This returns an exact expression only. TODO: We really only need an
+ // upper bound here, but SE doesn't expose that.
+ const SCEV *MaxExec = SE->getExitCount(L, Pred);
+ if (MaxExec != SE->getCouldNotCompute() &&
+ SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(
+ CountedLoopTripWidth))
+ return true;
+ }
+
+ return /* not finite */ false;
+}
+
+static void scanOneBB(Instruction *Start, Instruction *End,
+ std::vector<CallInst *> &Calls,
+ DenseSet<BasicBlock *> &Seen,
+ std::vector<BasicBlock *> &Worklist) {
+ for (BasicBlock::iterator BBI(Start), BBE0 = Start->getParent()->end(),
+ BBE1 = BasicBlock::iterator(End);
+ BBI != BBE0 && BBI != BBE1; BBI++) {
+ if (CallInst *CI = dyn_cast<CallInst>(&*BBI))
+ Calls.push_back(CI);
+
+ // FIXME: This code does not handle invokes
+ assert(!isa<InvokeInst>(&*BBI) &&
+ "support for invokes in poll code needed");
+
+ // Only add the successor blocks if we reach the terminator instruction
+ // without encountering end first
+ if (BBI->isTerminator()) {
+ BasicBlock *BB = BBI->getParent();
+ for (BasicBlock *Succ : successors(BB)) {
+ if (Seen.insert(Succ).second) {
+ Worklist.push_back(Succ);
+ }
+ }
+ }
+ }
+}
+
+static void scanInlinedCode(Instruction *Start, Instruction *End,
+ std::vector<CallInst *> &Calls,
+ DenseSet<BasicBlock *> &Seen) {
+ Calls.clear();
+ std::vector<BasicBlock *> Worklist;
+ Seen.insert(Start->getParent());
+ scanOneBB(Start, End, Calls, Seen, Worklist);
+ while (!Worklist.empty()) {
+ BasicBlock *BB = Worklist.back();
+ Worklist.pop_back();
+ scanOneBB(&*BB->begin(), End, Calls, Seen, Worklist);
+ }
+}
+
+bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
+ // Loop through all loop latches (branches controlling backedges). We need
+ // to place a safepoint on every backedge (potentially).
+ // Note: In common usage, there will be only one edge due to LoopSimplify
+ // having run sometime earlier in the pipeline, but this code must be correct
+ // w.r.t. loops with multiple backedges.
+ BasicBlock *Header = L->getHeader();
+ SmallVector<BasicBlock*, 16> LoopLatches;
+ L->getLoopLatches(LoopLatches);
+ for (BasicBlock *Pred : LoopLatches) {
+ assert(L->contains(Pred));
+
+ // Make a policy decision about whether this loop needs a safepoint or
+ // not. Note that this is about unburdening the optimizer in loops, not
+ // avoiding the runtime cost of the actual safepoint.
+ if (!AllBackedges) {
+ if (mustBeFiniteCountedLoop(L, SE, Pred)) {
+ DEBUG(dbgs() << "skipping safepoint placement in finite loop\n");
+ FiniteExecution++;
+ continue;
+ }
+ if (CallSafepointsEnabled &&
+ containsUnconditionalCallSafepoint(L, Header, Pred, *DT, *TLI)) {
+ // Note: This is only semantically legal since we won't do any further
+ // IPO or inlining before the actual call insertion.. If we hadn't, we
+ // might latter loose this call safepoint.
+ DEBUG(dbgs() << "skipping safepoint placement due to unconditional call\n");
+ CallInLoop++;
+ continue;
+ }
+ }
+
+ // TODO: We can create an inner loop which runs a finite number of
+ // iterations with an outer loop which contains a safepoint. This would
+ // not help runtime performance that much, but it might help our ability to
+ // optimize the inner loop.
+
+ // Safepoint insertion would involve creating a new basic block (as the
+ // target of the current backedge) which does the safepoint (of all live
+ // variables) and branches to the true header
+ TerminatorInst *Term = Pred->getTerminator();
+
+ DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term);
+
+ PollLocations.push_back(Term);
+ }
+
+ return false;
+}
+
+/// Returns true if an entry safepoint is not required before this callsite in
+/// the caller function.
+static bool doesNotRequireEntrySafepointBefore(const CallSite &CS) {
+ Instruction *Inst = CS.getInstruction();
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::experimental_gc_statepoint:
+ case Intrinsic::experimental_patchpoint_void:
+ case Intrinsic::experimental_patchpoint_i64:
+ // The can wrap an actual call which may grow the stack by an unbounded
+ // amount or run forever.
+ return false;
+ default:
+ // Most LLVM intrinsics are things which do not expand to actual calls, or
+ // at least if they do, are leaf functions that cause only finite stack
+ // growth. In particular, the optimizer likes to form things like memsets
+ // out of stores in the original IR. Another important example is
+ // llvm.localescape which must occur in the entry block. Inserting a
+ // safepoint before it is not legal since it could push the localescape
+ // out of the entry block.
+ return true;
+ }
+ }
+ return false;
+}
+
+static Instruction *findLocationForEntrySafepoint(Function &F,
+ DominatorTree &DT) {
+
+ // Conceptually, this poll needs to be on method entry, but in
+ // practice, we place it as late in the entry block as possible. We
+ // can place it as late as we want as long as it dominates all calls
+ // that can grow the stack. This, combined with backedge polls,
+ // give us all the progress guarantees we need.
+
+ // hasNextInstruction and nextInstruction are used to iterate
+ // through a "straight line" execution sequence.
+
+ auto HasNextInstruction = [](Instruction *I) {
+ if (!I->isTerminator())
+ return true;
+
+ BasicBlock *nextBB = I->getParent()->getUniqueSuccessor();
+ return nextBB && (nextBB->getUniquePredecessor() != nullptr);
+ };
+
+ auto NextInstruction = [&](Instruction *I) {
+ assert(HasNextInstruction(I) &&
+ "first check if there is a next instruction!");
+
+ if (I->isTerminator())
+ return &I->getParent()->getUniqueSuccessor()->front();
+ return &*++I->getIterator();
+ };
+
+ Instruction *Cursor = nullptr;
+ for (Cursor = &F.getEntryBlock().front(); HasNextInstruction(Cursor);
+ Cursor = NextInstruction(Cursor)) {
+
+ // We need to ensure a safepoint poll occurs before any 'real' call. The
+ // easiest way to ensure finite execution between safepoints in the face of
+ // recursive and mutually recursive functions is to enforce that each take
+ // a safepoint. Additionally, we need to ensure a poll before any call
+ // which can grow the stack by an unbounded amount. This isn't required
+ // for GC semantics per se, but is a common requirement for languages
+ // which detect stack overflow via guard pages and then throw exceptions.
+ if (auto CS = CallSite(Cursor)) {
+ if (doesNotRequireEntrySafepointBefore(CS))
+ continue;
+ break;
+ }
+ }
+
+ assert((HasNextInstruction(Cursor) || Cursor->isTerminator()) &&
+ "either we stopped because of a call, or because of terminator");
+
+ return Cursor;
+}
+
+static const char *const GCSafepointPollName = "gc.safepoint_poll";
+
+static bool isGCSafepointPoll(Function &F) {
+ return F.getName().equals(GCSafepointPollName);
+}
+
+/// Returns true if this function should be rewritten to include safepoint
+/// polls and parseable call sites. The main point of this function is to be
+/// an extension point for custom logic.
+static bool shouldRewriteFunction(Function &F) {
+ // TODO: This should check the GCStrategy
+ if (F.hasGC()) {
+ const auto &FunctionGCName = F.getGC();
+ const StringRef StatepointExampleName("statepoint-example");
+ const StringRef CoreCLRName("coreclr");
+ return (StatepointExampleName == FunctionGCName) ||
+ (CoreCLRName == FunctionGCName);
+ } else
+ return false;
+}
+
+// TODO: These should become properties of the GCStrategy, possibly with
+// command line overrides.
+static bool enableEntrySafepoints(Function &F) { return !NoEntry; }
+static bool enableBackedgeSafepoints(Function &F) { return !NoBackedge; }
+static bool enableCallSafepoints(Function &F) { return !NoCall; }
+
+bool PlaceSafepoints::runOnFunction(Function &F) {
+ if (F.isDeclaration() || F.empty()) {
+ // This is a declaration, nothing to do. Must exit early to avoid crash in
+ // dom tree calculation
+ return false;
+ }
+
+ if (isGCSafepointPoll(F)) {
+ // Given we're inlining this inside of safepoint poll insertion, this
+ // doesn't make any sense. Note that we do make any contained calls
+ // parseable after we inline a poll.
+ return false;
+ }
+
+ if (!shouldRewriteFunction(F))
+ return false;
+
+ const TargetLibraryInfo &TLI =
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+ bool Modified = false;
+
+ // In various bits below, we rely on the fact that uses are reachable from
+ // defs. When there are basic blocks unreachable from the entry, dominance
+ // and reachablity queries return non-sensical results. Thus, we preprocess
+ // the function to ensure these properties hold.
+ Modified |= removeUnreachableBlocks(F);
+
+ // STEP 1 - Insert the safepoint polling locations. We do not need to
+ // actually insert parse points yet. That will be done for all polls and
+ // calls in a single pass.
+
+ DominatorTree DT;
+ DT.recalculate(F);
+
+ SmallVector<Instruction *, 16> PollsNeeded;
+ std::vector<CallSite> ParsePointNeeded;
+
+ if (enableBackedgeSafepoints(F)) {
+ // Construct a pass manager to run the LoopPass backedge logic. We
+ // need the pass manager to handle scheduling all the loop passes
+ // appropriately. Doing this by hand is painful and just not worth messing
+ // with for the moment.
+ legacy::FunctionPassManager FPM(F.getParent());
+ bool CanAssumeCallSafepoints = enableCallSafepoints(F);
+ auto *PBS = new PlaceBackedgeSafepointsImpl(CanAssumeCallSafepoints);
+ FPM.add(PBS);
+ FPM.run(F);
+
+ // We preserve dominance information when inserting the poll, otherwise
+ // we'd have to recalculate this on every insert
+ DT.recalculate(F);
+
+ auto &PollLocations = PBS->PollLocations;
+
+ auto OrderByBBName = [](Instruction *a, Instruction *b) {
+ return a->getParent()->getName() < b->getParent()->getName();
+ };
+ // We need the order of list to be stable so that naming ends up stable
+ // when we split edges. This makes test cases much easier to write.
+ std::sort(PollLocations.begin(), PollLocations.end(), OrderByBBName);
+
+ // We can sometimes end up with duplicate poll locations. This happens if
+ // a single loop is visited more than once. The fact this happens seems
+ // wrong, but it does happen for the split-backedge.ll test case.
+ PollLocations.erase(std::unique(PollLocations.begin(),
+ PollLocations.end()),
+ PollLocations.end());
+
+ // Insert a poll at each point the analysis pass identified
+ // The poll location must be the terminator of a loop latch block.
+ for (TerminatorInst *Term : PollLocations) {
+ // We are inserting a poll, the function is modified
+ Modified = true;
+
+ if (SplitBackedge) {
+ // Split the backedge of the loop and insert the poll within that new
+ // basic block. This creates a loop with two latches per original
+ // latch (which is non-ideal), but this appears to be easier to
+ // optimize in practice than inserting the poll immediately before the
+ // latch test.
+
+ // Since this is a latch, at least one of the successors must dominate
+ // it. Its possible that we have a) duplicate edges to the same header
+ // and b) edges to distinct loop headers. We need to insert pools on
+ // each.
+ SetVector<BasicBlock *> Headers;
+ for (unsigned i = 0; i < Term->getNumSuccessors(); i++) {
+ BasicBlock *Succ = Term->getSuccessor(i);
+ if (DT.dominates(Succ, Term->getParent())) {
+ Headers.insert(Succ);
+ }
+ }
+ assert(!Headers.empty() && "poll location is not a loop latch?");
+
+ // The split loop structure here is so that we only need to recalculate
+ // the dominator tree once. Alternatively, we could just keep it up to
+ // date and use a more natural merged loop.
+ SetVector<BasicBlock *> SplitBackedges;
+ for (BasicBlock *Header : Headers) {
+ BasicBlock *NewBB = SplitEdge(Term->getParent(), Header, &DT);
+ PollsNeeded.push_back(NewBB->getTerminator());
+ NumBackedgeSafepoints++;
+ }
+ } else {
+ // Split the latch block itself, right before the terminator.
+ PollsNeeded.push_back(Term);
+ NumBackedgeSafepoints++;
+ }
+ }
+ }
+
+ if (enableEntrySafepoints(F)) {
+ if (Instruction *Location = findLocationForEntrySafepoint(F, DT)) {
+ PollsNeeded.push_back(Location);
+ Modified = true;
+ NumEntrySafepoints++;
+ }
+ // TODO: else we should assert that there was, in fact, a policy choice to
+ // not insert a entry safepoint poll.
+ }
+
+ // Now that we've identified all the needed safepoint poll locations, insert
+ // safepoint polls themselves.
+ for (Instruction *PollLocation : PollsNeeded) {
+ std::vector<CallSite> RuntimeCalls;
+ InsertSafepointPoll(PollLocation, RuntimeCalls, TLI);
+ ParsePointNeeded.insert(ParsePointNeeded.end(), RuntimeCalls.begin(),
+ RuntimeCalls.end());
+ }
+
+ return Modified;
+}
+
+char PlaceBackedgeSafepointsImpl::ID = 0;
+char PlaceSafepoints::ID = 0;
+
+FunctionPass *llvm::createPlaceSafepointsPass() {
+ return new PlaceSafepoints();
+}
+
+INITIALIZE_PASS_BEGIN(PlaceBackedgeSafepointsImpl,
+ "place-backedge-safepoints-impl",
+ "Place Backedge Safepoints", false, false)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(PlaceBackedgeSafepointsImpl,
+ "place-backedge-safepoints-impl",
+ "Place Backedge Safepoints", false, false)
+
+INITIALIZE_PASS_BEGIN(PlaceSafepoints, "place-safepoints", "Place Safepoints",
+ false, false)
+INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints",
+ false, false)
+
+static void
+InsertSafepointPoll(Instruction *InsertBefore,
+ std::vector<CallSite> &ParsePointsNeeded /*rval*/,
+ const TargetLibraryInfo &TLI) {
+ BasicBlock *OrigBB = InsertBefore->getParent();
+ Module *M = InsertBefore->getModule();
+ assert(M && "must be part of a module");
+
+ // Inline the safepoint poll implementation - this will get all the branch,
+ // control flow, etc.. Most importantly, it will introduce the actual slow
+ // path call - where we need to insert a safepoint (parsepoint).
+
+ auto *F = M->getFunction(GCSafepointPollName);
+ assert(F && "gc.safepoint_poll function is missing");
+ assert(F->getValueType() ==
+ FunctionType::get(Type::getVoidTy(M->getContext()), false) &&
+ "gc.safepoint_poll declared with wrong type");
+ assert(!F->empty() && "gc.safepoint_poll must be a non-empty function");
+ CallInst *PollCall = CallInst::Create(F, "", InsertBefore);
+
+ // Record some information about the call site we're replacing
+ BasicBlock::iterator Before(PollCall), After(PollCall);
+ bool IsBegin = false;
+ if (Before == OrigBB->begin())
+ IsBegin = true;
+ else
+ Before--;
+
+ After++;
+ assert(After != OrigBB->end() && "must have successor");
+
+ // Do the actual inlining
+ InlineFunctionInfo IFI;
+ bool InlineStatus = InlineFunction(PollCall, IFI);
+ assert(InlineStatus && "inline must succeed");
+ (void)InlineStatus; // suppress warning in release-asserts
+
+ // Check post-conditions
+ assert(IFI.StaticAllocas.empty() && "can't have allocs");
+
+ std::vector<CallInst *> Calls; // new calls
+ DenseSet<BasicBlock *> BBs; // new BBs + insertee
+
+ // Include only the newly inserted instructions, Note: begin may not be valid
+ // if we inserted to the beginning of the basic block
+ BasicBlock::iterator Start = IsBegin ? OrigBB->begin() : std::next(Before);
+
+ // If your poll function includes an unreachable at the end, that's not
+ // valid. Bugpoint likes to create this, so check for it.
+ assert(isPotentiallyReachable(&*Start, &*After) &&
+ "malformed poll function");
+
+ scanInlinedCode(&*Start, &*After, Calls, BBs);
+ assert(!Calls.empty() && "slow path not found for safepoint poll");
+
+ // Record the fact we need a parsable state at the runtime call contained in
+ // the poll function. This is required so that the runtime knows how to
+ // parse the last frame when we actually take the safepoint (i.e. execute
+ // the slow path)
+ assert(ParsePointsNeeded.empty());
+ for (auto *CI : Calls) {
+ // No safepoint needed or wanted
+ if (!needsStatepoint(CI, TLI))
+ continue;
+
+ // These are likely runtime calls. Should we assert that via calling
+ // convention or something?
+ ParsePointsNeeded.push_back(CallSite(CI));
+ }
+ assert(ParsePointsNeeded.size() <= Calls.size());
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
new file mode 100644
index 000000000000..88dcaf0f8a36
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -0,0 +1,2401 @@
+//===- Reassociate.cpp - Reassociate binary expressions -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reassociates commutative expressions in an order that is designed
+// to promote better constant propagation, GCSE, LICM, PRE, etc.
+//
+// For example: 4 + (x + 5) -> x + (4 + 5)
+//
+// In the implementation of this algorithm, constants are assigned rank = 0,
+// function arguments are rank = 1, and other values are assigned ranks
+// corresponding to the reverse post order traversal of current function
+// (starting at 2), which effectively gives values in deep loops higher rank
+// than values not in loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/Reassociate.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+using namespace reassociate;
+
+#define DEBUG_TYPE "reassociate"
+
+STATISTIC(NumChanged, "Number of insts reassociated");
+STATISTIC(NumAnnihil, "Number of expr tree annihilated");
+STATISTIC(NumFactor , "Number of multiplies factored");
+
+#ifndef NDEBUG
+/// Print out the expression identified in the Ops list.
+static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
+ Module *M = I->getModule();
+ dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " "
+ << *Ops[0].Op->getType() << '\t';
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+ dbgs() << "[ ";
+ Ops[i].Op->printAsOperand(dbgs(), false, M);
+ dbgs() << ", #" << Ops[i].Rank << "] ";
+ }
+}
+#endif
+
+/// Utility class representing a non-constant Xor-operand. We classify
+/// non-constant Xor-Operands into two categories:
+/// C1) The operand is in the form "X & C", where C is a constant and C != ~0
+/// C2)
+/// C2.1) The operand is in the form of "X | C", where C is a non-zero
+/// constant.
+/// C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this
+/// operand as "E | 0"
+class llvm::reassociate::XorOpnd {
+public:
+ XorOpnd(Value *V);
+
+ bool isInvalid() const { return SymbolicPart == nullptr; }
+ bool isOrExpr() const { return isOr; }
+ Value *getValue() const { return OrigVal; }
+ Value *getSymbolicPart() const { return SymbolicPart; }
+ unsigned getSymbolicRank() const { return SymbolicRank; }
+ const APInt &getConstPart() const { return ConstPart; }
+
+ void Invalidate() { SymbolicPart = OrigVal = nullptr; }
+ void setSymbolicRank(unsigned R) { SymbolicRank = R; }
+
+private:
+ Value *OrigVal;
+ Value *SymbolicPart;
+ APInt ConstPart;
+ unsigned SymbolicRank;
+ bool isOr;
+};
+
+XorOpnd::XorOpnd(Value *V) {
+ assert(!isa<ConstantInt>(V) && "No ConstantInt");
+ OrigVal = V;
+ Instruction *I = dyn_cast<Instruction>(V);
+ SymbolicRank = 0;
+
+ if (I && (I->getOpcode() == Instruction::Or ||
+ I->getOpcode() == Instruction::And)) {
+ Value *V0 = I->getOperand(0);
+ Value *V1 = I->getOperand(1);
+ const APInt *C;
+ if (match(V0, PatternMatch::m_APInt(C)))
+ std::swap(V0, V1);
+
+ if (match(V1, PatternMatch::m_APInt(C))) {
+ ConstPart = *C;
+ SymbolicPart = V0;
+ isOr = (I->getOpcode() == Instruction::Or);
+ return;
+ }
+ }
+
+ // view the operand as "V | 0"
+ SymbolicPart = V;
+ ConstPart = APInt::getNullValue(V->getType()->getScalarSizeInBits());
+ isOr = true;
+}
+
+/// Return true if V is an instruction of the specified opcode and if it
+/// only has one use.
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (I && I->hasOneUse() && I->getOpcode() == Opcode)
+ if (!isa<FPMathOperator>(I) || I->isFast())
+ return cast<BinaryOperator>(I);
+ return nullptr;
+}
+
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
+ unsigned Opcode2) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (I && I->hasOneUse() &&
+ (I->getOpcode() == Opcode1 || I->getOpcode() == Opcode2))
+ if (!isa<FPMathOperator>(I) || I->isFast())
+ return cast<BinaryOperator>(I);
+ return nullptr;
+}
+
+void ReassociatePass::BuildRankMap(Function &F,
+ ReversePostOrderTraversal<Function*> &RPOT) {
+ unsigned Rank = 2;
+
+ // Assign distinct ranks to function arguments.
+ for (auto &Arg : F.args()) {
+ ValueRankMap[&Arg] = ++Rank;
+ DEBUG(dbgs() << "Calculated Rank[" << Arg.getName() << "] = " << Rank
+ << "\n");
+ }
+
+ // Traverse basic blocks in ReversePostOrder
+ for (BasicBlock *BB : RPOT) {
+ unsigned BBRank = RankMap[BB] = ++Rank << 16;
+
+ // Walk the basic block, adding precomputed ranks for any instructions that
+ // we cannot move. This ensures that the ranks for these instructions are
+ // all different in the block.
+ for (Instruction &I : *BB)
+ if (mayBeMemoryDependent(I))
+ ValueRankMap[&I] = ++BBRank;
+ }
+}
+
+unsigned ReassociatePass::getRank(Value *V) {
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I) {
+ if (isa<Argument>(V)) return ValueRankMap[V]; // Function argument.
+ return 0; // Otherwise it's a global or constant, rank 0.
+ }
+
+ if (unsigned Rank = ValueRankMap[I])
+ return Rank; // Rank already known?
+
+ // If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that
+ // we can reassociate expressions for code motion! Since we do not recurse
+ // for PHI nodes, we cannot have infinite recursion here, because there
+ // cannot be loops in the value graph that do not go through PHI nodes.
+ unsigned Rank = 0, MaxRank = RankMap[I->getParent()];
+ for (unsigned i = 0, e = I->getNumOperands();
+ i != e && Rank != MaxRank; ++i)
+ Rank = std::max(Rank, getRank(I->getOperand(i)));
+
+ // If this is a not or neg instruction, do not count it for rank. This
+ // assures us that X and ~X will have the same rank.
+ if (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
+ !BinaryOperator::isFNeg(I))
+ ++Rank;
+
+ DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank << "\n");
+
+ return ValueRankMap[I] = Rank;
+}
+
+// Canonicalize constants to RHS. Otherwise, sort the operands by rank.
+void ReassociatePass::canonicalizeOperands(Instruction *I) {
+ assert(isa<BinaryOperator>(I) && "Expected binary operator.");
+ assert(I->isCommutative() && "Expected commutative operator.");
+
+ Value *LHS = I->getOperand(0);
+ Value *RHS = I->getOperand(1);
+ if (LHS == RHS || isa<Constant>(RHS))
+ return;
+ if (isa<Constant>(LHS) || getRank(RHS) < getRank(LHS))
+ cast<BinaryOperator>(I)->swapOperands();
+}
+
+static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name,
+ Instruction *InsertBefore, Value *FlagsOp) {
+ if (S1->getType()->isIntOrIntVectorTy())
+ return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore);
+ else {
+ BinaryOperator *Res =
+ BinaryOperator::CreateFAdd(S1, S2, Name, InsertBefore);
+ Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+ return Res;
+ }
+}
+
+static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name,
+ Instruction *InsertBefore, Value *FlagsOp) {
+ if (S1->getType()->isIntOrIntVectorTy())
+ return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore);
+ else {
+ BinaryOperator *Res =
+ BinaryOperator::CreateFMul(S1, S2, Name, InsertBefore);
+ Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+ return Res;
+ }
+}
+
+static BinaryOperator *CreateNeg(Value *S1, const Twine &Name,
+ Instruction *InsertBefore, Value *FlagsOp) {
+ if (S1->getType()->isIntOrIntVectorTy())
+ return BinaryOperator::CreateNeg(S1, Name, InsertBefore);
+ else {
+ BinaryOperator *Res = BinaryOperator::CreateFNeg(S1, Name, InsertBefore);
+ Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+ return Res;
+ }
+}
+
+/// Replace 0-X with X*-1.
+static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) {
+ Type *Ty = Neg->getType();
+ Constant *NegOne = Ty->isIntOrIntVectorTy() ?
+ ConstantInt::getAllOnesValue(Ty) : ConstantFP::get(Ty, -1.0);
+
+ BinaryOperator *Res = CreateMul(Neg->getOperand(1), NegOne, "", Neg, Neg);
+ Neg->setOperand(1, Constant::getNullValue(Ty)); // Drop use of op.
+ Res->takeName(Neg);
+ Neg->replaceAllUsesWith(Res);
+ Res->setDebugLoc(Neg->getDebugLoc());
+ return Res;
+}
+
+/// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael
+/// function. This means that x^(2^k) === 1 mod 2^Bitwidth for
+/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic.
+/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every
+/// even x in Bitwidth-bit arithmetic.
+static unsigned CarmichaelShift(unsigned Bitwidth) {
+ if (Bitwidth < 3)
+ return Bitwidth - 1;
+ return Bitwidth - 2;
+}
+
+/// Add the extra weight 'RHS' to the existing weight 'LHS',
+/// reducing the combined weight using any special properties of the operation.
+/// The existing weight LHS represents the computation X op X op ... op X where
+/// X occurs LHS times. The combined weight represents X op X op ... op X with
+/// X occurring LHS + RHS times. If op is "Xor" for example then the combined
+/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even;
+/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second.
+static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
+ // If we were working with infinite precision arithmetic then the combined
+ // weight would be LHS + RHS. But we are using finite precision arithmetic,
+ // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct
+ // for nilpotent operations and addition, but not for idempotent operations
+ // and multiplication), so it is important to correctly reduce the combined
+ // weight back into range if wrapping would be wrong.
+
+ // If RHS is zero then the weight didn't change.
+ if (RHS.isMinValue())
+ return;
+ // If LHS is zero then the combined weight is RHS.
+ if (LHS.isMinValue()) {
+ LHS = RHS;
+ return;
+ }
+ // From this point on we know that neither LHS nor RHS is zero.
+
+ if (Instruction::isIdempotent(Opcode)) {
+ // Idempotent means X op X === X, so any non-zero weight is equivalent to a
+ // weight of 1. Keeping weights at zero or one also means that wrapping is
+ // not a problem.
+ assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
+ return; // Return a weight of 1.
+ }
+ if (Instruction::isNilpotent(Opcode)) {
+ // Nilpotent means X op X === 0, so reduce weights modulo 2.
+ assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
+ LHS = 0; // 1 + 1 === 0 modulo 2.
+ return;
+ }
+ if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) {
+ // TODO: Reduce the weight by exploiting nsw/nuw?
+ LHS += RHS;
+ return;
+ }
+
+ assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) &&
+ "Unknown associative operation!");
+ unsigned Bitwidth = LHS.getBitWidth();
+ // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth
+ // can be replaced with W-CM. That's because x^W=x^(W-CM) for every Bitwidth
+ // bit number x, since either x is odd in which case x^CM = 1, or x is even in
+ // which case both x^W and x^(W - CM) are zero. By subtracting off multiples
+ // of CM like this weights can always be reduced to the range [0, CM+Bitwidth)
+ // which by a happy accident means that they can always be represented using
+ // Bitwidth bits.
+ // TODO: Reduce the weight by exploiting nsw/nuw? (Could do much better than
+ // the Carmichael number).
+ if (Bitwidth > 3) {
+ /// CM - The value of Carmichael's lambda function.
+ APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth));
+ // Any weight W >= Threshold can be replaced with W - CM.
+ APInt Threshold = CM + Bitwidth;
+ assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!");
+ // For Bitwidth 4 or more the following sum does not overflow.
+ LHS += RHS;
+ while (LHS.uge(Threshold))
+ LHS -= CM;
+ } else {
+ // To avoid problems with overflow do everything the same as above but using
+ // a larger type.
+ unsigned CM = 1U << CarmichaelShift(Bitwidth);
+ unsigned Threshold = CM + Bitwidth;
+ assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold &&
+ "Weights not reduced!");
+ unsigned Total = LHS.getZExtValue() + RHS.getZExtValue();
+ while (Total >= Threshold)
+ Total -= CM;
+ LHS = Total;
+ }
+}
+
+using RepeatedValue = std::pair<Value*, APInt>;
+
+/// Given an associative binary expression, return the leaf
+/// nodes in Ops along with their weights (how many times the leaf occurs). The
+/// original expression is the same as
+/// (Ops[0].first op Ops[0].first op ... Ops[0].first) <- Ops[0].second times
+/// op
+/// (Ops[1].first op Ops[1].first op ... Ops[1].first) <- Ops[1].second times
+/// op
+/// ...
+/// op
+/// (Ops[N].first op Ops[N].first op ... Ops[N].first) <- Ops[N].second times
+///
+/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct.
+///
+/// This routine may modify the function, in which case it returns 'true'. The
+/// changes it makes may well be destructive, changing the value computed by 'I'
+/// to something completely different. Thus if the routine returns 'true' then
+/// you MUST either replace I with a new expression computed from the Ops array,
+/// or use RewriteExprTree to put the values back in.
+///
+/// A leaf node is either not a binary operation of the same kind as the root
+/// node 'I' (i.e. is not a binary operator at all, or is, but with a different
+/// opcode), or is the same kind of binary operator but has a use which either
+/// does not belong to the expression, or does belong to the expression but is
+/// a leaf node. Every leaf node has at least one use that is a non-leaf node
+/// of the expression, while for non-leaf nodes (except for the root 'I') every
+/// use is a non-leaf node of the expression.
+///
+/// For example:
+/// expression graph node names
+///
+/// + | I
+/// / \ |
+/// + + | A, B
+/// / \ / \ |
+/// * + * | C, D, E
+/// / \ / \ / \ |
+/// + * | F, G
+///
+/// The leaf nodes are C, E, F and G. The Ops array will contain (maybe not in
+/// that order) (C, 1), (E, 1), (F, 2), (G, 2).
+///
+/// The expression is maximal: if some instruction is a binary operator of the
+/// same kind as 'I', and all of its uses are non-leaf nodes of the expression,
+/// then the instruction also belongs to the expression, is not a leaf node of
+/// it, and its operands also belong to the expression (but may be leaf nodes).
+///
+/// NOTE: This routine will set operands of non-leaf non-root nodes to undef in
+/// order to ensure that every non-root node in the expression has *exactly one*
+/// use by a non-leaf node of the expression. This destruction means that the
+/// caller MUST either replace 'I' with a new expression or use something like
+/// RewriteExprTree to put the values back in if the routine indicates that it
+/// made a change by returning 'true'.
+///
+/// In the above example either the right operand of A or the left operand of B
+/// will be replaced by undef. If it is B's operand then this gives:
+///
+/// + | I
+/// / \ |
+/// + + | A, B - operand of B replaced with undef
+/// / \ \ |
+/// * + * | C, D, E
+/// / \ / \ / \ |
+/// + * | F, G
+///
+/// Note that such undef operands can only be reached by passing through 'I'.
+/// For example, if you visit operands recursively starting from a leaf node
+/// then you will never see such an undef operand unless you get back to 'I',
+/// which requires passing through a phi node.
+///
+/// Note that this routine may also mutate binary operators of the wrong type
+/// that have all uses inside the expression (i.e. only used by non-leaf nodes
+/// of the expression) if it can turn them into binary operators of the right
+/// type and thus make the expression bigger.
+static bool LinearizeExprTree(BinaryOperator *I,
+ SmallVectorImpl<RepeatedValue> &Ops) {
+ DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
+ unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
+ unsigned Opcode = I->getOpcode();
+ assert(I->isAssociative() && I->isCommutative() &&
+ "Expected an associative and commutative operation!");
+
+ // Visit all operands of the expression, keeping track of their weight (the
+ // number of paths from the expression root to the operand, or if you like
+ // the number of times that operand occurs in the linearized expression).
+ // For example, if I = X + A, where X = A + B, then I, X and B have weight 1
+ // while A has weight two.
+
+ // Worklist of non-leaf nodes (their operands are in the expression too) along
+ // with their weights, representing a certain number of paths to the operator.
+ // If an operator occurs in the worklist multiple times then we found multiple
+ // ways to get to it.
+ SmallVector<std::pair<BinaryOperator*, APInt>, 8> Worklist; // (Op, Weight)
+ Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1)));
+ bool Changed = false;
+
+ // Leaves of the expression are values that either aren't the right kind of
+ // operation (eg: a constant, or a multiply in an add tree), or are, but have
+ // some uses that are not inside the expression. For example, in I = X + X,
+ // X = A + B, the value X has two uses (by I) that are in the expression. If
+ // X has any other uses, for example in a return instruction, then we consider
+ // X to be a leaf, and won't analyze it further. When we first visit a value,
+ // if it has more than one use then at first we conservatively consider it to
+ // be a leaf. Later, as the expression is explored, we may discover some more
+ // uses of the value from inside the expression. If all uses turn out to be
+ // from within the expression (and the value is a binary operator of the right
+ // kind) then the value is no longer considered to be a leaf, and its operands
+ // are explored.
+
+ // Leaves - Keeps track of the set of putative leaves as well as the number of
+ // paths to each leaf seen so far.
+ using LeafMap = DenseMap<Value *, APInt>;
+ LeafMap Leaves; // Leaf -> Total weight so far.
+ SmallVector<Value *, 8> LeafOrder; // Ensure deterministic leaf output order.
+
+#ifndef NDEBUG
+ SmallPtrSet<Value *, 8> Visited; // For sanity checking the iteration scheme.
+#endif
+ while (!Worklist.empty()) {
+ std::pair<BinaryOperator*, APInt> P = Worklist.pop_back_val();
+ I = P.first; // We examine the operands of this binary operator.
+
+ for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { // Visit operands.
+ Value *Op = I->getOperand(OpIdx);
+ APInt Weight = P.second; // Number of paths to this operand.
+ DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
+ assert(!Op->use_empty() && "No uses, so how did we get to it?!");
+
+ // If this is a binary operation of the right kind with only one use then
+ // add its operands to the expression.
+ if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
+ assert(Visited.insert(Op).second && "Not first visit!");
+ DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
+ Worklist.push_back(std::make_pair(BO, Weight));
+ continue;
+ }
+
+ // Appears to be a leaf. Is the operand already in the set of leaves?
+ LeafMap::iterator It = Leaves.find(Op);
+ if (It == Leaves.end()) {
+ // Not in the leaf map. Must be the first time we saw this operand.
+ assert(Visited.insert(Op).second && "Not first visit!");
+ if (!Op->hasOneUse()) {
+ // This value has uses not accounted for by the expression, so it is
+ // not safe to modify. Mark it as being a leaf.
+ DEBUG(dbgs() << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
+ LeafOrder.push_back(Op);
+ Leaves[Op] = Weight;
+ continue;
+ }
+ // No uses outside the expression, try morphing it.
+ } else {
+ // Already in the leaf map.
+ assert(It != Leaves.end() && Visited.count(Op) &&
+ "In leaf map but not visited!");
+
+ // Update the number of paths to the leaf.
+ IncorporateWeight(It->second, Weight, Opcode);
+
+#if 0 // TODO: Re-enable once PR13021 is fixed.
+ // The leaf already has one use from inside the expression. As we want
+ // exactly one such use, drop this new use of the leaf.
+ assert(!Op->hasOneUse() && "Only one use, but we got here twice!");
+ I->setOperand(OpIdx, UndefValue::get(I->getType()));
+ Changed = true;
+
+ // If the leaf is a binary operation of the right kind and we now see
+ // that its multiple original uses were in fact all by nodes belonging
+ // to the expression, then no longer consider it to be a leaf and add
+ // its operands to the expression.
+ if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
+ DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
+ Worklist.push_back(std::make_pair(BO, It->second));
+ Leaves.erase(It);
+ continue;
+ }
+#endif
+
+ // If we still have uses that are not accounted for by the expression
+ // then it is not safe to modify the value.
+ if (!Op->hasOneUse())
+ continue;
+
+ // No uses outside the expression, try morphing it.
+ Weight = It->second;
+ Leaves.erase(It); // Since the value may be morphed below.
+ }
+
+ // At this point we have a value which, first of all, is not a binary
+ // expression of the right kind, and secondly, is only used inside the
+ // expression. This means that it can safely be modified. See if we
+ // can usefully morph it into an expression of the right kind.
+ assert((!isa<Instruction>(Op) ||
+ cast<Instruction>(Op)->getOpcode() != Opcode
+ || (isa<FPMathOperator>(Op) &&
+ !cast<Instruction>(Op)->isFast())) &&
+ "Should have been handled above!");
+ assert(Op->hasOneUse() && "Has uses outside the expression tree!");
+
+ // If this is a multiply expression, turn any internal negations into
+ // multiplies by -1 so they can be reassociated.
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op))
+ if ((Opcode == Instruction::Mul && BinaryOperator::isNeg(BO)) ||
+ (Opcode == Instruction::FMul && BinaryOperator::isFNeg(BO))) {
+ DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
+ BO = LowerNegateToMultiply(BO);
+ DEBUG(dbgs() << *BO << '\n');
+ Worklist.push_back(std::make_pair(BO, Weight));
+ Changed = true;
+ continue;
+ }
+
+ // Failed to morph into an expression of the right type. This really is
+ // a leaf.
+ DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
+ assert(!isReassociableOp(Op, Opcode) && "Value was morphed?");
+ LeafOrder.push_back(Op);
+ Leaves[Op] = Weight;
+ }
+ }
+
+ // The leaves, repeated according to their weights, represent the linearized
+ // form of the expression.
+ for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) {
+ Value *V = LeafOrder[i];
+ LeafMap::iterator It = Leaves.find(V);
+ if (It == Leaves.end())
+ // Node initially thought to be a leaf wasn't.
+ continue;
+ assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!");
+ APInt Weight = It->second;
+ if (Weight.isMinValue())
+ // Leaf already output or weight reduction eliminated it.
+ continue;
+ // Ensure the leaf is only output once.
+ It->second = 0;
+ Ops.push_back(std::make_pair(V, Weight));
+ }
+
+ // For nilpotent operations or addition there may be no operands, for example
+ // because the expression was "X xor X" or consisted of 2^Bitwidth additions:
+ // in both cases the weight reduces to 0 causing the value to be skipped.
+ if (Ops.empty()) {
+ Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType());
+ assert(Identity && "Associative operation without identity!");
+ Ops.emplace_back(Identity, APInt(Bitwidth, 1));
+ }
+
+ return Changed;
+}
+
+/// Now that the operands for this expression tree are
+/// linearized and optimized, emit them in-order.
+void ReassociatePass::RewriteExprTree(BinaryOperator *I,
+ SmallVectorImpl<ValueEntry> &Ops) {
+ assert(Ops.size() > 1 && "Single values should be used directly!");
+
+ // Since our optimizations should never increase the number of operations, the
+ // new expression can usually be written reusing the existing binary operators
+ // from the original expression tree, without creating any new instructions,
+ // though the rewritten expression may have a completely different topology.
+ // We take care to not change anything if the new expression will be the same
+ // as the original. If more than trivial changes (like commuting operands)
+ // were made then we are obliged to clear out any optional subclass data like
+ // nsw flags.
+
+ /// NodesToRewrite - Nodes from the original expression available for writing
+ /// the new expression into.
+ SmallVector<BinaryOperator*, 8> NodesToRewrite;
+ unsigned Opcode = I->getOpcode();
+ BinaryOperator *Op = I;
+
+ /// NotRewritable - The operands being written will be the leaves of the new
+ /// expression and must not be used as inner nodes (via NodesToRewrite) by
+ /// mistake. Inner nodes are always reassociable, and usually leaves are not
+ /// (if they were they would have been incorporated into the expression and so
+ /// would not be leaves), so most of the time there is no danger of this. But
+ /// in rare cases a leaf may become reassociable if an optimization kills uses
+ /// of it, or it may momentarily become reassociable during rewriting (below)
+ /// due it being removed as an operand of one of its uses. Ensure that misuse
+ /// of leaf nodes as inner nodes cannot occur by remembering all of the future
+ /// leaves and refusing to reuse any of them as inner nodes.
+ SmallPtrSet<Value*, 8> NotRewritable;
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+ NotRewritable.insert(Ops[i].Op);
+
+ // ExpressionChanged - Non-null if the rewritten expression differs from the
+ // original in some non-trivial way, requiring the clearing of optional flags.
+ // Flags are cleared from the operator in ExpressionChanged up to I inclusive.
+ BinaryOperator *ExpressionChanged = nullptr;
+ for (unsigned i = 0; ; ++i) {
+ // The last operation (which comes earliest in the IR) is special as both
+ // operands will come from Ops, rather than just one with the other being
+ // a subexpression.
+ if (i+2 == Ops.size()) {
+ Value *NewLHS = Ops[i].Op;
+ Value *NewRHS = Ops[i+1].Op;
+ Value *OldLHS = Op->getOperand(0);
+ Value *OldRHS = Op->getOperand(1);
+
+ if (NewLHS == OldLHS && NewRHS == OldRHS)
+ // Nothing changed, leave it alone.
+ break;
+
+ if (NewLHS == OldRHS && NewRHS == OldLHS) {
+ // The order of the operands was reversed. Swap them.
+ DEBUG(dbgs() << "RA: " << *Op << '\n');
+ Op->swapOperands();
+ DEBUG(dbgs() << "TO: " << *Op << '\n');
+ MadeChange = true;
+ ++NumChanged;
+ break;
+ }
+
+ // The new operation differs non-trivially from the original. Overwrite
+ // the old operands with the new ones.
+ DEBUG(dbgs() << "RA: " << *Op << '\n');
+ if (NewLHS != OldLHS) {
+ BinaryOperator *BO = isReassociableOp(OldLHS, Opcode);
+ if (BO && !NotRewritable.count(BO))
+ NodesToRewrite.push_back(BO);
+ Op->setOperand(0, NewLHS);
+ }
+ if (NewRHS != OldRHS) {
+ BinaryOperator *BO = isReassociableOp(OldRHS, Opcode);
+ if (BO && !NotRewritable.count(BO))
+ NodesToRewrite.push_back(BO);
+ Op->setOperand(1, NewRHS);
+ }
+ DEBUG(dbgs() << "TO: " << *Op << '\n');
+
+ ExpressionChanged = Op;
+ MadeChange = true;
+ ++NumChanged;
+
+ break;
+ }
+
+ // Not the last operation. The left-hand side will be a sub-expression
+ // while the right-hand side will be the current element of Ops.
+ Value *NewRHS = Ops[i].Op;
+ if (NewRHS != Op->getOperand(1)) {
+ DEBUG(dbgs() << "RA: " << *Op << '\n');
+ if (NewRHS == Op->getOperand(0)) {
+ // The new right-hand side was already present as the left operand. If
+ // we are lucky then swapping the operands will sort out both of them.
+ Op->swapOperands();
+ } else {
+ // Overwrite with the new right-hand side.
+ BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode);
+ if (BO && !NotRewritable.count(BO))
+ NodesToRewrite.push_back(BO);
+ Op->setOperand(1, NewRHS);
+ ExpressionChanged = Op;
+ }
+ DEBUG(dbgs() << "TO: " << *Op << '\n');
+ MadeChange = true;
+ ++NumChanged;
+ }
+
+ // Now deal with the left-hand side. If this is already an operation node
+ // from the original expression then just rewrite the rest of the expression
+ // into it.
+ BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode);
+ if (BO && !NotRewritable.count(BO)) {
+ Op = BO;
+ continue;
+ }
+
+ // Otherwise, grab a spare node from the original expression and use that as
+ // the left-hand side. If there are no nodes left then the optimizers made
+ // an expression with more nodes than the original! This usually means that
+ // they did something stupid but it might mean that the problem was just too
+ // hard (finding the mimimal number of multiplications needed to realize a
+ // multiplication expression is NP-complete). Whatever the reason, smart or
+ // stupid, create a new node if there are none left.
+ BinaryOperator *NewOp;
+ if (NodesToRewrite.empty()) {
+ Constant *Undef = UndefValue::get(I->getType());
+ NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode),
+ Undef, Undef, "", I);
+ if (NewOp->getType()->isFPOrFPVectorTy())
+ NewOp->setFastMathFlags(I->getFastMathFlags());
+ } else {
+ NewOp = NodesToRewrite.pop_back_val();
+ }
+
+ DEBUG(dbgs() << "RA: " << *Op << '\n');
+ Op->setOperand(0, NewOp);
+ DEBUG(dbgs() << "TO: " << *Op << '\n');
+ ExpressionChanged = Op;
+ MadeChange = true;
+ ++NumChanged;
+ Op = NewOp;
+ }
+
+ // If the expression changed non-trivially then clear out all subclass data
+ // starting from the operator specified in ExpressionChanged, and compactify
+ // the operators to just before the expression root to guarantee that the
+ // expression tree is dominated by all of Ops.
+ if (ExpressionChanged)
+ do {
+ // Preserve FastMathFlags.
+ if (isa<FPMathOperator>(I)) {
+ FastMathFlags Flags = I->getFastMathFlags();
+ ExpressionChanged->clearSubclassOptionalData();
+ ExpressionChanged->setFastMathFlags(Flags);
+ } else
+ ExpressionChanged->clearSubclassOptionalData();
+
+ if (ExpressionChanged == I)
+ break;
+ ExpressionChanged->moveBefore(I);
+ ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin());
+ } while (true);
+
+ // Throw away any left over nodes from the original expression.
+ for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i)
+ RedoInsts.insert(NodesToRewrite[i]);
+}
+
+/// Insert instructions before the instruction pointed to by BI,
+/// that computes the negative version of the value specified. The negative
+/// version of the value is returned, and BI is left pointing at the instruction
+/// that should be processed next by the reassociation pass.
+/// Also add intermediate instructions to the redo list that are modified while
+/// pushing the negates through adds. These will be revisited to see if
+/// additional opportunities have been exposed.
+static Value *NegateValue(Value *V, Instruction *BI,
+ SetVector<AssertingVH<Instruction>> &ToRedo) {
+ if (auto *C = dyn_cast<Constant>(V))
+ return C->getType()->isFPOrFPVectorTy() ? ConstantExpr::getFNeg(C) :
+ ConstantExpr::getNeg(C);
+
+ // We are trying to expose opportunity for reassociation. One of the things
+ // that we want to do to achieve this is to push a negation as deep into an
+ // expression chain as possible, to expose the add instructions. In practice,
+ // this means that we turn this:
+ // X = -(A+12+C+D) into X = -A + -12 + -C + -D = -12 + -A + -C + -D
+ // so that later, a: Y = 12+X could get reassociated with the -12 to eliminate
+ // the constants. We assume that instcombine will clean up the mess later if
+ // we introduce tons of unnecessary negation instructions.
+ //
+ if (BinaryOperator *I =
+ isReassociableOp(V, Instruction::Add, Instruction::FAdd)) {
+ // Push the negates through the add.
+ I->setOperand(0, NegateValue(I->getOperand(0), BI, ToRedo));
+ I->setOperand(1, NegateValue(I->getOperand(1), BI, ToRedo));
+ if (I->getOpcode() == Instruction::Add) {
+ I->setHasNoUnsignedWrap(false);
+ I->setHasNoSignedWrap(false);
+ }
+
+ // We must move the add instruction here, because the neg instructions do
+ // not dominate the old add instruction in general. By moving it, we are
+ // assured that the neg instructions we just inserted dominate the
+ // instruction we are about to insert after them.
+ //
+ I->moveBefore(BI);
+ I->setName(I->getName()+".neg");
+
+ // Add the intermediate negates to the redo list as processing them later
+ // could expose more reassociating opportunities.
+ ToRedo.insert(I);
+ return I;
+ }
+
+ // Okay, we need to materialize a negated version of V with an instruction.
+ // Scan the use lists of V to see if we have one already.
+ for (User *U : V->users()) {
+ if (!BinaryOperator::isNeg(U) && !BinaryOperator::isFNeg(U))
+ continue;
+
+ // We found one! Now we have to make sure that the definition dominates
+ // this use. We do this by moving it to the entry block (if it is a
+ // non-instruction value) or right after the definition. These negates will
+ // be zapped by reassociate later, so we don't need much finesse here.
+ BinaryOperator *TheNeg = cast<BinaryOperator>(U);
+
+ // Verify that the negate is in this function, V might be a constant expr.
+ if (TheNeg->getParent()->getParent() != BI->getParent()->getParent())
+ continue;
+
+ BasicBlock::iterator InsertPt;
+ if (Instruction *InstInput = dyn_cast<Instruction>(V)) {
+ if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) {
+ InsertPt = II->getNormalDest()->begin();
+ } else {
+ InsertPt = ++InstInput->getIterator();
+ }
+ while (isa<PHINode>(InsertPt)) ++InsertPt;
+ } else {
+ InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin();
+ }
+ TheNeg->moveBefore(&*InsertPt);
+ if (TheNeg->getOpcode() == Instruction::Sub) {
+ TheNeg->setHasNoUnsignedWrap(false);
+ TheNeg->setHasNoSignedWrap(false);
+ } else {
+ TheNeg->andIRFlags(BI);
+ }
+ ToRedo.insert(TheNeg);
+ return TheNeg;
+ }
+
+ // Insert a 'neg' instruction that subtracts the value from zero to get the
+ // negation.
+ BinaryOperator *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI);
+ ToRedo.insert(NewNeg);
+ return NewNeg;
+}
+
+/// Return true if we should break up this subtract of X-Y into (X + -Y).
+static bool ShouldBreakUpSubtract(Instruction *Sub) {
+ // If this is a negation, we can't split it up!
+ if (BinaryOperator::isNeg(Sub) || BinaryOperator::isFNeg(Sub))
+ return false;
+
+ // Don't breakup X - undef.
+ if (isa<UndefValue>(Sub->getOperand(1)))
+ return false;
+
+ // Don't bother to break this up unless either the LHS is an associable add or
+ // subtract or if this is only used by one.
+ Value *V0 = Sub->getOperand(0);
+ if (isReassociableOp(V0, Instruction::Add, Instruction::FAdd) ||
+ isReassociableOp(V0, Instruction::Sub, Instruction::FSub))
+ return true;
+ Value *V1 = Sub->getOperand(1);
+ if (isReassociableOp(V1, Instruction::Add, Instruction::FAdd) ||
+ isReassociableOp(V1, Instruction::Sub, Instruction::FSub))
+ return true;
+ Value *VB = Sub->user_back();
+ if (Sub->hasOneUse() &&
+ (isReassociableOp(VB, Instruction::Add, Instruction::FAdd) ||
+ isReassociableOp(VB, Instruction::Sub, Instruction::FSub)))
+ return true;
+
+ return false;
+}
+
+/// If we have (X-Y), and if either X is an add, or if this is only used by an
+/// add, transform this into (X+(0-Y)) to promote better reassociation.
+static BinaryOperator *
+BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) {
+ // Convert a subtract into an add and a neg instruction. This allows sub
+ // instructions to be commuted with other add instructions.
+ //
+ // Calculate the negative value of Operand 1 of the sub instruction,
+ // and set it as the RHS of the add instruction we just made.
+ Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo);
+ BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub);
+ Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
+ Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op.
+ New->takeName(Sub);
+
+ // Everyone now refers to the add instruction.
+ Sub->replaceAllUsesWith(New);
+ New->setDebugLoc(Sub->getDebugLoc());
+
+ DEBUG(dbgs() << "Negated: " << *New << '\n');
+ return New;
+}
+
+/// If this is a shift of a reassociable multiply or is used by one, change
+/// this into a multiply by a constant to assist with further reassociation.
+static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
+ Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
+ MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));
+
+ BinaryOperator *Mul =
+ BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
+ Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op.
+ Mul->takeName(Shl);
+
+ // Everyone now refers to the mul instruction.
+ Shl->replaceAllUsesWith(Mul);
+ Mul->setDebugLoc(Shl->getDebugLoc());
+
+ // We can safely preserve the nuw flag in all cases. It's also safe to turn a
+ // nuw nsw shl into a nuw nsw mul. However, nsw in isolation requires special
+ // handling.
+ bool NSW = cast<BinaryOperator>(Shl)->hasNoSignedWrap();
+ bool NUW = cast<BinaryOperator>(Shl)->hasNoUnsignedWrap();
+ if (NSW && NUW)
+ Mul->setHasNoSignedWrap(true);
+ Mul->setHasNoUnsignedWrap(NUW);
+ return Mul;
+}
+
+/// Scan backwards and forwards among values with the same rank as element i
+/// to see if X exists. If X does not exist, return i. This is useful when
+/// scanning for 'x' when we see '-x' because they both get the same rank.
+static unsigned FindInOperandList(const SmallVectorImpl<ValueEntry> &Ops,
+ unsigned i, Value *X) {
+ unsigned XRank = Ops[i].Rank;
+ unsigned e = Ops.size();
+ for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) {
+ if (Ops[j].Op == X)
+ return j;
+ if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
+ if (Instruction *I2 = dyn_cast<Instruction>(X))
+ if (I1->isIdenticalTo(I2))
+ return j;
+ }
+ // Scan backwards.
+ for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j) {
+ if (Ops[j].Op == X)
+ return j;
+ if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
+ if (Instruction *I2 = dyn_cast<Instruction>(X))
+ if (I1->isIdenticalTo(I2))
+ return j;
+ }
+ return i;
+}
+
+/// Emit a tree of add instructions, summing Ops together
+/// and returning the result. Insert the tree before I.
+static Value *EmitAddTreeOfValues(Instruction *I,
+ SmallVectorImpl<WeakTrackingVH> &Ops) {
+ if (Ops.size() == 1) return Ops.back();
+
+ Value *V1 = Ops.back();
+ Ops.pop_back();
+ Value *V2 = EmitAddTreeOfValues(I, Ops);
+ return CreateAdd(V2, V1, "reass.add", I, I);
+}
+
+/// If V is an expression tree that is a multiplication sequence,
+/// and if this sequence contains a multiply by Factor,
+/// remove Factor from the tree and return the new tree.
+Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
+ BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
+ if (!BO)
+ return nullptr;
+
+ SmallVector<RepeatedValue, 8> Tree;
+ MadeChange |= LinearizeExprTree(BO, Tree);
+ SmallVector<ValueEntry, 8> Factors;
+ Factors.reserve(Tree.size());
+ for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
+ RepeatedValue E = Tree[i];
+ Factors.append(E.second.getZExtValue(),
+ ValueEntry(getRank(E.first), E.first));
+ }
+
+ bool FoundFactor = false;
+ bool NeedsNegate = false;
+ for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
+ if (Factors[i].Op == Factor) {
+ FoundFactor = true;
+ Factors.erase(Factors.begin()+i);
+ break;
+ }
+
+ // If this is a negative version of this factor, remove it.
+ if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) {
+ if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op))
+ if (FC1->getValue() == -FC2->getValue()) {
+ FoundFactor = NeedsNegate = true;
+ Factors.erase(Factors.begin()+i);
+ break;
+ }
+ } else if (ConstantFP *FC1 = dyn_cast<ConstantFP>(Factor)) {
+ if (ConstantFP *FC2 = dyn_cast<ConstantFP>(Factors[i].Op)) {
+ const APFloat &F1 = FC1->getValueAPF();
+ APFloat F2(FC2->getValueAPF());
+ F2.changeSign();
+ if (F1.compare(F2) == APFloat::cmpEqual) {
+ FoundFactor = NeedsNegate = true;
+ Factors.erase(Factors.begin() + i);
+ break;
+ }
+ }
+ }
+ }
+
+ if (!FoundFactor) {
+ // Make sure to restore the operands to the expression tree.
+ RewriteExprTree(BO, Factors);
+ return nullptr;
+ }
+
+ BasicBlock::iterator InsertPt = ++BO->getIterator();
+
+ // If this was just a single multiply, remove the multiply and return the only
+ // remaining operand.
+ if (Factors.size() == 1) {
+ RedoInsts.insert(BO);
+ V = Factors[0].Op;
+ } else {
+ RewriteExprTree(BO, Factors);
+ V = BO;
+ }
+
+ if (NeedsNegate)
+ V = CreateNeg(V, "neg", &*InsertPt, BO);
+
+ return V;
+}
+
+/// If V is a single-use multiply, recursively add its operands as factors,
+/// otherwise add V to the list of factors.
+///
+/// Ops is the top-level list of add operands we're trying to factor.
+static void FindSingleUseMultiplyFactors(Value *V,
+ SmallVectorImpl<Value*> &Factors) {
+ BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
+ if (!BO) {
+ Factors.push_back(V);
+ return;
+ }
+
+ // Otherwise, add the LHS and RHS to the list of factors.
+ FindSingleUseMultiplyFactors(BO->getOperand(1), Factors);
+ FindSingleUseMultiplyFactors(BO->getOperand(0), Factors);
+}
+
+/// Optimize a series of operands to an 'and', 'or', or 'xor' instruction.
+/// This optimizes based on identities. If it can be reduced to a single Value,
+/// it is returned, otherwise the Ops list is mutated as necessary.
+static Value *OptimizeAndOrXor(unsigned Opcode,
+ SmallVectorImpl<ValueEntry> &Ops) {
+ // Scan the operand lists looking for X and ~X pairs, along with X,X pairs.
+ // If we find any, we can simplify the expression. X&~X == 0, X|~X == -1.
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+ // First, check for X and ~X in the operand list.
+ assert(i < Ops.size());
+ if (BinaryOperator::isNot(Ops[i].Op)) { // Cannot occur for ^.
+ Value *X = BinaryOperator::getNotArgument(Ops[i].Op);
+ unsigned FoundX = FindInOperandList(Ops, i, X);
+ if (FoundX != i) {
+ if (Opcode == Instruction::And) // ...&X&~X = 0
+ return Constant::getNullValue(X->getType());
+
+ if (Opcode == Instruction::Or) // ...|X|~X = -1
+ return Constant::getAllOnesValue(X->getType());
+ }
+ }
+
+ // Next, check for duplicate pairs of values, which we assume are next to
+ // each other, due to our sorting criteria.
+ assert(i < Ops.size());
+ if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) {
+ if (Opcode == Instruction::And || Opcode == Instruction::Or) {
+ // Drop duplicate values for And and Or.
+ Ops.erase(Ops.begin()+i);
+ --i; --e;
+ ++NumAnnihil;
+ continue;
+ }
+
+ // Drop pairs of values for Xor.
+ assert(Opcode == Instruction::Xor);
+ if (e == 2)
+ return Constant::getNullValue(Ops[0].Op->getType());
+
+ // Y ^ X^X -> Y
+ Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
+ i -= 1; e -= 2;
+ ++NumAnnihil;
+ }
+ }
+ return nullptr;
+}
+
+/// Helper function of CombineXorOpnd(). It creates a bitwise-and
+/// instruction with the given two operands, and return the resulting
+/// instruction. There are two special cases: 1) if the constant operand is 0,
+/// it will return NULL. 2) if the constant is ~0, the symbolic operand will
+/// be returned.
+static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,
+ const APInt &ConstOpnd) {
+ if (ConstOpnd.isNullValue())
+ return nullptr;
+
+ if (ConstOpnd.isAllOnesValue())
+ return Opnd;
+
+ Instruction *I = BinaryOperator::CreateAnd(
+ Opnd, ConstantInt::get(Opnd->getType(), ConstOpnd), "and.ra",
+ InsertBefore);
+ I->setDebugLoc(InsertBefore->getDebugLoc());
+ return I;
+}
+
+// Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd"
+// into "R ^ C", where C would be 0, and R is a symbolic value.
+//
+// If it was successful, true is returned, and the "R" and "C" is returned
+// via "Res" and "ConstOpnd", respectively; otherwise, false is returned,
+// and both "Res" and "ConstOpnd" remain unchanged.
+bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
+ APInt &ConstOpnd, Value *&Res) {
+ // Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2
+ // = ((x | c1) ^ c1) ^ (c1 ^ c2)
+ // = (x & ~c1) ^ (c1 ^ c2)
+ // It is useful only when c1 == c2.
+ if (!Opnd1->isOrExpr() || Opnd1->getConstPart().isNullValue())
+ return false;
+
+ if (!Opnd1->getValue()->hasOneUse())
+ return false;
+
+ const APInt &C1 = Opnd1->getConstPart();
+ if (C1 != ConstOpnd)
+ return false;
+
+ Value *X = Opnd1->getSymbolicPart();
+ Res = createAndInstr(I, X, ~C1);
+ // ConstOpnd was C2, now C1 ^ C2.
+ ConstOpnd ^= C1;
+
+ if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+ RedoInsts.insert(T);
+ return true;
+}
+
+// Helper function of OptimizeXor(). It tries to simplify
+// "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a
+// symbolic value.
+//
+// If it was successful, true is returned, and the "R" and "C" is returned
+// via "Res" and "ConstOpnd", respectively (If the entire expression is
+// evaluated to a constant, the Res is set to NULL); otherwise, false is
+// returned, and both "Res" and "ConstOpnd" remain unchanged.
+bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
+ XorOpnd *Opnd2, APInt &ConstOpnd,
+ Value *&Res) {
+ Value *X = Opnd1->getSymbolicPart();
+ if (X != Opnd2->getSymbolicPart())
+ return false;
+
+ // This many instruction become dead.(At least "Opnd1 ^ Opnd2" will die.)
+ int DeadInstNum = 1;
+ if (Opnd1->getValue()->hasOneUse())
+ DeadInstNum++;
+ if (Opnd2->getValue()->hasOneUse())
+ DeadInstNum++;
+
+ // Xor-Rule 2:
+ // (x | c1) ^ (x & c2)
+ // = (x|c1) ^ (x&c2) ^ (c1 ^ c1) = ((x|c1) ^ c1) ^ (x & c2) ^ c1
+ // = (x & ~c1) ^ (x & c2) ^ c1 // Xor-Rule 1
+ // = (x & c3) ^ c1, where c3 = ~c1 ^ c2 // Xor-rule 3
+ //
+ if (Opnd1->isOrExpr() != Opnd2->isOrExpr()) {
+ if (Opnd2->isOrExpr())
+ std::swap(Opnd1, Opnd2);
+
+ const APInt &C1 = Opnd1->getConstPart();
+ const APInt &C2 = Opnd2->getConstPart();
+ APInt C3((~C1) ^ C2);
+
+ // Do not increase code size!
+ if (!C3.isNullValue() && !C3.isAllOnesValue()) {
+ int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;
+ if (NewInstNum > DeadInstNum)
+ return false;
+ }
+
+ Res = createAndInstr(I, X, C3);
+ ConstOpnd ^= C1;
+ } else if (Opnd1->isOrExpr()) {
+ // Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2
+ //
+ const APInt &C1 = Opnd1->getConstPart();
+ const APInt &C2 = Opnd2->getConstPart();
+ APInt C3 = C1 ^ C2;
+
+ // Do not increase code size
+ if (!C3.isNullValue() && !C3.isAllOnesValue()) {
+ int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;
+ if (NewInstNum > DeadInstNum)
+ return false;
+ }
+
+ Res = createAndInstr(I, X, C3);
+ ConstOpnd ^= C3;
+ } else {
+ // Xor-Rule 4: (x & c1) ^ (x & c2) = (x & (c1^c2))
+ //
+ const APInt &C1 = Opnd1->getConstPart();
+ const APInt &C2 = Opnd2->getConstPart();
+ APInt C3 = C1 ^ C2;
+ Res = createAndInstr(I, X, C3);
+ }
+
+ // Put the original operands in the Redo list; hope they will be deleted
+ // as dead code.
+ if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+ RedoInsts.insert(T);
+ if (Instruction *T = dyn_cast<Instruction>(Opnd2->getValue()))
+ RedoInsts.insert(T);
+
+ return true;
+}
+
+/// Optimize a series of operands to an 'xor' instruction. If it can be reduced
+/// to a single Value, it is returned, otherwise the Ops list is mutated as
+/// necessary.
+Value *ReassociatePass::OptimizeXor(Instruction *I,
+ SmallVectorImpl<ValueEntry> &Ops) {
+ if (Value *V = OptimizeAndOrXor(Instruction::Xor, Ops))
+ return V;
+
+ if (Ops.size() == 1)
+ return nullptr;
+
+ SmallVector<XorOpnd, 8> Opnds;
+ SmallVector<XorOpnd*, 8> OpndPtrs;
+ Type *Ty = Ops[0].Op->getType();
+ APInt ConstOpnd(Ty->getScalarSizeInBits(), 0);
+
+ // Step 1: Convert ValueEntry to XorOpnd
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+ Value *V = Ops[i].Op;
+ const APInt *C;
+ // TODO: Support non-splat vectors.
+ if (match(V, PatternMatch::m_APInt(C))) {
+ ConstOpnd ^= *C;
+ } else {
+ XorOpnd O(V);
+ O.setSymbolicRank(getRank(O.getSymbolicPart()));
+ Opnds.push_back(O);
+ }
+ }
+
+ // NOTE: From this point on, do *NOT* add/delete element to/from "Opnds".
+ // It would otherwise invalidate the "Opnds"'s iterator, and hence invalidate
+ // the "OpndPtrs" as well. For the similar reason, do not fuse this loop
+ // with the previous loop --- the iterator of the "Opnds" may be invalidated
+ // when new elements are added to the vector.
+ for (unsigned i = 0, e = Opnds.size(); i != e; ++i)
+ OpndPtrs.push_back(&Opnds[i]);
+
+ // Step 2: Sort the Xor-Operands in a way such that the operands containing
+ // the same symbolic value cluster together. For instance, the input operand
+ // sequence ("x | 123", "y & 456", "x & 789") will be sorted into:
+ // ("x | 123", "x & 789", "y & 456").
+ //
+ // The purpose is twofold:
+ // 1) Cluster together the operands sharing the same symbolic-value.
+ // 2) Operand having smaller symbolic-value-rank is permuted earlier, which
+ // could potentially shorten crital path, and expose more loop-invariants.
+ // Note that values' rank are basically defined in RPO order (FIXME).
+ // So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier
+ // than Y which is defined earlier than Z. Permute "x | 1", "Y & 2",
+ // "z" in the order of X-Y-Z is better than any other orders.
+ std::stable_sort(OpndPtrs.begin(), OpndPtrs.end(),
+ [](XorOpnd *LHS, XorOpnd *RHS) {
+ return LHS->getSymbolicRank() < RHS->getSymbolicRank();
+ });
+
+ // Step 3: Combine adjacent operands
+ XorOpnd *PrevOpnd = nullptr;
+ bool Changed = false;
+ for (unsigned i = 0, e = Opnds.size(); i < e; i++) {
+ XorOpnd *CurrOpnd = OpndPtrs[i];
+ // The combined value
+ Value *CV;
+
+ // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd"
+ if (!ConstOpnd.isNullValue() &&
+ CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
+ Changed = true;
+ if (CV)
+ *CurrOpnd = XorOpnd(CV);
+ else {
+ CurrOpnd->Invalidate();
+ continue;
+ }
+ }
+
+ if (!PrevOpnd || CurrOpnd->getSymbolicPart() != PrevOpnd->getSymbolicPart()) {
+ PrevOpnd = CurrOpnd;
+ continue;
+ }
+
+ // step 3.2: When previous and current operands share the same symbolic
+ // value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd"
+ if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) {
+ // Remove previous operand
+ PrevOpnd->Invalidate();
+ if (CV) {
+ *CurrOpnd = XorOpnd(CV);
+ PrevOpnd = CurrOpnd;
+ } else {
+ CurrOpnd->Invalidate();
+ PrevOpnd = nullptr;
+ }
+ Changed = true;
+ }
+ }
+
+ // Step 4: Reassemble the Ops
+ if (Changed) {
+ Ops.clear();
+ for (unsigned int i = 0, e = Opnds.size(); i < e; i++) {
+ XorOpnd &O = Opnds[i];
+ if (O.isInvalid())
+ continue;
+ ValueEntry VE(getRank(O.getValue()), O.getValue());
+ Ops.push_back(VE);
+ }
+ if (!ConstOpnd.isNullValue()) {
+ Value *C = ConstantInt::get(Ty, ConstOpnd);
+ ValueEntry VE(getRank(C), C);
+ Ops.push_back(VE);
+ }
+ unsigned Sz = Ops.size();
+ if (Sz == 1)
+ return Ops.back().Op;
+ if (Sz == 0) {
+ assert(ConstOpnd.isNullValue());
+ return ConstantInt::get(Ty, ConstOpnd);
+ }
+ }
+
+ return nullptr;
+}
+
+/// Optimize a series of operands to an 'add' instruction. This
+/// optimizes based on identities. If it can be reduced to a single Value, it
+/// is returned, otherwise the Ops list is mutated as necessary.
+Value *ReassociatePass::OptimizeAdd(Instruction *I,
+ SmallVectorImpl<ValueEntry> &Ops) {
+ // Scan the operand lists looking for X and -X pairs. If we find any, we
+ // can simplify expressions like X+-X == 0 and X+~X ==-1. While we're at it,
+ // scan for any
+ // duplicates. We want to canonicalize Y+Y+Y+Z -> 3*Y+Z.
+
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+ Value *TheOp = Ops[i].Op;
+ // Check to see if we've seen this operand before. If so, we factor all
+ // instances of the operand together. Due to our sorting criteria, we know
+ // that these need to be next to each other in the vector.
+ if (i+1 != Ops.size() && Ops[i+1].Op == TheOp) {
+ // Rescan the list, remove all instances of this operand from the expr.
+ unsigned NumFound = 0;
+ do {
+ Ops.erase(Ops.begin()+i);
+ ++NumFound;
+ } while (i != Ops.size() && Ops[i].Op == TheOp);
+
+ DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
+ ++NumFactor;
+
+ // Insert a new multiply.
+ Type *Ty = TheOp->getType();
+ Constant *C = Ty->isIntOrIntVectorTy() ?
+ ConstantInt::get(Ty, NumFound) : ConstantFP::get(Ty, NumFound);
+ Instruction *Mul = CreateMul(TheOp, C, "factor", I, I);
+
+ // Now that we have inserted a multiply, optimize it. This allows us to
+ // handle cases that require multiple factoring steps, such as this:
+ // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6
+ RedoInsts.insert(Mul);
+
+ // If every add operand was a duplicate, return the multiply.
+ if (Ops.empty())
+ return Mul;
+
+ // Otherwise, we had some input that didn't have the dupe, such as
+ // "A + A + B" -> "A*2 + B". Add the new multiply to the list of
+ // things being added by this operation.
+ Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul));
+
+ --i;
+ e = Ops.size();
+ continue;
+ }
+
+ // Check for X and -X or X and ~X in the operand list.
+ if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isFNeg(TheOp) &&
+ !BinaryOperator::isNot(TheOp))
+ continue;
+
+ Value *X = nullptr;
+ if (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp))
+ X = BinaryOperator::getNegArgument(TheOp);
+ else if (BinaryOperator::isNot(TheOp))
+ X = BinaryOperator::getNotArgument(TheOp);
+
+ unsigned FoundX = FindInOperandList(Ops, i, X);
+ if (FoundX == i)
+ continue;
+
+ // Remove X and -X from the operand list.
+ if (Ops.size() == 2 &&
+ (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp)))
+ return Constant::getNullValue(X->getType());
+
+ // Remove X and ~X from the operand list.
+ if (Ops.size() == 2 && BinaryOperator::isNot(TheOp))
+ return Constant::getAllOnesValue(X->getType());
+
+ Ops.erase(Ops.begin()+i);
+ if (i < FoundX)
+ --FoundX;
+ else
+ --i; // Need to back up an extra one.
+ Ops.erase(Ops.begin()+FoundX);
+ ++NumAnnihil;
+ --i; // Revisit element.
+ e -= 2; // Removed two elements.
+
+ // if X and ~X we append -1 to the operand list.
+ if (BinaryOperator::isNot(TheOp)) {
+ Value *V = Constant::getAllOnesValue(X->getType());
+ Ops.insert(Ops.end(), ValueEntry(getRank(V), V));
+ e += 1;
+ }
+ }
+
+ // Scan the operand list, checking to see if there are any common factors
+ // between operands. Consider something like A*A+A*B*C+D. We would like to
+ // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies.
+ // To efficiently find this, we count the number of times a factor occurs
+ // for any ADD operands that are MULs.
+ DenseMap<Value*, unsigned> FactorOccurrences;
+
+ // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4)
+ // where they are actually the same multiply.
+ unsigned MaxOcc = 0;
+ Value *MaxOccVal = nullptr;
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+ BinaryOperator *BOp =
+ isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
+ if (!BOp)
+ continue;
+
+ // Compute all of the factors of this added value.
+ SmallVector<Value*, 8> Factors;
+ FindSingleUseMultiplyFactors(BOp, Factors);
+ assert(Factors.size() > 1 && "Bad linearize!");
+
+ // Add one to FactorOccurrences for each unique factor in this op.
+ SmallPtrSet<Value*, 8> Duplicates;
+ for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
+ Value *Factor = Factors[i];
+ if (!Duplicates.insert(Factor).second)
+ continue;
+
+ unsigned Occ = ++FactorOccurrences[Factor];
+ if (Occ > MaxOcc) {
+ MaxOcc = Occ;
+ MaxOccVal = Factor;
+ }
+
+ // If Factor is a negative constant, add the negated value as a factor
+ // because we can percolate the negate out. Watch for minint, which
+ // cannot be positivified.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) {
+ if (CI->isNegative() && !CI->isMinValue(true)) {
+ Factor = ConstantInt::get(CI->getContext(), -CI->getValue());
+ if (!Duplicates.insert(Factor).second)
+ continue;
+ unsigned Occ = ++FactorOccurrences[Factor];
+ if (Occ > MaxOcc) {
+ MaxOcc = Occ;
+ MaxOccVal = Factor;
+ }
+ }
+ } else if (ConstantFP *CF = dyn_cast<ConstantFP>(Factor)) {
+ if (CF->isNegative()) {
+ APFloat F(CF->getValueAPF());
+ F.changeSign();
+ Factor = ConstantFP::get(CF->getContext(), F);
+ if (!Duplicates.insert(Factor).second)
+ continue;
+ unsigned Occ = ++FactorOccurrences[Factor];
+ if (Occ > MaxOcc) {
+ MaxOcc = Occ;
+ MaxOccVal = Factor;
+ }
+ }
+ }
+ }
+ }
+
+ // If any factor occurred more than one time, we can pull it out.
+ if (MaxOcc > 1) {
+ DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
+ ++NumFactor;
+
+ // Create a new instruction that uses the MaxOccVal twice. If we don't do
+ // this, we could otherwise run into situations where removing a factor
+ // from an expression will drop a use of maxocc, and this can cause
+ // RemoveFactorFromExpression on successive values to behave differently.
+ Instruction *DummyInst =
+ I->getType()->isIntOrIntVectorTy()
+ ? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal)
+ : BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal);
+
+ SmallVector<WeakTrackingVH, 4> NewMulOps;
+ for (unsigned i = 0; i != Ops.size(); ++i) {
+ // Only try to remove factors from expressions we're allowed to.
+ BinaryOperator *BOp =
+ isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
+ if (!BOp)
+ continue;
+
+ if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
+ // The factorized operand may occur several times. Convert them all in
+ // one fell swoop.
+ for (unsigned j = Ops.size(); j != i;) {
+ --j;
+ if (Ops[j].Op == Ops[i].Op) {
+ NewMulOps.push_back(V);
+ Ops.erase(Ops.begin()+j);
+ }
+ }
+ --i;
+ }
+ }
+
+ // No need for extra uses anymore.
+ DummyInst->deleteValue();
+
+ unsigned NumAddedValues = NewMulOps.size();
+ Value *V = EmitAddTreeOfValues(I, NewMulOps);
+
+ // Now that we have inserted the add tree, optimize it. This allows us to
+ // handle cases that require multiple factoring steps, such as this:
+ // A*A*B + A*A*C --> A*(A*B+A*C) --> A*(A*(B+C))
+ assert(NumAddedValues > 1 && "Each occurrence should contribute a value");
+ (void)NumAddedValues;
+ if (Instruction *VI = dyn_cast<Instruction>(V))
+ RedoInsts.insert(VI);
+
+ // Create the multiply.
+ Instruction *V2 = CreateMul(V, MaxOccVal, "reass.mul", I, I);
+
+ // Rerun associate on the multiply in case the inner expression turned into
+ // a multiply. We want to make sure that we keep things in canonical form.
+ RedoInsts.insert(V2);
+
+ // If every add operand included the factor (e.g. "A*B + A*C"), then the
+ // entire result expression is just the multiply "A*(B+C)".
+ if (Ops.empty())
+ return V2;
+
+ // Otherwise, we had some input that didn't have the factor, such as
+ // "A*B + A*C + D" -> "A*(B+C) + D". Add the new multiply to the list of
+ // things being added by this operation.
+ Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
+ }
+
+ return nullptr;
+}
+
+/// \brief Build up a vector of value/power pairs factoring a product.
+///
+/// Given a series of multiplication operands, build a vector of factors and
+/// the powers each is raised to when forming the final product. Sort them in
+/// the order of descending power.
+///
+/// (x*x) -> [(x, 2)]
+/// ((x*x)*x) -> [(x, 3)]
+/// ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)]
+///
+/// \returns Whether any factors have a power greater than one.
+static bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
+ SmallVectorImpl<Factor> &Factors) {
+ // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this.
+ // Compute the sum of powers of simplifiable factors.
+ unsigned FactorPowerSum = 0;
+ for (unsigned Idx = 1, Size = Ops.size(); Idx < Size; ++Idx) {
+ Value *Op = Ops[Idx-1].Op;
+
+ // Count the number of occurrences of this value.
+ unsigned Count = 1;
+ for (; Idx < Size && Ops[Idx].Op == Op; ++Idx)
+ ++Count;
+ // Track for simplification all factors which occur 2 or more times.
+ if (Count > 1)
+ FactorPowerSum += Count;
+ }
+
+ // We can only simplify factors if the sum of the powers of our simplifiable
+ // factors is 4 or higher. When that is the case, we will *always* have
+ // a simplification. This is an important invariant to prevent cyclicly
+ // trying to simplify already minimal formations.
+ if (FactorPowerSum < 4)
+ return false;
+
+ // Now gather the simplifiable factors, removing them from Ops.
+ FactorPowerSum = 0;
+ for (unsigned Idx = 1; Idx < Ops.size(); ++Idx) {
+ Value *Op = Ops[Idx-1].Op;
+
+ // Count the number of occurrences of this value.
+ unsigned Count = 1;
+ for (; Idx < Ops.size() && Ops[Idx].Op == Op; ++Idx)
+ ++Count;
+ if (Count == 1)
+ continue;
+ // Move an even number of occurrences to Factors.
+ Count &= ~1U;
+ Idx -= Count;
+ FactorPowerSum += Count;
+ Factors.push_back(Factor(Op, Count));
+ Ops.erase(Ops.begin()+Idx, Ops.begin()+Idx+Count);
+ }
+
+ // None of the adjustments above should have reduced the sum of factor powers
+ // below our mininum of '4'.
+ assert(FactorPowerSum >= 4);
+
+ std::stable_sort(Factors.begin(), Factors.end(),
+ [](const Factor &LHS, const Factor &RHS) {
+ return LHS.Power > RHS.Power;
+ });
+ return true;
+}
+
+/// \brief Build a tree of multiplies, computing the product of Ops.
+static Value *buildMultiplyTree(IRBuilder<> &Builder,
+ SmallVectorImpl<Value*> &Ops) {
+ if (Ops.size() == 1)
+ return Ops.back();
+
+ Value *LHS = Ops.pop_back_val();
+ do {
+ if (LHS->getType()->isIntOrIntVectorTy())
+ LHS = Builder.CreateMul(LHS, Ops.pop_back_val());
+ else
+ LHS = Builder.CreateFMul(LHS, Ops.pop_back_val());
+ } while (!Ops.empty());
+
+ return LHS;
+}
+
+/// \brief Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*...
+///
+/// Given a vector of values raised to various powers, where no two values are
+/// equal and the powers are sorted in decreasing order, compute the minimal
+/// DAG of multiplies to compute the final product, and return that product
+/// value.
+Value *
+ReassociatePass::buildMinimalMultiplyDAG(IRBuilder<> &Builder,
+ SmallVectorImpl<Factor> &Factors) {
+ assert(Factors[0].Power);
+ SmallVector<Value *, 4> OuterProduct;
+ for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size();
+ Idx < Size && Factors[Idx].Power > 0; ++Idx) {
+ if (Factors[Idx].Power != Factors[LastIdx].Power) {
+ LastIdx = Idx;
+ continue;
+ }
+
+ // We want to multiply across all the factors with the same power so that
+ // we can raise them to that power as a single entity. Build a mini tree
+ // for that.
+ SmallVector<Value *, 4> InnerProduct;
+ InnerProduct.push_back(Factors[LastIdx].Base);
+ do {
+ InnerProduct.push_back(Factors[Idx].Base);
+ ++Idx;
+ } while (Idx < Size && Factors[Idx].Power == Factors[LastIdx].Power);
+
+ // Reset the base value of the first factor to the new expression tree.
+ // We'll remove all the factors with the same power in a second pass.
+ Value *M = Factors[LastIdx].Base = buildMultiplyTree(Builder, InnerProduct);
+ if (Instruction *MI = dyn_cast<Instruction>(M))
+ RedoInsts.insert(MI);
+
+ LastIdx = Idx;
+ }
+ // Unique factors with equal powers -- we've folded them into the first one's
+ // base.
+ Factors.erase(std::unique(Factors.begin(), Factors.end(),
+ [](const Factor &LHS, const Factor &RHS) {
+ return LHS.Power == RHS.Power;
+ }),
+ Factors.end());
+
+ // Iteratively collect the base of each factor with an add power into the
+ // outer product, and halve each power in preparation for squaring the
+ // expression.
+ for (unsigned Idx = 0, Size = Factors.size(); Idx != Size; ++Idx) {
+ if (Factors[Idx].Power & 1)
+ OuterProduct.push_back(Factors[Idx].Base);
+ Factors[Idx].Power >>= 1;
+ }
+ if (Factors[0].Power) {
+ Value *SquareRoot = buildMinimalMultiplyDAG(Builder, Factors);
+ OuterProduct.push_back(SquareRoot);
+ OuterProduct.push_back(SquareRoot);
+ }
+ if (OuterProduct.size() == 1)
+ return OuterProduct.front();
+
+ Value *V = buildMultiplyTree(Builder, OuterProduct);
+ return V;
+}
+
+Value *ReassociatePass::OptimizeMul(BinaryOperator *I,
+ SmallVectorImpl<ValueEntry> &Ops) {
+ // We can only optimize the multiplies when there is a chain of more than
+ // three, such that a balanced tree might require fewer total multiplies.
+ if (Ops.size() < 4)
+ return nullptr;
+
+ // Try to turn linear trees of multiplies without other uses of the
+ // intermediate stages into minimal multiply DAGs with perfect sub-expression
+ // re-use.
+ SmallVector<Factor, 4> Factors;
+ if (!collectMultiplyFactors(Ops, Factors))
+ return nullptr; // All distinct factors, so nothing left for us to do.
+
+ IRBuilder<> Builder(I);
+ // The reassociate transformation for FP operations is performed only
+ // if unsafe algebra is permitted by FastMathFlags. Propagate those flags
+ // to the newly generated operations.
+ if (auto FPI = dyn_cast<FPMathOperator>(I))
+ Builder.setFastMathFlags(FPI->getFastMathFlags());
+
+ Value *V = buildMinimalMultiplyDAG(Builder, Factors);
+ if (Ops.empty())
+ return V;
+
+ ValueEntry NewEntry = ValueEntry(getRank(V), V);
+ Ops.insert(std::lower_bound(Ops.begin(), Ops.end(), NewEntry), NewEntry);
+ return nullptr;
+}
+
+Value *ReassociatePass::OptimizeExpression(BinaryOperator *I,
+ SmallVectorImpl<ValueEntry> &Ops) {
+ // Now that we have the linearized expression tree, try to optimize it.
+ // Start by folding any constants that we found.
+ Constant *Cst = nullptr;
+ unsigned Opcode = I->getOpcode();
+ while (!Ops.empty() && isa<Constant>(Ops.back().Op)) {
+ Constant *C = cast<Constant>(Ops.pop_back_val().Op);
+ Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C;
+ }
+ // If there was nothing but constants then we are done.
+ if (Ops.empty())
+ return Cst;
+
+ // Put the combined constant back at the end of the operand list, except if
+ // there is no point. For example, an add of 0 gets dropped here, while a
+ // multiplication by zero turns the whole expression into zero.
+ if (Cst && Cst != ConstantExpr::getBinOpIdentity(Opcode, I->getType())) {
+ if (Cst == ConstantExpr::getBinOpAbsorber(Opcode, I->getType()))
+ return Cst;
+ Ops.push_back(ValueEntry(0, Cst));
+ }
+
+ if (Ops.size() == 1) return Ops[0].Op;
+
+ // Handle destructive annihilation due to identities between elements in the
+ // argument list here.
+ unsigned NumOps = Ops.size();
+ switch (Opcode) {
+ default: break;
+ case Instruction::And:
+ case Instruction::Or:
+ if (Value *Result = OptimizeAndOrXor(Opcode, Ops))
+ return Result;
+ break;
+
+ case Instruction::Xor:
+ if (Value *Result = OptimizeXor(I, Ops))
+ return Result;
+ break;
+
+ case Instruction::Add:
+ case Instruction::FAdd:
+ if (Value *Result = OptimizeAdd(I, Ops))
+ return Result;
+ break;
+
+ case Instruction::Mul:
+ case Instruction::FMul:
+ if (Value *Result = OptimizeMul(I, Ops))
+ return Result;
+ break;
+ }
+
+ if (Ops.size() != NumOps)
+ return OptimizeExpression(I, Ops);
+ return nullptr;
+}
+
+// Remove dead instructions and if any operands are trivially dead add them to
+// Insts so they will be removed as well.
+void ReassociatePass::RecursivelyEraseDeadInsts(
+ Instruction *I, SetVector<AssertingVH<Instruction>> &Insts) {
+ assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
+ SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end());
+ ValueRankMap.erase(I);
+ Insts.remove(I);
+ RedoInsts.remove(I);
+ I->eraseFromParent();
+ for (auto Op : Ops)
+ if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+ if (OpInst->use_empty())
+ Insts.insert(OpInst);
+}
+
+/// Zap the given instruction, adding interesting operands to the work list.
+void ReassociatePass::EraseInst(Instruction *I) {
+ assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
+ DEBUG(dbgs() << "Erasing dead inst: "; I->dump());
+
+ SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
+ // Erase the dead instruction.
+ ValueRankMap.erase(I);
+ RedoInsts.remove(I);
+ I->eraseFromParent();
+ // Optimize its operands.
+ SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes.
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+ if (Instruction *Op = dyn_cast<Instruction>(Ops[i])) {
+ // If this is a node in an expression tree, climb to the expression root
+ // and add that since that's where optimization actually happens.
+ unsigned Opcode = Op->getOpcode();
+ while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode &&
+ Visited.insert(Op).second)
+ Op = Op->user_back();
+ RedoInsts.insert(Op);
+ }
+
+ MadeChange = true;
+}
+
+// Canonicalize expressions of the following form:
+// x + (-Constant * y) -> x - (Constant * y)
+// x - (-Constant * y) -> x + (Constant * y)
+Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) {
+ if (!I->hasOneUse() || I->getType()->isVectorTy())
+ return nullptr;
+
+ // Must be a fmul or fdiv instruction.
+ unsigned Opcode = I->getOpcode();
+ if (Opcode != Instruction::FMul && Opcode != Instruction::FDiv)
+ return nullptr;
+
+ auto *C0 = dyn_cast<ConstantFP>(I->getOperand(0));
+ auto *C1 = dyn_cast<ConstantFP>(I->getOperand(1));
+
+ // Both operands are constant, let it get constant folded away.
+ if (C0 && C1)
+ return nullptr;
+
+ ConstantFP *CF = C0 ? C0 : C1;
+
+ // Must have one constant operand.
+ if (!CF)
+ return nullptr;
+
+ // Must be a negative ConstantFP.
+ if (!CF->isNegative())
+ return nullptr;
+
+ // User must be a binary operator with one or more uses.
+ Instruction *User = I->user_back();
+ if (!isa<BinaryOperator>(User) || User->use_empty())
+ return nullptr;
+
+ unsigned UserOpcode = User->getOpcode();
+ if (UserOpcode != Instruction::FAdd && UserOpcode != Instruction::FSub)
+ return nullptr;
+
+ // Subtraction is not commutative. Explicitly, the following transform is
+ // not valid: (-Constant * y) - x -> x + (Constant * y)
+ if (!User->isCommutative() && User->getOperand(1) != I)
+ return nullptr;
+
+ // Don't canonicalize x + (-Constant * y) -> x - (Constant * y), if the
+ // resulting subtract will be broken up later. This can get us into an
+ // infinite loop during reassociation.
+ if (UserOpcode == Instruction::FAdd && ShouldBreakUpSubtract(User))
+ return nullptr;
+
+ // Change the sign of the constant.
+ APFloat Val = CF->getValueAPF();
+ Val.changeSign();
+ I->setOperand(C0 ? 0 : 1, ConstantFP::get(CF->getContext(), Val));
+
+ // Canonicalize I to RHS to simplify the next bit of logic. E.g.,
+ // ((-Const*y) + x) -> (x + (-Const*y)).
+ if (User->getOperand(0) == I && User->isCommutative())
+ cast<BinaryOperator>(User)->swapOperands();
+
+ Value *Op0 = User->getOperand(0);
+ Value *Op1 = User->getOperand(1);
+ BinaryOperator *NI;
+ switch (UserOpcode) {
+ default:
+ llvm_unreachable("Unexpected Opcode!");
+ case Instruction::FAdd:
+ NI = BinaryOperator::CreateFSub(Op0, Op1);
+ NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
+ break;
+ case Instruction::FSub:
+ NI = BinaryOperator::CreateFAdd(Op0, Op1);
+ NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
+ break;
+ }
+
+ NI->insertBefore(User);
+ NI->setName(User->getName());
+ User->replaceAllUsesWith(NI);
+ NI->setDebugLoc(I->getDebugLoc());
+ RedoInsts.insert(I);
+ MadeChange = true;
+ return NI;
+}
+
+/// Inspect and optimize the given instruction. Note that erasing
+/// instructions is not allowed.
+void ReassociatePass::OptimizeInst(Instruction *I) {
+ // Only consider operations that we understand.
+ if (!isa<BinaryOperator>(I))
+ return;
+
+ if (I->getOpcode() == Instruction::Shl && isa<ConstantInt>(I->getOperand(1)))
+ // If an operand of this shift is a reassociable multiply, or if the shift
+ // is used by a reassociable multiply or add, turn into a multiply.
+ if (isReassociableOp(I->getOperand(0), Instruction::Mul) ||
+ (I->hasOneUse() &&
+ (isReassociableOp(I->user_back(), Instruction::Mul) ||
+ isReassociableOp(I->user_back(), Instruction::Add)))) {
+ Instruction *NI = ConvertShiftToMul(I);
+ RedoInsts.insert(I);
+ MadeChange = true;
+ I = NI;
+ }
+
+ // Canonicalize negative constants out of expressions.
+ if (Instruction *Res = canonicalizeNegConstExpr(I))
+ I = Res;
+
+ // Commute binary operators, to canonicalize the order of their operands.
+ // This can potentially expose more CSE opportunities, and makes writing other
+ // transformations simpler.
+ if (I->isCommutative())
+ canonicalizeOperands(I);
+
+ // Don't optimize floating-point instructions unless they are 'fast'.
+ if (I->getType()->isFPOrFPVectorTy() && !I->isFast())
+ return;
+
+ // Do not reassociate boolean (i1) expressions. We want to preserve the
+ // original order of evaluation for short-circuited comparisons that
+ // SimplifyCFG has folded to AND/OR expressions. If the expression
+ // is not further optimized, it is likely to be transformed back to a
+ // short-circuited form for code gen, and the source order may have been
+ // optimized for the most likely conditions.
+ if (I->getType()->isIntegerTy(1))
+ return;
+
+ // If this is a subtract instruction which is not already in negate form,
+ // see if we can convert it to X+-Y.
+ if (I->getOpcode() == Instruction::Sub) {
+ if (ShouldBreakUpSubtract(I)) {
+ Instruction *NI = BreakUpSubtract(I, RedoInsts);
+ RedoInsts.insert(I);
+ MadeChange = true;
+ I = NI;
+ } else if (BinaryOperator::isNeg(I)) {
+ // Otherwise, this is a negation. See if the operand is a multiply tree
+ // and if this is not an inner node of a multiply tree.
+ if (isReassociableOp(I->getOperand(1), Instruction::Mul) &&
+ (!I->hasOneUse() ||
+ !isReassociableOp(I->user_back(), Instruction::Mul))) {
+ Instruction *NI = LowerNegateToMultiply(I);
+ // If the negate was simplified, revisit the users to see if we can
+ // reassociate further.
+ for (User *U : NI->users()) {
+ if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
+ RedoInsts.insert(Tmp);
+ }
+ RedoInsts.insert(I);
+ MadeChange = true;
+ I = NI;
+ }
+ }
+ } else if (I->getOpcode() == Instruction::FSub) {
+ if (ShouldBreakUpSubtract(I)) {
+ Instruction *NI = BreakUpSubtract(I, RedoInsts);
+ RedoInsts.insert(I);
+ MadeChange = true;
+ I = NI;
+ } else if (BinaryOperator::isFNeg(I)) {
+ // Otherwise, this is a negation. See if the operand is a multiply tree
+ // and if this is not an inner node of a multiply tree.
+ if (isReassociableOp(I->getOperand(1), Instruction::FMul) &&
+ (!I->hasOneUse() ||
+ !isReassociableOp(I->user_back(), Instruction::FMul))) {
+ // If the negate was simplified, revisit the users to see if we can
+ // reassociate further.
+ Instruction *NI = LowerNegateToMultiply(I);
+ for (User *U : NI->users()) {
+ if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
+ RedoInsts.insert(Tmp);
+ }
+ RedoInsts.insert(I);
+ MadeChange = true;
+ I = NI;
+ }
+ }
+ }
+
+ // If this instruction is an associative binary operator, process it.
+ if (!I->isAssociative()) return;
+ BinaryOperator *BO = cast<BinaryOperator>(I);
+
+ // If this is an interior node of a reassociable tree, ignore it until we
+ // get to the root of the tree, to avoid N^2 analysis.
+ unsigned Opcode = BO->getOpcode();
+ if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) {
+ // During the initial run we will get to the root of the tree.
+ // But if we get here while we are redoing instructions, there is no
+ // guarantee that the root will be visited. So Redo later
+ if (BO->user_back() != BO &&
+ BO->getParent() == BO->user_back()->getParent())
+ RedoInsts.insert(BO->user_back());
+ return;
+ }
+
+ // If this is an add tree that is used by a sub instruction, ignore it
+ // until we process the subtract.
+ if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add &&
+ cast<Instruction>(BO->user_back())->getOpcode() == Instruction::Sub)
+ return;
+ if (BO->hasOneUse() && BO->getOpcode() == Instruction::FAdd &&
+ cast<Instruction>(BO->user_back())->getOpcode() == Instruction::FSub)
+ return;
+
+ ReassociateExpression(BO);
+}
+
+void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
+ // First, walk the expression tree, linearizing the tree, collecting the
+ // operand information.
+ SmallVector<RepeatedValue, 8> Tree;
+ MadeChange |= LinearizeExprTree(I, Tree);
+ SmallVector<ValueEntry, 8> Ops;
+ Ops.reserve(Tree.size());
+ for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
+ RepeatedValue E = Tree[i];
+ Ops.append(E.second.getZExtValue(),
+ ValueEntry(getRank(E.first), E.first));
+ }
+
+ DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
+
+ // Now that we have linearized the tree to a list and have gathered all of
+ // the operands and their ranks, sort the operands by their rank. Use a
+ // stable_sort so that values with equal ranks will have their relative
+ // positions maintained (and so the compiler is deterministic). Note that
+ // this sorts so that the highest ranking values end up at the beginning of
+ // the vector.
+ std::stable_sort(Ops.begin(), Ops.end());
+
+ // Now that we have the expression tree in a convenient
+ // sorted form, optimize it globally if possible.
+ if (Value *V = OptimizeExpression(I, Ops)) {
+ if (V == I)
+ // Self-referential expression in unreachable code.
+ return;
+ // This expression tree simplified to something that isn't a tree,
+ // eliminate it.
+ DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
+ I->replaceAllUsesWith(V);
+ if (Instruction *VI = dyn_cast<Instruction>(V))
+ if (I->getDebugLoc())
+ VI->setDebugLoc(I->getDebugLoc());
+ RedoInsts.insert(I);
+ ++NumAnnihil;
+ return;
+ }
+
+ // We want to sink immediates as deeply as possible except in the case where
+ // this is a multiply tree used only by an add, and the immediate is a -1.
+ // In this case we reassociate to put the negation on the outside so that we
+ // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y
+ if (I->hasOneUse()) {
+ if (I->getOpcode() == Instruction::Mul &&
+ cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add &&
+ isa<ConstantInt>(Ops.back().Op) &&
+ cast<ConstantInt>(Ops.back().Op)->isMinusOne()) {
+ ValueEntry Tmp = Ops.pop_back_val();
+ Ops.insert(Ops.begin(), Tmp);
+ } else if (I->getOpcode() == Instruction::FMul &&
+ cast<Instruction>(I->user_back())->getOpcode() ==
+ Instruction::FAdd &&
+ isa<ConstantFP>(Ops.back().Op) &&
+ cast<ConstantFP>(Ops.back().Op)->isExactlyValue(-1.0)) {
+ ValueEntry Tmp = Ops.pop_back_val();
+ Ops.insert(Ops.begin(), Tmp);
+ }
+ }
+
+ DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
+
+ if (Ops.size() == 1) {
+ if (Ops[0].Op == I)
+ // Self-referential expression in unreachable code.
+ return;
+
+ // This expression tree simplified to something that isn't a tree,
+ // eliminate it.
+ I->replaceAllUsesWith(Ops[0].Op);
+ if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op))
+ OI->setDebugLoc(I->getDebugLoc());
+ RedoInsts.insert(I);
+ return;
+ }
+
+ if (Ops.size() > 2 && Ops.size() <= GlobalReassociateLimit) {
+ // Find the pair with the highest count in the pairmap and move it to the
+ // back of the list so that it can later be CSE'd.
+ // example:
+ // a*b*c*d*e
+ // if c*e is the most "popular" pair, we can express this as
+ // (((c*e)*d)*b)*a
+ unsigned Max = 1;
+ unsigned BestRank = 0;
+ std::pair<unsigned, unsigned> BestPair;
+ unsigned Idx = I->getOpcode() - Instruction::BinaryOpsBegin;
+ for (unsigned i = 0; i < Ops.size() - 1; ++i)
+ for (unsigned j = i + 1; j < Ops.size(); ++j) {
+ unsigned Score = 0;
+ Value *Op0 = Ops[i].Op;
+ Value *Op1 = Ops[j].Op;
+ if (std::less<Value *>()(Op1, Op0))
+ std::swap(Op0, Op1);
+ auto it = PairMap[Idx].find({Op0, Op1});
+ if (it != PairMap[Idx].end())
+ Score += it->second;
+
+ unsigned MaxRank = std::max(Ops[i].Rank, Ops[j].Rank);
+ if (Score > Max || (Score == Max && MaxRank < BestRank)) {
+ BestPair = {i, j};
+ Max = Score;
+ BestRank = MaxRank;
+ }
+ }
+ if (Max > 1) {
+ auto Op0 = Ops[BestPair.first];
+ auto Op1 = Ops[BestPair.second];
+ Ops.erase(&Ops[BestPair.second]);
+ Ops.erase(&Ops[BestPair.first]);
+ Ops.push_back(Op0);
+ Ops.push_back(Op1);
+ }
+ }
+ // Now that we ordered and optimized the expressions, splat them back into
+ // the expression tree, removing any unneeded nodes.
+ RewriteExprTree(I, Ops);
+}
+
+void
+ReassociatePass::BuildPairMap(ReversePostOrderTraversal<Function *> &RPOT) {
+ // Make a "pairmap" of how often each operand pair occurs.
+ for (BasicBlock *BI : RPOT) {
+ for (Instruction &I : *BI) {
+ if (!I.isAssociative())
+ continue;
+
+ // Ignore nodes that aren't at the root of trees.
+ if (I.hasOneUse() && I.user_back()->getOpcode() == I.getOpcode())
+ continue;
+
+ // Collect all operands in a single reassociable expression.
+ // Since Reassociate has already been run once, we can assume things
+ // are already canonical according to Reassociation's regime.
+ SmallVector<Value *, 8> Worklist = { I.getOperand(0), I.getOperand(1) };
+ SmallVector<Value *, 8> Ops;
+ while (!Worklist.empty() && Ops.size() <= GlobalReassociateLimit) {
+ Value *Op = Worklist.pop_back_val();
+ Instruction *OpI = dyn_cast<Instruction>(Op);
+ if (!OpI || OpI->getOpcode() != I.getOpcode() || !OpI->hasOneUse()) {
+ Ops.push_back(Op);
+ continue;
+ }
+ // Be paranoid about self-referencing expressions in unreachable code.
+ if (OpI->getOperand(0) != OpI)
+ Worklist.push_back(OpI->getOperand(0));
+ if (OpI->getOperand(1) != OpI)
+ Worklist.push_back(OpI->getOperand(1));
+ }
+ // Skip extremely long expressions.
+ if (Ops.size() > GlobalReassociateLimit)
+ continue;
+
+ // Add all pairwise combinations of operands to the pair map.
+ unsigned BinaryIdx = I.getOpcode() - Instruction::BinaryOpsBegin;
+ SmallSet<std::pair<Value *, Value*>, 32> Visited;
+ for (unsigned i = 0; i < Ops.size() - 1; ++i) {
+ for (unsigned j = i + 1; j < Ops.size(); ++j) {
+ // Canonicalize operand orderings.
+ Value *Op0 = Ops[i];
+ Value *Op1 = Ops[j];
+ if (std::less<Value *>()(Op1, Op0))
+ std::swap(Op0, Op1);
+ if (!Visited.insert({Op0, Op1}).second)
+ continue;
+ auto res = PairMap[BinaryIdx].insert({{Op0, Op1}, 1});
+ if (!res.second)
+ ++res.first->second;
+ }
+ }
+ }
+ }
+}
+
+PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
+ // Get the functions basic blocks in Reverse Post Order. This order is used by
+ // BuildRankMap to pre calculate ranks correctly. It also excludes dead basic
+ // blocks (it has been seen that the analysis in this pass could hang when
+ // analysing dead basic blocks).
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+
+ // Calculate the rank map for F.
+ BuildRankMap(F, RPOT);
+
+ // Build the pair map before running reassociate.
+ // Technically this would be more accurate if we did it after one round
+ // of reassociation, but in practice it doesn't seem to help much on
+ // real-world code, so don't waste the compile time running reassociate
+ // twice.
+ // If a user wants, they could expicitly run reassociate twice in their
+ // pass pipeline for further potential gains.
+ // It might also be possible to update the pair map during runtime, but the
+ // overhead of that may be large if there's many reassociable chains.
+ BuildPairMap(RPOT);
+
+ MadeChange = false;
+
+ // Traverse the same blocks that were analysed by BuildRankMap.
+ for (BasicBlock *BI : RPOT) {
+ assert(RankMap.count(&*BI) && "BB should be ranked.");
+ // Optimize every instruction in the basic block.
+ for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;)
+ if (isInstructionTriviallyDead(&*II)) {
+ EraseInst(&*II++);
+ } else {
+ OptimizeInst(&*II);
+ assert(II->getParent() == &*BI && "Moved to a different block!");
+ ++II;
+ }
+
+ // Make a copy of all the instructions to be redone so we can remove dead
+ // instructions.
+ SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts);
+ // Iterate over all instructions to be reevaluated and remove trivially dead
+ // instructions. If any operand of the trivially dead instruction becomes
+ // dead mark it for deletion as well. Continue this process until all
+ // trivially dead instructions have been removed.
+ while (!ToRedo.empty()) {
+ Instruction *I = ToRedo.pop_back_val();
+ if (isInstructionTriviallyDead(I)) {
+ RecursivelyEraseDeadInsts(I, ToRedo);
+ MadeChange = true;
+ }
+ }
+
+ // Now that we have removed dead instructions, we can reoptimize the
+ // remaining instructions.
+ while (!RedoInsts.empty()) {
+ Instruction *I = RedoInsts.pop_back_val();
+ if (isInstructionTriviallyDead(I))
+ EraseInst(I);
+ else
+ OptimizeInst(I);
+ }
+ }
+
+ // We are done with the rank map and pair map.
+ RankMap.clear();
+ ValueRankMap.clear();
+ for (auto &Entry : PairMap)
+ Entry.clear();
+
+ if (MadeChange) {
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+ }
+
+ return PreservedAnalyses::all();
+}
+
+namespace {
+
+ class ReassociateLegacyPass : public FunctionPass {
+ ReassociatePass Impl;
+
+ public:
+ static char ID; // Pass identification, replacement for typeid
+
+ ReassociateLegacyPass() : FunctionPass(ID) {
+ initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ FunctionAnalysisManager DummyFAM;
+ auto PA = Impl.run(F, DummyFAM);
+ return !PA.areAllPreserved();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+ };
+
+} // end anonymous namespace
+
+char ReassociateLegacyPass::ID = 0;
+
+INITIALIZE_PASS(ReassociateLegacyPass, "reassociate",
+ "Reassociate expressions", false, false)
+
+// Public interface to the Reassociate pass
+FunctionPass *llvm::createReassociatePass() {
+ return new ReassociateLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
new file mode 100644
index 000000000000..96295683314c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -0,0 +1,128 @@
+//===- Reg2Mem.cpp - Convert registers to allocas -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file demotes all registers to memory references. It is intended to be
+// the inverse of PromoteMemoryToRegister. By converting to loads, the only
+// values live across basic blocks are allocas and loads before phi nodes.
+// It is intended that this should make CFG hacking much easier.
+// To make later hacking easier, the entry block is split into two, such that
+// all introduced allocas and nothing else are in the entry block.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <list>
+using namespace llvm;
+
+#define DEBUG_TYPE "reg2mem"
+
+STATISTIC(NumRegsDemoted, "Number of registers demoted");
+STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted");
+
+namespace {
+ struct RegToMem : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ RegToMem() : FunctionPass(ID) {
+ initializeRegToMemPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequiredID(BreakCriticalEdgesID);
+ AU.addPreservedID(BreakCriticalEdgesID);
+ }
+
+ bool valueEscapes(const Instruction *Inst) const {
+ const BasicBlock *BB = Inst->getParent();
+ for (const User *U : Inst->users()) {
+ const Instruction *UI = cast<Instruction>(U);
+ if (UI->getParent() != BB || isa<PHINode>(UI))
+ return true;
+ }
+ return false;
+ }
+
+ bool runOnFunction(Function &F) override;
+ };
+}
+
+char RegToMem::ID = 0;
+INITIALIZE_PASS_BEGIN(RegToMem, "reg2mem", "Demote all values to stack slots",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
+INITIALIZE_PASS_END(RegToMem, "reg2mem", "Demote all values to stack slots",
+ false, false)
+
+bool RegToMem::runOnFunction(Function &F) {
+ if (F.isDeclaration() || skipFunction(F))
+ return false;
+
+ // Insert all new allocas into entry block.
+ BasicBlock *BBEntry = &F.getEntryBlock();
+ assert(pred_empty(BBEntry) &&
+ "Entry block to function must not have predecessors!");
+
+ // Find first non-alloca instruction and create insertion point. This is
+ // safe if block is well-formed: it always have terminator, otherwise
+ // we'll get and assertion.
+ BasicBlock::iterator I = BBEntry->begin();
+ while (isa<AllocaInst>(I)) ++I;
+
+ CastInst *AllocaInsertionPoint = new BitCastInst(
+ Constant::getNullValue(Type::getInt32Ty(F.getContext())),
+ Type::getInt32Ty(F.getContext()), "reg2mem alloca point", &*I);
+
+ // Find the escaped instructions. But don't create stack slots for
+ // allocas in entry block.
+ std::list<Instruction*> WorkList;
+ for (BasicBlock &ibb : F)
+ for (BasicBlock::iterator iib = ibb.begin(), iie = ibb.end(); iib != iie;
+ ++iib) {
+ if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) &&
+ valueEscapes(&*iib)) {
+ WorkList.push_front(&*iib);
+ }
+ }
+
+ // Demote escaped instructions
+ NumRegsDemoted += WorkList.size();
+ for (Instruction *ilb : WorkList)
+ DemoteRegToStack(*ilb, false, AllocaInsertionPoint);
+
+ WorkList.clear();
+
+ // Find all phi's
+ for (BasicBlock &ibb : F)
+ for (BasicBlock::iterator iib = ibb.begin(), iie = ibb.end(); iib != iie;
+ ++iib)
+ if (isa<PHINode>(iib))
+ WorkList.push_front(&*iib);
+
+ // Demote phi nodes
+ NumPhisDemoted += WorkList.size();
+ for (Instruction *ilb : WorkList)
+ DemotePHIToStack(cast<PHINode>(ilb), AllocaInsertionPoint);
+
+ return true;
+}
+
+
+// createDemoteRegisterToMemory - Provide an entry point to create this pass.
+char &llvm::DemoteRegisterToMemoryID = RegToMem::ID;
+FunctionPass *llvm::createDemoteRegisterToMemoryPass() {
+ return new RegToMem();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
new file mode 100644
index 000000000000..c44edbed8ed9
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -0,0 +1,2830 @@
+//===- RewriteStatepointsForGC.cpp - Make GC relocations explicit ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Rewrite call/invoke instructions so as to make potential relocations
+// performed by the garbage collector explicit in the IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/RewriteStatepointsForGC.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#define DEBUG_TYPE "rewrite-statepoints-for-gc"
+
+using namespace llvm;
+
+// Print the liveset found at the insert location
+static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden,
+ cl::init(false));
+static cl::opt<bool> PrintLiveSetSize("spp-print-liveset-size", cl::Hidden,
+ cl::init(false));
+
+// Print out the base pointers for debugging
+static cl::opt<bool> PrintBasePointers("spp-print-base-pointers", cl::Hidden,
+ cl::init(false));
+
+// Cost threshold measuring when it is profitable to rematerialize value instead
+// of relocating it
+static cl::opt<unsigned>
+RematerializationThreshold("spp-rematerialization-threshold", cl::Hidden,
+ cl::init(6));
+
+#ifdef EXPENSIVE_CHECKS
+static bool ClobberNonLive = true;
+#else
+static bool ClobberNonLive = false;
+#endif
+
+static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live",
+ cl::location(ClobberNonLive),
+ cl::Hidden);
+
+static cl::opt<bool>
+ AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info",
+ cl::Hidden, cl::init(true));
+
+/// The IR fed into RewriteStatepointsForGC may have had attributes and
+/// metadata implying dereferenceability that are no longer valid/correct after
+/// RewriteStatepointsForGC has run. This is because semantically, after
+/// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire
+/// heap. stripNonValidData (conservatively) restores
+/// correctness by erasing all attributes in the module that externally imply
+/// dereferenceability. Similar reasoning also applies to the noalias
+/// attributes and metadata. gc.statepoint can touch the entire heap including
+/// noalias objects.
+/// Apart from attributes and metadata, we also remove instructions that imply
+/// constant physical memory: llvm.invariant.start.
+static void stripNonValidData(Module &M);
+
+static bool shouldRewriteStatepointsIn(Function &F);
+
+PreservedAnalyses RewriteStatepointsForGC::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ bool Changed = false;
+ auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ for (Function &F : M) {
+ // Nothing to do for declarations.
+ if (F.isDeclaration() || F.empty())
+ continue;
+
+ // Policy choice says not to rewrite - the most common reason is that we're
+ // compiling code without a GCStrategy.
+ if (!shouldRewriteStatepointsIn(F))
+ continue;
+
+ auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+ auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
+ auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+ Changed |= runOnFunction(F, DT, TTI, TLI);
+ }
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ // stripNonValidData asserts that shouldRewriteStatepointsIn
+ // returns true for at least one function in the module. Since at least
+ // one function changed, we know that the precondition is satisfied.
+ stripNonValidData(M);
+
+ PreservedAnalyses PA;
+ PA.preserve<TargetIRAnalysis>();
+ PA.preserve<TargetLibraryAnalysis>();
+ return PA;
+}
+
+namespace {
+
+class RewriteStatepointsForGCLegacyPass : public ModulePass {
+ RewriteStatepointsForGC Impl;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ RewriteStatepointsForGCLegacyPass() : ModulePass(ID), Impl() {
+ initializeRewriteStatepointsForGCLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ bool Changed = false;
+ const TargetLibraryInfo &TLI =
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ for (Function &F : M) {
+ // Nothing to do for declarations.
+ if (F.isDeclaration() || F.empty())
+ continue;
+
+ // Policy choice says not to rewrite - the most common reason is that
+ // we're compiling code without a GCStrategy.
+ if (!shouldRewriteStatepointsIn(F))
+ continue;
+
+ TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+
+ Changed |= Impl.runOnFunction(F, DT, TTI, TLI);
+ }
+
+ if (!Changed)
+ return false;
+
+ // stripNonValidData asserts that shouldRewriteStatepointsIn
+ // returns true for at least one function in the module. Since at least
+ // one function changed, we know that the precondition is satisfied.
+ stripNonValidData(M);
+ return true;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // We add and rewrite a bunch of instructions, but don't really do much
+ // else. We could in theory preserve a lot more analyses here.
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+char RewriteStatepointsForGCLegacyPass::ID = 0;
+
+ModulePass *llvm::createRewriteStatepointsForGCLegacyPass() {
+ return new RewriteStatepointsForGCLegacyPass();
+}
+
+INITIALIZE_PASS_BEGIN(RewriteStatepointsForGCLegacyPass,
+ "rewrite-statepoints-for-gc",
+ "Make relocations explicit at statepoints", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(RewriteStatepointsForGCLegacyPass,
+ "rewrite-statepoints-for-gc",
+ "Make relocations explicit at statepoints", false, false)
+
+namespace {
+
+struct GCPtrLivenessData {
+ /// Values defined in this block.
+ MapVector<BasicBlock *, SetVector<Value *>> KillSet;
+
+ /// Values used in this block (and thus live); does not included values
+ /// killed within this block.
+ MapVector<BasicBlock *, SetVector<Value *>> LiveSet;
+
+ /// Values live into this basic block (i.e. used by any
+ /// instruction in this basic block or ones reachable from here)
+ MapVector<BasicBlock *, SetVector<Value *>> LiveIn;
+
+ /// Values live out of this basic block (i.e. live into
+ /// any successor block)
+ MapVector<BasicBlock *, SetVector<Value *>> LiveOut;
+};
+
+// The type of the internal cache used inside the findBasePointers family
+// of functions. From the callers perspective, this is an opaque type and
+// should not be inspected.
+//
+// In the actual implementation this caches two relations:
+// - The base relation itself (i.e. this pointer is based on that one)
+// - The base defining value relation (i.e. before base_phi insertion)
+// Generally, after the execution of a full findBasePointer call, only the
+// base relation will remain. Internally, we add a mixture of the two
+// types, then update all the second type to the first type
+using DefiningValueMapTy = MapVector<Value *, Value *>;
+using StatepointLiveSetTy = SetVector<Value *>;
+using RematerializedValueMapTy =
+ MapVector<AssertingVH<Instruction>, AssertingVH<Value>>;
+
+struct PartiallyConstructedSafepointRecord {
+ /// The set of values known to be live across this safepoint
+ StatepointLiveSetTy LiveSet;
+
+ /// Mapping from live pointers to a base-defining-value
+ MapVector<Value *, Value *> PointerToBase;
+
+ /// The *new* gc.statepoint instruction itself. This produces the token
+ /// that normal path gc.relocates and the gc.result are tied to.
+ Instruction *StatepointToken;
+
+ /// Instruction to which exceptional gc relocates are attached
+ /// Makes it easier to iterate through them during relocationViaAlloca.
+ Instruction *UnwindToken;
+
+ /// Record live values we are rematerialized instead of relocating.
+ /// They are not included into 'LiveSet' field.
+ /// Maps rematerialized copy to it's original value.
+ RematerializedValueMapTy RematerializedValues;
+};
+
+} // end anonymous namespace
+
+static ArrayRef<Use> GetDeoptBundleOperands(ImmutableCallSite CS) {
+ Optional<OperandBundleUse> DeoptBundle =
+ CS.getOperandBundle(LLVMContext::OB_deopt);
+
+ if (!DeoptBundle.hasValue()) {
+ assert(AllowStatepointWithNoDeoptInfo &&
+ "Found non-leaf call without deopt info!");
+ return None;
+ }
+
+ return DeoptBundle.getValue().Inputs;
+}
+
+/// Compute the live-in set for every basic block in the function
+static void computeLiveInValues(DominatorTree &DT, Function &F,
+ GCPtrLivenessData &Data);
+
+/// Given results from the dataflow liveness computation, find the set of live
+/// Values at a particular instruction.
+static void findLiveSetAtInst(Instruction *inst, GCPtrLivenessData &Data,
+ StatepointLiveSetTy &out);
+
+// TODO: Once we can get to the GCStrategy, this becomes
+// Optional<bool> isGCManagedPointer(const Type *Ty) const override {
+
+static bool isGCPointerType(Type *T) {
+ if (auto *PT = dyn_cast<PointerType>(T))
+ // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
+ // GC managed heap. We know that a pointer into this heap needs to be
+ // updated and that no other pointer does.
+ return PT->getAddressSpace() == 1;
+ return false;
+}
+
+// Return true if this type is one which a) is a gc pointer or contains a GC
+// pointer and b) is of a type this code expects to encounter as a live value.
+// (The insertion code will assert that a type which matches (a) and not (b)
+// is not encountered.)
+static bool isHandledGCPointerType(Type *T) {
+ // We fully support gc pointers
+ if (isGCPointerType(T))
+ return true;
+ // We partially support vectors of gc pointers. The code will assert if it
+ // can't handle something.
+ if (auto VT = dyn_cast<VectorType>(T))
+ if (isGCPointerType(VT->getElementType()))
+ return true;
+ return false;
+}
+
+#ifndef NDEBUG
+/// Returns true if this type contains a gc pointer whether we know how to
+/// handle that type or not.
+static bool containsGCPtrType(Type *Ty) {
+ if (isGCPointerType(Ty))
+ return true;
+ if (VectorType *VT = dyn_cast<VectorType>(Ty))
+ return isGCPointerType(VT->getScalarType());
+ if (ArrayType *AT = dyn_cast<ArrayType>(Ty))
+ return containsGCPtrType(AT->getElementType());
+ if (StructType *ST = dyn_cast<StructType>(Ty))
+ return llvm::any_of(ST->subtypes(), containsGCPtrType);
+ return false;
+}
+
+// Returns true if this is a type which a) is a gc pointer or contains a GC
+// pointer and b) is of a type which the code doesn't expect (i.e. first class
+// aggregates). Used to trip assertions.
+static bool isUnhandledGCPointerType(Type *Ty) {
+ return containsGCPtrType(Ty) && !isHandledGCPointerType(Ty);
+}
+#endif
+
+// Return the name of the value suffixed with the provided value, or if the
+// value didn't have a name, the default value specified.
+static std::string suffixed_name_or(Value *V, StringRef Suffix,
+ StringRef DefaultName) {
+ return V->hasName() ? (V->getName() + Suffix).str() : DefaultName.str();
+}
+
+// Conservatively identifies any definitions which might be live at the
+// given instruction. The analysis is performed immediately before the
+// given instruction. Values defined by that instruction are not considered
+// live. Values used by that instruction are considered live.
+static void
+analyzeParsePointLiveness(DominatorTree &DT,
+ GCPtrLivenessData &OriginalLivenessData, CallSite CS,
+ PartiallyConstructedSafepointRecord &Result) {
+ Instruction *Inst = CS.getInstruction();
+
+ StatepointLiveSetTy LiveSet;
+ findLiveSetAtInst(Inst, OriginalLivenessData, LiveSet);
+
+ if (PrintLiveSet) {
+ dbgs() << "Live Variables:\n";
+ for (Value *V : LiveSet)
+ dbgs() << " " << V->getName() << " " << *V << "\n";
+ }
+ if (PrintLiveSetSize) {
+ dbgs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n";
+ dbgs() << "Number live values: " << LiveSet.size() << "\n";
+ }
+ Result.LiveSet = LiveSet;
+}
+
+static bool isKnownBaseResult(Value *V);
+
+namespace {
+
+/// A single base defining value - An immediate base defining value for an
+/// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'.
+/// For instructions which have multiple pointer [vector] inputs or that
+/// transition between vector and scalar types, there is no immediate base
+/// defining value. The 'base defining value' for 'Def' is the transitive
+/// closure of this relation stopping at the first instruction which has no
+/// immediate base defining value. The b.d.v. might itself be a base pointer,
+/// but it can also be an arbitrary derived pointer.
+struct BaseDefiningValueResult {
+ /// Contains the value which is the base defining value.
+ Value * const BDV;
+
+ /// True if the base defining value is also known to be an actual base
+ /// pointer.
+ const bool IsKnownBase;
+
+ BaseDefiningValueResult(Value *BDV, bool IsKnownBase)
+ : BDV(BDV), IsKnownBase(IsKnownBase) {
+#ifndef NDEBUG
+ // Check consistency between new and old means of checking whether a BDV is
+ // a base.
+ bool MustBeBase = isKnownBaseResult(BDV);
+ assert(!MustBeBase || MustBeBase == IsKnownBase);
+#endif
+ }
+};
+
+} // end anonymous namespace
+
+static BaseDefiningValueResult findBaseDefiningValue(Value *I);
+
+/// Return a base defining value for the 'Index' element of the given vector
+/// instruction 'I'. If Index is null, returns a BDV for the entire vector
+/// 'I'. As an optimization, this method will try to determine when the
+/// element is known to already be a base pointer. If this can be established,
+/// the second value in the returned pair will be true. Note that either a
+/// vector or a pointer typed value can be returned. For the former, the
+/// vector returned is a BDV (and possibly a base) of the entire vector 'I'.
+/// If the later, the return pointer is a BDV (or possibly a base) for the
+/// particular element in 'I'.
+static BaseDefiningValueResult
+findBaseDefiningValueOfVector(Value *I) {
+ // Each case parallels findBaseDefiningValue below, see that code for
+ // detailed motivation.
+
+ if (isa<Argument>(I))
+ // An incoming argument to the function is a base pointer
+ return BaseDefiningValueResult(I, true);
+
+ if (isa<Constant>(I))
+ // Base of constant vector consists only of constant null pointers.
+ // For reasoning see similar case inside 'findBaseDefiningValue' function.
+ return BaseDefiningValueResult(ConstantAggregateZero::get(I->getType()),
+ true);
+
+ if (isa<LoadInst>(I))
+ return BaseDefiningValueResult(I, true);
+
+ if (isa<InsertElementInst>(I))
+ // We don't know whether this vector contains entirely base pointers or
+ // not. To be conservatively correct, we treat it as a BDV and will
+ // duplicate code as needed to construct a parallel vector of bases.
+ return BaseDefiningValueResult(I, false);
+
+ if (isa<ShuffleVectorInst>(I))
+ // We don't know whether this vector contains entirely base pointers or
+ // not. To be conservatively correct, we treat it as a BDV and will
+ // duplicate code as needed to construct a parallel vector of bases.
+ // TODO: There a number of local optimizations which could be applied here
+ // for particular sufflevector patterns.
+ return BaseDefiningValueResult(I, false);
+
+ // The behavior of getelementptr instructions is the same for vector and
+ // non-vector data types.
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+ return findBaseDefiningValue(GEP->getPointerOperand());
+
+ // If the pointer comes through a bitcast of a vector of pointers to
+ // a vector of another type of pointer, then look through the bitcast
+ if (auto *BC = dyn_cast<BitCastInst>(I))
+ return findBaseDefiningValue(BC->getOperand(0));
+
+ // A PHI or Select is a base defining value. The outer findBasePointer
+ // algorithm is responsible for constructing a base value for this BDV.
+ assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
+ "unknown vector instruction - no base found for vector element");
+ return BaseDefiningValueResult(I, false);
+}
+
+/// Helper function for findBasePointer - Will return a value which either a)
+/// defines the base pointer for the input, b) blocks the simple search
+/// (i.e. a PHI or Select of two derived pointers), or c) involves a change
+/// from pointer to vector type or back.
+static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
+ assert(I->getType()->isPtrOrPtrVectorTy() &&
+ "Illegal to ask for the base pointer of a non-pointer type");
+
+ if (I->getType()->isVectorTy())
+ return findBaseDefiningValueOfVector(I);
+
+ if (isa<Argument>(I))
+ // An incoming argument to the function is a base pointer
+ // We should have never reached here if this argument isn't an gc value
+ return BaseDefiningValueResult(I, true);
+
+ if (isa<Constant>(I)) {
+ // We assume that objects with a constant base (e.g. a global) can't move
+ // and don't need to be reported to the collector because they are always
+ // live. Besides global references, all kinds of constants (e.g. undef,
+ // constant expressions, null pointers) can be introduced by the inliner or
+ // the optimizer, especially on dynamically dead paths.
+ // Here we treat all of them as having single null base. By doing this we
+ // trying to avoid problems reporting various conflicts in a form of
+ // "phi (const1, const2)" or "phi (const, regular gc ptr)".
+ // See constant.ll file for relevant test cases.
+
+ return BaseDefiningValueResult(
+ ConstantPointerNull::get(cast<PointerType>(I->getType())), true);
+ }
+
+ if (CastInst *CI = dyn_cast<CastInst>(I)) {
+ Value *Def = CI->stripPointerCasts();
+ // If stripping pointer casts changes the address space there is an
+ // addrspacecast in between.
+ assert(cast<PointerType>(Def->getType())->getAddressSpace() ==
+ cast<PointerType>(CI->getType())->getAddressSpace() &&
+ "unsupported addrspacecast");
+ // If we find a cast instruction here, it means we've found a cast which is
+ // not simply a pointer cast (i.e. an inttoptr). We don't know how to
+ // handle int->ptr conversion.
+ assert(!isa<CastInst>(Def) && "shouldn't find another cast here");
+ return findBaseDefiningValue(Def);
+ }
+
+ if (isa<LoadInst>(I))
+ // The value loaded is an gc base itself
+ return BaseDefiningValueResult(I, true);
+
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I))
+ // The base of this GEP is the base
+ return findBaseDefiningValue(GEP->getPointerOperand());
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ default:
+ // fall through to general call handling
+ break;
+ case Intrinsic::experimental_gc_statepoint:
+ llvm_unreachable("statepoints don't produce pointers");
+ case Intrinsic::experimental_gc_relocate:
+ // Rerunning safepoint insertion after safepoints are already
+ // inserted is not supported. It could probably be made to work,
+ // but why are you doing this? There's no good reason.
+ llvm_unreachable("repeat safepoint insertion is not supported");
+ case Intrinsic::gcroot:
+ // Currently, this mechanism hasn't been extended to work with gcroot.
+ // There's no reason it couldn't be, but I haven't thought about the
+ // implications much.
+ llvm_unreachable(
+ "interaction with the gcroot mechanism is not supported");
+ }
+ }
+ // We assume that functions in the source language only return base
+ // pointers. This should probably be generalized via attributes to support
+ // both source language and internal functions.
+ if (isa<CallInst>(I) || isa<InvokeInst>(I))
+ return BaseDefiningValueResult(I, true);
+
+ // TODO: I have absolutely no idea how to implement this part yet. It's not
+ // necessarily hard, I just haven't really looked at it yet.
+ assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented");
+
+ if (isa<AtomicCmpXchgInst>(I))
+ // A CAS is effectively a atomic store and load combined under a
+ // predicate. From the perspective of base pointers, we just treat it
+ // like a load.
+ return BaseDefiningValueResult(I, true);
+
+ assert(!isa<AtomicRMWInst>(I) && "Xchg handled above, all others are "
+ "binary ops which don't apply to pointers");
+
+ // The aggregate ops. Aggregates can either be in the heap or on the
+ // stack, but in either case, this is simply a field load. As a result,
+ // this is a defining definition of the base just like a load is.
+ if (isa<ExtractValueInst>(I))
+ return BaseDefiningValueResult(I, true);
+
+ // We should never see an insert vector since that would require we be
+ // tracing back a struct value not a pointer value.
+ assert(!isa<InsertValueInst>(I) &&
+ "Base pointer for a struct is meaningless");
+
+ // An extractelement produces a base result exactly when it's input does.
+ // We may need to insert a parallel instruction to extract the appropriate
+ // element out of the base vector corresponding to the input. Given this,
+ // it's analogous to the phi and select case even though it's not a merge.
+ if (isa<ExtractElementInst>(I))
+ // Note: There a lot of obvious peephole cases here. This are deliberately
+ // handled after the main base pointer inference algorithm to make writing
+ // test cases to exercise that code easier.
+ return BaseDefiningValueResult(I, false);
+
+ // The last two cases here don't return a base pointer. Instead, they
+ // return a value which dynamically selects from among several base
+ // derived pointers (each with it's own base potentially). It's the job of
+ // the caller to resolve these.
+ assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
+ "missing instruction case in findBaseDefiningValing");
+ return BaseDefiningValueResult(I, false);
+}
+
+/// Returns the base defining value for this value.
+static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) {
+ Value *&Cached = Cache[I];
+ if (!Cached) {
+ Cached = findBaseDefiningValue(I).BDV;
+ DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> "
+ << Cached->getName() << "\n");
+ }
+ assert(Cache[I] != nullptr);
+ return Cached;
+}
+
+/// Return a base pointer for this value if known. Otherwise, return it's
+/// base defining value.
+static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) {
+ Value *Def = findBaseDefiningValueCached(I, Cache);
+ auto Found = Cache.find(Def);
+ if (Found != Cache.end()) {
+ // Either a base-of relation, or a self reference. Caller must check.
+ return Found->second;
+ }
+ // Only a BDV available
+ return Def;
+}
+
+/// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV,
+/// is it known to be a base pointer? Or do we need to continue searching.
+static bool isKnownBaseResult(Value *V) {
+ if (!isa<PHINode>(V) && !isa<SelectInst>(V) &&
+ !isa<ExtractElementInst>(V) && !isa<InsertElementInst>(V) &&
+ !isa<ShuffleVectorInst>(V)) {
+ // no recursion possible
+ return true;
+ }
+ if (isa<Instruction>(V) &&
+ cast<Instruction>(V)->getMetadata("is_base_value")) {
+ // This is a previously inserted base phi or select. We know
+ // that this is a base value.
+ return true;
+ }
+
+ // We need to keep searching
+ return false;
+}
+
+namespace {
+
+/// Models the state of a single base defining value in the findBasePointer
+/// algorithm for determining where a new instruction is needed to propagate
+/// the base of this BDV.
+class BDVState {
+public:
+ enum Status { Unknown, Base, Conflict };
+
+ BDVState() : BaseValue(nullptr) {}
+
+ explicit BDVState(Status Status, Value *BaseValue = nullptr)
+ : Status(Status), BaseValue(BaseValue) {
+ assert(Status != Base || BaseValue);
+ }
+
+ explicit BDVState(Value *BaseValue) : Status(Base), BaseValue(BaseValue) {}
+
+ Status getStatus() const { return Status; }
+ Value *getBaseValue() const { return BaseValue; }
+
+ bool isBase() const { return getStatus() == Base; }
+ bool isUnknown() const { return getStatus() == Unknown; }
+ bool isConflict() const { return getStatus() == Conflict; }
+
+ bool operator==(const BDVState &Other) const {
+ return BaseValue == Other.BaseValue && Status == Other.Status;
+ }
+
+ bool operator!=(const BDVState &other) const { return !(*this == other); }
+
+ LLVM_DUMP_METHOD
+ void dump() const {
+ print(dbgs());
+ dbgs() << '\n';
+ }
+
+ void print(raw_ostream &OS) const {
+ switch (getStatus()) {
+ case Unknown:
+ OS << "U";
+ break;
+ case Base:
+ OS << "B";
+ break;
+ case Conflict:
+ OS << "C";
+ break;
+ }
+ OS << " (" << getBaseValue() << " - "
+ << (getBaseValue() ? getBaseValue()->getName() : "nullptr") << "): ";
+ }
+
+private:
+ Status Status = Unknown;
+ AssertingVH<Value> BaseValue; // Non-null only if Status == Base.
+};
+
+} // end anonymous namespace
+
+#ifndef NDEBUG
+static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) {
+ State.print(OS);
+ return OS;
+}
+#endif
+
+static BDVState meetBDVStateImpl(const BDVState &LHS, const BDVState &RHS) {
+ switch (LHS.getStatus()) {
+ case BDVState::Unknown:
+ return RHS;
+
+ case BDVState::Base:
+ assert(LHS.getBaseValue() && "can't be null");
+ if (RHS.isUnknown())
+ return LHS;
+
+ if (RHS.isBase()) {
+ if (LHS.getBaseValue() == RHS.getBaseValue()) {
+ assert(LHS == RHS && "equality broken!");
+ return LHS;
+ }
+ return BDVState(BDVState::Conflict);
+ }
+ assert(RHS.isConflict() && "only three states!");
+ return BDVState(BDVState::Conflict);
+
+ case BDVState::Conflict:
+ return LHS;
+ }
+ llvm_unreachable("only three states!");
+}
+
+// Values of type BDVState form a lattice, and this function implements the meet
+// operation.
+static BDVState meetBDVState(const BDVState &LHS, const BDVState &RHS) {
+ BDVState Result = meetBDVStateImpl(LHS, RHS);
+ assert(Result == meetBDVStateImpl(RHS, LHS) &&
+ "Math is wrong: meet does not commute!");
+ return Result;
+}
+
+/// For a given value or instruction, figure out what base ptr its derived from.
+/// For gc objects, this is simply itself. On success, returns a value which is
+/// the base pointer. (This is reliable and can be used for relocation.) On
+/// failure, returns nullptr.
+static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
+ Value *Def = findBaseOrBDV(I, Cache);
+
+ if (isKnownBaseResult(Def))
+ return Def;
+
+ // Here's the rough algorithm:
+ // - For every SSA value, construct a mapping to either an actual base
+ // pointer or a PHI which obscures the base pointer.
+ // - Construct a mapping from PHI to unknown TOP state. Use an
+ // optimistic algorithm to propagate base pointer information. Lattice
+ // looks like:
+ // UNKNOWN
+ // b1 b2 b3 b4
+ // CONFLICT
+ // When algorithm terminates, all PHIs will either have a single concrete
+ // base or be in a conflict state.
+ // - For every conflict, insert a dummy PHI node without arguments. Add
+ // these to the base[Instruction] = BasePtr mapping. For every
+ // non-conflict, add the actual base.
+ // - For every conflict, add arguments for the base[a] of each input
+ // arguments.
+ //
+ // Note: A simpler form of this would be to add the conflict form of all
+ // PHIs without running the optimistic algorithm. This would be
+ // analogous to pessimistic data flow and would likely lead to an
+ // overall worse solution.
+
+#ifndef NDEBUG
+ auto isExpectedBDVType = [](Value *BDV) {
+ return isa<PHINode>(BDV) || isa<SelectInst>(BDV) ||
+ isa<ExtractElementInst>(BDV) || isa<InsertElementInst>(BDV) ||
+ isa<ShuffleVectorInst>(BDV);
+ };
+#endif
+
+ // Once populated, will contain a mapping from each potentially non-base BDV
+ // to a lattice value (described above) which corresponds to that BDV.
+ // We use the order of insertion (DFS over the def/use graph) to provide a
+ // stable deterministic ordering for visiting DenseMaps (which are unordered)
+ // below. This is important for deterministic compilation.
+ MapVector<Value *, BDVState> States;
+
+ // Recursively fill in all base defining values reachable from the initial
+ // one for which we don't already know a definite base value for
+ /* scope */ {
+ SmallVector<Value*, 16> Worklist;
+ Worklist.push_back(Def);
+ States.insert({Def, BDVState()});
+ while (!Worklist.empty()) {
+ Value *Current = Worklist.pop_back_val();
+ assert(!isKnownBaseResult(Current) && "why did it get added?");
+
+ auto visitIncomingValue = [&](Value *InVal) {
+ Value *Base = findBaseOrBDV(InVal, Cache);
+ if (isKnownBaseResult(Base))
+ // Known bases won't need new instructions introduced and can be
+ // ignored safely
+ return;
+ assert(isExpectedBDVType(Base) && "the only non-base values "
+ "we see should be base defining values");
+ if (States.insert(std::make_pair(Base, BDVState())).second)
+ Worklist.push_back(Base);
+ };
+ if (PHINode *PN = dyn_cast<PHINode>(Current)) {
+ for (Value *InVal : PN->incoming_values())
+ visitIncomingValue(InVal);
+ } else if (SelectInst *SI = dyn_cast<SelectInst>(Current)) {
+ visitIncomingValue(SI->getTrueValue());
+ visitIncomingValue(SI->getFalseValue());
+ } else if (auto *EE = dyn_cast<ExtractElementInst>(Current)) {
+ visitIncomingValue(EE->getVectorOperand());
+ } else if (auto *IE = dyn_cast<InsertElementInst>(Current)) {
+ visitIncomingValue(IE->getOperand(0)); // vector operand
+ visitIncomingValue(IE->getOperand(1)); // scalar operand
+ } else if (auto *SV = dyn_cast<ShuffleVectorInst>(Current)) {
+ visitIncomingValue(SV->getOperand(0));
+ visitIncomingValue(SV->getOperand(1));
+ }
+ else {
+ llvm_unreachable("Unimplemented instruction case");
+ }
+ }
+ }
+
+#ifndef NDEBUG
+ DEBUG(dbgs() << "States after initialization:\n");
+ for (auto Pair : States) {
+ DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
+ }
+#endif
+
+ // Return a phi state for a base defining value. We'll generate a new
+ // base state for known bases and expect to find a cached state otherwise.
+ auto getStateForBDV = [&](Value *baseValue) {
+ if (isKnownBaseResult(baseValue))
+ return BDVState(baseValue);
+ auto I = States.find(baseValue);
+ assert(I != States.end() && "lookup failed!");
+ return I->second;
+ };
+
+ bool Progress = true;
+ while (Progress) {
+#ifndef NDEBUG
+ const size_t OldSize = States.size();
+#endif
+ Progress = false;
+ // We're only changing values in this loop, thus safe to keep iterators.
+ // Since this is computing a fixed point, the order of visit does not
+ // effect the result. TODO: We could use a worklist here and make this run
+ // much faster.
+ for (auto Pair : States) {
+ Value *BDV = Pair.first;
+ assert(!isKnownBaseResult(BDV) && "why did it get added?");
+
+ // Given an input value for the current instruction, return a BDVState
+ // instance which represents the BDV of that value.
+ auto getStateForInput = [&](Value *V) mutable {
+ Value *BDV = findBaseOrBDV(V, Cache);
+ return getStateForBDV(BDV);
+ };
+
+ BDVState NewState;
+ if (SelectInst *SI = dyn_cast<SelectInst>(BDV)) {
+ NewState = meetBDVState(NewState, getStateForInput(SI->getTrueValue()));
+ NewState =
+ meetBDVState(NewState, getStateForInput(SI->getFalseValue()));
+ } else if (PHINode *PN = dyn_cast<PHINode>(BDV)) {
+ for (Value *Val : PN->incoming_values())
+ NewState = meetBDVState(NewState, getStateForInput(Val));
+ } else if (auto *EE = dyn_cast<ExtractElementInst>(BDV)) {
+ // The 'meet' for an extractelement is slightly trivial, but it's still
+ // useful in that it drives us to conflict if our input is.
+ NewState =
+ meetBDVState(NewState, getStateForInput(EE->getVectorOperand()));
+ } else if (auto *IE = dyn_cast<InsertElementInst>(BDV)){
+ // Given there's a inherent type mismatch between the operands, will
+ // *always* produce Conflict.
+ NewState = meetBDVState(NewState, getStateForInput(IE->getOperand(0)));
+ NewState = meetBDVState(NewState, getStateForInput(IE->getOperand(1)));
+ } else {
+ // The only instance this does not return a Conflict is when both the
+ // vector operands are the same vector.
+ auto *SV = cast<ShuffleVectorInst>(BDV);
+ NewState = meetBDVState(NewState, getStateForInput(SV->getOperand(0)));
+ NewState = meetBDVState(NewState, getStateForInput(SV->getOperand(1)));
+ }
+
+ BDVState OldState = States[BDV];
+ if (OldState != NewState) {
+ Progress = true;
+ States[BDV] = NewState;
+ }
+ }
+
+ assert(OldSize == States.size() &&
+ "fixed point shouldn't be adding any new nodes to state");
+ }
+
+#ifndef NDEBUG
+ DEBUG(dbgs() << "States after meet iteration:\n");
+ for (auto Pair : States) {
+ DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
+ }
+#endif
+
+ // Insert Phis for all conflicts
+ // TODO: adjust naming patterns to avoid this order of iteration dependency
+ for (auto Pair : States) {
+ Instruction *I = cast<Instruction>(Pair.first);
+ BDVState State = Pair.second;
+ assert(!isKnownBaseResult(I) && "why did it get added?");
+ assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
+
+ // extractelement instructions are a bit special in that we may need to
+ // insert an extract even when we know an exact base for the instruction.
+ // The problem is that we need to convert from a vector base to a scalar
+ // base for the particular indice we're interested in.
+ if (State.isBase() && isa<ExtractElementInst>(I) &&
+ isa<VectorType>(State.getBaseValue()->getType())) {
+ auto *EE = cast<ExtractElementInst>(I);
+ // TODO: In many cases, the new instruction is just EE itself. We should
+ // exploit this, but can't do it here since it would break the invariant
+ // about the BDV not being known to be a base.
+ auto *BaseInst = ExtractElementInst::Create(
+ State.getBaseValue(), EE->getIndexOperand(), "base_ee", EE);
+ BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
+ States[I] = BDVState(BDVState::Base, BaseInst);
+ }
+
+ // Since we're joining a vector and scalar base, they can never be the
+ // same. As a result, we should always see insert element having reached
+ // the conflict state.
+ assert(!isa<InsertElementInst>(I) || State.isConflict());
+
+ if (!State.isConflict())
+ continue;
+
+ /// Create and insert a new instruction which will represent the base of
+ /// the given instruction 'I'.
+ auto MakeBaseInstPlaceholder = [](Instruction *I) -> Instruction* {
+ if (isa<PHINode>(I)) {
+ BasicBlock *BB = I->getParent();
+ int NumPreds = std::distance(pred_begin(BB), pred_end(BB));
+ assert(NumPreds > 0 && "how did we reach here");
+ std::string Name = suffixed_name_or(I, ".base", "base_phi");
+ return PHINode::Create(I->getType(), NumPreds, Name, I);
+ } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+ // The undef will be replaced later
+ UndefValue *Undef = UndefValue::get(SI->getType());
+ std::string Name = suffixed_name_or(I, ".base", "base_select");
+ return SelectInst::Create(SI->getCondition(), Undef, Undef, Name, SI);
+ } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
+ UndefValue *Undef = UndefValue::get(EE->getVectorOperand()->getType());
+ std::string Name = suffixed_name_or(I, ".base", "base_ee");
+ return ExtractElementInst::Create(Undef, EE->getIndexOperand(), Name,
+ EE);
+ } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
+ UndefValue *VecUndef = UndefValue::get(IE->getOperand(0)->getType());
+ UndefValue *ScalarUndef = UndefValue::get(IE->getOperand(1)->getType());
+ std::string Name = suffixed_name_or(I, ".base", "base_ie");
+ return InsertElementInst::Create(VecUndef, ScalarUndef,
+ IE->getOperand(2), Name, IE);
+ } else {
+ auto *SV = cast<ShuffleVectorInst>(I);
+ UndefValue *VecUndef = UndefValue::get(SV->getOperand(0)->getType());
+ std::string Name = suffixed_name_or(I, ".base", "base_sv");
+ return new ShuffleVectorInst(VecUndef, VecUndef, SV->getOperand(2),
+ Name, SV);
+ }
+ };
+ Instruction *BaseInst = MakeBaseInstPlaceholder(I);
+ // Add metadata marking this as a base value
+ BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
+ States[I] = BDVState(BDVState::Conflict, BaseInst);
+ }
+
+ // Returns a instruction which produces the base pointer for a given
+ // instruction. The instruction is assumed to be an input to one of the BDVs
+ // seen in the inference algorithm above. As such, we must either already
+ // know it's base defining value is a base, or have inserted a new
+ // instruction to propagate the base of it's BDV and have entered that newly
+ // introduced instruction into the state table. In either case, we are
+ // assured to be able to determine an instruction which produces it's base
+ // pointer.
+ auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) {
+ Value *BDV = findBaseOrBDV(Input, Cache);
+ Value *Base = nullptr;
+ if (isKnownBaseResult(BDV)) {
+ Base = BDV;
+ } else {
+ // Either conflict or base.
+ assert(States.count(BDV));
+ Base = States[BDV].getBaseValue();
+ }
+ assert(Base && "Can't be null");
+ // The cast is needed since base traversal may strip away bitcasts
+ if (Base->getType() != Input->getType() && InsertPt)
+ Base = new BitCastInst(Base, Input->getType(), "cast", InsertPt);
+ return Base;
+ };
+
+ // Fixup all the inputs of the new PHIs. Visit order needs to be
+ // deterministic and predictable because we're naming newly created
+ // instructions.
+ for (auto Pair : States) {
+ Instruction *BDV = cast<Instruction>(Pair.first);
+ BDVState State = Pair.second;
+
+ assert(!isKnownBaseResult(BDV) && "why did it get added?");
+ assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
+ if (!State.isConflict())
+ continue;
+
+ if (PHINode *BasePHI = dyn_cast<PHINode>(State.getBaseValue())) {
+ PHINode *PN = cast<PHINode>(BDV);
+ unsigned NumPHIValues = PN->getNumIncomingValues();
+ for (unsigned i = 0; i < NumPHIValues; i++) {
+ Value *InVal = PN->getIncomingValue(i);
+ BasicBlock *InBB = PN->getIncomingBlock(i);
+
+ // If we've already seen InBB, add the same incoming value
+ // we added for it earlier. The IR verifier requires phi
+ // nodes with multiple entries from the same basic block
+ // to have the same incoming value for each of those
+ // entries. If we don't do this check here and basephi
+ // has a different type than base, we'll end up adding two
+ // bitcasts (and hence two distinct values) as incoming
+ // values for the same basic block.
+
+ int BlockIndex = BasePHI->getBasicBlockIndex(InBB);
+ if (BlockIndex != -1) {
+ Value *OldBase = BasePHI->getIncomingValue(BlockIndex);
+ BasePHI->addIncoming(OldBase, InBB);
+
+#ifndef NDEBUG
+ Value *Base = getBaseForInput(InVal, nullptr);
+ // In essence this assert states: the only way two values
+ // incoming from the same basic block may be different is by
+ // being different bitcasts of the same value. A cleanup
+ // that remains TODO is changing findBaseOrBDV to return an
+ // llvm::Value of the correct type (and still remain pure).
+ // This will remove the need to add bitcasts.
+ assert(Base->stripPointerCasts() == OldBase->stripPointerCasts() &&
+ "Sanity -- findBaseOrBDV should be pure!");
+#endif
+ continue;
+ }
+
+ // Find the instruction which produces the base for each input. We may
+ // need to insert a bitcast in the incoming block.
+ // TODO: Need to split critical edges if insertion is needed
+ Value *Base = getBaseForInput(InVal, InBB->getTerminator());
+ BasePHI->addIncoming(Base, InBB);
+ }
+ assert(BasePHI->getNumIncomingValues() == NumPHIValues);
+ } else if (SelectInst *BaseSI =
+ dyn_cast<SelectInst>(State.getBaseValue())) {
+ SelectInst *SI = cast<SelectInst>(BDV);
+
+ // Find the instruction which produces the base for each input.
+ // We may need to insert a bitcast.
+ BaseSI->setTrueValue(getBaseForInput(SI->getTrueValue(), BaseSI));
+ BaseSI->setFalseValue(getBaseForInput(SI->getFalseValue(), BaseSI));
+ } else if (auto *BaseEE =
+ dyn_cast<ExtractElementInst>(State.getBaseValue())) {
+ Value *InVal = cast<ExtractElementInst>(BDV)->getVectorOperand();
+ // Find the instruction which produces the base for each input. We may
+ // need to insert a bitcast.
+ BaseEE->setOperand(0, getBaseForInput(InVal, BaseEE));
+ } else if (auto *BaseIE = dyn_cast<InsertElementInst>(State.getBaseValue())){
+ auto *BdvIE = cast<InsertElementInst>(BDV);
+ auto UpdateOperand = [&](int OperandIdx) {
+ Value *InVal = BdvIE->getOperand(OperandIdx);
+ Value *Base = getBaseForInput(InVal, BaseIE);
+ BaseIE->setOperand(OperandIdx, Base);
+ };
+ UpdateOperand(0); // vector operand
+ UpdateOperand(1); // scalar operand
+ } else {
+ auto *BaseSV = cast<ShuffleVectorInst>(State.getBaseValue());
+ auto *BdvSV = cast<ShuffleVectorInst>(BDV);
+ auto UpdateOperand = [&](int OperandIdx) {
+ Value *InVal = BdvSV->getOperand(OperandIdx);
+ Value *Base = getBaseForInput(InVal, BaseSV);
+ BaseSV->setOperand(OperandIdx, Base);
+ };
+ UpdateOperand(0); // vector operand
+ UpdateOperand(1); // vector operand
+ }
+ }
+
+ // Cache all of our results so we can cheaply reuse them
+ // NOTE: This is actually two caches: one of the base defining value
+ // relation and one of the base pointer relation! FIXME
+ for (auto Pair : States) {
+ auto *BDV = Pair.first;
+ Value *Base = Pair.second.getBaseValue();
+ assert(BDV && Base);
+ assert(!isKnownBaseResult(BDV) && "why did it get added?");
+
+ DEBUG(dbgs() << "Updating base value cache"
+ << " for: " << BDV->getName() << " from: "
+ << (Cache.count(BDV) ? Cache[BDV]->getName().str() : "none")
+ << " to: " << Base->getName() << "\n");
+
+ if (Cache.count(BDV)) {
+ assert(isKnownBaseResult(Base) &&
+ "must be something we 'know' is a base pointer");
+ // Once we transition from the BDV relation being store in the Cache to
+ // the base relation being stored, it must be stable
+ assert((!isKnownBaseResult(Cache[BDV]) || Cache[BDV] == Base) &&
+ "base relation should be stable");
+ }
+ Cache[BDV] = Base;
+ }
+ assert(Cache.count(Def));
+ return Cache[Def];
+}
+
+// For a set of live pointers (base and/or derived), identify the base
+// pointer of the object which they are derived from. This routine will
+// mutate the IR graph as needed to make the 'base' pointer live at the
+// definition site of 'derived'. This ensures that any use of 'derived' can
+// also use 'base'. This may involve the insertion of a number of
+// additional PHI nodes.
+//
+// preconditions: live is a set of pointer type Values
+//
+// side effects: may insert PHI nodes into the existing CFG, will preserve
+// CFG, will not remove or mutate any existing nodes
+//
+// post condition: PointerToBase contains one (derived, base) pair for every
+// pointer in live. Note that derived can be equal to base if the original
+// pointer was a base pointer.
+static void
+findBasePointers(const StatepointLiveSetTy &live,
+ MapVector<Value *, Value *> &PointerToBase,
+ DominatorTree *DT, DefiningValueMapTy &DVCache) {
+ for (Value *ptr : live) {
+ Value *base = findBasePointer(ptr, DVCache);
+ assert(base && "failed to find base pointer");
+ PointerToBase[ptr] = base;
+ assert((!isa<Instruction>(base) || !isa<Instruction>(ptr) ||
+ DT->dominates(cast<Instruction>(base)->getParent(),
+ cast<Instruction>(ptr)->getParent())) &&
+ "The base we found better dominate the derived pointer");
+ }
+}
+
+/// Find the required based pointers (and adjust the live set) for the given
+/// parse point.
+static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache,
+ CallSite CS,
+ PartiallyConstructedSafepointRecord &result) {
+ MapVector<Value *, Value *> PointerToBase;
+ findBasePointers(result.LiveSet, PointerToBase, &DT, DVCache);
+
+ if (PrintBasePointers) {
+ errs() << "Base Pairs (w/o Relocation):\n";
+ for (auto &Pair : PointerToBase) {
+ errs() << " derived ";
+ Pair.first->printAsOperand(errs(), false);
+ errs() << " base ";
+ Pair.second->printAsOperand(errs(), false);
+ errs() << "\n";;
+ }
+ }
+
+ result.PointerToBase = PointerToBase;
+}
+
+/// Given an updated version of the dataflow liveness results, update the
+/// liveset and base pointer maps for the call site CS.
+static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
+ CallSite CS,
+ PartiallyConstructedSafepointRecord &result);
+
+static void recomputeLiveInValues(
+ Function &F, DominatorTree &DT, ArrayRef<CallSite> toUpdate,
+ MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
+ // TODO-PERF: reuse the original liveness, then simply run the dataflow
+ // again. The old values are still live and will help it stabilize quickly.
+ GCPtrLivenessData RevisedLivenessData;
+ computeLiveInValues(DT, F, RevisedLivenessData);
+ for (size_t i = 0; i < records.size(); i++) {
+ struct PartiallyConstructedSafepointRecord &info = records[i];
+ recomputeLiveInValues(RevisedLivenessData, toUpdate[i], info);
+ }
+}
+
+// When inserting gc.relocate and gc.result calls, we need to ensure there are
+// no uses of the original value / return value between the gc.statepoint and
+// the gc.relocate / gc.result call. One case which can arise is a phi node
+// starting one of the successor blocks. We also need to be able to insert the
+// gc.relocates only on the path which goes through the statepoint. We might
+// need to split an edge to make this possible.
+static BasicBlock *
+normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent,
+ DominatorTree &DT) {
+ BasicBlock *Ret = BB;
+ if (!BB->getUniquePredecessor())
+ Ret = SplitBlockPredecessors(BB, InvokeParent, "", &DT);
+
+ // Now that 'Ret' has unique predecessor we can safely remove all phi nodes
+ // from it
+ FoldSingleEntryPHINodes(Ret);
+ assert(!isa<PHINode>(Ret->begin()) &&
+ "All PHI nodes should have been removed!");
+
+ // At this point, we can safely insert a gc.relocate or gc.result as the first
+ // instruction in Ret if needed.
+ return Ret;
+}
+
+// Create new attribute set containing only attributes which can be transferred
+// from original call to the safepoint.
+static AttributeList legalizeCallAttributes(AttributeList AL) {
+ if (AL.isEmpty())
+ return AL;
+
+ // Remove the readonly, readnone, and statepoint function attributes.
+ AttrBuilder FnAttrs = AL.getFnAttributes();
+ FnAttrs.removeAttribute(Attribute::ReadNone);
+ FnAttrs.removeAttribute(Attribute::ReadOnly);
+ for (Attribute A : AL.getFnAttributes()) {
+ if (isStatepointDirectiveAttr(A))
+ FnAttrs.remove(A);
+ }
+
+ // Just skip parameter and return attributes for now
+ LLVMContext &Ctx = AL.getContext();
+ return AttributeList::get(Ctx, AttributeList::FunctionIndex,
+ AttributeSet::get(Ctx, FnAttrs));
+}
+
+/// Helper function to place all gc relocates necessary for the given
+/// statepoint.
+/// Inputs:
+/// liveVariables - list of variables to be relocated.
+/// liveStart - index of the first live variable.
+/// basePtrs - base pointers.
+/// statepointToken - statepoint instruction to which relocates should be
+/// bound.
+/// Builder - Llvm IR builder to be used to construct new calls.
+static void CreateGCRelocates(ArrayRef<Value *> LiveVariables,
+ const int LiveStart,
+ ArrayRef<Value *> BasePtrs,
+ Instruction *StatepointToken,
+ IRBuilder<> Builder) {
+ if (LiveVariables.empty())
+ return;
+
+ auto FindIndex = [](ArrayRef<Value *> LiveVec, Value *Val) {
+ auto ValIt = llvm::find(LiveVec, Val);
+ assert(ValIt != LiveVec.end() && "Val not found in LiveVec!");
+ size_t Index = std::distance(LiveVec.begin(), ValIt);
+ assert(Index < LiveVec.size() && "Bug in std::find?");
+ return Index;
+ };
+ Module *M = StatepointToken->getModule();
+
+ // All gc_relocate are generated as i8 addrspace(1)* (or a vector type whose
+ // element type is i8 addrspace(1)*). We originally generated unique
+ // declarations for each pointer type, but this proved problematic because
+ // the intrinsic mangling code is incomplete and fragile. Since we're moving
+ // towards a single unified pointer type anyways, we can just cast everything
+ // to an i8* of the right address space. A bitcast is added later to convert
+ // gc_relocate to the actual value's type.
+ auto getGCRelocateDecl = [&] (Type *Ty) {
+ assert(isHandledGCPointerType(Ty));
+ auto AS = Ty->getScalarType()->getPointerAddressSpace();
+ Type *NewTy = Type::getInt8PtrTy(M->getContext(), AS);
+ if (auto *VT = dyn_cast<VectorType>(Ty))
+ NewTy = VectorType::get(NewTy, VT->getNumElements());
+ return Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate,
+ {NewTy});
+ };
+
+ // Lazily populated map from input types to the canonicalized form mentioned
+ // in the comment above. This should probably be cached somewhere more
+ // broadly.
+ DenseMap<Type*, Value*> TypeToDeclMap;
+
+ for (unsigned i = 0; i < LiveVariables.size(); i++) {
+ // Generate the gc.relocate call and save the result
+ Value *BaseIdx =
+ Builder.getInt32(LiveStart + FindIndex(LiveVariables, BasePtrs[i]));
+ Value *LiveIdx = Builder.getInt32(LiveStart + i);
+
+ Type *Ty = LiveVariables[i]->getType();
+ if (!TypeToDeclMap.count(Ty))
+ TypeToDeclMap[Ty] = getGCRelocateDecl(Ty);
+ Value *GCRelocateDecl = TypeToDeclMap[Ty];
+
+ // only specify a debug name if we can give a useful one
+ CallInst *Reloc = Builder.CreateCall(
+ GCRelocateDecl, {StatepointToken, BaseIdx, LiveIdx},
+ suffixed_name_or(LiveVariables[i], ".relocated", ""));
+ // Trick CodeGen into thinking there are lots of free registers at this
+ // fake call.
+ Reloc->setCallingConv(CallingConv::Cold);
+ }
+}
+
+namespace {
+
+/// This struct is used to defer RAUWs and `eraseFromParent` s. Using this
+/// avoids having to worry about keeping around dangling pointers to Values.
+class DeferredReplacement {
+ AssertingVH<Instruction> Old;
+ AssertingVH<Instruction> New;
+ bool IsDeoptimize = false;
+
+ DeferredReplacement() = default;
+
+public:
+ static DeferredReplacement createRAUW(Instruction *Old, Instruction *New) {
+ assert(Old != New && Old && New &&
+ "Cannot RAUW equal values or to / from null!");
+
+ DeferredReplacement D;
+ D.Old = Old;
+ D.New = New;
+ return D;
+ }
+
+ static DeferredReplacement createDelete(Instruction *ToErase) {
+ DeferredReplacement D;
+ D.Old = ToErase;
+ return D;
+ }
+
+ static DeferredReplacement createDeoptimizeReplacement(Instruction *Old) {
+#ifndef NDEBUG
+ auto *F = cast<CallInst>(Old)->getCalledFunction();
+ assert(F && F->getIntrinsicID() == Intrinsic::experimental_deoptimize &&
+ "Only way to construct a deoptimize deferred replacement");
+#endif
+ DeferredReplacement D;
+ D.Old = Old;
+ D.IsDeoptimize = true;
+ return D;
+ }
+
+ /// Does the task represented by this instance.
+ void doReplacement() {
+ Instruction *OldI = Old;
+ Instruction *NewI = New;
+
+ assert(OldI != NewI && "Disallowed at construction?!");
+ assert((!IsDeoptimize || !New) &&
+ "Deoptimize instrinsics are not replaced!");
+
+ Old = nullptr;
+ New = nullptr;
+
+ if (NewI)
+ OldI->replaceAllUsesWith(NewI);
+
+ if (IsDeoptimize) {
+ // Note: we've inserted instructions, so the call to llvm.deoptimize may
+ // not necessarilly be followed by the matching return.
+ auto *RI = cast<ReturnInst>(OldI->getParent()->getTerminator());
+ new UnreachableInst(RI->getContext(), RI);
+ RI->eraseFromParent();
+ }
+
+ OldI->eraseFromParent();
+ }
+};
+
+} // end anonymous namespace
+
+static StringRef getDeoptLowering(CallSite CS) {
+ const char *DeoptLowering = "deopt-lowering";
+ if (CS.hasFnAttr(DeoptLowering)) {
+ // FIXME: CallSite has a *really* confusing interface around attributes
+ // with values.
+ const AttributeList &CSAS = CS.getAttributes();
+ if (CSAS.hasAttribute(AttributeList::FunctionIndex, DeoptLowering))
+ return CSAS.getAttribute(AttributeList::FunctionIndex, DeoptLowering)
+ .getValueAsString();
+ Function *F = CS.getCalledFunction();
+ assert(F && F->hasFnAttribute(DeoptLowering));
+ return F->getFnAttribute(DeoptLowering).getValueAsString();
+ }
+ return "live-through";
+}
+
+static void
+makeStatepointExplicitImpl(const CallSite CS, /* to replace */
+ const SmallVectorImpl<Value *> &BasePtrs,
+ const SmallVectorImpl<Value *> &LiveVariables,
+ PartiallyConstructedSafepointRecord &Result,
+ std::vector<DeferredReplacement> &Replacements) {
+ assert(BasePtrs.size() == LiveVariables.size());
+
+ // Then go ahead and use the builder do actually do the inserts. We insert
+ // immediately before the previous instruction under the assumption that all
+ // arguments will be available here. We can't insert afterwards since we may
+ // be replacing a terminator.
+ Instruction *InsertBefore = CS.getInstruction();
+ IRBuilder<> Builder(InsertBefore);
+
+ ArrayRef<Value *> GCArgs(LiveVariables);
+ uint64_t StatepointID = StatepointDirectives::DefaultStatepointID;
+ uint32_t NumPatchBytes = 0;
+ uint32_t Flags = uint32_t(StatepointFlags::None);
+
+ ArrayRef<Use> CallArgs(CS.arg_begin(), CS.arg_end());
+ ArrayRef<Use> DeoptArgs = GetDeoptBundleOperands(CS);
+ ArrayRef<Use> TransitionArgs;
+ if (auto TransitionBundle =
+ CS.getOperandBundle(LLVMContext::OB_gc_transition)) {
+ Flags |= uint32_t(StatepointFlags::GCTransition);
+ TransitionArgs = TransitionBundle->Inputs;
+ }
+
+ // Instead of lowering calls to @llvm.experimental.deoptimize as normal calls
+ // with a return value, we lower then as never returning calls to
+ // __llvm_deoptimize that are followed by unreachable to get better codegen.
+ bool IsDeoptimize = false;
+
+ StatepointDirectives SD =
+ parseStatepointDirectivesFromAttrs(CS.getAttributes());
+ if (SD.NumPatchBytes)
+ NumPatchBytes = *SD.NumPatchBytes;
+ if (SD.StatepointID)
+ StatepointID = *SD.StatepointID;
+
+ // Pass through the requested lowering if any. The default is live-through.
+ StringRef DeoptLowering = getDeoptLowering(CS);
+ if (DeoptLowering.equals("live-in"))
+ Flags |= uint32_t(StatepointFlags::DeoptLiveIn);
+ else {
+ assert(DeoptLowering.equals("live-through") && "Unsupported value!");
+ }
+
+ Value *CallTarget = CS.getCalledValue();
+ if (Function *F = dyn_cast<Function>(CallTarget)) {
+ if (F->getIntrinsicID() == Intrinsic::experimental_deoptimize) {
+ // Calls to llvm.experimental.deoptimize are lowered to calls to the
+ // __llvm_deoptimize symbol. We want to resolve this now, since the
+ // verifier does not allow taking the address of an intrinsic function.
+
+ SmallVector<Type *, 8> DomainTy;
+ for (Value *Arg : CallArgs)
+ DomainTy.push_back(Arg->getType());
+ auto *FTy = FunctionType::get(Type::getVoidTy(F->getContext()), DomainTy,
+ /* isVarArg = */ false);
+
+ // Note: CallTarget can be a bitcast instruction of a symbol if there are
+ // calls to @llvm.experimental.deoptimize with different argument types in
+ // the same module. This is fine -- we assume the frontend knew what it
+ // was doing when generating this kind of IR.
+ CallTarget =
+ F->getParent()->getOrInsertFunction("__llvm_deoptimize", FTy);
+
+ IsDeoptimize = true;
+ }
+ }
+
+ // Create the statepoint given all the arguments
+ Instruction *Token = nullptr;
+ if (CS.isCall()) {
+ CallInst *ToReplace = cast<CallInst>(CS.getInstruction());
+ CallInst *Call = Builder.CreateGCStatepointCall(
+ StatepointID, NumPatchBytes, CallTarget, Flags, CallArgs,
+ TransitionArgs, DeoptArgs, GCArgs, "safepoint_token");
+
+ Call->setTailCallKind(ToReplace->getTailCallKind());
+ Call->setCallingConv(ToReplace->getCallingConv());
+
+ // Currently we will fail on parameter attributes and on certain
+ // function attributes. In case if we can handle this set of attributes -
+ // set up function attrs directly on statepoint and return attrs later for
+ // gc_result intrinsic.
+ Call->setAttributes(legalizeCallAttributes(ToReplace->getAttributes()));
+
+ Token = Call;
+
+ // Put the following gc_result and gc_relocate calls immediately after the
+ // the old call (which we're about to delete)
+ assert(ToReplace->getNextNode() && "Not a terminator, must have next!");
+ Builder.SetInsertPoint(ToReplace->getNextNode());
+ Builder.SetCurrentDebugLocation(ToReplace->getNextNode()->getDebugLoc());
+ } else {
+ InvokeInst *ToReplace = cast<InvokeInst>(CS.getInstruction());
+
+ // Insert the new invoke into the old block. We'll remove the old one in a
+ // moment at which point this will become the new terminator for the
+ // original block.
+ InvokeInst *Invoke = Builder.CreateGCStatepointInvoke(
+ StatepointID, NumPatchBytes, CallTarget, ToReplace->getNormalDest(),
+ ToReplace->getUnwindDest(), Flags, CallArgs, TransitionArgs, DeoptArgs,
+ GCArgs, "statepoint_token");
+
+ Invoke->setCallingConv(ToReplace->getCallingConv());
+
+ // Currently we will fail on parameter attributes and on certain
+ // function attributes. In case if we can handle this set of attributes -
+ // set up function attrs directly on statepoint and return attrs later for
+ // gc_result intrinsic.
+ Invoke->setAttributes(legalizeCallAttributes(ToReplace->getAttributes()));
+
+ Token = Invoke;
+
+ // Generate gc relocates in exceptional path
+ BasicBlock *UnwindBlock = ToReplace->getUnwindDest();
+ assert(!isa<PHINode>(UnwindBlock->begin()) &&
+ UnwindBlock->getUniquePredecessor() &&
+ "can't safely insert in this block!");
+
+ Builder.SetInsertPoint(&*UnwindBlock->getFirstInsertionPt());
+ Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());
+
+ // Attach exceptional gc relocates to the landingpad.
+ Instruction *ExceptionalToken = UnwindBlock->getLandingPadInst();
+ Result.UnwindToken = ExceptionalToken;
+
+ const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx();
+ CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, ExceptionalToken,
+ Builder);
+
+ // Generate gc relocates and returns for normal block
+ BasicBlock *NormalDest = ToReplace->getNormalDest();
+ assert(!isa<PHINode>(NormalDest->begin()) &&
+ NormalDest->getUniquePredecessor() &&
+ "can't safely insert in this block!");
+
+ Builder.SetInsertPoint(&*NormalDest->getFirstInsertionPt());
+
+ // gc relocates will be generated later as if it were regular call
+ // statepoint
+ }
+ assert(Token && "Should be set in one of the above branches!");
+
+ if (IsDeoptimize) {
+ // If we're wrapping an @llvm.experimental.deoptimize in a statepoint, we
+ // transform the tail-call like structure to a call to a void function
+ // followed by unreachable to get better codegen.
+ Replacements.push_back(
+ DeferredReplacement::createDeoptimizeReplacement(CS.getInstruction()));
+ } else {
+ Token->setName("statepoint_token");
+ if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) {
+ StringRef Name =
+ CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : "";
+ CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), Name);
+ GCResult->setAttributes(
+ AttributeList::get(GCResult->getContext(), AttributeList::ReturnIndex,
+ CS.getAttributes().getRetAttributes()));
+
+ // We cannot RAUW or delete CS.getInstruction() because it could be in the
+ // live set of some other safepoint, in which case that safepoint's
+ // PartiallyConstructedSafepointRecord will hold a raw pointer to this
+ // llvm::Instruction. Instead, we defer the replacement and deletion to
+ // after the live sets have been made explicit in the IR, and we no longer
+ // have raw pointers to worry about.
+ Replacements.emplace_back(
+ DeferredReplacement::createRAUW(CS.getInstruction(), GCResult));
+ } else {
+ Replacements.emplace_back(
+ DeferredReplacement::createDelete(CS.getInstruction()));
+ }
+ }
+
+ Result.StatepointToken = Token;
+
+ // Second, create a gc.relocate for every live variable
+ const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx();
+ CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, Token, Builder);
+}
+
+// Replace an existing gc.statepoint with a new one and a set of gc.relocates
+// which make the relocations happening at this safepoint explicit.
+//
+// WARNING: Does not do any fixup to adjust users of the original live
+// values. That's the callers responsibility.
+static void
+makeStatepointExplicit(DominatorTree &DT, CallSite CS,
+ PartiallyConstructedSafepointRecord &Result,
+ std::vector<DeferredReplacement> &Replacements) {
+ const auto &LiveSet = Result.LiveSet;
+ const auto &PointerToBase = Result.PointerToBase;
+
+ // Convert to vector for efficient cross referencing.
+ SmallVector<Value *, 64> BaseVec, LiveVec;
+ LiveVec.reserve(LiveSet.size());
+ BaseVec.reserve(LiveSet.size());
+ for (Value *L : LiveSet) {
+ LiveVec.push_back(L);
+ assert(PointerToBase.count(L));
+ Value *Base = PointerToBase.find(L)->second;
+ BaseVec.push_back(Base);
+ }
+ assert(LiveVec.size() == BaseVec.size());
+
+ // Do the actual rewriting and delete the old statepoint
+ makeStatepointExplicitImpl(CS, BaseVec, LiveVec, Result, Replacements);
+}
+
+// Helper function for the relocationViaAlloca.
+//
+// It receives iterator to the statepoint gc relocates and emits a store to the
+// assigned location (via allocaMap) for the each one of them. It adds the
+// visited values into the visitedLiveValues set, which we will later use them
+// for sanity checking.
+static void
+insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs,
+ DenseMap<Value *, Value *> &AllocaMap,
+ DenseSet<Value *> &VisitedLiveValues) {
+ for (User *U : GCRelocs) {
+ GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U);
+ if (!Relocate)
+ continue;
+
+ Value *OriginalValue = Relocate->getDerivedPtr();
+ assert(AllocaMap.count(OriginalValue));
+ Value *Alloca = AllocaMap[OriginalValue];
+
+ // Emit store into the related alloca
+ // All gc_relocates are i8 addrspace(1)* typed, and it must be bitcasted to
+ // the correct type according to alloca.
+ assert(Relocate->getNextNode() &&
+ "Should always have one since it's not a terminator");
+ IRBuilder<> Builder(Relocate->getNextNode());
+ Value *CastedRelocatedValue =
+ Builder.CreateBitCast(Relocate,
+ cast<AllocaInst>(Alloca)->getAllocatedType(),
+ suffixed_name_or(Relocate, ".casted", ""));
+
+ StoreInst *Store = new StoreInst(CastedRelocatedValue, Alloca);
+ Store->insertAfter(cast<Instruction>(CastedRelocatedValue));
+
+#ifndef NDEBUG
+ VisitedLiveValues.insert(OriginalValue);
+#endif
+ }
+}
+
+// Helper function for the "relocationViaAlloca". Similar to the
+// "insertRelocationStores" but works for rematerialized values.
+static void insertRematerializationStores(
+ const RematerializedValueMapTy &RematerializedValues,
+ DenseMap<Value *, Value *> &AllocaMap,
+ DenseSet<Value *> &VisitedLiveValues) {
+ for (auto RematerializedValuePair: RematerializedValues) {
+ Instruction *RematerializedValue = RematerializedValuePair.first;
+ Value *OriginalValue = RematerializedValuePair.second;
+
+ assert(AllocaMap.count(OriginalValue) &&
+ "Can not find alloca for rematerialized value");
+ Value *Alloca = AllocaMap[OriginalValue];
+
+ StoreInst *Store = new StoreInst(RematerializedValue, Alloca);
+ Store->insertAfter(RematerializedValue);
+
+#ifndef NDEBUG
+ VisitedLiveValues.insert(OriginalValue);
+#endif
+ }
+}
+
+/// Do all the relocation update via allocas and mem2reg
+static void relocationViaAlloca(
+ Function &F, DominatorTree &DT, ArrayRef<Value *> Live,
+ ArrayRef<PartiallyConstructedSafepointRecord> Records) {
+#ifndef NDEBUG
+ // record initial number of (static) allocas; we'll check we have the same
+ // number when we get done.
+ int InitialAllocaNum = 0;
+ for (Instruction &I : F.getEntryBlock())
+ if (isa<AllocaInst>(I))
+ InitialAllocaNum++;
+#endif
+
+ // TODO-PERF: change data structures, reserve
+ DenseMap<Value *, Value *> AllocaMap;
+ SmallVector<AllocaInst *, 200> PromotableAllocas;
+ // Used later to chack that we have enough allocas to store all values
+ std::size_t NumRematerializedValues = 0;
+ PromotableAllocas.reserve(Live.size());
+
+ // Emit alloca for "LiveValue" and record it in "allocaMap" and
+ // "PromotableAllocas"
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ auto emitAllocaFor = [&](Value *LiveValue) {
+ AllocaInst *Alloca = new AllocaInst(LiveValue->getType(),
+ DL.getAllocaAddrSpace(), "",
+ F.getEntryBlock().getFirstNonPHI());
+ AllocaMap[LiveValue] = Alloca;
+ PromotableAllocas.push_back(Alloca);
+ };
+
+ // Emit alloca for each live gc pointer
+ for (Value *V : Live)
+ emitAllocaFor(V);
+
+ // Emit allocas for rematerialized values
+ for (const auto &Info : Records)
+ for (auto RematerializedValuePair : Info.RematerializedValues) {
+ Value *OriginalValue = RematerializedValuePair.second;
+ if (AllocaMap.count(OriginalValue) != 0)
+ continue;
+
+ emitAllocaFor(OriginalValue);
+ ++NumRematerializedValues;
+ }
+
+ // The next two loops are part of the same conceptual operation. We need to
+ // insert a store to the alloca after the original def and at each
+ // redefinition. We need to insert a load before each use. These are split
+ // into distinct loops for performance reasons.
+
+ // Update gc pointer after each statepoint: either store a relocated value or
+ // null (if no relocated value was found for this gc pointer and it is not a
+ // gc_result). This must happen before we update the statepoint with load of
+ // alloca otherwise we lose the link between statepoint and old def.
+ for (const auto &Info : Records) {
+ Value *Statepoint = Info.StatepointToken;
+
+ // This will be used for consistency check
+ DenseSet<Value *> VisitedLiveValues;
+
+ // Insert stores for normal statepoint gc relocates
+ insertRelocationStores(Statepoint->users(), AllocaMap, VisitedLiveValues);
+
+ // In case if it was invoke statepoint
+ // we will insert stores for exceptional path gc relocates.
+ if (isa<InvokeInst>(Statepoint)) {
+ insertRelocationStores(Info.UnwindToken->users(), AllocaMap,
+ VisitedLiveValues);
+ }
+
+ // Do similar thing with rematerialized values
+ insertRematerializationStores(Info.RematerializedValues, AllocaMap,
+ VisitedLiveValues);
+
+ if (ClobberNonLive) {
+ // As a debugging aid, pretend that an unrelocated pointer becomes null at
+ // the gc.statepoint. This will turn some subtle GC problems into
+ // slightly easier to debug SEGVs. Note that on large IR files with
+ // lots of gc.statepoints this is extremely costly both memory and time
+ // wise.
+ SmallVector<AllocaInst *, 64> ToClobber;
+ for (auto Pair : AllocaMap) {
+ Value *Def = Pair.first;
+ AllocaInst *Alloca = cast<AllocaInst>(Pair.second);
+
+ // This value was relocated
+ if (VisitedLiveValues.count(Def)) {
+ continue;
+ }
+ ToClobber.push_back(Alloca);
+ }
+
+ auto InsertClobbersAt = [&](Instruction *IP) {
+ for (auto *AI : ToClobber) {
+ auto PT = cast<PointerType>(AI->getAllocatedType());
+ Constant *CPN = ConstantPointerNull::get(PT);
+ StoreInst *Store = new StoreInst(CPN, AI);
+ Store->insertBefore(IP);
+ }
+ };
+
+ // Insert the clobbering stores. These may get intermixed with the
+ // gc.results and gc.relocates, but that's fine.
+ if (auto II = dyn_cast<InvokeInst>(Statepoint)) {
+ InsertClobbersAt(&*II->getNormalDest()->getFirstInsertionPt());
+ InsertClobbersAt(&*II->getUnwindDest()->getFirstInsertionPt());
+ } else {
+ InsertClobbersAt(cast<Instruction>(Statepoint)->getNextNode());
+ }
+ }
+ }
+
+ // Update use with load allocas and add store for gc_relocated.
+ for (auto Pair : AllocaMap) {
+ Value *Def = Pair.first;
+ Value *Alloca = Pair.second;
+
+ // We pre-record the uses of allocas so that we dont have to worry about
+ // later update that changes the user information..
+
+ SmallVector<Instruction *, 20> Uses;
+ // PERF: trade a linear scan for repeated reallocation
+ Uses.reserve(std::distance(Def->user_begin(), Def->user_end()));
+ for (User *U : Def->users()) {
+ if (!isa<ConstantExpr>(U)) {
+ // If the def has a ConstantExpr use, then the def is either a
+ // ConstantExpr use itself or null. In either case
+ // (recursively in the first, directly in the second), the oop
+ // it is ultimately dependent on is null and this particular
+ // use does not need to be fixed up.
+ Uses.push_back(cast<Instruction>(U));
+ }
+ }
+
+ std::sort(Uses.begin(), Uses.end());
+ auto Last = std::unique(Uses.begin(), Uses.end());
+ Uses.erase(Last, Uses.end());
+
+ for (Instruction *Use : Uses) {
+ if (isa<PHINode>(Use)) {
+ PHINode *Phi = cast<PHINode>(Use);
+ for (unsigned i = 0; i < Phi->getNumIncomingValues(); i++) {
+ if (Def == Phi->getIncomingValue(i)) {
+ LoadInst *Load = new LoadInst(
+ Alloca, "", Phi->getIncomingBlock(i)->getTerminator());
+ Phi->setIncomingValue(i, Load);
+ }
+ }
+ } else {
+ LoadInst *Load = new LoadInst(Alloca, "", Use);
+ Use->replaceUsesOfWith(Def, Load);
+ }
+ }
+
+ // Emit store for the initial gc value. Store must be inserted after load,
+ // otherwise store will be in alloca's use list and an extra load will be
+ // inserted before it.
+ StoreInst *Store = new StoreInst(Def, Alloca);
+ if (Instruction *Inst = dyn_cast<Instruction>(Def)) {
+ if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) {
+ // InvokeInst is a TerminatorInst so the store need to be inserted
+ // into its normal destination block.
+ BasicBlock *NormalDest = Invoke->getNormalDest();
+ Store->insertBefore(NormalDest->getFirstNonPHI());
+ } else {
+ assert(!Inst->isTerminator() &&
+ "The only TerminatorInst that can produce a value is "
+ "InvokeInst which is handled above.");
+ Store->insertAfter(Inst);
+ }
+ } else {
+ assert(isa<Argument>(Def));
+ Store->insertAfter(cast<Instruction>(Alloca));
+ }
+ }
+
+ assert(PromotableAllocas.size() == Live.size() + NumRematerializedValues &&
+ "we must have the same allocas with lives");
+ if (!PromotableAllocas.empty()) {
+ // Apply mem2reg to promote alloca to SSA
+ PromoteMemToReg(PromotableAllocas, DT);
+ }
+
+#ifndef NDEBUG
+ for (auto &I : F.getEntryBlock())
+ if (isa<AllocaInst>(I))
+ InitialAllocaNum--;
+ assert(InitialAllocaNum == 0 && "We must not introduce any extra allocas");
+#endif
+}
+
+/// Implement a unique function which doesn't require we sort the input
+/// vector. Doing so has the effect of changing the output of a couple of
+/// tests in ways which make them less useful in testing fused safepoints.
+template <typename T> static void unique_unsorted(SmallVectorImpl<T> &Vec) {
+ SmallSet<T, 8> Seen;
+ Vec.erase(remove_if(Vec, [&](const T &V) { return !Seen.insert(V).second; }),
+ Vec.end());
+}
+
+/// Insert holders so that each Value is obviously live through the entire
+/// lifetime of the call.
+static void insertUseHolderAfter(CallSite &CS, const ArrayRef<Value *> Values,
+ SmallVectorImpl<CallInst *> &Holders) {
+ if (Values.empty())
+ // No values to hold live, might as well not insert the empty holder
+ return;
+
+ Module *M = CS.getInstruction()->getModule();
+ // Use a dummy vararg function to actually hold the values live
+ Function *Func = cast<Function>(M->getOrInsertFunction(
+ "__tmp_use", FunctionType::get(Type::getVoidTy(M->getContext()), true)));
+ if (CS.isCall()) {
+ // For call safepoints insert dummy calls right after safepoint
+ Holders.push_back(CallInst::Create(Func, Values, "",
+ &*++CS.getInstruction()->getIterator()));
+ return;
+ }
+ // For invoke safepooints insert dummy calls both in normal and
+ // exceptional destination blocks
+ auto *II = cast<InvokeInst>(CS.getInstruction());
+ Holders.push_back(CallInst::Create(
+ Func, Values, "", &*II->getNormalDest()->getFirstInsertionPt()));
+ Holders.push_back(CallInst::Create(
+ Func, Values, "", &*II->getUnwindDest()->getFirstInsertionPt()));
+}
+
+static void findLiveReferences(
+ Function &F, DominatorTree &DT, ArrayRef<CallSite> toUpdate,
+ MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
+ GCPtrLivenessData OriginalLivenessData;
+ computeLiveInValues(DT, F, OriginalLivenessData);
+ for (size_t i = 0; i < records.size(); i++) {
+ struct PartiallyConstructedSafepointRecord &info = records[i];
+ analyzeParsePointLiveness(DT, OriginalLivenessData, toUpdate[i], info);
+ }
+}
+
+// Helper function for the "rematerializeLiveValues". It walks use chain
+// starting from the "CurrentValue" until it reaches the root of the chain, i.e.
+// the base or a value it cannot process. Only "simple" values are processed
+// (currently it is GEP's and casts). The returned root is examined by the
+// callers of findRematerializableChainToBasePointer. Fills "ChainToBase" array
+// with all visited values.
+static Value* findRematerializableChainToBasePointer(
+ SmallVectorImpl<Instruction*> &ChainToBase,
+ Value *CurrentValue) {
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurrentValue)) {
+ ChainToBase.push_back(GEP);
+ return findRematerializableChainToBasePointer(ChainToBase,
+ GEP->getPointerOperand());
+ }
+
+ if (CastInst *CI = dyn_cast<CastInst>(CurrentValue)) {
+ if (!CI->isNoopCast(CI->getModule()->getDataLayout()))
+ return CI;
+
+ ChainToBase.push_back(CI);
+ return findRematerializableChainToBasePointer(ChainToBase,
+ CI->getOperand(0));
+ }
+
+ // We have reached the root of the chain, which is either equal to the base or
+ // is the first unsupported value along the use chain.
+ return CurrentValue;
+}
+
+// Helper function for the "rematerializeLiveValues". Compute cost of the use
+// chain we are going to rematerialize.
+static unsigned
+chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
+ TargetTransformInfo &TTI) {
+ unsigned Cost = 0;
+
+ for (Instruction *Instr : Chain) {
+ if (CastInst *CI = dyn_cast<CastInst>(Instr)) {
+ assert(CI->isNoopCast(CI->getModule()->getDataLayout()) &&
+ "non noop cast is found during rematerialization");
+
+ Type *SrcTy = CI->getOperand(0)->getType();
+ Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy, CI);
+
+ } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
+ // Cost of the address calculation
+ Type *ValTy = GEP->getSourceElementType();
+ Cost += TTI.getAddressComputationCost(ValTy);
+
+ // And cost of the GEP itself
+ // TODO: Use TTI->getGEPCost here (it exists, but appears to be not
+ // allowed for the external usage)
+ if (!GEP->hasAllConstantIndices())
+ Cost += 2;
+
+ } else {
+ llvm_unreachable("unsupported instruciton type during rematerialization");
+ }
+ }
+
+ return Cost;
+}
+
+static bool AreEquivalentPhiNodes(PHINode &OrigRootPhi, PHINode &AlternateRootPhi) {
+ unsigned PhiNum = OrigRootPhi.getNumIncomingValues();
+ if (PhiNum != AlternateRootPhi.getNumIncomingValues() ||
+ OrigRootPhi.getParent() != AlternateRootPhi.getParent())
+ return false;
+ // Map of incoming values and their corresponding basic blocks of
+ // OrigRootPhi.
+ SmallDenseMap<Value *, BasicBlock *, 8> CurrentIncomingValues;
+ for (unsigned i = 0; i < PhiNum; i++)
+ CurrentIncomingValues[OrigRootPhi.getIncomingValue(i)] =
+ OrigRootPhi.getIncomingBlock(i);
+
+ // Both current and base PHIs should have same incoming values and
+ // the same basic blocks corresponding to the incoming values.
+ for (unsigned i = 0; i < PhiNum; i++) {
+ auto CIVI =
+ CurrentIncomingValues.find(AlternateRootPhi.getIncomingValue(i));
+ if (CIVI == CurrentIncomingValues.end())
+ return false;
+ BasicBlock *CurrentIncomingBB = CIVI->second;
+ if (CurrentIncomingBB != AlternateRootPhi.getIncomingBlock(i))
+ return false;
+ }
+ return true;
+}
+
+// From the statepoint live set pick values that are cheaper to recompute then
+// to relocate. Remove this values from the live set, rematerialize them after
+// statepoint and record them in "Info" structure. Note that similar to
+// relocated values we don't do any user adjustments here.
+static void rematerializeLiveValues(CallSite CS,
+ PartiallyConstructedSafepointRecord &Info,
+ TargetTransformInfo &TTI) {
+ const unsigned int ChainLengthThreshold = 10;
+
+ // Record values we are going to delete from this statepoint live set.
+ // We can not di this in following loop due to iterator invalidation.
+ SmallVector<Value *, 32> LiveValuesToBeDeleted;
+
+ for (Value *LiveValue: Info.LiveSet) {
+ // For each live pointer find it's defining chain
+ SmallVector<Instruction *, 3> ChainToBase;
+ assert(Info.PointerToBase.count(LiveValue));
+ Value *RootOfChain =
+ findRematerializableChainToBasePointer(ChainToBase,
+ LiveValue);
+
+ // Nothing to do, or chain is too long
+ if ( ChainToBase.size() == 0 ||
+ ChainToBase.size() > ChainLengthThreshold)
+ continue;
+
+ // Handle the scenario where the RootOfChain is not equal to the
+ // Base Value, but they are essentially the same phi values.
+ if (RootOfChain != Info.PointerToBase[LiveValue]) {
+ PHINode *OrigRootPhi = dyn_cast<PHINode>(RootOfChain);
+ PHINode *AlternateRootPhi = dyn_cast<PHINode>(Info.PointerToBase[LiveValue]);
+ if (!OrigRootPhi || !AlternateRootPhi)
+ continue;
+ // PHI nodes that have the same incoming values, and belonging to the same
+ // basic blocks are essentially the same SSA value. When the original phi
+ // has incoming values with different base pointers, the original phi is
+ // marked as conflict, and an additional `AlternateRootPhi` with the same
+ // incoming values get generated by the findBasePointer function. We need
+ // to identify the newly generated AlternateRootPhi (.base version of phi)
+ // and RootOfChain (the original phi node itself) are the same, so that we
+ // can rematerialize the gep and casts. This is a workaround for the
+ // deficiency in the findBasePointer algorithm.
+ if (!AreEquivalentPhiNodes(*OrigRootPhi, *AlternateRootPhi))
+ continue;
+ // Now that the phi nodes are proved to be the same, assert that
+ // findBasePointer's newly generated AlternateRootPhi is present in the
+ // liveset of the call.
+ assert(Info.LiveSet.count(AlternateRootPhi));
+ }
+ // Compute cost of this chain
+ unsigned Cost = chainToBasePointerCost(ChainToBase, TTI);
+ // TODO: We can also account for cases when we will be able to remove some
+ // of the rematerialized values by later optimization passes. I.e if
+ // we rematerialized several intersecting chains. Or if original values
+ // don't have any uses besides this statepoint.
+
+ // For invokes we need to rematerialize each chain twice - for normal and
+ // for unwind basic blocks. Model this by multiplying cost by two.
+ if (CS.isInvoke()) {
+ Cost *= 2;
+ }
+ // If it's too expensive - skip it
+ if (Cost >= RematerializationThreshold)
+ continue;
+
+ // Remove value from the live set
+ LiveValuesToBeDeleted.push_back(LiveValue);
+
+ // Clone instructions and record them inside "Info" structure
+
+ // Walk backwards to visit top-most instructions first
+ std::reverse(ChainToBase.begin(), ChainToBase.end());
+
+ // Utility function which clones all instructions from "ChainToBase"
+ // and inserts them before "InsertBefore". Returns rematerialized value
+ // which should be used after statepoint.
+ auto rematerializeChain = [&ChainToBase](
+ Instruction *InsertBefore, Value *RootOfChain, Value *AlternateLiveBase) {
+ Instruction *LastClonedValue = nullptr;
+ Instruction *LastValue = nullptr;
+ for (Instruction *Instr: ChainToBase) {
+ // Only GEP's and casts are supported as we need to be careful to not
+ // introduce any new uses of pointers not in the liveset.
+ // Note that it's fine to introduce new uses of pointers which were
+ // otherwise not used after this statepoint.
+ assert(isa<GetElementPtrInst>(Instr) || isa<CastInst>(Instr));
+
+ Instruction *ClonedValue = Instr->clone();
+ ClonedValue->insertBefore(InsertBefore);
+ ClonedValue->setName(Instr->getName() + ".remat");
+
+ // If it is not first instruction in the chain then it uses previously
+ // cloned value. We should update it to use cloned value.
+ if (LastClonedValue) {
+ assert(LastValue);
+ ClonedValue->replaceUsesOfWith(LastValue, LastClonedValue);
+#ifndef NDEBUG
+ for (auto OpValue : ClonedValue->operand_values()) {
+ // Assert that cloned instruction does not use any instructions from
+ // this chain other than LastClonedValue
+ assert(!is_contained(ChainToBase, OpValue) &&
+ "incorrect use in rematerialization chain");
+ // Assert that the cloned instruction does not use the RootOfChain
+ // or the AlternateLiveBase.
+ assert(OpValue != RootOfChain && OpValue != AlternateLiveBase);
+ }
+#endif
+ } else {
+ // For the first instruction, replace the use of unrelocated base i.e.
+ // RootOfChain/OrigRootPhi, with the corresponding PHI present in the
+ // live set. They have been proved to be the same PHI nodes. Note
+ // that the *only* use of the RootOfChain in the ChainToBase list is
+ // the first Value in the list.
+ if (RootOfChain != AlternateLiveBase)
+ ClonedValue->replaceUsesOfWith(RootOfChain, AlternateLiveBase);
+ }
+
+ LastClonedValue = ClonedValue;
+ LastValue = Instr;
+ }
+ assert(LastClonedValue);
+ return LastClonedValue;
+ };
+
+ // Different cases for calls and invokes. For invokes we need to clone
+ // instructions both on normal and unwind path.
+ if (CS.isCall()) {
+ Instruction *InsertBefore = CS.getInstruction()->getNextNode();
+ assert(InsertBefore);
+ Instruction *RematerializedValue = rematerializeChain(
+ InsertBefore, RootOfChain, Info.PointerToBase[LiveValue]);
+ Info.RematerializedValues[RematerializedValue] = LiveValue;
+ } else {
+ InvokeInst *Invoke = cast<InvokeInst>(CS.getInstruction());
+
+ Instruction *NormalInsertBefore =
+ &*Invoke->getNormalDest()->getFirstInsertionPt();
+ Instruction *UnwindInsertBefore =
+ &*Invoke->getUnwindDest()->getFirstInsertionPt();
+
+ Instruction *NormalRematerializedValue = rematerializeChain(
+ NormalInsertBefore, RootOfChain, Info.PointerToBase[LiveValue]);
+ Instruction *UnwindRematerializedValue = rematerializeChain(
+ UnwindInsertBefore, RootOfChain, Info.PointerToBase[LiveValue]);
+
+ Info.RematerializedValues[NormalRematerializedValue] = LiveValue;
+ Info.RematerializedValues[UnwindRematerializedValue] = LiveValue;
+ }
+ }
+
+ // Remove rematerializaed values from the live set
+ for (auto LiveValue: LiveValuesToBeDeleted) {
+ Info.LiveSet.remove(LiveValue);
+ }
+}
+
+static bool insertParsePoints(Function &F, DominatorTree &DT,
+ TargetTransformInfo &TTI,
+ SmallVectorImpl<CallSite> &ToUpdate) {
+#ifndef NDEBUG
+ // sanity check the input
+ std::set<CallSite> Uniqued;
+ Uniqued.insert(ToUpdate.begin(), ToUpdate.end());
+ assert(Uniqued.size() == ToUpdate.size() && "no duplicates please!");
+
+ for (CallSite CS : ToUpdate)
+ assert(CS.getInstruction()->getFunction() == &F);
+#endif
+
+ // When inserting gc.relocates for invokes, we need to be able to insert at
+ // the top of the successor blocks. See the comment on
+ // normalForInvokeSafepoint on exactly what is needed. Note that this step
+ // may restructure the CFG.
+ for (CallSite CS : ToUpdate) {
+ if (!CS.isInvoke())
+ continue;
+ auto *II = cast<InvokeInst>(CS.getInstruction());
+ normalizeForInvokeSafepoint(II->getNormalDest(), II->getParent(), DT);
+ normalizeForInvokeSafepoint(II->getUnwindDest(), II->getParent(), DT);
+ }
+
+ // A list of dummy calls added to the IR to keep various values obviously
+ // live in the IR. We'll remove all of these when done.
+ SmallVector<CallInst *, 64> Holders;
+
+ // Insert a dummy call with all of the deopt operands we'll need for the
+ // actual safepoint insertion as arguments. This ensures reference operands
+ // in the deopt argument list are considered live through the safepoint (and
+ // thus makes sure they get relocated.)
+ for (CallSite CS : ToUpdate) {
+ SmallVector<Value *, 64> DeoptValues;
+
+ for (Value *Arg : GetDeoptBundleOperands(CS)) {
+ assert(!isUnhandledGCPointerType(Arg->getType()) &&
+ "support for FCA unimplemented");
+ if (isHandledGCPointerType(Arg->getType()))
+ DeoptValues.push_back(Arg);
+ }
+
+ insertUseHolderAfter(CS, DeoptValues, Holders);
+ }
+
+ SmallVector<PartiallyConstructedSafepointRecord, 64> Records(ToUpdate.size());
+
+ // A) Identify all gc pointers which are statically live at the given call
+ // site.
+ findLiveReferences(F, DT, ToUpdate, Records);
+
+ // B) Find the base pointers for each live pointer
+ /* scope for caching */ {
+ // Cache the 'defining value' relation used in the computation and
+ // insertion of base phis and selects. This ensures that we don't insert
+ // large numbers of duplicate base_phis.
+ DefiningValueMapTy DVCache;
+
+ for (size_t i = 0; i < Records.size(); i++) {
+ PartiallyConstructedSafepointRecord &info = Records[i];
+ findBasePointers(DT, DVCache, ToUpdate[i], info);
+ }
+ } // end of cache scope
+
+ // The base phi insertion logic (for any safepoint) may have inserted new
+ // instructions which are now live at some safepoint. The simplest such
+ // example is:
+ // loop:
+ // phi a <-- will be a new base_phi here
+ // safepoint 1 <-- that needs to be live here
+ // gep a + 1
+ // safepoint 2
+ // br loop
+ // We insert some dummy calls after each safepoint to definitely hold live
+ // the base pointers which were identified for that safepoint. We'll then
+ // ask liveness for _every_ base inserted to see what is now live. Then we
+ // remove the dummy calls.
+ Holders.reserve(Holders.size() + Records.size());
+ for (size_t i = 0; i < Records.size(); i++) {
+ PartiallyConstructedSafepointRecord &Info = Records[i];
+
+ SmallVector<Value *, 128> Bases;
+ for (auto Pair : Info.PointerToBase)
+ Bases.push_back(Pair.second);
+
+ insertUseHolderAfter(ToUpdate[i], Bases, Holders);
+ }
+
+ // By selecting base pointers, we've effectively inserted new uses. Thus, we
+ // need to rerun liveness. We may *also* have inserted new defs, but that's
+ // not the key issue.
+ recomputeLiveInValues(F, DT, ToUpdate, Records);
+
+ if (PrintBasePointers) {
+ for (auto &Info : Records) {
+ errs() << "Base Pairs: (w/Relocation)\n";
+ for (auto Pair : Info.PointerToBase) {
+ errs() << " derived ";
+ Pair.first->printAsOperand(errs(), false);
+ errs() << " base ";
+ Pair.second->printAsOperand(errs(), false);
+ errs() << "\n";
+ }
+ }
+ }
+
+ // It is possible that non-constant live variables have a constant base. For
+ // example, a GEP with a variable offset from a global. In this case we can
+ // remove it from the liveset. We already don't add constants to the liveset
+ // because we assume they won't move at runtime and the GC doesn't need to be
+ // informed about them. The same reasoning applies if the base is constant.
+ // Note that the relocation placement code relies on this filtering for
+ // correctness as it expects the base to be in the liveset, which isn't true
+ // if the base is constant.
+ for (auto &Info : Records)
+ for (auto &BasePair : Info.PointerToBase)
+ if (isa<Constant>(BasePair.second))
+ Info.LiveSet.remove(BasePair.first);
+
+ for (CallInst *CI : Holders)
+ CI->eraseFromParent();
+
+ Holders.clear();
+
+ // In order to reduce live set of statepoint we might choose to rematerialize
+ // some values instead of relocating them. This is purely an optimization and
+ // does not influence correctness.
+ for (size_t i = 0; i < Records.size(); i++)
+ rematerializeLiveValues(ToUpdate[i], Records[i], TTI);
+
+ // We need this to safely RAUW and delete call or invoke return values that
+ // may themselves be live over a statepoint. For details, please see usage in
+ // makeStatepointExplicitImpl.
+ std::vector<DeferredReplacement> Replacements;
+
+ // Now run through and replace the existing statepoints with new ones with
+ // the live variables listed. We do not yet update uses of the values being
+ // relocated. We have references to live variables that need to
+ // survive to the last iteration of this loop. (By construction, the
+ // previous statepoint can not be a live variable, thus we can and remove
+ // the old statepoint calls as we go.)
+ for (size_t i = 0; i < Records.size(); i++)
+ makeStatepointExplicit(DT, ToUpdate[i], Records[i], Replacements);
+
+ ToUpdate.clear(); // prevent accident use of invalid CallSites
+
+ for (auto &PR : Replacements)
+ PR.doReplacement();
+
+ Replacements.clear();
+
+ for (auto &Info : Records) {
+ // These live sets may contain state Value pointers, since we replaced calls
+ // with operand bundles with calls wrapped in gc.statepoint, and some of
+ // those calls may have been def'ing live gc pointers. Clear these out to
+ // avoid accidentally using them.
+ //
+ // TODO: We should create a separate data structure that does not contain
+ // these live sets, and migrate to using that data structure from this point
+ // onward.
+ Info.LiveSet.clear();
+ Info.PointerToBase.clear();
+ }
+
+ // Do all the fixups of the original live variables to their relocated selves
+ SmallVector<Value *, 128> Live;
+ for (size_t i = 0; i < Records.size(); i++) {
+ PartiallyConstructedSafepointRecord &Info = Records[i];
+
+ // We can't simply save the live set from the original insertion. One of
+ // the live values might be the result of a call which needs a safepoint.
+ // That Value* no longer exists and we need to use the new gc_result.
+ // Thankfully, the live set is embedded in the statepoint (and updated), so
+ // we just grab that.
+ Statepoint Statepoint(Info.StatepointToken);
+ Live.insert(Live.end(), Statepoint.gc_args_begin(),
+ Statepoint.gc_args_end());
+#ifndef NDEBUG
+ // Do some basic sanity checks on our liveness results before performing
+ // relocation. Relocation can and will turn mistakes in liveness results
+ // into non-sensical code which is must harder to debug.
+ // TODO: It would be nice to test consistency as well
+ assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) &&
+ "statepoint must be reachable or liveness is meaningless");
+ for (Value *V : Statepoint.gc_args()) {
+ if (!isa<Instruction>(V))
+ // Non-instruction values trivial dominate all possible uses
+ continue;
+ auto *LiveInst = cast<Instruction>(V);
+ assert(DT.isReachableFromEntry(LiveInst->getParent()) &&
+ "unreachable values should never be live");
+ assert(DT.dominates(LiveInst, Info.StatepointToken) &&
+ "basic SSA liveness expectation violated by liveness analysis");
+ }
+#endif
+ }
+ unique_unsorted(Live);
+
+#ifndef NDEBUG
+ // sanity check
+ for (auto *Ptr : Live)
+ assert(isHandledGCPointerType(Ptr->getType()) &&
+ "must be a gc pointer type");
+#endif
+
+ relocationViaAlloca(F, DT, Live, Records);
+ return !Records.empty();
+}
+
+// Handles both return values and arguments for Functions and CallSites.
+template <typename AttrHolder>
+static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
+ unsigned Index) {
+ AttrBuilder R;
+ if (AH.getDereferenceableBytes(Index))
+ R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable,
+ AH.getDereferenceableBytes(Index)));
+ if (AH.getDereferenceableOrNullBytes(Index))
+ R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull,
+ AH.getDereferenceableOrNullBytes(Index)));
+ if (AH.getAttributes().hasAttribute(Index, Attribute::NoAlias))
+ R.addAttribute(Attribute::NoAlias);
+
+ if (!R.empty())
+ AH.setAttributes(AH.getAttributes().removeAttributes(Ctx, Index, R));
+}
+
+static void stripNonValidAttributesFromPrototype(Function &F) {
+ LLVMContext &Ctx = F.getContext();
+
+ for (Argument &A : F.args())
+ if (isa<PointerType>(A.getType()))
+ RemoveNonValidAttrAtIndex(Ctx, F,
+ A.getArgNo() + AttributeList::FirstArgIndex);
+
+ if (isa<PointerType>(F.getReturnType()))
+ RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex);
+}
+
+/// Certain metadata on instructions are invalid after running RS4GC.
+/// Optimizations that run after RS4GC can incorrectly use this metadata to
+/// optimize functions. We drop such metadata on the instruction.
+static void stripInvalidMetadataFromInstruction(Instruction &I) {
+ if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+ return;
+ // These are the attributes that are still valid on loads and stores after
+ // RS4GC.
+ // The metadata implying dereferenceability and noalias are (conservatively)
+ // dropped. This is because semantically, after RewriteStatepointsForGC runs,
+ // all calls to gc.statepoint "free" the entire heap. Also, gc.statepoint can
+ // touch the entire heap including noalias objects. Note: The reasoning is
+ // same as stripping the dereferenceability and noalias attributes that are
+ // analogous to the metadata counterparts.
+ // We also drop the invariant.load metadata on the load because that metadata
+ // implies the address operand to the load points to memory that is never
+ // changed once it became dereferenceable. This is no longer true after RS4GC.
+ // Similar reasoning applies to invariant.group metadata, which applies to
+ // loads within a group.
+ unsigned ValidMetadataAfterRS4GC[] = {LLVMContext::MD_tbaa,
+ LLVMContext::MD_range,
+ LLVMContext::MD_alias_scope,
+ LLVMContext::MD_nontemporal,
+ LLVMContext::MD_nonnull,
+ LLVMContext::MD_align,
+ LLVMContext::MD_type};
+
+ // Drops all metadata on the instruction other than ValidMetadataAfterRS4GC.
+ I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC);
+}
+
+static void stripNonValidDataFromBody(Function &F) {
+ if (F.empty())
+ return;
+
+ LLVMContext &Ctx = F.getContext();
+ MDBuilder Builder(Ctx);
+
+ // Set of invariantstart instructions that we need to remove.
+ // Use this to avoid invalidating the instruction iterator.
+ SmallVector<IntrinsicInst*, 12> InvariantStartInstructions;
+
+ for (Instruction &I : instructions(F)) {
+ // invariant.start on memory location implies that the referenced memory
+ // location is constant and unchanging. This is no longer true after
+ // RewriteStatepointsForGC runs because there can be calls to gc.statepoint
+ // which frees the entire heap and the presence of invariant.start allows
+ // the optimizer to sink the load of a memory location past a statepoint,
+ // which is incorrect.
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::invariant_start) {
+ InvariantStartInstructions.push_back(II);
+ continue;
+ }
+
+ if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) {
+ assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!");
+ bool IsImmutableTBAA =
+ MD->getNumOperands() == 4 &&
+ mdconst::extract<ConstantInt>(MD->getOperand(3))->getValue() == 1;
+
+ if (!IsImmutableTBAA)
+ continue; // no work to do, MD_tbaa is already marked mutable
+
+ MDNode *Base = cast<MDNode>(MD->getOperand(0));
+ MDNode *Access = cast<MDNode>(MD->getOperand(1));
+ uint64_t Offset =
+ mdconst::extract<ConstantInt>(MD->getOperand(2))->getZExtValue();
+
+ MDNode *MutableTBAA =
+ Builder.createTBAAStructTagNode(Base, Access, Offset);
+ I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA);
+ }
+
+ stripInvalidMetadataFromInstruction(I);
+
+ if (CallSite CS = CallSite(&I)) {
+ for (int i = 0, e = CS.arg_size(); i != e; i++)
+ if (isa<PointerType>(CS.getArgument(i)->getType()))
+ RemoveNonValidAttrAtIndex(Ctx, CS, i + AttributeList::FirstArgIndex);
+ if (isa<PointerType>(CS.getType()))
+ RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex);
+ }
+ }
+
+ // Delete the invariant.start instructions and RAUW undef.
+ for (auto *II : InvariantStartInstructions) {
+ II->replaceAllUsesWith(UndefValue::get(II->getType()));
+ II->eraseFromParent();
+ }
+}
+
+/// Returns true if this function should be rewritten by this pass. The main
+/// point of this function is as an extension point for custom logic.
+static bool shouldRewriteStatepointsIn(Function &F) {
+ // TODO: This should check the GCStrategy
+ if (F.hasGC()) {
+ const auto &FunctionGCName = F.getGC();
+ const StringRef StatepointExampleName("statepoint-example");
+ const StringRef CoreCLRName("coreclr");
+ return (StatepointExampleName == FunctionGCName) ||
+ (CoreCLRName == FunctionGCName);
+ } else
+ return false;
+}
+
+static void stripNonValidData(Module &M) {
+#ifndef NDEBUG
+ assert(llvm::any_of(M, shouldRewriteStatepointsIn) && "precondition!");
+#endif
+
+ for (Function &F : M)
+ stripNonValidAttributesFromPrototype(F);
+
+ for (Function &F : M)
+ stripNonValidDataFromBody(F);
+}
+
+bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
+ TargetTransformInfo &TTI,
+ const TargetLibraryInfo &TLI) {
+ assert(!F.isDeclaration() && !F.empty() &&
+ "need function body to rewrite statepoints in");
+ assert(shouldRewriteStatepointsIn(F) && "mismatch in rewrite decision");
+
+ auto NeedsRewrite = [&TLI](Instruction &I) {
+ if (ImmutableCallSite CS = ImmutableCallSite(&I))
+ return !callsGCLeafFunction(CS, TLI) && !isStatepoint(CS);
+ return false;
+ };
+
+ // Gather all the statepoints which need rewritten. Be careful to only
+ // consider those in reachable code since we need to ask dominance queries
+ // when rewriting. We'll delete the unreachable ones in a moment.
+ SmallVector<CallSite, 64> ParsePointNeeded;
+ bool HasUnreachableStatepoint = false;
+ for (Instruction &I : instructions(F)) {
+ // TODO: only the ones with the flag set!
+ if (NeedsRewrite(I)) {
+ if (DT.isReachableFromEntry(I.getParent()))
+ ParsePointNeeded.push_back(CallSite(&I));
+ else
+ HasUnreachableStatepoint = true;
+ }
+ }
+
+ bool MadeChange = false;
+
+ // Delete any unreachable statepoints so that we don't have unrewritten
+ // statepoints surviving this pass. This makes testing easier and the
+ // resulting IR less confusing to human readers. Rather than be fancy, we
+ // just reuse a utility function which removes the unreachable blocks.
+ if (HasUnreachableStatepoint)
+ MadeChange |= removeUnreachableBlocks(F);
+
+ // Return early if no work to do.
+ if (ParsePointNeeded.empty())
+ return MadeChange;
+
+ // As a prepass, go ahead and aggressively destroy single entry phi nodes.
+ // These are created by LCSSA. They have the effect of increasing the size
+ // of liveness sets for no good reason. It may be harder to do this post
+ // insertion since relocations and base phis can confuse things.
+ for (BasicBlock &BB : F)
+ if (BB.getUniquePredecessor()) {
+ MadeChange = true;
+ FoldSingleEntryPHINodes(&BB);
+ }
+
+ // Before we start introducing relocations, we want to tweak the IR a bit to
+ // avoid unfortunate code generation effects. The main example is that we
+ // want to try to make sure the comparison feeding a branch is after any
+ // safepoints. Otherwise, we end up with a comparison of pre-relocation
+ // values feeding a branch after relocation. This is semantically correct,
+ // but results in extra register pressure since both the pre-relocation and
+ // post-relocation copies must be available in registers. For code without
+ // relocations this is handled elsewhere, but teaching the scheduler to
+ // reverse the transform we're about to do would be slightly complex.
+ // Note: This may extend the live range of the inputs to the icmp and thus
+ // increase the liveset of any statepoint we move over. This is profitable
+ // as long as all statepoints are in rare blocks. If we had in-register
+ // lowering for live values this would be a much safer transform.
+ auto getConditionInst = [](TerminatorInst *TI) -> Instruction* {
+ if (auto *BI = dyn_cast<BranchInst>(TI))
+ if (BI->isConditional())
+ return dyn_cast<Instruction>(BI->getCondition());
+ // TODO: Extend this to handle switches
+ return nullptr;
+ };
+ for (BasicBlock &BB : F) {
+ TerminatorInst *TI = BB.getTerminator();
+ if (auto *Cond = getConditionInst(TI))
+ // TODO: Handle more than just ICmps here. We should be able to move
+ // most instructions without side effects or memory access.
+ if (isa<ICmpInst>(Cond) && Cond->hasOneUse()) {
+ MadeChange = true;
+ Cond->moveBefore(TI);
+ }
+ }
+
+ MadeChange |= insertParsePoints(F, DT, TTI, ParsePointNeeded);
+ return MadeChange;
+}
+
+// liveness computation via standard dataflow
+// -------------------------------------------------------------------
+
+// TODO: Consider using bitvectors for liveness, the set of potentially
+// interesting values should be small and easy to pre-compute.
+
+/// Compute the live-in set for the location rbegin starting from
+/// the live-out set of the basic block
+static void computeLiveInValues(BasicBlock::reverse_iterator Begin,
+ BasicBlock::reverse_iterator End,
+ SetVector<Value *> &LiveTmp) {
+ for (auto &I : make_range(Begin, End)) {
+ // KILL/Def - Remove this definition from LiveIn
+ LiveTmp.remove(&I);
+
+ // Don't consider *uses* in PHI nodes, we handle their contribution to
+ // predecessor blocks when we seed the LiveOut sets
+ if (isa<PHINode>(I))
+ continue;
+
+ // USE - Add to the LiveIn set for this instruction
+ for (Value *V : I.operands()) {
+ assert(!isUnhandledGCPointerType(V->getType()) &&
+ "support for FCA unimplemented");
+ if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) {
+ // The choice to exclude all things constant here is slightly subtle.
+ // There are two independent reasons:
+ // - We assume that things which are constant (from LLVM's definition)
+ // do not move at runtime. For example, the address of a global
+ // variable is fixed, even though it's contents may not be.
+ // - Second, we can't disallow arbitrary inttoptr constants even
+ // if the language frontend does. Optimization passes are free to
+ // locally exploit facts without respect to global reachability. This
+ // can create sections of code which are dynamically unreachable and
+ // contain just about anything. (see constants.ll in tests)
+ LiveTmp.insert(V);
+ }
+ }
+ }
+}
+
+static void computeLiveOutSeed(BasicBlock *BB, SetVector<Value *> &LiveTmp) {
+ for (BasicBlock *Succ : successors(BB)) {
+ for (auto &I : *Succ) {
+ PHINode *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ break;
+
+ Value *V = PN->getIncomingValueForBlock(BB);
+ assert(!isUnhandledGCPointerType(V->getType()) &&
+ "support for FCA unimplemented");
+ if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V))
+ LiveTmp.insert(V);
+ }
+ }
+}
+
+static SetVector<Value *> computeKillSet(BasicBlock *BB) {
+ SetVector<Value *> KillSet;
+ for (Instruction &I : *BB)
+ if (isHandledGCPointerType(I.getType()))
+ KillSet.insert(&I);
+ return KillSet;
+}
+
+#ifndef NDEBUG
+/// Check that the items in 'Live' dominate 'TI'. This is used as a basic
+/// sanity check for the liveness computation.
+static void checkBasicSSA(DominatorTree &DT, SetVector<Value *> &Live,
+ TerminatorInst *TI, bool TermOkay = false) {
+ for (Value *V : Live) {
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ // The terminator can be a member of the LiveOut set. LLVM's definition
+ // of instruction dominance states that V does not dominate itself. As
+ // such, we need to special case this to allow it.
+ if (TermOkay && TI == I)
+ continue;
+ assert(DT.dominates(I, TI) &&
+ "basic SSA liveness expectation violated by liveness analysis");
+ }
+ }
+}
+
+/// Check that all the liveness sets used during the computation of liveness
+/// obey basic SSA properties. This is useful for finding cases where we miss
+/// a def.
+static void checkBasicSSA(DominatorTree &DT, GCPtrLivenessData &Data,
+ BasicBlock &BB) {
+ checkBasicSSA(DT, Data.LiveSet[&BB], BB.getTerminator());
+ checkBasicSSA(DT, Data.LiveOut[&BB], BB.getTerminator(), true);
+ checkBasicSSA(DT, Data.LiveIn[&BB], BB.getTerminator());
+}
+#endif
+
+static void computeLiveInValues(DominatorTree &DT, Function &F,
+ GCPtrLivenessData &Data) {
+ SmallSetVector<BasicBlock *, 32> Worklist;
+
+ // Seed the liveness for each individual block
+ for (BasicBlock &BB : F) {
+ Data.KillSet[&BB] = computeKillSet(&BB);
+ Data.LiveSet[&BB].clear();
+ computeLiveInValues(BB.rbegin(), BB.rend(), Data.LiveSet[&BB]);
+
+#ifndef NDEBUG
+ for (Value *Kill : Data.KillSet[&BB])
+ assert(!Data.LiveSet[&BB].count(Kill) && "live set contains kill");
+#endif
+
+ Data.LiveOut[&BB] = SetVector<Value *>();
+ computeLiveOutSeed(&BB, Data.LiveOut[&BB]);
+ Data.LiveIn[&BB] = Data.LiveSet[&BB];
+ Data.LiveIn[&BB].set_union(Data.LiveOut[&BB]);
+ Data.LiveIn[&BB].set_subtract(Data.KillSet[&BB]);
+ if (!Data.LiveIn[&BB].empty())
+ Worklist.insert(pred_begin(&BB), pred_end(&BB));
+ }
+
+ // Propagate that liveness until stable
+ while (!Worklist.empty()) {
+ BasicBlock *BB = Worklist.pop_back_val();
+
+ // Compute our new liveout set, then exit early if it hasn't changed despite
+ // the contribution of our successor.
+ SetVector<Value *> LiveOut = Data.LiveOut[BB];
+ const auto OldLiveOutSize = LiveOut.size();
+ for (BasicBlock *Succ : successors(BB)) {
+ assert(Data.LiveIn.count(Succ));
+ LiveOut.set_union(Data.LiveIn[Succ]);
+ }
+ // assert OutLiveOut is a subset of LiveOut
+ if (OldLiveOutSize == LiveOut.size()) {
+ // If the sets are the same size, then we didn't actually add anything
+ // when unioning our successors LiveIn. Thus, the LiveIn of this block
+ // hasn't changed.
+ continue;
+ }
+ Data.LiveOut[BB] = LiveOut;
+
+ // Apply the effects of this basic block
+ SetVector<Value *> LiveTmp = LiveOut;
+ LiveTmp.set_union(Data.LiveSet[BB]);
+ LiveTmp.set_subtract(Data.KillSet[BB]);
+
+ assert(Data.LiveIn.count(BB));
+ const SetVector<Value *> &OldLiveIn = Data.LiveIn[BB];
+ // assert: OldLiveIn is a subset of LiveTmp
+ if (OldLiveIn.size() != LiveTmp.size()) {
+ Data.LiveIn[BB] = LiveTmp;
+ Worklist.insert(pred_begin(BB), pred_end(BB));
+ }
+ } // while (!Worklist.empty())
+
+#ifndef NDEBUG
+ // Sanity check our output against SSA properties. This helps catch any
+ // missing kills during the above iteration.
+ for (BasicBlock &BB : F)
+ checkBasicSSA(DT, Data, BB);
+#endif
+}
+
+static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data,
+ StatepointLiveSetTy &Out) {
+ BasicBlock *BB = Inst->getParent();
+
+ // Note: The copy is intentional and required
+ assert(Data.LiveOut.count(BB));
+ SetVector<Value *> LiveOut = Data.LiveOut[BB];
+
+ // We want to handle the statepoint itself oddly. It's
+ // call result is not live (normal), nor are it's arguments
+ // (unless they're used again later). This adjustment is
+ // specifically what we need to relocate
+ computeLiveInValues(BB->rbegin(), ++Inst->getIterator().getReverse(),
+ LiveOut);
+ LiveOut.remove(Inst);
+ Out.insert(LiveOut.begin(), LiveOut.end());
+}
+
+static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
+ CallSite CS,
+ PartiallyConstructedSafepointRecord &Info) {
+ Instruction *Inst = CS.getInstruction();
+ StatepointLiveSetTy Updated;
+ findLiveSetAtInst(Inst, RevisedLivenessData, Updated);
+
+ // We may have base pointers which are now live that weren't before. We need
+ // to update the PointerToBase structure to reflect this.
+ for (auto V : Updated)
+ if (Info.PointerToBase.insert({V, V}).second) {
+ assert(isKnownBaseResult(V) &&
+ "Can't find base for unexpected live value!");
+ continue;
+ }
+
+#ifndef NDEBUG
+ for (auto V : Updated)
+ assert(Info.PointerToBase.count(V) &&
+ "Must be able to find base for live value!");
+#endif
+
+ // Remove any stale base mappings - this can happen since our liveness is
+ // more precise then the one inherent in the base pointer analysis.
+ DenseSet<Value *> ToErase;
+ for (auto KVPair : Info.PointerToBase)
+ if (!Updated.count(KVPair.first))
+ ToErase.insert(KVPair.first);
+
+ for (auto *V : ToErase)
+ Info.PointerToBase.erase(V);
+
+#ifndef NDEBUG
+ for (auto KVPair : Info.PointerToBase)
+ assert(Updated.count(KVPair.first) && "record for non-live value");
+#endif
+
+ Info.LiveSet = Updated;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
new file mode 100644
index 000000000000..9dc550ceaeca
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -0,0 +1,2068 @@
+//===- SCCP.cpp - Sparse Conditional Constant Propagation -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements sparse conditional constant propagation and merging:
+//
+// Specifically, this:
+// * Assumes values are constant unless proven otherwise
+// * Assumes BasicBlocks are dead unless proven otherwise
+// * Proves values to be constant, and replaces them with constants
+// * Proves conditional branches to be unconditional
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/SCCP.h"
+#include "llvm/Transforms/Scalar/SCCP.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueLattice.h"
+#include "llvm/Analysis/ValueLatticeUtils.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sccp"
+
+STATISTIC(NumInstRemoved, "Number of instructions removed");
+STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");
+
+STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP");
+STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
+STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");
+STATISTIC(IPNumRangeInfoUsed, "Number of times constant range info was used by"
+ "IPSCCP");
+
+namespace {
+
+/// LatticeVal class - This class represents the different lattice values that
+/// an LLVM value may occupy. It is a simple class with value semantics.
+///
+class LatticeVal {
+ enum LatticeValueTy {
+ /// unknown - This LLVM Value has no known value yet.
+ unknown,
+
+ /// constant - This LLVM Value has a specific constant value.
+ constant,
+
+ /// forcedconstant - This LLVM Value was thought to be undef until
+ /// ResolvedUndefsIn. This is treated just like 'constant', but if merged
+ /// with another (different) constant, it goes to overdefined, instead of
+ /// asserting.
+ forcedconstant,
+
+ /// overdefined - This instruction is not known to be constant, and we know
+ /// it has a value.
+ overdefined
+ };
+
+ /// Val: This stores the current lattice value along with the Constant* for
+ /// the constant if this is a 'constant' or 'forcedconstant' value.
+ PointerIntPair<Constant *, 2, LatticeValueTy> Val;
+
+ LatticeValueTy getLatticeValue() const {
+ return Val.getInt();
+ }
+
+public:
+ LatticeVal() : Val(nullptr, unknown) {}
+
+ bool isUnknown() const { return getLatticeValue() == unknown; }
+
+ bool isConstant() const {
+ return getLatticeValue() == constant || getLatticeValue() == forcedconstant;
+ }
+
+ bool isOverdefined() const { return getLatticeValue() == overdefined; }
+
+ Constant *getConstant() const {
+ assert(isConstant() && "Cannot get the constant of a non-constant!");
+ return Val.getPointer();
+ }
+
+ /// markOverdefined - Return true if this is a change in status.
+ bool markOverdefined() {
+ if (isOverdefined())
+ return false;
+
+ Val.setInt(overdefined);
+ return true;
+ }
+
+ /// markConstant - Return true if this is a change in status.
+ bool markConstant(Constant *V) {
+ if (getLatticeValue() == constant) { // Constant but not forcedconstant.
+ assert(getConstant() == V && "Marking constant with different value");
+ return false;
+ }
+
+ if (isUnknown()) {
+ Val.setInt(constant);
+ assert(V && "Marking constant with NULL");
+ Val.setPointer(V);
+ } else {
+ assert(getLatticeValue() == forcedconstant &&
+ "Cannot move from overdefined to constant!");
+ // Stay at forcedconstant if the constant is the same.
+ if (V == getConstant()) return false;
+
+ // Otherwise, we go to overdefined. Assumptions made based on the
+ // forced value are possibly wrong. Assuming this is another constant
+ // could expose a contradiction.
+ Val.setInt(overdefined);
+ }
+ return true;
+ }
+
+ /// getConstantInt - If this is a constant with a ConstantInt value, return it
+ /// otherwise return null.
+ ConstantInt *getConstantInt() const {
+ if (isConstant())
+ return dyn_cast<ConstantInt>(getConstant());
+ return nullptr;
+ }
+
+ /// getBlockAddress - If this is a constant with a BlockAddress value, return
+ /// it, otherwise return null.
+ BlockAddress *getBlockAddress() const {
+ if (isConstant())
+ return dyn_cast<BlockAddress>(getConstant());
+ return nullptr;
+ }
+
+ void markForcedConstant(Constant *V) {
+ assert(isUnknown() && "Can't force a defined value!");
+ Val.setInt(forcedconstant);
+ Val.setPointer(V);
+ }
+
+ ValueLatticeElement toValueLattice() const {
+ if (isOverdefined())
+ return ValueLatticeElement::getOverdefined();
+ if (isConstant())
+ return ValueLatticeElement::get(getConstant());
+ return ValueLatticeElement();
+ }
+};
+
+//===----------------------------------------------------------------------===//
+//
+/// SCCPSolver - This class is a general purpose solver for Sparse Conditional
+/// Constant Propagation.
+///
+class SCCPSolver : public InstVisitor<SCCPSolver> {
+ const DataLayout &DL;
+ const TargetLibraryInfo *TLI;
+ SmallPtrSet<BasicBlock *, 8> BBExecutable; // The BBs that are executable.
+ DenseMap<Value *, LatticeVal> ValueState; // The state each value is in.
+ // The state each parameter is in.
+ DenseMap<Value *, ValueLatticeElement> ParamState;
+
+ /// StructValueState - This maintains ValueState for values that have
+ /// StructType, for example for formal arguments, calls, insertelement, etc.
+ DenseMap<std::pair<Value *, unsigned>, LatticeVal> StructValueState;
+
+ /// GlobalValue - If we are tracking any values for the contents of a global
+ /// variable, we keep a mapping from the constant accessor to the element of
+ /// the global, to the currently known value. If the value becomes
+ /// overdefined, it's entry is simply removed from this map.
+ DenseMap<GlobalVariable *, LatticeVal> TrackedGlobals;
+
+ /// TrackedRetVals - If we are tracking arguments into and the return
+ /// value out of a function, it will have an entry in this map, indicating
+ /// what the known return value for the function is.
+ DenseMap<Function *, LatticeVal> TrackedRetVals;
+
+ /// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions
+ /// that return multiple values.
+ DenseMap<std::pair<Function *, unsigned>, LatticeVal> TrackedMultipleRetVals;
+
+ /// MRVFunctionsTracked - Each function in TrackedMultipleRetVals is
+ /// represented here for efficient lookup.
+ SmallPtrSet<Function *, 16> MRVFunctionsTracked;
+
+ /// TrackingIncomingArguments - This is the set of functions for whose
+ /// arguments we make optimistic assumptions about and try to prove as
+ /// constants.
+ SmallPtrSet<Function *, 16> TrackingIncomingArguments;
+
+ /// The reason for two worklists is that overdefined is the lowest state
+ /// on the lattice, and moving things to overdefined as fast as possible
+ /// makes SCCP converge much faster.
+ ///
+ /// By having a separate worklist, we accomplish this because everything
+ /// possibly overdefined will become overdefined at the soonest possible
+ /// point.
+ SmallVector<Value *, 64> OverdefinedInstWorkList;
+ SmallVector<Value *, 64> InstWorkList;
+
+ // The BasicBlock work list
+ SmallVector<BasicBlock *, 64> BBWorkList;
+
+ /// KnownFeasibleEdges - Entries in this set are edges which have already had
+ /// PHI nodes retriggered.
+ using Edge = std::pair<BasicBlock *, BasicBlock *>;
+ DenseSet<Edge> KnownFeasibleEdges;
+
+public:
+ SCCPSolver(const DataLayout &DL, const TargetLibraryInfo *tli)
+ : DL(DL), TLI(tli) {}
+
+ /// MarkBlockExecutable - This method can be used by clients to mark all of
+ /// the blocks that are known to be intrinsically live in the processed unit.
+ ///
+ /// This returns true if the block was not considered live before.
+ bool MarkBlockExecutable(BasicBlock *BB) {
+ if (!BBExecutable.insert(BB).second)
+ return false;
+ DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n');
+ BBWorkList.push_back(BB); // Add the block to the work list!
+ return true;
+ }
+
+ /// TrackValueOfGlobalVariable - Clients can use this method to
+ /// inform the SCCPSolver that it should track loads and stores to the
+ /// specified global variable if it can. This is only legal to call if
+ /// performing Interprocedural SCCP.
+ void TrackValueOfGlobalVariable(GlobalVariable *GV) {
+ // We only track the contents of scalar globals.
+ if (GV->getValueType()->isSingleValueType()) {
+ LatticeVal &IV = TrackedGlobals[GV];
+ if (!isa<UndefValue>(GV->getInitializer()))
+ IV.markConstant(GV->getInitializer());
+ }
+ }
+
+ /// AddTrackedFunction - If the SCCP solver is supposed to track calls into
+ /// and out of the specified function (which cannot have its address taken),
+ /// this method must be called.
+ void AddTrackedFunction(Function *F) {
+ // Add an entry, F -> undef.
+ if (auto *STy = dyn_cast<StructType>(F->getReturnType())) {
+ MRVFunctionsTracked.insert(F);
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+ TrackedMultipleRetVals.insert(std::make_pair(std::make_pair(F, i),
+ LatticeVal()));
+ } else
+ TrackedRetVals.insert(std::make_pair(F, LatticeVal()));
+ }
+
+ void AddArgumentTrackedFunction(Function *F) {
+ TrackingIncomingArguments.insert(F);
+ }
+
+ /// Returns true if the given function is in the solver's set of
+ /// argument-tracked functions.
+ bool isArgumentTrackedFunction(Function *F) {
+ return TrackingIncomingArguments.count(F);
+ }
+
+ /// Solve - Solve for constants and executable blocks.
+ void Solve();
+
+ /// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+ /// that branches on undef values cannot reach any of their successors.
+ /// However, this is not a safe assumption. After we solve dataflow, this
+ /// method should be use to handle this. If this returns true, the solver
+ /// should be rerun.
+ bool ResolvedUndefsIn(Function &F);
+
+ bool isBlockExecutable(BasicBlock *BB) const {
+ return BBExecutable.count(BB);
+ }
+
+ std::vector<LatticeVal> getStructLatticeValueFor(Value *V) const {
+ std::vector<LatticeVal> StructValues;
+ auto *STy = dyn_cast<StructType>(V->getType());
+ assert(STy && "getStructLatticeValueFor() can be called only on structs");
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ auto I = StructValueState.find(std::make_pair(V, i));
+ assert(I != StructValueState.end() && "Value not in valuemap!");
+ StructValues.push_back(I->second);
+ }
+ return StructValues;
+ }
+
+ ValueLatticeElement getLatticeValueFor(Value *V) {
+ assert(!V->getType()->isStructTy() &&
+ "Should use getStructLatticeValueFor");
+ std::pair<DenseMap<Value*, ValueLatticeElement>::iterator, bool>
+ PI = ParamState.insert(std::make_pair(V, ValueLatticeElement()));
+ ValueLatticeElement &LV = PI.first->second;
+ if (PI.second) {
+ DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V);
+ assert(I != ValueState.end() &&
+ "V not found in ValueState nor Paramstate map!");
+ LV = I->second.toValueLattice();
+ }
+
+ return LV;
+ }
+
+ /// getTrackedRetVals - Get the inferred return value map.
+ const DenseMap<Function*, LatticeVal> &getTrackedRetVals() {
+ return TrackedRetVals;
+ }
+
+ /// getTrackedGlobals - Get and return the set of inferred initializers for
+ /// global variables.
+ const DenseMap<GlobalVariable*, LatticeVal> &getTrackedGlobals() {
+ return TrackedGlobals;
+ }
+
+ /// getMRVFunctionsTracked - Get the set of functions which return multiple
+ /// values tracked by the pass.
+ const SmallPtrSet<Function *, 16> getMRVFunctionsTracked() {
+ return MRVFunctionsTracked;
+ }
+
+ /// markOverdefined - Mark the specified value overdefined. This
+ /// works with both scalars and structs.
+ void markOverdefined(Value *V) {
+ if (auto *STy = dyn_cast<StructType>(V->getType()))
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+ markOverdefined(getStructValueState(V, i), V);
+ else
+ markOverdefined(ValueState[V], V);
+ }
+
+ // isStructLatticeConstant - Return true if all the lattice values
+ // corresponding to elements of the structure are not overdefined,
+ // false otherwise.
+ bool isStructLatticeConstant(Function *F, StructType *STy) {
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ const auto &It = TrackedMultipleRetVals.find(std::make_pair(F, i));
+ assert(It != TrackedMultipleRetVals.end());
+ LatticeVal LV = It->second;
+ if (LV.isOverdefined())
+ return false;
+ }
+ return true;
+ }
+
+private:
+ // pushToWorkList - Helper for markConstant/markForcedConstant/markOverdefined
+ void pushToWorkList(LatticeVal &IV, Value *V) {
+ if (IV.isOverdefined())
+ return OverdefinedInstWorkList.push_back(V);
+ InstWorkList.push_back(V);
+ }
+
+ // markConstant - Make a value be marked as "constant". If the value
+ // is not already a constant, add it to the instruction work list so that
+ // the users of the instruction are updated later.
+ void markConstant(LatticeVal &IV, Value *V, Constant *C) {
+ if (!IV.markConstant(C)) return;
+ DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n');
+ pushToWorkList(IV, V);
+ }
+
+ void markConstant(Value *V, Constant *C) {
+ assert(!V->getType()->isStructTy() && "structs should use mergeInValue");
+ markConstant(ValueState[V], V, C);
+ }
+
+ void markForcedConstant(Value *V, Constant *C) {
+ assert(!V->getType()->isStructTy() && "structs should use mergeInValue");
+ LatticeVal &IV = ValueState[V];
+ IV.markForcedConstant(C);
+ DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n');
+ pushToWorkList(IV, V);
+ }
+
+ // markOverdefined - Make a value be marked as "overdefined". If the
+ // value is not already overdefined, add it to the overdefined instruction
+ // work list so that the users of the instruction are updated later.
+ void markOverdefined(LatticeVal &IV, Value *V) {
+ if (!IV.markOverdefined()) return;
+
+ DEBUG(dbgs() << "markOverdefined: ";
+ if (auto *F = dyn_cast<Function>(V))
+ dbgs() << "Function '" << F->getName() << "'\n";
+ else
+ dbgs() << *V << '\n');
+ // Only instructions go on the work list
+ pushToWorkList(IV, V);
+ }
+
+ void mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) {
+ if (IV.isOverdefined() || MergeWithV.isUnknown())
+ return; // Noop.
+ if (MergeWithV.isOverdefined())
+ return markOverdefined(IV, V);
+ if (IV.isUnknown())
+ return markConstant(IV, V, MergeWithV.getConstant());
+ if (IV.getConstant() != MergeWithV.getConstant())
+ return markOverdefined(IV, V);
+ }
+
+ void mergeInValue(Value *V, LatticeVal MergeWithV) {
+ assert(!V->getType()->isStructTy() &&
+ "non-structs should use markConstant");
+ mergeInValue(ValueState[V], V, MergeWithV);
+ }
+
+ /// getValueState - Return the LatticeVal object that corresponds to the
+ /// value. This function handles the case when the value hasn't been seen yet
+ /// by properly seeding constants etc.
+ LatticeVal &getValueState(Value *V) {
+ assert(!V->getType()->isStructTy() && "Should use getStructValueState");
+
+ std::pair<DenseMap<Value*, LatticeVal>::iterator, bool> I =
+ ValueState.insert(std::make_pair(V, LatticeVal()));
+ LatticeVal &LV = I.first->second;
+
+ if (!I.second)
+ return LV; // Common case, already in the map.
+
+ if (auto *C = dyn_cast<Constant>(V)) {
+ // Undef values remain unknown.
+ if (!isa<UndefValue>(V))
+ LV.markConstant(C); // Constants are constant
+ }
+
+ // All others are underdefined by default.
+ return LV;
+ }
+
+ ValueLatticeElement &getParamState(Value *V) {
+ assert(!V->getType()->isStructTy() && "Should use getStructValueState");
+
+ std::pair<DenseMap<Value*, ValueLatticeElement>::iterator, bool>
+ PI = ParamState.insert(std::make_pair(V, ValueLatticeElement()));
+ ValueLatticeElement &LV = PI.first->second;
+ if (PI.second)
+ LV = getValueState(V).toValueLattice();
+
+ return LV;
+ }
+
+ /// getStructValueState - Return the LatticeVal object that corresponds to the
+ /// value/field pair. This function handles the case when the value hasn't
+ /// been seen yet by properly seeding constants etc.
+ LatticeVal &getStructValueState(Value *V, unsigned i) {
+ assert(V->getType()->isStructTy() && "Should use getValueState");
+ assert(i < cast<StructType>(V->getType())->getNumElements() &&
+ "Invalid element #");
+
+ std::pair<DenseMap<std::pair<Value*, unsigned>, LatticeVal>::iterator,
+ bool> I = StructValueState.insert(
+ std::make_pair(std::make_pair(V, i), LatticeVal()));
+ LatticeVal &LV = I.first->second;
+
+ if (!I.second)
+ return LV; // Common case, already in the map.
+
+ if (auto *C = dyn_cast<Constant>(V)) {
+ Constant *Elt = C->getAggregateElement(i);
+
+ if (!Elt)
+ LV.markOverdefined(); // Unknown sort of constant.
+ else if (isa<UndefValue>(Elt))
+ ; // Undef values remain unknown.
+ else
+ LV.markConstant(Elt); // Constants are constant.
+ }
+
+ // All others are underdefined by default.
+ return LV;
+ }
+
+ /// markEdgeExecutable - Mark a basic block as executable, adding it to the BB
+ /// work list if it is not already executable.
+ void markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
+ if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second)
+ return; // This edge is already known to be executable!
+
+ if (!MarkBlockExecutable(Dest)) {
+ // If the destination is already executable, we just made an *edge*
+ // feasible that wasn't before. Revisit the PHI nodes in the block
+ // because they have potentially new operands.
+ DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
+ << " -> " << Dest->getName() << '\n');
+
+ for (PHINode &PN : Dest->phis())
+ visitPHINode(PN);
+ }
+ }
+
+ // getFeasibleSuccessors - Return a vector of booleans to indicate which
+ // successors are reachable from a given terminator instruction.
+ void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs);
+
+ // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+ // block to the 'To' basic block is currently feasible.
+ bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
+
+ // OperandChangedState - This method is invoked on all of the users of an
+ // instruction that was just changed state somehow. Based on this
+ // information, we need to update the specified user of this instruction.
+ void OperandChangedState(Instruction *I) {
+ if (BBExecutable.count(I->getParent())) // Inst is executable?
+ visit(*I);
+ }
+
+private:
+ friend class InstVisitor<SCCPSolver>;
+
+ // visit implementations - Something changed in this instruction. Either an
+ // operand made a transition, or the instruction is newly executable. Change
+ // the value type of I to reflect these changes if appropriate.
+ void visitPHINode(PHINode &I);
+
+ // Terminators
+
+ void visitReturnInst(ReturnInst &I);
+ void visitTerminatorInst(TerminatorInst &TI);
+
+ void visitCastInst(CastInst &I);
+ void visitSelectInst(SelectInst &I);
+ void visitBinaryOperator(Instruction &I);
+ void visitCmpInst(CmpInst &I);
+ void visitExtractValueInst(ExtractValueInst &EVI);
+ void visitInsertValueInst(InsertValueInst &IVI);
+
+ void visitCatchSwitchInst(CatchSwitchInst &CPI) {
+ markOverdefined(&CPI);
+ visitTerminatorInst(CPI);
+ }
+
+ // Instructions that cannot be folded away.
+
+ void visitStoreInst (StoreInst &I);
+ void visitLoadInst (LoadInst &I);
+ void visitGetElementPtrInst(GetElementPtrInst &I);
+
+ void visitCallInst (CallInst &I) {
+ visitCallSite(&I);
+ }
+
+ void visitInvokeInst (InvokeInst &II) {
+ visitCallSite(&II);
+ visitTerminatorInst(II);
+ }
+
+ void visitCallSite (CallSite CS);
+ void visitResumeInst (TerminatorInst &I) { /*returns void*/ }
+ void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
+ void visitFenceInst (FenceInst &I) { /*returns void*/ }
+
+ void visitInstruction(Instruction &I) {
+ // All the instructions we don't do any special handling for just
+ // go to overdefined.
+ DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
+ markOverdefined(&I);
+ }
+};
+
+} // end anonymous namespace
+
+// getFeasibleSuccessors - Return a vector of booleans to indicate which
+// successors are reachable from a given terminator instruction.
+void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
+ SmallVectorImpl<bool> &Succs) {
+ Succs.resize(TI.getNumSuccessors());
+ if (auto *BI = dyn_cast<BranchInst>(&TI)) {
+ if (BI->isUnconditional()) {
+ Succs[0] = true;
+ return;
+ }
+
+ LatticeVal BCValue = getValueState(BI->getCondition());
+ ConstantInt *CI = BCValue.getConstantInt();
+ if (!CI) {
+ // Overdefined condition variables, and branches on unfoldable constant
+ // conditions, mean the branch could go either way.
+ if (!BCValue.isUnknown())
+ Succs[0] = Succs[1] = true;
+ return;
+ }
+
+ // Constant condition variables mean the branch can only go a single way.
+ Succs[CI->isZero()] = true;
+ return;
+ }
+
+ // Unwinding instructions successors are always executable.
+ if (TI.isExceptional()) {
+ Succs.assign(TI.getNumSuccessors(), true);
+ return;
+ }
+
+ if (auto *SI = dyn_cast<SwitchInst>(&TI)) {
+ if (!SI->getNumCases()) {
+ Succs[0] = true;
+ return;
+ }
+ LatticeVal SCValue = getValueState(SI->getCondition());
+ ConstantInt *CI = SCValue.getConstantInt();
+
+ if (!CI) { // Overdefined or unknown condition?
+ // All destinations are executable!
+ if (!SCValue.isUnknown())
+ Succs.assign(TI.getNumSuccessors(), true);
+ return;
+ }
+
+ Succs[SI->findCaseValue(CI)->getSuccessorIndex()] = true;
+ return;
+ }
+
+ // In case of indirect branch and its address is a blockaddress, we mark
+ // the target as executable.
+ if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) {
+ // Casts are folded by visitCastInst.
+ LatticeVal IBRValue = getValueState(IBR->getAddress());
+ BlockAddress *Addr = IBRValue.getBlockAddress();
+ if (!Addr) { // Overdefined or unknown condition?
+ // All destinations are executable!
+ if (!IBRValue.isUnknown())
+ Succs.assign(TI.getNumSuccessors(), true);
+ return;
+ }
+
+ BasicBlock* T = Addr->getBasicBlock();
+ assert(Addr->getFunction() == T->getParent() &&
+ "Block address of a different function ?");
+ for (unsigned i = 0; i < IBR->getNumSuccessors(); ++i) {
+ // This is the target.
+ if (IBR->getDestination(i) == T) {
+ Succs[i] = true;
+ return;
+ }
+ }
+
+ // If we didn't find our destination in the IBR successor list, then we
+ // have undefined behavior. Its ok to assume no successor is executable.
+ return;
+ }
+
+ DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n');
+ llvm_unreachable("SCCP: Don't know how to handle this terminator!");
+}
+
+// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+// block to the 'To' basic block is currently feasible.
+bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
+ assert(BBExecutable.count(To) && "Dest should always be alive!");
+
+ // Make sure the source basic block is executable!!
+ if (!BBExecutable.count(From)) return false;
+
+ // Check to make sure this edge itself is actually feasible now.
+ TerminatorInst *TI = From->getTerminator();
+ if (auto *BI = dyn_cast<BranchInst>(TI)) {
+ if (BI->isUnconditional())
+ return true;
+
+ LatticeVal BCValue = getValueState(BI->getCondition());
+
+ // Overdefined condition variables mean the branch could go either way,
+ // undef conditions mean that neither edge is feasible yet.
+ ConstantInt *CI = BCValue.getConstantInt();
+ if (!CI)
+ return !BCValue.isUnknown();
+
+ // Constant condition variables mean the branch can only go a single way.
+ return BI->getSuccessor(CI->isZero()) == To;
+ }
+
+ // Unwinding instructions successors are always executable.
+ if (TI->isExceptional())
+ return true;
+
+ if (auto *SI = dyn_cast<SwitchInst>(TI)) {
+ if (SI->getNumCases() < 1)
+ return true;
+
+ LatticeVal SCValue = getValueState(SI->getCondition());
+ ConstantInt *CI = SCValue.getConstantInt();
+
+ if (!CI)
+ return !SCValue.isUnknown();
+
+ return SI->findCaseValue(CI)->getCaseSuccessor() == To;
+ }
+
+ // In case of indirect branch and its address is a blockaddress, we mark
+ // the target as executable.
+ if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
+ LatticeVal IBRValue = getValueState(IBR->getAddress());
+ BlockAddress *Addr = IBRValue.getBlockAddress();
+
+ if (!Addr)
+ return !IBRValue.isUnknown();
+
+ // At this point, the indirectbr is branching on a blockaddress.
+ return Addr->getBasicBlock() == To;
+ }
+
+ DEBUG(dbgs() << "Unknown terminator instruction: " << *TI << '\n');
+ llvm_unreachable("SCCP: Don't know how to handle this terminator!");
+}
+
+// visit Implementations - Something changed in this instruction, either an
+// operand made a transition, or the instruction is newly executable. Change
+// the value type of I to reflect these changes if appropriate. This method
+// makes sure to do the following actions:
+//
+// 1. If a phi node merges two constants in, and has conflicting value coming
+// from different branches, or if the PHI node merges in an overdefined
+// value, then the PHI node becomes overdefined.
+// 2. If a phi node merges only constants in, and they all agree on value, the
+// PHI node becomes a constant value equal to that.
+// 3. If V <- x (op) y && isConstant(x) && isConstant(y) V = Constant
+// 4. If V <- x (op) y && (isOverdefined(x) || isOverdefined(y)) V = Overdefined
+// 5. If V <- MEM or V <- CALL or V <- (unknown) then V = Overdefined
+// 6. If a conditional branch has a value that is constant, make the selected
+// destination executable
+// 7. If a conditional branch has a value that is overdefined, make all
+// successors executable.
+void SCCPSolver::visitPHINode(PHINode &PN) {
+ // If this PN returns a struct, just mark the result overdefined.
+ // TODO: We could do a lot better than this if code actually uses this.
+ if (PN.getType()->isStructTy())
+ return markOverdefined(&PN);
+
+ if (getValueState(&PN).isOverdefined())
+ return; // Quick exit
+
+ // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
+ // and slow us down a lot. Just mark them overdefined.
+ if (PN.getNumIncomingValues() > 64)
+ return markOverdefined(&PN);
+
+ // Look at all of the executable operands of the PHI node. If any of them
+ // are overdefined, the PHI becomes overdefined as well. If they are all
+ // constant, and they agree with each other, the PHI becomes the identical
+ // constant. If they are constant and don't agree, the PHI is overdefined.
+ // If there are no executable operands, the PHI remains unknown.
+ Constant *OperandVal = nullptr;
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+ LatticeVal IV = getValueState(PN.getIncomingValue(i));
+ if (IV.isUnknown()) continue; // Doesn't influence PHI node.
+
+ if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
+ continue;
+
+ if (IV.isOverdefined()) // PHI node becomes overdefined!
+ return markOverdefined(&PN);
+
+ if (!OperandVal) { // Grab the first value.
+ OperandVal = IV.getConstant();
+ continue;
+ }
+
+ // There is already a reachable operand. If we conflict with it,
+ // then the PHI node becomes overdefined. If we agree with it, we
+ // can continue on.
+
+ // Check to see if there are two different constants merging, if so, the PHI
+ // node is overdefined.
+ if (IV.getConstant() != OperandVal)
+ return markOverdefined(&PN);
+ }
+
+ // If we exited the loop, this means that the PHI node only has constant
+ // arguments that agree with each other(and OperandVal is the constant) or
+ // OperandVal is null because there are no defined incoming arguments. If
+ // this is the case, the PHI remains unknown.
+ if (OperandVal)
+ markConstant(&PN, OperandVal); // Acquire operand value
+}
+
+void SCCPSolver::visitReturnInst(ReturnInst &I) {
+ if (I.getNumOperands() == 0) return; // ret void
+
+ Function *F = I.getParent()->getParent();
+ Value *ResultOp = I.getOperand(0);
+
+ // If we are tracking the return value of this function, merge it in.
+ if (!TrackedRetVals.empty() && !ResultOp->getType()->isStructTy()) {
+ DenseMap<Function*, LatticeVal>::iterator TFRVI =
+ TrackedRetVals.find(F);
+ if (TFRVI != TrackedRetVals.end()) {
+ mergeInValue(TFRVI->second, F, getValueState(ResultOp));
+ return;
+ }
+ }
+
+ // Handle functions that return multiple values.
+ if (!TrackedMultipleRetVals.empty()) {
+ if (auto *STy = dyn_cast<StructType>(ResultOp->getType()))
+ if (MRVFunctionsTracked.count(F))
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+ mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F,
+ getStructValueState(ResultOp, i));
+ }
+}
+
+void SCCPSolver::visitTerminatorInst(TerminatorInst &TI) {
+ SmallVector<bool, 16> SuccFeasible;
+ getFeasibleSuccessors(TI, SuccFeasible);
+
+ BasicBlock *BB = TI.getParent();
+
+ // Mark all feasible successors executable.
+ for (unsigned i = 0, e = SuccFeasible.size(); i != e; ++i)
+ if (SuccFeasible[i])
+ markEdgeExecutable(BB, TI.getSuccessor(i));
+}
+
+void SCCPSolver::visitCastInst(CastInst &I) {
+ LatticeVal OpSt = getValueState(I.getOperand(0));
+ if (OpSt.isOverdefined()) // Inherit overdefinedness of operand
+ markOverdefined(&I);
+ else if (OpSt.isConstant()) {
+ // Fold the constant as we build.
+ Constant *C = ConstantFoldCastOperand(I.getOpcode(), OpSt.getConstant(),
+ I.getType(), DL);
+ if (isa<UndefValue>(C))
+ return;
+ // Propagate constant value
+ markConstant(&I, C);
+ }
+}
+
+void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
+ // If this returns a struct, mark all elements over defined, we don't track
+ // structs in structs.
+ if (EVI.getType()->isStructTy())
+ return markOverdefined(&EVI);
+
+ // If this is extracting from more than one level of struct, we don't know.
+ if (EVI.getNumIndices() != 1)
+ return markOverdefined(&EVI);
+
+ Value *AggVal = EVI.getAggregateOperand();
+ if (AggVal->getType()->isStructTy()) {
+ unsigned i = *EVI.idx_begin();
+ LatticeVal EltVal = getStructValueState(AggVal, i);
+ mergeInValue(getValueState(&EVI), &EVI, EltVal);
+ } else {
+ // Otherwise, must be extracting from an array.
+ return markOverdefined(&EVI);
+ }
+}
+
+void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
+ auto *STy = dyn_cast<StructType>(IVI.getType());
+ if (!STy)
+ return markOverdefined(&IVI);
+
+ // If this has more than one index, we can't handle it, drive all results to
+ // undef.
+ if (IVI.getNumIndices() != 1)
+ return markOverdefined(&IVI);
+
+ Value *Aggr = IVI.getAggregateOperand();
+ unsigned Idx = *IVI.idx_begin();
+
+ // Compute the result based on what we're inserting.
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ // This passes through all values that aren't the inserted element.
+ if (i != Idx) {
+ LatticeVal EltVal = getStructValueState(Aggr, i);
+ mergeInValue(getStructValueState(&IVI, i), &IVI, EltVal);
+ continue;
+ }
+
+ Value *Val = IVI.getInsertedValueOperand();
+ if (Val->getType()->isStructTy())
+ // We don't track structs in structs.
+ markOverdefined(getStructValueState(&IVI, i), &IVI);
+ else {
+ LatticeVal InVal = getValueState(Val);
+ mergeInValue(getStructValueState(&IVI, i), &IVI, InVal);
+ }
+ }
+}
+
+void SCCPSolver::visitSelectInst(SelectInst &I) {
+ // If this select returns a struct, just mark the result overdefined.
+ // TODO: We could do a lot better than this if code actually uses this.
+ if (I.getType()->isStructTy())
+ return markOverdefined(&I);
+
+ LatticeVal CondValue = getValueState(I.getCondition());
+ if (CondValue.isUnknown())
+ return;
+
+ if (ConstantInt *CondCB = CondValue.getConstantInt()) {
+ Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue();
+ mergeInValue(&I, getValueState(OpVal));
+ return;
+ }
+
+ // Otherwise, the condition is overdefined or a constant we can't evaluate.
+ // See if we can produce something better than overdefined based on the T/F
+ // value.
+ LatticeVal TVal = getValueState(I.getTrueValue());
+ LatticeVal FVal = getValueState(I.getFalseValue());
+
+ // select ?, C, C -> C.
+ if (TVal.isConstant() && FVal.isConstant() &&
+ TVal.getConstant() == FVal.getConstant())
+ return markConstant(&I, FVal.getConstant());
+
+ if (TVal.isUnknown()) // select ?, undef, X -> X.
+ return mergeInValue(&I, FVal);
+ if (FVal.isUnknown()) // select ?, X, undef -> X.
+ return mergeInValue(&I, TVal);
+ markOverdefined(&I);
+}
+
+// Handle Binary Operators.
+void SCCPSolver::visitBinaryOperator(Instruction &I) {
+ LatticeVal V1State = getValueState(I.getOperand(0));
+ LatticeVal V2State = getValueState(I.getOperand(1));
+
+ LatticeVal &IV = ValueState[&I];
+ if (IV.isOverdefined()) return;
+
+ if (V1State.isConstant() && V2State.isConstant()) {
+ Constant *C = ConstantExpr::get(I.getOpcode(), V1State.getConstant(),
+ V2State.getConstant());
+ // X op Y -> undef.
+ if (isa<UndefValue>(C))
+ return;
+ return markConstant(IV, &I, C);
+ }
+
+ // If something is undef, wait for it to resolve.
+ if (!V1State.isOverdefined() && !V2State.isOverdefined())
+ return;
+
+ // Otherwise, one of our operands is overdefined. Try to produce something
+ // better than overdefined with some tricks.
+ // If this is 0 / Y, it doesn't matter that the second operand is
+ // overdefined, and we can replace it with zero.
+ if (I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv)
+ if (V1State.isConstant() && V1State.getConstant()->isNullValue())
+ return markConstant(IV, &I, V1State.getConstant());
+
+ // If this is:
+ // -> AND/MUL with 0
+ // -> OR with -1
+ // it doesn't matter that the other operand is overdefined.
+ if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Mul ||
+ I.getOpcode() == Instruction::Or) {
+ LatticeVal *NonOverdefVal = nullptr;
+ if (!V1State.isOverdefined())
+ NonOverdefVal = &V1State;
+ else if (!V2State.isOverdefined())
+ NonOverdefVal = &V2State;
+
+ if (NonOverdefVal) {
+ if (NonOverdefVal->isUnknown())
+ return;
+
+ if (I.getOpcode() == Instruction::And ||
+ I.getOpcode() == Instruction::Mul) {
+ // X and 0 = 0
+ // X * 0 = 0
+ if (NonOverdefVal->getConstant()->isNullValue())
+ return markConstant(IV, &I, NonOverdefVal->getConstant());
+ } else {
+ // X or -1 = -1
+ if (ConstantInt *CI = NonOverdefVal->getConstantInt())
+ if (CI->isMinusOne())
+ return markConstant(IV, &I, NonOverdefVal->getConstant());
+ }
+ }
+ }
+
+ markOverdefined(&I);
+}
+
+// Handle ICmpInst instruction.
+void SCCPSolver::visitCmpInst(CmpInst &I) {
+ LatticeVal V1State = getValueState(I.getOperand(0));
+ LatticeVal V2State = getValueState(I.getOperand(1));
+
+ LatticeVal &IV = ValueState[&I];
+ if (IV.isOverdefined()) return;
+
+ if (V1State.isConstant() && V2State.isConstant()) {
+ Constant *C = ConstantExpr::getCompare(
+ I.getPredicate(), V1State.getConstant(), V2State.getConstant());
+ if (isa<UndefValue>(C))
+ return;
+ return markConstant(IV, &I, C);
+ }
+
+ // If operands are still unknown, wait for it to resolve.
+ if (!V1State.isOverdefined() && !V2State.isOverdefined())
+ return;
+
+ markOverdefined(&I);
+}
+
+// Handle getelementptr instructions. If all operands are constants then we
+// can turn this into a getelementptr ConstantExpr.
+void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
+ if (ValueState[&I].isOverdefined()) return;
+
+ SmallVector<Constant*, 8> Operands;
+ Operands.reserve(I.getNumOperands());
+
+ for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+ LatticeVal State = getValueState(I.getOperand(i));
+ if (State.isUnknown())
+ return; // Operands are not resolved yet.
+
+ if (State.isOverdefined())
+ return markOverdefined(&I);
+
+ assert(State.isConstant() && "Unknown state!");
+ Operands.push_back(State.getConstant());
+ }
+
+ Constant *Ptr = Operands[0];
+ auto Indices = makeArrayRef(Operands.begin() + 1, Operands.end());
+ Constant *C =
+ ConstantExpr::getGetElementPtr(I.getSourceElementType(), Ptr, Indices);
+ if (isa<UndefValue>(C))
+ return;
+ markConstant(&I, C);
+}
+
+void SCCPSolver::visitStoreInst(StoreInst &SI) {
+ // If this store is of a struct, ignore it.
+ if (SI.getOperand(0)->getType()->isStructTy())
+ return;
+
+ if (TrackedGlobals.empty() || !isa<GlobalVariable>(SI.getOperand(1)))
+ return;
+
+ GlobalVariable *GV = cast<GlobalVariable>(SI.getOperand(1));
+ DenseMap<GlobalVariable*, LatticeVal>::iterator I = TrackedGlobals.find(GV);
+ if (I == TrackedGlobals.end() || I->second.isOverdefined()) return;
+
+ // Get the value we are storing into the global, then merge it.
+ mergeInValue(I->second, GV, getValueState(SI.getOperand(0)));
+ if (I->second.isOverdefined())
+ TrackedGlobals.erase(I); // No need to keep tracking this!
+}
+
+// Handle load instructions. If the operand is a constant pointer to a constant
+// global, we can replace the load with the loaded constant value!
+void SCCPSolver::visitLoadInst(LoadInst &I) {
+ // If this load is of a struct, just mark the result overdefined.
+ if (I.getType()->isStructTy())
+ return markOverdefined(&I);
+
+ LatticeVal PtrVal = getValueState(I.getOperand(0));
+ if (PtrVal.isUnknown()) return; // The pointer is not resolved yet!
+
+ LatticeVal &IV = ValueState[&I];
+ if (IV.isOverdefined()) return;
+
+ if (!PtrVal.isConstant() || I.isVolatile())
+ return markOverdefined(IV, &I);
+
+ Constant *Ptr = PtrVal.getConstant();
+
+ // load null is undefined.
+ if (isa<ConstantPointerNull>(Ptr) && I.getPointerAddressSpace() == 0)
+ return;
+
+ // Transform load (constant global) into the value loaded.
+ if (auto *GV = dyn_cast<GlobalVariable>(Ptr)) {
+ if (!TrackedGlobals.empty()) {
+ // If we are tracking this global, merge in the known value for it.
+ DenseMap<GlobalVariable*, LatticeVal>::iterator It =
+ TrackedGlobals.find(GV);
+ if (It != TrackedGlobals.end()) {
+ mergeInValue(IV, &I, It->second);
+ return;
+ }
+ }
+ }
+
+ // Transform load from a constant into a constant if possible.
+ if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) {
+ if (isa<UndefValue>(C))
+ return;
+ return markConstant(IV, &I, C);
+ }
+
+ // Otherwise we cannot say for certain what value this load will produce.
+ // Bail out.
+ markOverdefined(IV, &I);
+}
+
+void SCCPSolver::visitCallSite(CallSite CS) {
+ Function *F = CS.getCalledFunction();
+ Instruction *I = CS.getInstruction();
+
+ // The common case is that we aren't tracking the callee, either because we
+ // are not doing interprocedural analysis or the callee is indirect, or is
+ // external. Handle these cases first.
+ if (!F || F->isDeclaration()) {
+CallOverdefined:
+ // Void return and not tracking callee, just bail.
+ if (I->getType()->isVoidTy()) return;
+
+ // Otherwise, if we have a single return value case, and if the function is
+ // a declaration, maybe we can constant fold it.
+ if (F && F->isDeclaration() && !I->getType()->isStructTy() &&
+ canConstantFoldCallTo(CS, F)) {
+ SmallVector<Constant*, 8> Operands;
+ for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
+ AI != E; ++AI) {
+ LatticeVal State = getValueState(*AI);
+
+ if (State.isUnknown())
+ return; // Operands are not resolved yet.
+ if (State.isOverdefined())
+ return markOverdefined(I);
+ assert(State.isConstant() && "Unknown state!");
+ Operands.push_back(State.getConstant());
+ }
+
+ if (getValueState(I).isOverdefined())
+ return;
+
+ // If we can constant fold this, mark the result of the call as a
+ // constant.
+ if (Constant *C = ConstantFoldCall(CS, F, Operands, TLI)) {
+ // call -> undef.
+ if (isa<UndefValue>(C))
+ return;
+ return markConstant(I, C);
+ }
+ }
+
+ // Otherwise, we don't know anything about this call, mark it overdefined.
+ return markOverdefined(I);
+ }
+
+ // If this is a local function that doesn't have its address taken, mark its
+ // entry block executable and merge in the actual arguments to the call into
+ // the formal arguments of the function.
+ if (!TrackingIncomingArguments.empty() && TrackingIncomingArguments.count(F)){
+ MarkBlockExecutable(&F->front());
+
+ // Propagate information from this call site into the callee.
+ CallSite::arg_iterator CAI = CS.arg_begin();
+ for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+ AI != E; ++AI, ++CAI) {
+ // If this argument is byval, and if the function is not readonly, there
+ // will be an implicit copy formed of the input aggregate.
+ if (AI->hasByValAttr() && !F->onlyReadsMemory()) {
+ markOverdefined(&*AI);
+ continue;
+ }
+
+ if (auto *STy = dyn_cast<StructType>(AI->getType())) {
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ LatticeVal CallArg = getStructValueState(*CAI, i);
+ mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg);
+ }
+ } else {
+ // Most other parts of the Solver still only use the simpler value
+ // lattice, so we propagate changes for parameters to both lattices.
+ getParamState(&*AI).mergeIn(getValueState(*CAI).toValueLattice(), DL);
+ mergeInValue(&*AI, getValueState(*CAI));
+ }
+ }
+ }
+
+ // If this is a single/zero retval case, see if we're tracking the function.
+ if (auto *STy = dyn_cast<StructType>(F->getReturnType())) {
+ if (!MRVFunctionsTracked.count(F))
+ goto CallOverdefined; // Not tracking this callee.
+
+ // If we are tracking this callee, propagate the result of the function
+ // into this call site.
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+ mergeInValue(getStructValueState(I, i), I,
+ TrackedMultipleRetVals[std::make_pair(F, i)]);
+ } else {
+ DenseMap<Function*, LatticeVal>::iterator TFRVI = TrackedRetVals.find(F);
+ if (TFRVI == TrackedRetVals.end())
+ goto CallOverdefined; // Not tracking this callee.
+
+ // If so, propagate the return value of the callee into this call result.
+ mergeInValue(I, TFRVI->second);
+ }
+}
+
+void SCCPSolver::Solve() {
+ // Process the work lists until they are empty!
+ while (!BBWorkList.empty() || !InstWorkList.empty() ||
+ !OverdefinedInstWorkList.empty()) {
+ // Process the overdefined instruction's work list first, which drives other
+ // things to overdefined more quickly.
+ while (!OverdefinedInstWorkList.empty()) {
+ Value *I = OverdefinedInstWorkList.pop_back_val();
+
+ DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n');
+
+ // "I" got into the work list because it either made the transition from
+ // bottom to constant, or to overdefined.
+ //
+ // Anything on this worklist that is overdefined need not be visited
+ // since all of its users will have already been marked as overdefined
+ // Update all of the users of this instruction's value.
+ //
+ for (User *U : I->users())
+ if (auto *UI = dyn_cast<Instruction>(U))
+ OperandChangedState(UI);
+ }
+
+ // Process the instruction work list.
+ while (!InstWorkList.empty()) {
+ Value *I = InstWorkList.pop_back_val();
+
+ DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n');
+
+ // "I" got into the work list because it made the transition from undef to
+ // constant.
+ //
+ // Anything on this worklist that is overdefined need not be visited
+ // since all of its users will have already been marked as overdefined.
+ // Update all of the users of this instruction's value.
+ //
+ if (I->getType()->isStructTy() || !getValueState(I).isOverdefined())
+ for (User *U : I->users())
+ if (auto *UI = dyn_cast<Instruction>(U))
+ OperandChangedState(UI);
+ }
+
+ // Process the basic block work list.
+ while (!BBWorkList.empty()) {
+ BasicBlock *BB = BBWorkList.back();
+ BBWorkList.pop_back();
+
+ DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n');
+
+ // Notify all instructions in this basic block that they are newly
+ // executable.
+ visit(BB);
+ }
+ }
+}
+
+/// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+/// that branches on undef values cannot reach any of their successors.
+/// However, this is not a safe assumption. After we solve dataflow, this
+/// method should be use to handle this. If this returns true, the solver
+/// should be rerun.
+///
+/// This method handles this by finding an unresolved branch and marking it one
+/// of the edges from the block as being feasible, even though the condition
+/// doesn't say it would otherwise be. This allows SCCP to find the rest of the
+/// CFG and only slightly pessimizes the analysis results (by marking one,
+/// potentially infeasible, edge feasible). This cannot usefully modify the
+/// constraints on the condition of the branch, as that would impact other users
+/// of the value.
+///
+/// This scan also checks for values that use undefs, whose results are actually
+/// defined. For example, 'zext i8 undef to i32' should produce all zeros
+/// conservatively, as "(zext i8 X -> i32) & 0xFF00" must always return zero,
+/// even if X isn't defined.
+bool SCCPSolver::ResolvedUndefsIn(Function &F) {
+ for (BasicBlock &BB : F) {
+ if (!BBExecutable.count(&BB))
+ continue;
+
+ for (Instruction &I : BB) {
+ // Look for instructions which produce undef values.
+ if (I.getType()->isVoidTy()) continue;
+
+ if (auto *STy = dyn_cast<StructType>(I.getType())) {
+ // Only a few things that can be structs matter for undef.
+
+ // Tracked calls must never be marked overdefined in ResolvedUndefsIn.
+ if (CallSite CS = CallSite(&I))
+ if (Function *F = CS.getCalledFunction())
+ if (MRVFunctionsTracked.count(F))
+ continue;
+
+ // extractvalue and insertvalue don't need to be marked; they are
+ // tracked as precisely as their operands.
+ if (isa<ExtractValueInst>(I) || isa<InsertValueInst>(I))
+ continue;
+
+ // Send the results of everything else to overdefined. We could be
+ // more precise than this but it isn't worth bothering.
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ LatticeVal &LV = getStructValueState(&I, i);
+ if (LV.isUnknown())
+ markOverdefined(LV, &I);
+ }
+ continue;
+ }
+
+ LatticeVal &LV = getValueState(&I);
+ if (!LV.isUnknown()) continue;
+
+ // extractvalue is safe; check here because the argument is a struct.
+ if (isa<ExtractValueInst>(I))
+ continue;
+
+ // Compute the operand LatticeVals, for convenience below.
+ // Anything taking a struct is conservatively assumed to require
+ // overdefined markings.
+ if (I.getOperand(0)->getType()->isStructTy()) {
+ markOverdefined(&I);
+ return true;
+ }
+ LatticeVal Op0LV = getValueState(I.getOperand(0));
+ LatticeVal Op1LV;
+ if (I.getNumOperands() == 2) {
+ if (I.getOperand(1)->getType()->isStructTy()) {
+ markOverdefined(&I);
+ return true;
+ }
+
+ Op1LV = getValueState(I.getOperand(1));
+ }
+ // If this is an instructions whose result is defined even if the input is
+ // not fully defined, propagate the information.
+ Type *ITy = I.getType();
+ switch (I.getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast:
+ break; // Any undef -> undef
+ case Instruction::FSub:
+ case Instruction::FAdd:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ // Floating-point binary operation: be conservative.
+ if (Op0LV.isUnknown() && Op1LV.isUnknown())
+ markForcedConstant(&I, Constant::getNullValue(ITy));
+ else
+ markOverdefined(&I);
+ return true;
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ // undef -> 0; some outputs are impossible
+ markForcedConstant(&I, Constant::getNullValue(ITy));
+ return true;
+ case Instruction::Mul:
+ case Instruction::And:
+ // Both operands undef -> undef
+ if (Op0LV.isUnknown() && Op1LV.isUnknown())
+ break;
+ // undef * X -> 0. X could be zero.
+ // undef & X -> 0. X could be zero.
+ markForcedConstant(&I, Constant::getNullValue(ITy));
+ return true;
+ case Instruction::Or:
+ // Both operands undef -> undef
+ if (Op0LV.isUnknown() && Op1LV.isUnknown())
+ break;
+ // undef | X -> -1. X could be -1.
+ markForcedConstant(&I, Constant::getAllOnesValue(ITy));
+ return true;
+ case Instruction::Xor:
+ // undef ^ undef -> 0; strictly speaking, this is not strictly
+ // necessary, but we try to be nice to people who expect this
+ // behavior in simple cases
+ if (Op0LV.isUnknown() && Op1LV.isUnknown()) {
+ markForcedConstant(&I, Constant::getNullValue(ITy));
+ return true;
+ }
+ // undef ^ X -> undef
+ break;
+ case Instruction::SDiv:
+ case Instruction::UDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ // X / undef -> undef. No change.
+ // X % undef -> undef. No change.
+ if (Op1LV.isUnknown()) break;
+
+ // X / 0 -> undef. No change.
+ // X % 0 -> undef. No change.
+ if (Op1LV.isConstant() && Op1LV.getConstant()->isZeroValue())
+ break;
+
+ // undef / X -> 0. X could be maxint.
+ // undef % X -> 0. X could be 1.
+ markForcedConstant(&I, Constant::getNullValue(ITy));
+ return true;
+ case Instruction::AShr:
+ // X >>a undef -> undef.
+ if (Op1LV.isUnknown()) break;
+
+ // Shifting by the bitwidth or more is undefined.
+ if (Op1LV.isConstant()) {
+ if (auto *ShiftAmt = Op1LV.getConstantInt())
+ if (ShiftAmt->getLimitedValue() >=
+ ShiftAmt->getType()->getScalarSizeInBits())
+ break;
+ }
+
+ // undef >>a X -> 0
+ markForcedConstant(&I, Constant::getNullValue(ITy));
+ return true;
+ case Instruction::LShr:
+ case Instruction::Shl:
+ // X << undef -> undef.
+ // X >> undef -> undef.
+ if (Op1LV.isUnknown()) break;
+
+ // Shifting by the bitwidth or more is undefined.
+ if (Op1LV.isConstant()) {
+ if (auto *ShiftAmt = Op1LV.getConstantInt())
+ if (ShiftAmt->getLimitedValue() >=
+ ShiftAmt->getType()->getScalarSizeInBits())
+ break;
+ }
+
+ // undef << X -> 0
+ // undef >> X -> 0
+ markForcedConstant(&I, Constant::getNullValue(ITy));
+ return true;
+ case Instruction::Select:
+ Op1LV = getValueState(I.getOperand(1));
+ // undef ? X : Y -> X or Y. There could be commonality between X/Y.
+ if (Op0LV.isUnknown()) {
+ if (!Op1LV.isConstant()) // Pick the constant one if there is any.
+ Op1LV = getValueState(I.getOperand(2));
+ } else if (Op1LV.isUnknown()) {
+ // c ? undef : undef -> undef. No change.
+ Op1LV = getValueState(I.getOperand(2));
+ if (Op1LV.isUnknown())
+ break;
+ // Otherwise, c ? undef : x -> x.
+ } else {
+ // Leave Op1LV as Operand(1)'s LatticeValue.
+ }
+
+ if (Op1LV.isConstant())
+ markForcedConstant(&I, Op1LV.getConstant());
+ else
+ markOverdefined(&I);
+ return true;
+ case Instruction::Load:
+ // A load here means one of two things: a load of undef from a global,
+ // a load from an unknown pointer. Either way, having it return undef
+ // is okay.
+ break;
+ case Instruction::ICmp:
+ // X == undef -> undef. Other comparisons get more complicated.
+ if (cast<ICmpInst>(&I)->isEquality())
+ break;
+ markOverdefined(&I);
+ return true;
+ case Instruction::Call:
+ case Instruction::Invoke:
+ // There are two reasons a call can have an undef result
+ // 1. It could be tracked.
+ // 2. It could be constant-foldable.
+ // Because of the way we solve return values, tracked calls must
+ // never be marked overdefined in ResolvedUndefsIn.
+ if (Function *F = CallSite(&I).getCalledFunction())
+ if (TrackedRetVals.count(F))
+ break;
+
+ // If the call is constant-foldable, we mark it overdefined because
+ // we do not know what return values are valid.
+ markOverdefined(&I);
+ return true;
+ default:
+ // If we don't know what should happen here, conservatively mark it
+ // overdefined.
+ markOverdefined(&I);
+ return true;
+ }
+ }
+
+ // Check to see if we have a branch or switch on an undefined value. If so
+ // we force the branch to go one way or the other to make the successor
+ // values live. It doesn't really matter which way we force it.
+ TerminatorInst *TI = BB.getTerminator();
+ if (auto *BI = dyn_cast<BranchInst>(TI)) {
+ if (!BI->isConditional()) continue;
+ if (!getValueState(BI->getCondition()).isUnknown())
+ continue;
+
+ // If the input to SCCP is actually branch on undef, fix the undef to
+ // false.
+ if (isa<UndefValue>(BI->getCondition())) {
+ BI->setCondition(ConstantInt::getFalse(BI->getContext()));
+ markEdgeExecutable(&BB, TI->getSuccessor(1));
+ return true;
+ }
+
+ // Otherwise, it is a branch on a symbolic value which is currently
+ // considered to be undef. Handle this by forcing the input value to the
+ // branch to false.
+ markForcedConstant(BI->getCondition(),
+ ConstantInt::getFalse(TI->getContext()));
+ return true;
+ }
+
+ if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
+ // Indirect branch with no successor ?. Its ok to assume it branches
+ // to no target.
+ if (IBR->getNumSuccessors() < 1)
+ continue;
+
+ if (!getValueState(IBR->getAddress()).isUnknown())
+ continue;
+
+ // If the input to SCCP is actually branch on undef, fix the undef to
+ // the first successor of the indirect branch.
+ if (isa<UndefValue>(IBR->getAddress())) {
+ IBR->setAddress(BlockAddress::get(IBR->getSuccessor(0)));
+ markEdgeExecutable(&BB, IBR->getSuccessor(0));
+ return true;
+ }
+
+ // Otherwise, it is a branch on a symbolic value which is currently
+ // considered to be undef. Handle this by forcing the input value to the
+ // branch to the first successor.
+ markForcedConstant(IBR->getAddress(),
+ BlockAddress::get(IBR->getSuccessor(0)));
+ return true;
+ }
+
+ if (auto *SI = dyn_cast<SwitchInst>(TI)) {
+ if (!SI->getNumCases() || !getValueState(SI->getCondition()).isUnknown())
+ continue;
+
+ // If the input to SCCP is actually switch on undef, fix the undef to
+ // the first constant.
+ if (isa<UndefValue>(SI->getCondition())) {
+ SI->setCondition(SI->case_begin()->getCaseValue());
+ markEdgeExecutable(&BB, SI->case_begin()->getCaseSuccessor());
+ return true;
+ }
+
+ markForcedConstant(SI->getCondition(), SI->case_begin()->getCaseValue());
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool tryToReplaceWithConstantRange(SCCPSolver &Solver, Value *V) {
+ bool Changed = false;
+
+ // Currently we only use range information for integer values.
+ if (!V->getType()->isIntegerTy())
+ return false;
+
+ const ValueLatticeElement &IV = Solver.getLatticeValueFor(V);
+ if (!IV.isConstantRange())
+ return false;
+
+ for (auto UI = V->uses().begin(), E = V->uses().end(); UI != E;) {
+ const Use &U = *UI++;
+ auto *Icmp = dyn_cast<ICmpInst>(U.getUser());
+ if (!Icmp || !Solver.isBlockExecutable(Icmp->getParent()))
+ continue;
+
+ auto getIcmpLatticeValue = [&](Value *Op) {
+ if (auto *C = dyn_cast<Constant>(Op))
+ return ValueLatticeElement::get(C);
+ return Solver.getLatticeValueFor(Op);
+ };
+
+ ValueLatticeElement A = getIcmpLatticeValue(Icmp->getOperand(0));
+ ValueLatticeElement B = getIcmpLatticeValue(Icmp->getOperand(1));
+
+ Constant *C = nullptr;
+ if (A.satisfiesPredicate(Icmp->getPredicate(), B))
+ C = ConstantInt::getTrue(Icmp->getType());
+ else if (A.satisfiesPredicate(Icmp->getInversePredicate(), B))
+ C = ConstantInt::getFalse(Icmp->getType());
+
+ if (C) {
+ Icmp->replaceAllUsesWith(C);
+ DEBUG(dbgs() << "Replacing " << *Icmp << " with " << *C
+ << ", because of range information " << A << " " << B
+ << "\n");
+ Icmp->eraseFromParent();
+ Changed = true;
+ }
+ }
+ return Changed;
+}
+
+static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
+ Constant *Const = nullptr;
+ if (V->getType()->isStructTy()) {
+ std::vector<LatticeVal> IVs = Solver.getStructLatticeValueFor(V);
+ if (llvm::any_of(IVs,
+ [](const LatticeVal &LV) { return LV.isOverdefined(); }))
+ return false;
+ std::vector<Constant *> ConstVals;
+ auto *ST = dyn_cast<StructType>(V->getType());
+ for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+ LatticeVal V = IVs[i];
+ ConstVals.push_back(V.isConstant()
+ ? V.getConstant()
+ : UndefValue::get(ST->getElementType(i)));
+ }
+ Const = ConstantStruct::get(ST, ConstVals);
+ } else {
+ const ValueLatticeElement &IV = Solver.getLatticeValueFor(V);
+ if (IV.isOverdefined())
+ return false;
+
+ if (IV.isConstantRange()) {
+ if (IV.getConstantRange().isSingleElement())
+ Const =
+ ConstantInt::get(V->getType(), IV.asConstantInteger().getValue());
+ else
+ return false;
+ } else
+ Const =
+ IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType());
+ }
+ assert(Const && "Constant is nullptr here!");
+ DEBUG(dbgs() << " Constant: " << *Const << " = " << *V << '\n');
+
+ // Replaces all of the uses of a variable with uses of the constant.
+ V->replaceAllUsesWith(Const);
+ return true;
+}
+
+// runSCCP() - Run the Sparse Conditional Constant Propagation algorithm,
+// and return true if the function was modified.
+static bool runSCCP(Function &F, const DataLayout &DL,
+ const TargetLibraryInfo *TLI) {
+ DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
+ SCCPSolver Solver(DL, TLI);
+
+ // Mark the first block of the function as being executable.
+ Solver.MarkBlockExecutable(&F.front());
+
+ // Mark all arguments to the function as being overdefined.
+ for (Argument &AI : F.args())
+ Solver.markOverdefined(&AI);
+
+ // Solve for constants.
+ bool ResolvedUndefs = true;
+ while (ResolvedUndefs) {
+ Solver.Solve();
+ DEBUG(dbgs() << "RESOLVING UNDEFs\n");
+ ResolvedUndefs = Solver.ResolvedUndefsIn(F);
+ }
+
+ bool MadeChanges = false;
+
+ // If we decided that there are basic blocks that are dead in this function,
+ // delete their contents now. Note that we cannot actually delete the blocks,
+ // as we cannot modify the CFG of the function.
+
+ for (BasicBlock &BB : F) {
+ if (!Solver.isBlockExecutable(&BB)) {
+ DEBUG(dbgs() << " BasicBlock Dead:" << BB);
+
+ ++NumDeadBlocks;
+ NumInstRemoved += removeAllNonTerminatorAndEHPadInstructions(&BB);
+
+ MadeChanges = true;
+ continue;
+ }
+
+ // Iterate over all of the instructions in a function, replacing them with
+ // constants if we have found them to be of constant values.
+ for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) {
+ Instruction *Inst = &*BI++;
+ if (Inst->getType()->isVoidTy() || isa<TerminatorInst>(Inst))
+ continue;
+
+ if (tryToReplaceWithConstant(Solver, Inst)) {
+ if (isInstructionTriviallyDead(Inst))
+ Inst->eraseFromParent();
+ // Hey, we just changed something!
+ MadeChanges = true;
+ ++NumInstRemoved;
+ }
+ }
+ }
+
+ return MadeChanges;
+}
+
+PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) {
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ if (!runSCCP(F, DL, &TLI))
+ return PreservedAnalyses::all();
+
+ auto PA = PreservedAnalyses();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+namespace {
+
+//===--------------------------------------------------------------------===//
+//
+/// SCCP Class - This class uses the SCCPSolver to implement a per-function
+/// Sparse Conditional Constant Propagator.
+///
+class SCCPLegacyPass : public FunctionPass {
+public:
+ // Pass identification, replacement for typeid
+ static char ID;
+
+ SCCPLegacyPass() : FunctionPass(ID) {
+ initializeSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+
+ // runOnFunction - Run the Sparse Conditional Constant Propagation
+ // algorithm, and return true if the function was modified.
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ const TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ return runSCCP(F, DL, TLI);
+ }
+};
+
+} // end anonymous namespace
+
+char SCCPLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SCCPLegacyPass, "sccp",
+ "Sparse Conditional Constant Propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(SCCPLegacyPass, "sccp",
+ "Sparse Conditional Constant Propagation", false, false)
+
+// createSCCPPass - This is the public interface to this file.
+FunctionPass *llvm::createSCCPPass() { return new SCCPLegacyPass(); }
+
+static void findReturnsToZap(Function &F,
+ SmallVector<ReturnInst *, 8> &ReturnsToZap,
+ SCCPSolver &Solver) {
+ // We can only do this if we know that nothing else can call the function.
+ if (!Solver.isArgumentTrackedFunction(&F))
+ return;
+
+ for (BasicBlock &BB : F)
+ if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
+ if (!isa<UndefValue>(RI->getOperand(0)))
+ ReturnsToZap.push_back(RI);
+}
+
+static bool runIPSCCP(Module &M, const DataLayout &DL,
+ const TargetLibraryInfo *TLI) {
+ SCCPSolver Solver(DL, TLI);
+
+ // Loop over all functions, marking arguments to those with their addresses
+ // taken or that are external as overdefined.
+ for (Function &F : M) {
+ if (F.isDeclaration())
+ continue;
+
+ // Determine if we can track the function's return values. If so, add the
+ // function to the solver's set of return-tracked functions.
+ if (canTrackReturnsInterprocedurally(&F))
+ Solver.AddTrackedFunction(&F);
+
+ // Determine if we can track the function's arguments. If so, add the
+ // function to the solver's set of argument-tracked functions.
+ if (canTrackArgumentsInterprocedurally(&F)) {
+ Solver.AddArgumentTrackedFunction(&F);
+ continue;
+ }
+
+ // Assume the function is called.
+ Solver.MarkBlockExecutable(&F.front());
+
+ // Assume nothing about the incoming arguments.
+ for (Argument &AI : F.args())
+ Solver.markOverdefined(&AI);
+ }
+
+ // Determine if we can track any of the module's global variables. If so, add
+ // the global variables we can track to the solver's set of tracked global
+ // variables.
+ for (GlobalVariable &G : M.globals()) {
+ G.removeDeadConstantUsers();
+ if (canTrackGlobalVariableInterprocedurally(&G))
+ Solver.TrackValueOfGlobalVariable(&G);
+ }
+
+ // Solve for constants.
+ bool ResolvedUndefs = true;
+ while (ResolvedUndefs) {
+ Solver.Solve();
+
+ DEBUG(dbgs() << "RESOLVING UNDEFS\n");
+ ResolvedUndefs = false;
+ for (Function &F : M)
+ ResolvedUndefs |= Solver.ResolvedUndefsIn(F);
+ }
+
+ bool MadeChanges = false;
+
+ // Iterate over all of the instructions in the module, replacing them with
+ // constants if we have found them to be of constant values.
+ SmallVector<BasicBlock*, 512> BlocksToErase;
+
+ for (Function &F : M) {
+ if (F.isDeclaration())
+ continue;
+
+ if (Solver.isBlockExecutable(&F.front()))
+ for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;
+ ++AI) {
+ if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI)) {
+ ++IPNumArgsElimed;
+ continue;
+ }
+
+ if (!AI->use_empty() && tryToReplaceWithConstantRange(Solver, &*AI))
+ ++IPNumRangeInfoUsed;
+ }
+
+ for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+ if (!Solver.isBlockExecutable(&*BB)) {
+ DEBUG(dbgs() << " BasicBlock Dead:" << *BB);
+
+ ++NumDeadBlocks;
+ NumInstRemoved +=
+ changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false);
+
+ MadeChanges = true;
+
+ if (&*BB != &F.front())
+ BlocksToErase.push_back(&*BB);
+ continue;
+ }
+
+ for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
+ Instruction *Inst = &*BI++;
+ if (Inst->getType()->isVoidTy())
+ continue;
+ if (tryToReplaceWithConstant(Solver, Inst)) {
+ if (!isa<CallInst>(Inst) && !isa<TerminatorInst>(Inst))
+ Inst->eraseFromParent();
+ // Hey, we just changed something!
+ MadeChanges = true;
+ ++IPNumInstRemoved;
+ }
+ }
+ }
+
+ // Now that all instructions in the function are constant folded, erase dead
+ // blocks, because we can now use ConstantFoldTerminator to get rid of
+ // in-edges.
+ for (unsigned i = 0, e = BlocksToErase.size(); i != e; ++i) {
+ // If there are any PHI nodes in this successor, drop entries for BB now.
+ BasicBlock *DeadBB = BlocksToErase[i];
+ for (Value::user_iterator UI = DeadBB->user_begin(),
+ UE = DeadBB->user_end();
+ UI != UE;) {
+ // Grab the user and then increment the iterator early, as the user
+ // will be deleted. Step past all adjacent uses from the same user.
+ auto *I = dyn_cast<Instruction>(*UI);
+ do { ++UI; } while (UI != UE && *UI == I);
+
+ // Ignore blockaddress users; BasicBlock's dtor will handle them.
+ if (!I) continue;
+
+ bool Folded = ConstantFoldTerminator(I->getParent());
+ if (!Folded) {
+ // The constant folder may not have been able to fold the terminator
+ // if this is a branch or switch on undef. Fold it manually as a
+ // branch to the first successor.
+#ifndef NDEBUG
+ if (auto *BI = dyn_cast<BranchInst>(I)) {
+ assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) &&
+ "Branch should be foldable!");
+ } else if (auto *SI = dyn_cast<SwitchInst>(I)) {
+ assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold");
+ } else {
+ llvm_unreachable("Didn't fold away reference to block!");
+ }
+#endif
+
+ // Make this an uncond branch to the first successor.
+ TerminatorInst *TI = I->getParent()->getTerminator();
+ BranchInst::Create(TI->getSuccessor(0), TI);
+
+ // Remove entries in successor phi nodes to remove edges.
+ for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
+ TI->getSuccessor(i)->removePredecessor(TI->getParent());
+
+ // Remove the old terminator.
+ TI->eraseFromParent();
+ }
+ }
+
+ // Finally, delete the basic block.
+ F.getBasicBlockList().erase(DeadBB);
+ }
+ BlocksToErase.clear();
+ }
+
+ // If we inferred constant or undef return values for a function, we replaced
+ // all call uses with the inferred value. This means we don't need to bother
+ // actually returning anything from the function. Replace all return
+ // instructions with return undef.
+ //
+ // Do this in two stages: first identify the functions we should process, then
+ // actually zap their returns. This is important because we can only do this
+ // if the address of the function isn't taken. In cases where a return is the
+ // last use of a function, the order of processing functions would affect
+ // whether other functions are optimizable.
+ SmallVector<ReturnInst*, 8> ReturnsToZap;
+
+ const DenseMap<Function*, LatticeVal> &RV = Solver.getTrackedRetVals();
+ for (const auto &I : RV) {
+ Function *F = I.first;
+ if (I.second.isOverdefined() || F->getReturnType()->isVoidTy())
+ continue;
+ findReturnsToZap(*F, ReturnsToZap, Solver);
+ }
+
+ for (const auto &F : Solver.getMRVFunctionsTracked()) {
+ assert(F->getReturnType()->isStructTy() &&
+ "The return type should be a struct");
+ StructType *STy = cast<StructType>(F->getReturnType());
+ if (Solver.isStructLatticeConstant(F, STy))
+ findReturnsToZap(*F, ReturnsToZap, Solver);
+ }
+
+ // Zap all returns which we've identified as zap to change.
+ for (unsigned i = 0, e = ReturnsToZap.size(); i != e; ++i) {
+ Function *F = ReturnsToZap[i]->getParent()->getParent();
+ ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType()));
+ }
+
+ // If we inferred constant or undef values for globals variables, we can
+ // delete the global and any stores that remain to it.
+ const DenseMap<GlobalVariable*, LatticeVal> &TG = Solver.getTrackedGlobals();
+ for (DenseMap<GlobalVariable*, LatticeVal>::const_iterator I = TG.begin(),
+ E = TG.end(); I != E; ++I) {
+ GlobalVariable *GV = I->first;
+ assert(!I->second.isOverdefined() &&
+ "Overdefined values should have been taken out of the map!");
+ DEBUG(dbgs() << "Found that GV '" << GV->getName() << "' is constant!\n");
+ while (!GV->use_empty()) {
+ StoreInst *SI = cast<StoreInst>(GV->user_back());
+ SI->eraseFromParent();
+ }
+ M.getGlobalList().erase(GV);
+ ++IPNumGlobalConst;
+ }
+
+ return MadeChanges;
+}
+
+PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) {
+ const DataLayout &DL = M.getDataLayout();
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
+ if (!runIPSCCP(M, DL, &TLI))
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
+
+namespace {
+
+//===--------------------------------------------------------------------===//
+//
+/// IPSCCP Class - This class implements interprocedural Sparse Conditional
+/// Constant Propagation.
+///
+class IPSCCPLegacyPass : public ModulePass {
+public:
+ static char ID;
+
+ IPSCCPLegacyPass() : ModulePass(ID) {
+ initializeIPSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+ const DataLayout &DL = M.getDataLayout();
+ const TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ return runIPSCCP(M, DL, TLI);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+char IPSCCPLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(IPSCCPLegacyPass, "ipsccp",
+ "Interprocedural Sparse Conditional Constant Propagation",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(IPSCCPLegacyPass, "ipsccp",
+ "Interprocedural Sparse Conditional Constant Propagation",
+ false, false)
+
+// createIPSCCPPass - This is the public interface to this file.
+ModulePass *llvm::createIPSCCPPass() { return new IPSCCPLegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
new file mode 100644
index 000000000000..bfe3754f0769
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -0,0 +1,4447 @@
+//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This transformation implements the well known scalar replacement of
+/// aggregates transformation. It tries to identify promotable elements of an
+/// aggregate alloca, and promote them to registers. It will also try to
+/// convert uses of an element (or set of elements) of an alloca into a vector
+/// or bitfield-style integer scalar if appropriate.
+///
+/// It works to do this with minimal slicing of the alloca so that regions
+/// which are merely transferred in and out of external memory remain unchanged
+/// and are not decomposed to scalar code.
+///
+/// Because this also performs alloca promotion, it can be thought of as also
+/// serving the purpose of SSA formation. The algorithm iterates on the
+/// function until all opportunities for promotion have been realized.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SROA.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/PtrUseVisitor.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantFolder.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#ifndef NDEBUG
+// We only use this for a debug check.
+#include <random>
+#endif
+
+using namespace llvm;
+using namespace llvm::sroa;
+
+#define DEBUG_TYPE "sroa"
+
+STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
+STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
+STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
+STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
+STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
+STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
+STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
+STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
+STATISTIC(NumDeleted, "Number of instructions deleted");
+STATISTIC(NumVectorized, "Number of vectorized aggregates");
+
+/// Hidden option to enable randomly shuffling the slices to help uncover
+/// instability in their order.
+static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
+ cl::init(false), cl::Hidden);
+
+/// Hidden option to experiment with completely strict handling of inbounds
+/// GEPs.
+static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
+ cl::Hidden);
+
+/// Hidden option to allow more aggressive splitting.
+static cl::opt<bool>
+SROASplitNonWholeAllocaSlices("sroa-split-nonwhole-alloca-slices",
+ cl::init(false), cl::Hidden);
+
+namespace {
+
+/// \brief A custom IRBuilder inserter which prefixes all names, but only in
+/// Assert builds.
+class IRBuilderPrefixedInserter : public IRBuilderDefaultInserter {
+ std::string Prefix;
+
+ const Twine getNameWithPrefix(const Twine &Name) const {
+ return Name.isTriviallyEmpty() ? Name : Prefix + Name;
+ }
+
+public:
+ void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
+
+protected:
+ void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
+ BasicBlock::iterator InsertPt) const {
+ IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), BB,
+ InsertPt);
+ }
+};
+
+/// \brief Provide a type for IRBuilder that drops names in release builds.
+using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
+
+/// \brief A used slice of an alloca.
+///
+/// This structure represents a slice of an alloca used by some instruction. It
+/// stores both the begin and end offsets of this use, a pointer to the use
+/// itself, and a flag indicating whether we can classify the use as splittable
+/// or not when forming partitions of the alloca.
+class Slice {
+ /// \brief The beginning offset of the range.
+ uint64_t BeginOffset = 0;
+
+ /// \brief The ending offset, not included in the range.
+ uint64_t EndOffset = 0;
+
+ /// \brief Storage for both the use of this slice and whether it can be
+ /// split.
+ PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
+
+public:
+ Slice() = default;
+
+ Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
+ : BeginOffset(BeginOffset), EndOffset(EndOffset),
+ UseAndIsSplittable(U, IsSplittable) {}
+
+ uint64_t beginOffset() const { return BeginOffset; }
+ uint64_t endOffset() const { return EndOffset; }
+
+ bool isSplittable() const { return UseAndIsSplittable.getInt(); }
+ void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
+
+ Use *getUse() const { return UseAndIsSplittable.getPointer(); }
+
+ bool isDead() const { return getUse() == nullptr; }
+ void kill() { UseAndIsSplittable.setPointer(nullptr); }
+
+ /// \brief Support for ordering ranges.
+ ///
+ /// This provides an ordering over ranges such that start offsets are
+ /// always increasing, and within equal start offsets, the end offsets are
+ /// decreasing. Thus the spanning range comes first in a cluster with the
+ /// same start position.
+ bool operator<(const Slice &RHS) const {
+ if (beginOffset() < RHS.beginOffset())
+ return true;
+ if (beginOffset() > RHS.beginOffset())
+ return false;
+ if (isSplittable() != RHS.isSplittable())
+ return !isSplittable();
+ if (endOffset() > RHS.endOffset())
+ return true;
+ return false;
+ }
+
+ /// \brief Support comparison with a single offset to allow binary searches.
+ friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
+ uint64_t RHSOffset) {
+ return LHS.beginOffset() < RHSOffset;
+ }
+ friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
+ const Slice &RHS) {
+ return LHSOffset < RHS.beginOffset();
+ }
+
+ bool operator==(const Slice &RHS) const {
+ return isSplittable() == RHS.isSplittable() &&
+ beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
+ }
+ bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+template <typename T> struct isPodLike;
+template <> struct isPodLike<Slice> { static const bool value = true; };
+
+} // end namespace llvm
+
+/// \brief Representation of the alloca slices.
+///
+/// This class represents the slices of an alloca which are formed by its
+/// various uses. If a pointer escapes, we can't fully build a representation
+/// for the slices used and we reflect that in this structure. The uses are
+/// stored, sorted by increasing beginning offset and with unsplittable slices
+/// starting at a particular offset before splittable slices.
+class llvm::sroa::AllocaSlices {
+public:
+ /// \brief Construct the slices of a particular alloca.
+ AllocaSlices(const DataLayout &DL, AllocaInst &AI);
+
+ /// \brief Test whether a pointer to the allocation escapes our analysis.
+ ///
+ /// If this is true, the slices are never fully built and should be
+ /// ignored.
+ bool isEscaped() const { return PointerEscapingInstr; }
+
+ /// \brief Support for iterating over the slices.
+ /// @{
+ using iterator = SmallVectorImpl<Slice>::iterator;
+ using range = iterator_range<iterator>;
+
+ iterator begin() { return Slices.begin(); }
+ iterator end() { return Slices.end(); }
+
+ using const_iterator = SmallVectorImpl<Slice>::const_iterator;
+ using const_range = iterator_range<const_iterator>;
+
+ const_iterator begin() const { return Slices.begin(); }
+ const_iterator end() const { return Slices.end(); }
+ /// @}
+
+ /// \brief Erase a range of slices.
+ void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
+
+ /// \brief Insert new slices for this alloca.
+ ///
+ /// This moves the slices into the alloca's slices collection, and re-sorts
+ /// everything so that the usual ordering properties of the alloca's slices
+ /// hold.
+ void insert(ArrayRef<Slice> NewSlices) {
+ int OldSize = Slices.size();
+ Slices.append(NewSlices.begin(), NewSlices.end());
+ auto SliceI = Slices.begin() + OldSize;
+ std::sort(SliceI, Slices.end());
+ std::inplace_merge(Slices.begin(), SliceI, Slices.end());
+ }
+
+ // Forward declare the iterator and range accessor for walking the
+ // partitions.
+ class partition_iterator;
+ iterator_range<partition_iterator> partitions();
+
+ /// \brief Access the dead users for this alloca.
+ ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
+
+ /// \brief Access the dead operands referring to this alloca.
+ ///
+ /// These are operands which have cannot actually be used to refer to the
+ /// alloca as they are outside its range and the user doesn't correct for
+ /// that. These mostly consist of PHI node inputs and the like which we just
+ /// need to replace with undef.
+ ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
+ void printSlice(raw_ostream &OS, const_iterator I,
+ StringRef Indent = " ") const;
+ void printUse(raw_ostream &OS, const_iterator I,
+ StringRef Indent = " ") const;
+ void print(raw_ostream &OS) const;
+ void dump(const_iterator I) const;
+ void dump() const;
+#endif
+
+private:
+ template <typename DerivedT, typename RetT = void> class BuilderBase;
+ class SliceBuilder;
+
+ friend class AllocaSlices::SliceBuilder;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// \brief Handle to alloca instruction to simplify method interfaces.
+ AllocaInst &AI;
+#endif
+
+ /// \brief The instruction responsible for this alloca not having a known set
+ /// of slices.
+ ///
+ /// When an instruction (potentially) escapes the pointer to the alloca, we
+ /// store a pointer to that here and abort trying to form slices of the
+ /// alloca. This will be null if the alloca slices are analyzed successfully.
+ Instruction *PointerEscapingInstr;
+
+ /// \brief The slices of the alloca.
+ ///
+ /// We store a vector of the slices formed by uses of the alloca here. This
+ /// vector is sorted by increasing begin offset, and then the unsplittable
+ /// slices before the splittable ones. See the Slice inner class for more
+ /// details.
+ SmallVector<Slice, 8> Slices;
+
+ /// \brief Instructions which will become dead if we rewrite the alloca.
+ ///
+ /// Note that these are not separated by slice. This is because we expect an
+ /// alloca to be completely rewritten or not rewritten at all. If rewritten,
+ /// all these instructions can simply be removed and replaced with undef as
+ /// they come from outside of the allocated space.
+ SmallVector<Instruction *, 8> DeadUsers;
+
+ /// \brief Operands which will become dead if we rewrite the alloca.
+ ///
+ /// These are operands that in their particular use can be replaced with
+ /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
+ /// to PHI nodes and the like. They aren't entirely dead (there might be
+ /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
+ /// want to swap this particular input for undef to simplify the use lists of
+ /// the alloca.
+ SmallVector<Use *, 8> DeadOperands;
+};
+
+/// \brief A partition of the slices.
+///
+/// An ephemeral representation for a range of slices which can be viewed as
+/// a partition of the alloca. This range represents a span of the alloca's
+/// memory which cannot be split, and provides access to all of the slices
+/// overlapping some part of the partition.
+///
+/// Objects of this type are produced by traversing the alloca's slices, but
+/// are only ephemeral and not persistent.
+class llvm::sroa::Partition {
+private:
+ friend class AllocaSlices;
+ friend class AllocaSlices::partition_iterator;
+
+ using iterator = AllocaSlices::iterator;
+
+ /// \brief The beginning and ending offsets of the alloca for this
+ /// partition.
+ uint64_t BeginOffset, EndOffset;
+
+ /// \brief The start and end iterators of this partition.
+ iterator SI, SJ;
+
+ /// \brief A collection of split slice tails overlapping the partition.
+ SmallVector<Slice *, 4> SplitTails;
+
+ /// \brief Raw constructor builds an empty partition starting and ending at
+ /// the given iterator.
+ Partition(iterator SI) : SI(SI), SJ(SI) {}
+
+public:
+ /// \brief The start offset of this partition.
+ ///
+ /// All of the contained slices start at or after this offset.
+ uint64_t beginOffset() const { return BeginOffset; }
+
+ /// \brief The end offset of this partition.
+ ///
+ /// All of the contained slices end at or before this offset.
+ uint64_t endOffset() const { return EndOffset; }
+
+ /// \brief The size of the partition.
+ ///
+ /// Note that this can never be zero.
+ uint64_t size() const {
+ assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
+ return EndOffset - BeginOffset;
+ }
+
+ /// \brief Test whether this partition contains no slices, and merely spans
+ /// a region occupied by split slices.
+ bool empty() const { return SI == SJ; }
+
+ /// \name Iterate slices that start within the partition.
+ /// These may be splittable or unsplittable. They have a begin offset >= the
+ /// partition begin offset.
+ /// @{
+ // FIXME: We should probably define a "concat_iterator" helper and use that
+ // to stitch together pointee_iterators over the split tails and the
+ // contiguous iterators of the partition. That would give a much nicer
+ // interface here. We could then additionally expose filtered iterators for
+ // split, unsplit, and unsplittable splices based on the usage patterns.
+ iterator begin() const { return SI; }
+ iterator end() const { return SJ; }
+ /// @}
+
+ /// \brief Get the sequence of split slice tails.
+ ///
+ /// These tails are of slices which start before this partition but are
+ /// split and overlap into the partition. We accumulate these while forming
+ /// partitions.
+ ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
+};
+
+/// \brief An iterator over partitions of the alloca's slices.
+///
+/// This iterator implements the core algorithm for partitioning the alloca's
+/// slices. It is a forward iterator as we don't support backtracking for
+/// efficiency reasons, and re-use a single storage area to maintain the
+/// current set of split slices.
+///
+/// It is templated on the slice iterator type to use so that it can operate
+/// with either const or non-const slice iterators.
+class AllocaSlices::partition_iterator
+ : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
+ Partition> {
+ friend class AllocaSlices;
+
+ /// \brief Most of the state for walking the partitions is held in a class
+ /// with a nice interface for examining them.
+ Partition P;
+
+ /// \brief We need to keep the end of the slices to know when to stop.
+ AllocaSlices::iterator SE;
+
+ /// \brief We also need to keep track of the maximum split end offset seen.
+ /// FIXME: Do we really?
+ uint64_t MaxSplitSliceEndOffset = 0;
+
+ /// \brief Sets the partition to be empty at given iterator, and sets the
+ /// end iterator.
+ partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
+ : P(SI), SE(SE) {
+ // If not already at the end, advance our state to form the initial
+ // partition.
+ if (SI != SE)
+ advance();
+ }
+
+ /// \brief Advance the iterator to the next partition.
+ ///
+ /// Requires that the iterator not be at the end of the slices.
+ void advance() {
+ assert((P.SI != SE || !P.SplitTails.empty()) &&
+ "Cannot advance past the end of the slices!");
+
+ // Clear out any split uses which have ended.
+ if (!P.SplitTails.empty()) {
+ if (P.EndOffset >= MaxSplitSliceEndOffset) {
+ // If we've finished all splits, this is easy.
+ P.SplitTails.clear();
+ MaxSplitSliceEndOffset = 0;
+ } else {
+ // Remove the uses which have ended in the prior partition. This
+ // cannot change the max split slice end because we just checked that
+ // the prior partition ended prior to that max.
+ P.SplitTails.erase(llvm::remove_if(P.SplitTails,
+ [&](Slice *S) {
+ return S->endOffset() <=
+ P.EndOffset;
+ }),
+ P.SplitTails.end());
+ assert(llvm::any_of(P.SplitTails,
+ [&](Slice *S) {
+ return S->endOffset() == MaxSplitSliceEndOffset;
+ }) &&
+ "Could not find the current max split slice offset!");
+ assert(llvm::all_of(P.SplitTails,
+ [&](Slice *S) {
+ return S->endOffset() <= MaxSplitSliceEndOffset;
+ }) &&
+ "Max split slice end offset is not actually the max!");
+ }
+ }
+
+ // If P.SI is already at the end, then we've cleared the split tail and
+ // now have an end iterator.
+ if (P.SI == SE) {
+ assert(P.SplitTails.empty() && "Failed to clear the split slices!");
+ return;
+ }
+
+ // If we had a non-empty partition previously, set up the state for
+ // subsequent partitions.
+ if (P.SI != P.SJ) {
+ // Accumulate all the splittable slices which started in the old
+ // partition into the split list.
+ for (Slice &S : P)
+ if (S.isSplittable() && S.endOffset() > P.EndOffset) {
+ P.SplitTails.push_back(&S);
+ MaxSplitSliceEndOffset =
+ std::max(S.endOffset(), MaxSplitSliceEndOffset);
+ }
+
+ // Start from the end of the previous partition.
+ P.SI = P.SJ;
+
+ // If P.SI is now at the end, we at most have a tail of split slices.
+ if (P.SI == SE) {
+ P.BeginOffset = P.EndOffset;
+ P.EndOffset = MaxSplitSliceEndOffset;
+ return;
+ }
+
+ // If the we have split slices and the next slice is after a gap and is
+ // not splittable immediately form an empty partition for the split
+ // slices up until the next slice begins.
+ if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
+ !P.SI->isSplittable()) {
+ P.BeginOffset = P.EndOffset;
+ P.EndOffset = P.SI->beginOffset();
+ return;
+ }
+ }
+
+ // OK, we need to consume new slices. Set the end offset based on the
+ // current slice, and step SJ past it. The beginning offset of the
+ // partition is the beginning offset of the next slice unless we have
+ // pre-existing split slices that are continuing, in which case we begin
+ // at the prior end offset.
+ P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
+ P.EndOffset = P.SI->endOffset();
+ ++P.SJ;
+
+ // There are two strategies to form a partition based on whether the
+ // partition starts with an unsplittable slice or a splittable slice.
+ if (!P.SI->isSplittable()) {
+ // When we're forming an unsplittable region, it must always start at
+ // the first slice and will extend through its end.
+ assert(P.BeginOffset == P.SI->beginOffset());
+
+ // Form a partition including all of the overlapping slices with this
+ // unsplittable slice.
+ while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+ if (!P.SJ->isSplittable())
+ P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+ ++P.SJ;
+ }
+
+ // We have a partition across a set of overlapping unsplittable
+ // partitions.
+ return;
+ }
+
+ // If we're starting with a splittable slice, then we need to form
+ // a synthetic partition spanning it and any other overlapping splittable
+ // splices.
+ assert(P.SI->isSplittable() && "Forming a splittable partition!");
+
+ // Collect all of the overlapping splittable slices.
+ while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
+ P.SJ->isSplittable()) {
+ P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+ ++P.SJ;
+ }
+
+ // Back upiP.EndOffset if we ended the span early when encountering an
+ // unsplittable slice. This synthesizes the early end offset of
+ // a partition spanning only splittable slices.
+ if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+ assert(!P.SJ->isSplittable());
+ P.EndOffset = P.SJ->beginOffset();
+ }
+ }
+
+public:
+ bool operator==(const partition_iterator &RHS) const {
+ assert(SE == RHS.SE &&
+ "End iterators don't match between compared partition iterators!");
+
+ // The observed positions of partitions is marked by the P.SI iterator and
+ // the emptiness of the split slices. The latter is only relevant when
+ // P.SI == SE, as the end iterator will additionally have an empty split
+ // slices list, but the prior may have the same P.SI and a tail of split
+ // slices.
+ if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
+ assert(P.SJ == RHS.P.SJ &&
+ "Same set of slices formed two different sized partitions!");
+ assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
+ "Same slice position with differently sized non-empty split "
+ "slice tails!");
+ return true;
+ }
+ return false;
+ }
+
+ partition_iterator &operator++() {
+ advance();
+ return *this;
+ }
+
+ Partition &operator*() { return P; }
+};
+
+/// \brief A forward range over the partitions of the alloca's slices.
+///
+/// This accesses an iterator range over the partitions of the alloca's
+/// slices. It computes these partitions on the fly based on the overlapping
+/// offsets of the slices and the ability to split them. It will visit "empty"
+/// partitions to cover regions of the alloca only accessed via split
+/// slices.
+iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
+ return make_range(partition_iterator(begin(), end()),
+ partition_iterator(end(), end()));
+}
+
+static Value *foldSelectInst(SelectInst &SI) {
+ // If the condition being selected on is a constant or the same value is
+ // being selected between, fold the select. Yes this does (rarely) happen
+ // early on.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
+ return SI.getOperand(1 + CI->isZero());
+ if (SI.getOperand(1) == SI.getOperand(2))
+ return SI.getOperand(1);
+
+ return nullptr;
+}
+
+/// \brief A helper that folds a PHI node or a select.
+static Value *foldPHINodeOrSelectInst(Instruction &I) {
+ if (PHINode *PN = dyn_cast<PHINode>(&I)) {
+ // If PN merges together the same value, return that value.
+ return PN->hasConstantValue();
+ }
+ return foldSelectInst(cast<SelectInst>(I));
+}
+
+/// \brief Builder for the alloca slices.
+///
+/// This class builds a set of alloca slices by recursively visiting the uses
+/// of an alloca and making a slice for each load and store at each offset.
+class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
+ friend class PtrUseVisitor<SliceBuilder>;
+ friend class InstVisitor<SliceBuilder>;
+
+ using Base = PtrUseVisitor<SliceBuilder>;
+
+ const uint64_t AllocSize;
+ AllocaSlices &AS;
+
+ SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
+ SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
+
+ /// \brief Set to de-duplicate dead instructions found in the use walk.
+ SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
+
+public:
+ SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
+ : PtrUseVisitor<SliceBuilder>(DL),
+ AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), AS(AS) {}
+
+private:
+ void markAsDead(Instruction &I) {
+ if (VisitedDeadInsts.insert(&I).second)
+ AS.DeadUsers.push_back(&I);
+ }
+
+ void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
+ bool IsSplittable = false) {
+ // Completely skip uses which have a zero size or start either before or
+ // past the end of the allocation.
+ if (Size == 0 || Offset.uge(AllocSize)) {
+ DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
+ << " which has zero size or starts outside of the "
+ << AllocSize << " byte alloca:\n"
+ << " alloca: " << AS.AI << "\n"
+ << " use: " << I << "\n");
+ return markAsDead(I);
+ }
+
+ uint64_t BeginOffset = Offset.getZExtValue();
+ uint64_t EndOffset = BeginOffset + Size;
+
+ // Clamp the end offset to the end of the allocation. Note that this is
+ // formulated to handle even the case where "BeginOffset + Size" overflows.
+ // This may appear superficially to be something we could ignore entirely,
+ // but that is not so! There may be widened loads or PHI-node uses where
+ // some instructions are dead but not others. We can't completely ignore
+ // them, and so have to record at least the information here.
+ assert(AllocSize >= BeginOffset); // Established above.
+ if (Size > AllocSize - BeginOffset) {
+ DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
+ << " to remain within the " << AllocSize << " byte alloca:\n"
+ << " alloca: " << AS.AI << "\n"
+ << " use: " << I << "\n");
+ EndOffset = AllocSize;
+ }
+
+ AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
+ }
+
+ void visitBitCastInst(BitCastInst &BC) {
+ if (BC.use_empty())
+ return markAsDead(BC);
+
+ return Base::visitBitCastInst(BC);
+ }
+
+ void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+ if (GEPI.use_empty())
+ return markAsDead(GEPI);
+
+ if (SROAStrictInbounds && GEPI.isInBounds()) {
+ // FIXME: This is a manually un-factored variant of the basic code inside
+ // of GEPs with checking of the inbounds invariant specified in the
+ // langref in a very strict sense. If we ever want to enable
+ // SROAStrictInbounds, this code should be factored cleanly into
+ // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
+ // by writing out the code here where we have the underlying allocation
+ // size readily available.
+ APInt GEPOffset = Offset;
+ const DataLayout &DL = GEPI.getModule()->getDataLayout();
+ for (gep_type_iterator GTI = gep_type_begin(GEPI),
+ GTE = gep_type_end(GEPI);
+ GTI != GTE; ++GTI) {
+ ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
+ if (!OpC)
+ break;
+
+ // Handle a struct index, which adds its field offset to the pointer.
+ if (StructType *STy = GTI.getStructTypeOrNull()) {
+ unsigned ElementIdx = OpC->getZExtValue();
+ const StructLayout *SL = DL.getStructLayout(STy);
+ GEPOffset +=
+ APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
+ } else {
+ // For array or vector indices, scale the index by the size of the
+ // type.
+ APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
+ GEPOffset += Index * APInt(Offset.getBitWidth(),
+ DL.getTypeAllocSize(GTI.getIndexedType()));
+ }
+
+ // If this index has computed an intermediate pointer which is not
+ // inbounds, then the result of the GEP is a poison value and we can
+ // delete it and all uses.
+ if (GEPOffset.ugt(AllocSize))
+ return markAsDead(GEPI);
+ }
+ }
+
+ return Base::visitGetElementPtrInst(GEPI);
+ }
+
+ void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
+ uint64_t Size, bool IsVolatile) {
+ // We allow splitting of non-volatile loads and stores where the type is an
+ // integer type. These may be used to implement 'memcpy' or other "transfer
+ // of bits" patterns.
+ bool IsSplittable = Ty->isIntegerTy() && !IsVolatile;
+
+ insertUse(I, Offset, Size, IsSplittable);
+ }
+
+ void visitLoadInst(LoadInst &LI) {
+ assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
+ "All simple FCA loads should have been pre-split");
+
+ if (!IsOffsetKnown)
+ return PI.setAborted(&LI);
+
+ const DataLayout &DL = LI.getModule()->getDataLayout();
+ uint64_t Size = DL.getTypeStoreSize(LI.getType());
+ return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
+ }
+
+ void visitStoreInst(StoreInst &SI) {
+ Value *ValOp = SI.getValueOperand();
+ if (ValOp == *U)
+ return PI.setEscapedAndAborted(&SI);
+ if (!IsOffsetKnown)
+ return PI.setAborted(&SI);
+
+ const DataLayout &DL = SI.getModule()->getDataLayout();
+ uint64_t Size = DL.getTypeStoreSize(ValOp->getType());
+
+ // If this memory access can be shown to *statically* extend outside the
+ // bounds of of the allocation, it's behavior is undefined, so simply
+ // ignore it. Note that this is more strict than the generic clamping
+ // behavior of insertUse. We also try to handle cases which might run the
+ // risk of overflow.
+ // FIXME: We should instead consider the pointer to have escaped if this
+ // function is being instrumented for addressing bugs or race conditions.
+ if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
+ DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
+ << " which extends past the end of the " << AllocSize
+ << " byte alloca:\n"
+ << " alloca: " << AS.AI << "\n"
+ << " use: " << SI << "\n");
+ return markAsDead(SI);
+ }
+
+ assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
+ "All simple FCA stores should have been pre-split");
+ handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
+ }
+
+ void visitMemSetInst(MemSetInst &II) {
+ assert(II.getRawDest() == *U && "Pointer use is not the destination?");
+ ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+ if ((Length && Length->getValue() == 0) ||
+ (IsOffsetKnown && Offset.uge(AllocSize)))
+ // Zero-length mem transfer intrinsics can be ignored entirely.
+ return markAsDead(II);
+
+ if (!IsOffsetKnown)
+ return PI.setAborted(&II);
+
+ insertUse(II, Offset, Length ? Length->getLimitedValue()
+ : AllocSize - Offset.getLimitedValue(),
+ (bool)Length);
+ }
+
+ void visitMemTransferInst(MemTransferInst &II) {
+ ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+ if (Length && Length->getValue() == 0)
+ // Zero-length mem transfer intrinsics can be ignored entirely.
+ return markAsDead(II);
+
+ // Because we can visit these intrinsics twice, also check to see if the
+ // first time marked this instruction as dead. If so, skip it.
+ if (VisitedDeadInsts.count(&II))
+ return;
+
+ if (!IsOffsetKnown)
+ return PI.setAborted(&II);
+
+ // This side of the transfer is completely out-of-bounds, and so we can
+ // nuke the entire transfer. However, we also need to nuke the other side
+ // if already added to our partitions.
+ // FIXME: Yet another place we really should bypass this when
+ // instrumenting for ASan.
+ if (Offset.uge(AllocSize)) {
+ SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
+ MemTransferSliceMap.find(&II);
+ if (MTPI != MemTransferSliceMap.end())
+ AS.Slices[MTPI->second].kill();
+ return markAsDead(II);
+ }
+
+ uint64_t RawOffset = Offset.getLimitedValue();
+ uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
+
+ // Check for the special case where the same exact value is used for both
+ // source and dest.
+ if (*U == II.getRawDest() && *U == II.getRawSource()) {
+ // For non-volatile transfers this is a no-op.
+ if (!II.isVolatile())
+ return markAsDead(II);
+
+ return insertUse(II, Offset, Size, /*IsSplittable=*/false);
+ }
+
+ // If we have seen both source and destination for a mem transfer, then
+ // they both point to the same alloca.
+ bool Inserted;
+ SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
+ std::tie(MTPI, Inserted) =
+ MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
+ unsigned PrevIdx = MTPI->second;
+ if (!Inserted) {
+ Slice &PrevP = AS.Slices[PrevIdx];
+
+ // Check if the begin offsets match and this is a non-volatile transfer.
+ // In that case, we can completely elide the transfer.
+ if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
+ PrevP.kill();
+ return markAsDead(II);
+ }
+
+ // Otherwise we have an offset transfer within the same alloca. We can't
+ // split those.
+ PrevP.makeUnsplittable();
+ }
+
+ // Insert the use now that we've fixed up the splittable nature.
+ insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
+
+ // Check that we ended up with a valid index in the map.
+ assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
+ "Map index doesn't point back to a slice with this user.");
+ }
+
+ // Disable SRoA for any intrinsics except for lifetime invariants.
+ // FIXME: What about debug intrinsics? This matches old behavior, but
+ // doesn't make sense.
+ void visitIntrinsicInst(IntrinsicInst &II) {
+ if (!IsOffsetKnown)
+ return PI.setAborted(&II);
+
+ if (II.getIntrinsicID() == Intrinsic::lifetime_start ||
+ II.getIntrinsicID() == Intrinsic::lifetime_end) {
+ ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
+ uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
+ Length->getLimitedValue());
+ insertUse(II, Offset, Size, true);
+ return;
+ }
+
+ Base::visitIntrinsicInst(II);
+ }
+
+ Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
+ // We consider any PHI or select that results in a direct load or store of
+ // the same offset to be a viable use for slicing purposes. These uses
+ // are considered unsplittable and the size is the maximum loaded or stored
+ // size.
+ SmallPtrSet<Instruction *, 4> Visited;
+ SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
+ Visited.insert(Root);
+ Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
+ const DataLayout &DL = Root->getModule()->getDataLayout();
+ // If there are no loads or stores, the access is dead. We mark that as
+ // a size zero access.
+ Size = 0;
+ do {
+ Instruction *I, *UsedI;
+ std::tie(UsedI, I) = Uses.pop_back_val();
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ Size = std::max(Size, DL.getTypeStoreSize(LI->getType()));
+ continue;
+ }
+ if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+ Value *Op = SI->getOperand(0);
+ if (Op == UsedI)
+ return SI;
+ Size = std::max(Size, DL.getTypeStoreSize(Op->getType()));
+ continue;
+ }
+
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+ if (!GEP->hasAllZeroIndices())
+ return GEP;
+ } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
+ !isa<SelectInst>(I)) {
+ return I;
+ }
+
+ for (User *U : I->users())
+ if (Visited.insert(cast<Instruction>(U)).second)
+ Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
+ } while (!Uses.empty());
+
+ return nullptr;
+ }
+
+ void visitPHINodeOrSelectInst(Instruction &I) {
+ assert(isa<PHINode>(I) || isa<SelectInst>(I));
+ if (I.use_empty())
+ return markAsDead(I);
+
+ // TODO: We could use SimplifyInstruction here to fold PHINodes and
+ // SelectInsts. However, doing so requires to change the current
+ // dead-operand-tracking mechanism. For instance, suppose neither loading
+ // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
+ // trap either. However, if we simply replace %U with undef using the
+ // current dead-operand-tracking mechanism, "load (select undef, undef,
+ // %other)" may trap because the select may return the first operand
+ // "undef".
+ if (Value *Result = foldPHINodeOrSelectInst(I)) {
+ if (Result == *U)
+ // If the result of the constant fold will be the pointer, recurse
+ // through the PHI/select as if we had RAUW'ed it.
+ enqueueUsers(I);
+ else
+ // Otherwise the operand to the PHI/select is dead, and we can replace
+ // it with undef.
+ AS.DeadOperands.push_back(U);
+
+ return;
+ }
+
+ if (!IsOffsetKnown)
+ return PI.setAborted(&I);
+
+ // See if we already have computed info on this node.
+ uint64_t &Size = PHIOrSelectSizes[&I];
+ if (!Size) {
+ // This is a new PHI/Select, check for an unsafe use of it.
+ if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
+ return PI.setAborted(UnsafeI);
+ }
+
+ // For PHI and select operands outside the alloca, we can't nuke the entire
+ // phi or select -- the other side might still be relevant, so we special
+ // case them here and use a separate structure to track the operands
+ // themselves which should be replaced with undef.
+ // FIXME: This should instead be escaped in the event we're instrumenting
+ // for address sanitization.
+ if (Offset.uge(AllocSize)) {
+ AS.DeadOperands.push_back(U);
+ return;
+ }
+
+ insertUse(I, Offset, Size);
+ }
+
+ void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
+
+ void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
+
+ /// \brief Disable SROA entirely if there are unhandled users of the alloca.
+ void visitInstruction(Instruction &I) { PI.setAborted(&I); }
+};
+
+AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
+ :
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ AI(AI),
+#endif
+ PointerEscapingInstr(nullptr) {
+ SliceBuilder PB(DL, AI, *this);
+ SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
+ if (PtrI.isEscaped() || PtrI.isAborted()) {
+ // FIXME: We should sink the escape vs. abort info into the caller nicely,
+ // possibly by just storing the PtrInfo in the AllocaSlices.
+ PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
+ : PtrI.getAbortingInst();
+ assert(PointerEscapingInstr && "Did not track a bad instruction");
+ return;
+ }
+
+ Slices.erase(
+ llvm::remove_if(Slices, [](const Slice &S) { return S.isDead(); }),
+ Slices.end());
+
+#ifndef NDEBUG
+ if (SROARandomShuffleSlices) {
+ std::mt19937 MT(static_cast<unsigned>(
+ std::chrono::system_clock::now().time_since_epoch().count()));
+ std::shuffle(Slices.begin(), Slices.end(), MT);
+ }
+#endif
+
+ // Sort the uses. This arranges for the offsets to be in ascending order,
+ // and the sizes to be in descending order.
+ std::sort(Slices.begin(), Slices.end());
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+void AllocaSlices::print(raw_ostream &OS, const_iterator I,
+ StringRef Indent) const {
+ printSlice(OS, I, Indent);
+ OS << "\n";
+ printUse(OS, I, Indent);
+}
+
+void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
+ StringRef Indent) const {
+ OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
+ << " slice #" << (I - begin())
+ << (I->isSplittable() ? " (splittable)" : "");
+}
+
+void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
+ StringRef Indent) const {
+ OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
+}
+
+void AllocaSlices::print(raw_ostream &OS) const {
+ if (PointerEscapingInstr) {
+ OS << "Can't analyze slices for alloca: " << AI << "\n"
+ << " A pointer to this alloca escaped by:\n"
+ << " " << *PointerEscapingInstr << "\n";
+ return;
+ }
+
+ OS << "Slices of alloca: " << AI << "\n";
+ for (const_iterator I = begin(), E = end(); I != E; ++I)
+ print(OS, I);
+}
+
+LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
+ print(dbgs(), I);
+}
+LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
+
+#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+/// Walk the range of a partitioning looking for a common type to cover this
+/// sequence of slices.
+static Type *findCommonType(AllocaSlices::const_iterator B,
+ AllocaSlices::const_iterator E,
+ uint64_t EndOffset) {
+ Type *Ty = nullptr;
+ bool TyIsCommon = true;
+ IntegerType *ITy = nullptr;
+
+ // Note that we need to look at *every* alloca slice's Use to ensure we
+ // always get consistent results regardless of the order of slices.
+ for (AllocaSlices::const_iterator I = B; I != E; ++I) {
+ Use *U = I->getUse();
+ if (isa<IntrinsicInst>(*U->getUser()))
+ continue;
+ if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
+ continue;
+
+ Type *UserTy = nullptr;
+ if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+ UserTy = LI->getType();
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+ UserTy = SI->getValueOperand()->getType();
+ }
+
+ if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
+ // If the type is larger than the partition, skip it. We only encounter
+ // this for split integer operations where we want to use the type of the
+ // entity causing the split. Also skip if the type is not a byte width
+ // multiple.
+ if (UserITy->getBitWidth() % 8 != 0 ||
+ UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
+ continue;
+
+ // Track the largest bitwidth integer type used in this way in case there
+ // is no common type.
+ if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
+ ITy = UserITy;
+ }
+
+ // To avoid depending on the order of slices, Ty and TyIsCommon must not
+ // depend on types skipped above.
+ if (!UserTy || (Ty && Ty != UserTy))
+ TyIsCommon = false; // Give up on anything but an iN type.
+ else
+ Ty = UserTy;
+ }
+
+ return TyIsCommon ? Ty : ITy;
+}
+
+/// PHI instructions that use an alloca and are subsequently loaded can be
+/// rewritten to load both input pointers in the pred blocks and then PHI the
+/// results, allowing the load of the alloca to be promoted.
+/// From this:
+/// %P2 = phi [i32* %Alloca, i32* %Other]
+/// %V = load i32* %P2
+/// to:
+/// %V1 = load i32* %Alloca -> will be mem2reg'd
+/// ...
+/// %V2 = load i32* %Other
+/// ...
+/// %V = phi [i32 %V1, i32 %V2]
+///
+/// We can do this to a select if its only uses are loads and if the operands
+/// to the select can be loaded unconditionally.
+///
+/// FIXME: This should be hoisted into a generic utility, likely in
+/// Transforms/Util/Local.h
+static bool isSafePHIToSpeculate(PHINode &PN) {
+ // For now, we can only do this promotion if the load is in the same block
+ // as the PHI, and if there are no stores between the phi and load.
+ // TODO: Allow recursive phi users.
+ // TODO: Allow stores.
+ BasicBlock *BB = PN.getParent();
+ unsigned MaxAlign = 0;
+ bool HaveLoad = false;
+ for (User *U : PN.users()) {
+ LoadInst *LI = dyn_cast<LoadInst>(U);
+ if (!LI || !LI->isSimple())
+ return false;
+
+ // For now we only allow loads in the same block as the PHI. This is
+ // a common case that happens when instcombine merges two loads through
+ // a PHI.
+ if (LI->getParent() != BB)
+ return false;
+
+ // Ensure that there are no instructions between the PHI and the load that
+ // could store.
+ for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
+ if (BBI->mayWriteToMemory())
+ return false;
+
+ MaxAlign = std::max(MaxAlign, LI->getAlignment());
+ HaveLoad = true;
+ }
+
+ if (!HaveLoad)
+ return false;
+
+ const DataLayout &DL = PN.getModule()->getDataLayout();
+
+ // We can only transform this if it is safe to push the loads into the
+ // predecessor blocks. The only thing to watch out for is that we can't put
+ // a possibly trapping load in the predecessor if it is a critical edge.
+ for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+ TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
+ Value *InVal = PN.getIncomingValue(Idx);
+
+ // If the value is produced by the terminator of the predecessor (an
+ // invoke) or it has side-effects, there is no valid place to put a load
+ // in the predecessor.
+ if (TI == InVal || TI->mayHaveSideEffects())
+ return false;
+
+ // If the predecessor has a single successor, then the edge isn't
+ // critical.
+ if (TI->getNumSuccessors() == 1)
+ continue;
+
+ // If this pointer is always safe to load, or if we can prove that there
+ // is already a load in the block, then we can move the load to the pred
+ // block.
+ if (isSafeToLoadUnconditionally(InVal, MaxAlign, DL, TI))
+ continue;
+
+ return false;
+ }
+
+ return true;
+}
+
+static void speculatePHINodeLoads(PHINode &PN) {
+ DEBUG(dbgs() << " original: " << PN << "\n");
+
+ Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
+ IRBuilderTy PHIBuilder(&PN);
+ PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
+ PN.getName() + ".sroa.speculated");
+
+ // Get the AA tags and alignment to use from one of the loads. It doesn't
+ // matter which one we get and if any differ.
+ LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
+
+ AAMDNodes AATags;
+ SomeLoad->getAAMetadata(AATags);
+ unsigned Align = SomeLoad->getAlignment();
+
+ // Rewrite all loads of the PN to use the new PHI.
+ while (!PN.use_empty()) {
+ LoadInst *LI = cast<LoadInst>(PN.user_back());
+ LI->replaceAllUsesWith(NewPN);
+ LI->eraseFromParent();
+ }
+
+ // Inject loads into all of the pred blocks.
+ for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+ BasicBlock *Pred = PN.getIncomingBlock(Idx);
+ TerminatorInst *TI = Pred->getTerminator();
+ Value *InVal = PN.getIncomingValue(Idx);
+ IRBuilderTy PredBuilder(TI);
+
+ LoadInst *Load = PredBuilder.CreateLoad(
+ InVal, (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
+ ++NumLoadsSpeculated;
+ Load->setAlignment(Align);
+ if (AATags)
+ Load->setAAMetadata(AATags);
+ NewPN->addIncoming(Load, Pred);
+ }
+
+ DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
+ PN.eraseFromParent();
+}
+
+/// Select instructions that use an alloca and are subsequently loaded can be
+/// rewritten to load both input pointers and then select between the result,
+/// allowing the load of the alloca to be promoted.
+/// From this:
+/// %P2 = select i1 %cond, i32* %Alloca, i32* %Other
+/// %V = load i32* %P2
+/// to:
+/// %V1 = load i32* %Alloca -> will be mem2reg'd
+/// %V2 = load i32* %Other
+/// %V = select i1 %cond, i32 %V1, i32 %V2
+///
+/// We can do this to a select if its only uses are loads and if the operand
+/// to the select can be loaded unconditionally.
+static bool isSafeSelectToSpeculate(SelectInst &SI) {
+ Value *TValue = SI.getTrueValue();
+ Value *FValue = SI.getFalseValue();
+ const DataLayout &DL = SI.getModule()->getDataLayout();
+
+ for (User *U : SI.users()) {
+ LoadInst *LI = dyn_cast<LoadInst>(U);
+ if (!LI || !LI->isSimple())
+ return false;
+
+ // Both operands to the select need to be dereferenceable, either
+ // absolutely (e.g. allocas) or at this point because we can see other
+ // accesses to it.
+ if (!isSafeToLoadUnconditionally(TValue, LI->getAlignment(), DL, LI))
+ return false;
+ if (!isSafeToLoadUnconditionally(FValue, LI->getAlignment(), DL, LI))
+ return false;
+ }
+
+ return true;
+}
+
+static void speculateSelectInstLoads(SelectInst &SI) {
+ DEBUG(dbgs() << " original: " << SI << "\n");
+
+ IRBuilderTy IRB(&SI);
+ Value *TV = SI.getTrueValue();
+ Value *FV = SI.getFalseValue();
+ // Replace the loads of the select with a select of two loads.
+ while (!SI.use_empty()) {
+ LoadInst *LI = cast<LoadInst>(SI.user_back());
+ assert(LI->isSimple() && "We only speculate simple loads");
+
+ IRB.SetInsertPoint(LI);
+ LoadInst *TL =
+ IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true");
+ LoadInst *FL =
+ IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false");
+ NumLoadsSpeculated += 2;
+
+ // Transfer alignment and AA info if present.
+ TL->setAlignment(LI->getAlignment());
+ FL->setAlignment(LI->getAlignment());
+
+ AAMDNodes Tags;
+ LI->getAAMetadata(Tags);
+ if (Tags) {
+ TL->setAAMetadata(Tags);
+ FL->setAAMetadata(Tags);
+ }
+
+ Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
+ LI->getName() + ".sroa.speculated");
+
+ DEBUG(dbgs() << " speculated to: " << *V << "\n");
+ LI->replaceAllUsesWith(V);
+ LI->eraseFromParent();
+ }
+ SI.eraseFromParent();
+}
+
+/// \brief Build a GEP out of a base pointer and indices.
+///
+/// This will return the BasePtr if that is valid, or build a new GEP
+/// instruction using the IRBuilder if GEP-ing is needed.
+static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
+ SmallVectorImpl<Value *> &Indices, Twine NamePrefix) {
+ if (Indices.empty())
+ return BasePtr;
+
+ // A single zero index is a no-op, so check for this and avoid building a GEP
+ // in that case.
+ if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
+ return BasePtr;
+
+ return IRB.CreateInBoundsGEP(nullptr, BasePtr, Indices,
+ NamePrefix + "sroa_idx");
+}
+
+/// \brief Get a natural GEP off of the BasePtr walking through Ty toward
+/// TargetTy without changing the offset of the pointer.
+///
+/// This routine assumes we've already established a properly offset GEP with
+/// Indices, and arrived at the Ty type. The goal is to continue to GEP with
+/// zero-indices down through type layers until we find one the same as
+/// TargetTy. If we can't find one with the same type, we at least try to use
+/// one with the same size. If none of that works, we just produce the GEP as
+/// indicated by Indices to have the correct offset.
+static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
+ Value *BasePtr, Type *Ty, Type *TargetTy,
+ SmallVectorImpl<Value *> &Indices,
+ Twine NamePrefix) {
+ if (Ty == TargetTy)
+ return buildGEP(IRB, BasePtr, Indices, NamePrefix);
+
+ // Pointer size to use for the indices.
+ unsigned PtrSize = DL.getPointerTypeSizeInBits(BasePtr->getType());
+
+ // See if we can descend into a struct and locate a field with the correct
+ // type.
+ unsigned NumLayers = 0;
+ Type *ElementTy = Ty;
+ do {
+ if (ElementTy->isPointerTy())
+ break;
+
+ if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) {
+ ElementTy = ArrayTy->getElementType();
+ Indices.push_back(IRB.getIntN(PtrSize, 0));
+ } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) {
+ ElementTy = VectorTy->getElementType();
+ Indices.push_back(IRB.getInt32(0));
+ } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
+ if (STy->element_begin() == STy->element_end())
+ break; // Nothing left to descend into.
+ ElementTy = *STy->element_begin();
+ Indices.push_back(IRB.getInt32(0));
+ } else {
+ break;
+ }
+ ++NumLayers;
+ } while (ElementTy != TargetTy);
+ if (ElementTy != TargetTy)
+ Indices.erase(Indices.end() - NumLayers, Indices.end());
+
+ return buildGEP(IRB, BasePtr, Indices, NamePrefix);
+}
+
+/// \brief Recursively compute indices for a natural GEP.
+///
+/// This is the recursive step for getNaturalGEPWithOffset that walks down the
+/// element types adding appropriate indices for the GEP.
+static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
+ Value *Ptr, Type *Ty, APInt &Offset,
+ Type *TargetTy,
+ SmallVectorImpl<Value *> &Indices,
+ Twine NamePrefix) {
+ if (Offset == 0)
+ return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices,
+ NamePrefix);
+
+ // We can't recurse through pointer types.
+ if (Ty->isPointerTy())
+ return nullptr;
+
+ // We try to analyze GEPs over vectors here, but note that these GEPs are
+ // extremely poorly defined currently. The long-term goal is to remove GEPing
+ // over a vector from the IR completely.
+ if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
+ unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType());
+ if (ElementSizeInBits % 8 != 0) {
+ // GEPs over non-multiple of 8 size vector elements are invalid.
+ return nullptr;
+ }
+ APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
+ APInt NumSkippedElements = Offset.sdiv(ElementSize);
+ if (NumSkippedElements.ugt(VecTy->getNumElements()))
+ return nullptr;
+ Offset -= NumSkippedElements * ElementSize;
+ Indices.push_back(IRB.getInt(NumSkippedElements));
+ return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(),
+ Offset, TargetTy, Indices, NamePrefix);
+ }
+
+ if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
+ Type *ElementTy = ArrTy->getElementType();
+ APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
+ APInt NumSkippedElements = Offset.sdiv(ElementSize);
+ if (NumSkippedElements.ugt(ArrTy->getNumElements()))
+ return nullptr;
+
+ Offset -= NumSkippedElements * ElementSize;
+ Indices.push_back(IRB.getInt(NumSkippedElements));
+ return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
+ Indices, NamePrefix);
+ }
+
+ StructType *STy = dyn_cast<StructType>(Ty);
+ if (!STy)
+ return nullptr;
+
+ const StructLayout *SL = DL.getStructLayout(STy);
+ uint64_t StructOffset = Offset.getZExtValue();
+ if (StructOffset >= SL->getSizeInBytes())
+ return nullptr;
+ unsigned Index = SL->getElementContainingOffset(StructOffset);
+ Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
+ Type *ElementTy = STy->getElementType(Index);
+ if (Offset.uge(DL.getTypeAllocSize(ElementTy)))
+ return nullptr; // The offset points into alignment padding.
+
+ Indices.push_back(IRB.getInt32(Index));
+ return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
+ Indices, NamePrefix);
+}
+
+/// \brief Get a natural GEP from a base pointer to a particular offset and
+/// resulting in a particular type.
+///
+/// The goal is to produce a "natural" looking GEP that works with the existing
+/// composite types to arrive at the appropriate offset and element type for
+/// a pointer. TargetTy is the element type the returned GEP should point-to if
+/// possible. We recurse by decreasing Offset, adding the appropriate index to
+/// Indices, and setting Ty to the result subtype.
+///
+/// If no natural GEP can be constructed, this function returns null.
+static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
+ Value *Ptr, APInt Offset, Type *TargetTy,
+ SmallVectorImpl<Value *> &Indices,
+ Twine NamePrefix) {
+ PointerType *Ty = cast<PointerType>(Ptr->getType());
+
+ // Don't consider any GEPs through an i8* as natural unless the TargetTy is
+ // an i8.
+ if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8))
+ return nullptr;
+
+ Type *ElementTy = Ty->getElementType();
+ if (!ElementTy->isSized())
+ return nullptr; // We can't GEP through an unsized element.
+ APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
+ if (ElementSize == 0)
+ return nullptr; // Zero-length arrays can't help us build a natural GEP.
+ APInt NumSkippedElements = Offset.sdiv(ElementSize);
+
+ Offset -= NumSkippedElements * ElementSize;
+ Indices.push_back(IRB.getInt(NumSkippedElements));
+ return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
+ Indices, NamePrefix);
+}
+
+/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
+/// resulting pointer has PointerTy.
+///
+/// This tries very hard to compute a "natural" GEP which arrives at the offset
+/// and produces the pointer type desired. Where it cannot, it will try to use
+/// the natural GEP to arrive at the offset and bitcast to the type. Where that
+/// fails, it will try to use an existing i8* and GEP to the byte offset and
+/// bitcast to the type.
+///
+/// The strategy for finding the more natural GEPs is to peel off layers of the
+/// pointer, walking back through bit casts and GEPs, searching for a base
+/// pointer from which we can compute a natural GEP with the desired
+/// properties. The algorithm tries to fold as many constant indices into
+/// a single GEP as possible, thus making each GEP more independent of the
+/// surrounding code.
+static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
+ APInt Offset, Type *PointerTy, Twine NamePrefix) {
+ // Even though we don't look through PHI nodes, we could be called on an
+ // instruction in an unreachable block, which may be on a cycle.
+ SmallPtrSet<Value *, 4> Visited;
+ Visited.insert(Ptr);
+ SmallVector<Value *, 4> Indices;
+
+ // We may end up computing an offset pointer that has the wrong type. If we
+ // never are able to compute one directly that has the correct type, we'll
+ // fall back to it, so keep it and the base it was computed from around here.
+ Value *OffsetPtr = nullptr;
+ Value *OffsetBasePtr;
+
+ // Remember any i8 pointer we come across to re-use if we need to do a raw
+ // byte offset.
+ Value *Int8Ptr = nullptr;
+ APInt Int8PtrOffset(Offset.getBitWidth(), 0);
+
+ Type *TargetTy = PointerTy->getPointerElementType();
+
+ do {
+ // First fold any existing GEPs into the offset.
+ while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
+ APInt GEPOffset(Offset.getBitWidth(), 0);
+ if (!GEP->accumulateConstantOffset(DL, GEPOffset))
+ break;
+ Offset += GEPOffset;
+ Ptr = GEP->getPointerOperand();
+ if (!Visited.insert(Ptr).second)
+ break;
+ }
+
+ // See if we can perform a natural GEP here.
+ Indices.clear();
+ if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
+ Indices, NamePrefix)) {
+ // If we have a new natural pointer at the offset, clear out any old
+ // offset pointer we computed. Unless it is the base pointer or
+ // a non-instruction, we built a GEP we don't need. Zap it.
+ if (OffsetPtr && OffsetPtr != OffsetBasePtr)
+ if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) {
+ assert(I->use_empty() && "Built a GEP with uses some how!");
+ I->eraseFromParent();
+ }
+ OffsetPtr = P;
+ OffsetBasePtr = Ptr;
+ // If we also found a pointer of the right type, we're done.
+ if (P->getType() == PointerTy)
+ return P;
+ }
+
+ // Stash this pointer if we've found an i8*.
+ if (Ptr->getType()->isIntegerTy(8)) {
+ Int8Ptr = Ptr;
+ Int8PtrOffset = Offset;
+ }
+
+ // Peel off a layer of the pointer and update the offset appropriately.
+ if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
+ Ptr = cast<Operator>(Ptr)->getOperand(0);
+ } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
+ if (GA->isInterposable())
+ break;
+ Ptr = GA->getAliasee();
+ } else {
+ break;
+ }
+ assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
+ } while (Visited.insert(Ptr).second);
+
+ if (!OffsetPtr) {
+ if (!Int8Ptr) {
+ Int8Ptr = IRB.CreateBitCast(
+ Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()),
+ NamePrefix + "sroa_raw_cast");
+ Int8PtrOffset = Offset;
+ }
+
+ OffsetPtr = Int8PtrOffset == 0
+ ? Int8Ptr
+ : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr,
+ IRB.getInt(Int8PtrOffset),
+ NamePrefix + "sroa_raw_idx");
+ }
+ Ptr = OffsetPtr;
+
+ // On the off chance we were targeting i8*, guard the bitcast here.
+ if (Ptr->getType() != PointerTy)
+ Ptr = IRB.CreateBitCast(Ptr, PointerTy, NamePrefix + "sroa_cast");
+
+ return Ptr;
+}
+
+/// \brief Compute the adjusted alignment for a load or store from an offset.
+static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
+ const DataLayout &DL) {
+ unsigned Alignment;
+ Type *Ty;
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ Alignment = LI->getAlignment();
+ Ty = LI->getType();
+ } else if (auto *SI = dyn_cast<StoreInst>(I)) {
+ Alignment = SI->getAlignment();
+ Ty = SI->getValueOperand()->getType();
+ } else {
+ llvm_unreachable("Only loads and stores are allowed!");
+ }
+
+ if (!Alignment)
+ Alignment = DL.getABITypeAlignment(Ty);
+
+ return MinAlign(Alignment, Offset);
+}
+
+/// \brief Test whether we can convert a value from the old to the new type.
+///
+/// This predicate should be used to guard calls to convertValue in order to
+/// ensure that we only try to convert viable values. The strategy is that we
+/// will peel off single element struct and array wrappings to get to an
+/// underlying value, and convert that value.
+static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
+ if (OldTy == NewTy)
+ return true;
+
+ // For integer types, we can't handle any bit-width differences. This would
+ // break both vector conversions with extension and introduce endianness
+ // issues when in conjunction with loads and stores.
+ if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
+ assert(cast<IntegerType>(OldTy)->getBitWidth() !=
+ cast<IntegerType>(NewTy)->getBitWidth() &&
+ "We can't have the same bitwidth for different int types");
+ return false;
+ }
+
+ if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy))
+ return false;
+ if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
+ return false;
+
+ // We can convert pointers to integers and vice-versa. Same for vectors
+ // of pointers and integers.
+ OldTy = OldTy->getScalarType();
+ NewTy = NewTy->getScalarType();
+ if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
+ if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
+ return cast<PointerType>(NewTy)->getPointerAddressSpace() ==
+ cast<PointerType>(OldTy)->getPointerAddressSpace();
+ }
+
+ // We can convert integers to integral pointers, but not to non-integral
+ // pointers.
+ if (OldTy->isIntegerTy())
+ return !DL.isNonIntegralPointerType(NewTy);
+
+ // We can convert integral pointers to integers, but non-integral pointers
+ // need to remain pointers.
+ if (!DL.isNonIntegralPointerType(OldTy))
+ return NewTy->isIntegerTy();
+
+ return false;
+ }
+
+ return true;
+}
+
+/// \brief Generic routine to convert an SSA value to a value of a different
+/// type.
+///
+/// This will try various different casting techniques, such as bitcasts,
+/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
+/// two types for viability with this routine.
+static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
+ Type *NewTy) {
+ Type *OldTy = V->getType();
+ assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
+
+ if (OldTy == NewTy)
+ return V;
+
+ assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
+ "Integer types must be the exact same to convert.");
+
+ // See if we need inttoptr for this type pair. A cast involving both scalars
+ // and vectors requires and additional bitcast.
+ if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
+ // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
+ if (OldTy->isVectorTy() && !NewTy->isVectorTy())
+ return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+ NewTy);
+
+ // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
+ if (!OldTy->isVectorTy() && NewTy->isVectorTy())
+ return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+ NewTy);
+
+ return IRB.CreateIntToPtr(V, NewTy);
+ }
+
+ // See if we need ptrtoint for this type pair. A cast involving both scalars
+ // and vectors requires and additional bitcast.
+ if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
+ // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
+ if (OldTy->isVectorTy() && !NewTy->isVectorTy())
+ return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+ NewTy);
+
+ // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
+ if (!OldTy->isVectorTy() && NewTy->isVectorTy())
+ return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+ NewTy);
+
+ return IRB.CreatePtrToInt(V, NewTy);
+ }
+
+ return IRB.CreateBitCast(V, NewTy);
+}
+
+/// \brief Test whether the given slice use can be promoted to a vector.
+///
+/// This function is called to test each entry in a partition which is slated
+/// for a single slice.
+static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
+ VectorType *Ty,
+ uint64_t ElementSize,
+ const DataLayout &DL) {
+ // First validate the slice offsets.
+ uint64_t BeginOffset =
+ std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
+ uint64_t BeginIndex = BeginOffset / ElementSize;
+ if (BeginIndex * ElementSize != BeginOffset ||
+ BeginIndex >= Ty->getNumElements())
+ return false;
+ uint64_t EndOffset =
+ std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
+ uint64_t EndIndex = EndOffset / ElementSize;
+ if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements())
+ return false;
+
+ assert(EndIndex > BeginIndex && "Empty vector!");
+ uint64_t NumElements = EndIndex - BeginIndex;
+ Type *SliceTy = (NumElements == 1)
+ ? Ty->getElementType()
+ : VectorType::get(Ty->getElementType(), NumElements);
+
+ Type *SplitIntTy =
+ Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
+
+ Use *U = S.getUse();
+
+ if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
+ if (MI->isVolatile())
+ return false;
+ if (!S.isSplittable())
+ return false; // Skip any unsplittable intrinsics.
+ } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
+ if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+ II->getIntrinsicID() != Intrinsic::lifetime_end)
+ return false;
+ } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
+ // Disable vector promotion when there are loads or stores of an FCA.
+ return false;
+ } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+ if (LI->isVolatile())
+ return false;
+ Type *LTy = LI->getType();
+ if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
+ assert(LTy->isIntegerTy());
+ LTy = SplitIntTy;
+ }
+ if (!canConvertValue(DL, SliceTy, LTy))
+ return false;
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+ if (SI->isVolatile())
+ return false;
+ Type *STy = SI->getValueOperand()->getType();
+ if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
+ assert(STy->isIntegerTy());
+ STy = SplitIntTy;
+ }
+ if (!canConvertValue(DL, STy, SliceTy))
+ return false;
+ } else {
+ return false;
+ }
+
+ return true;
+}
+
+/// \brief Test whether the given alloca partitioning and range of slices can be
+/// promoted to a vector.
+///
+/// This is a quick test to check whether we can rewrite a particular alloca
+/// partition (and its newly formed alloca) into a vector alloca with only
+/// whole-vector loads and stores such that it could be promoted to a vector
+/// SSA value. We only can ensure this for a limited set of operations, and we
+/// don't want to do the rewrites unless we are confident that the result will
+/// be promotable, so we have an early test here.
+static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
+ // Collect the candidate types for vector-based promotion. Also track whether
+ // we have different element types.
+ SmallVector<VectorType *, 4> CandidateTys;
+ Type *CommonEltTy = nullptr;
+ bool HaveCommonEltTy = true;
+ auto CheckCandidateType = [&](Type *Ty) {
+ if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+ CandidateTys.push_back(VTy);
+ if (!CommonEltTy)
+ CommonEltTy = VTy->getElementType();
+ else if (CommonEltTy != VTy->getElementType())
+ HaveCommonEltTy = false;
+ }
+ };
+ // Consider any loads or stores that are the exact size of the slice.
+ for (const Slice &S : P)
+ if (S.beginOffset() == P.beginOffset() &&
+ S.endOffset() == P.endOffset()) {
+ if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
+ CheckCandidateType(LI->getType());
+ else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
+ CheckCandidateType(SI->getValueOperand()->getType());
+ }
+
+ // If we didn't find a vector type, nothing to do here.
+ if (CandidateTys.empty())
+ return nullptr;
+
+ // Remove non-integer vector types if we had multiple common element types.
+ // FIXME: It'd be nice to replace them with integer vector types, but we can't
+ // do that until all the backends are known to produce good code for all
+ // integer vector types.
+ if (!HaveCommonEltTy) {
+ CandidateTys.erase(
+ llvm::remove_if(CandidateTys,
+ [](VectorType *VTy) {
+ return !VTy->getElementType()->isIntegerTy();
+ }),
+ CandidateTys.end());
+
+ // If there were no integer vector types, give up.
+ if (CandidateTys.empty())
+ return nullptr;
+
+ // Rank the remaining candidate vector types. This is easy because we know
+ // they're all integer vectors. We sort by ascending number of elements.
+ auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
+ (void)DL;
+ assert(DL.getTypeSizeInBits(RHSTy) == DL.getTypeSizeInBits(LHSTy) &&
+ "Cannot have vector types of different sizes!");
+ assert(RHSTy->getElementType()->isIntegerTy() &&
+ "All non-integer types eliminated!");
+ assert(LHSTy->getElementType()->isIntegerTy() &&
+ "All non-integer types eliminated!");
+ return RHSTy->getNumElements() < LHSTy->getNumElements();
+ };
+ std::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes);
+ CandidateTys.erase(
+ std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
+ CandidateTys.end());
+ } else {
+// The only way to have the same element type in every vector type is to
+// have the same vector type. Check that and remove all but one.
+#ifndef NDEBUG
+ for (VectorType *VTy : CandidateTys) {
+ assert(VTy->getElementType() == CommonEltTy &&
+ "Unaccounted for element type!");
+ assert(VTy == CandidateTys[0] &&
+ "Different vector types with the same element type!");
+ }
+#endif
+ CandidateTys.resize(1);
+ }
+
+ // Try each vector type, and return the one which works.
+ auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
+ uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType());
+
+ // While the definition of LLVM vectors is bitpacked, we don't support sizes
+ // that aren't byte sized.
+ if (ElementSize % 8)
+ return false;
+ assert((DL.getTypeSizeInBits(VTy) % 8) == 0 &&
+ "vector size not a multiple of element size?");
+ ElementSize /= 8;
+
+ for (const Slice &S : P)
+ if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
+ return false;
+
+ for (const Slice *S : P.splitSliceTails())
+ if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
+ return false;
+
+ return true;
+ };
+ for (VectorType *VTy : CandidateTys)
+ if (CheckVectorTypeForPromotion(VTy))
+ return VTy;
+
+ return nullptr;
+}
+
+/// \brief Test whether a slice of an alloca is valid for integer widening.
+///
+/// This implements the necessary checking for the \c isIntegerWideningViable
+/// test below on a single slice of the alloca.
+static bool isIntegerWideningViableForSlice(const Slice &S,
+ uint64_t AllocBeginOffset,
+ Type *AllocaTy,
+ const DataLayout &DL,
+ bool &WholeAllocaOp) {
+ uint64_t Size = DL.getTypeStoreSize(AllocaTy);
+
+ uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
+ uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
+
+ // We can't reasonably handle cases where the load or store extends past
+ // the end of the alloca's type and into its padding.
+ if (RelEnd > Size)
+ return false;
+
+ Use *U = S.getUse();
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+ if (LI->isVolatile())
+ return false;
+ // We can't handle loads that extend past the allocated memory.
+ if (DL.getTypeStoreSize(LI->getType()) > Size)
+ return false;
+ // Note that we don't count vector loads or stores as whole-alloca
+ // operations which enable integer widening because we would prefer to use
+ // vector widening instead.
+ if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
+ WholeAllocaOp = true;
+ if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
+ if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
+ return false;
+ } else if (RelBegin != 0 || RelEnd != Size ||
+ !canConvertValue(DL, AllocaTy, LI->getType())) {
+ // Non-integer loads need to be convertible from the alloca type so that
+ // they are promotable.
+ return false;
+ }
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+ Type *ValueTy = SI->getValueOperand()->getType();
+ if (SI->isVolatile())
+ return false;
+ // We can't handle stores that extend past the allocated memory.
+ if (DL.getTypeStoreSize(ValueTy) > Size)
+ return false;
+ // Note that we don't count vector loads or stores as whole-alloca
+ // operations which enable integer widening because we would prefer to use
+ // vector widening instead.
+ if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
+ WholeAllocaOp = true;
+ if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
+ if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
+ return false;
+ } else if (RelBegin != 0 || RelEnd != Size ||
+ !canConvertValue(DL, ValueTy, AllocaTy)) {
+ // Non-integer stores need to be convertible to the alloca type so that
+ // they are promotable.
+ return false;
+ }
+ } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
+ if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
+ return false;
+ if (!S.isSplittable())
+ return false; // Skip any unsplittable intrinsics.
+ } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
+ if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+ II->getIntrinsicID() != Intrinsic::lifetime_end)
+ return false;
+ } else {
+ return false;
+ }
+
+ return true;
+}
+
+/// \brief Test whether the given alloca partition's integer operations can be
+/// widened to promotable ones.
+///
+/// This is a quick test to check whether we can rewrite the integer loads and
+/// stores to a particular alloca into wider loads and stores and be able to
+/// promote the resulting alloca.
+static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
+ const DataLayout &DL) {
+ uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy);
+ // Don't create integer types larger than the maximum bitwidth.
+ if (SizeInBits > IntegerType::MAX_INT_BITS)
+ return false;
+
+ // Don't try to handle allocas with bit-padding.
+ if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy))
+ return false;
+
+ // We need to ensure that an integer type with the appropriate bitwidth can
+ // be converted to the alloca type, whatever that is. We don't want to force
+ // the alloca itself to have an integer type if there is a more suitable one.
+ Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
+ if (!canConvertValue(DL, AllocaTy, IntTy) ||
+ !canConvertValue(DL, IntTy, AllocaTy))
+ return false;
+
+ // While examining uses, we ensure that the alloca has a covering load or
+ // store. We don't want to widen the integer operations only to fail to
+ // promote due to some other unsplittable entry (which we may make splittable
+ // later). However, if there are only splittable uses, go ahead and assume
+ // that we cover the alloca.
+ // FIXME: We shouldn't consider split slices that happen to start in the
+ // partition here...
+ bool WholeAllocaOp =
+ P.begin() != P.end() ? false : DL.isLegalInteger(SizeInBits);
+
+ for (const Slice &S : P)
+ if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
+ WholeAllocaOp))
+ return false;
+
+ for (const Slice *S : P.splitSliceTails())
+ if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
+ WholeAllocaOp))
+ return false;
+
+ return WholeAllocaOp;
+}
+
+static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
+ IntegerType *Ty, uint64_t Offset,
+ const Twine &Name) {
+ DEBUG(dbgs() << " start: " << *V << "\n");
+ IntegerType *IntTy = cast<IntegerType>(V->getType());
+ assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+ "Element extends past full value");
+ uint64_t ShAmt = 8 * Offset;
+ if (DL.isBigEndian())
+ ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+ if (ShAmt) {
+ V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
+ DEBUG(dbgs() << " shifted: " << *V << "\n");
+ }
+ assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+ "Cannot extract to a larger integer!");
+ if (Ty != IntTy) {
+ V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
+ DEBUG(dbgs() << " trunced: " << *V << "\n");
+ }
+ return V;
+}
+
+static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
+ Value *V, uint64_t Offset, const Twine &Name) {
+ IntegerType *IntTy = cast<IntegerType>(Old->getType());
+ IntegerType *Ty = cast<IntegerType>(V->getType());
+ assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+ "Cannot insert a larger integer!");
+ DEBUG(dbgs() << " start: " << *V << "\n");
+ if (Ty != IntTy) {
+ V = IRB.CreateZExt(V, IntTy, Name + ".ext");
+ DEBUG(dbgs() << " extended: " << *V << "\n");
+ }
+ assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+ "Element store outside of alloca store");
+ uint64_t ShAmt = 8 * Offset;
+ if (DL.isBigEndian())
+ ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+ if (ShAmt) {
+ V = IRB.CreateShl(V, ShAmt, Name + ".shift");
+ DEBUG(dbgs() << " shifted: " << *V << "\n");
+ }
+
+ if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
+ APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
+ Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
+ DEBUG(dbgs() << " masked: " << *Old << "\n");
+ V = IRB.CreateOr(Old, V, Name + ".insert");
+ DEBUG(dbgs() << " inserted: " << *V << "\n");
+ }
+ return V;
+}
+
+static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
+ unsigned EndIndex, const Twine &Name) {
+ VectorType *VecTy = cast<VectorType>(V->getType());
+ unsigned NumElements = EndIndex - BeginIndex;
+ assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+
+ if (NumElements == VecTy->getNumElements())
+ return V;
+
+ if (NumElements == 1) {
+ V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
+ Name + ".extract");
+ DEBUG(dbgs() << " extract: " << *V << "\n");
+ return V;
+ }
+
+ SmallVector<Constant *, 8> Mask;
+ Mask.reserve(NumElements);
+ for (unsigned i = BeginIndex; i != EndIndex; ++i)
+ Mask.push_back(IRB.getInt32(i));
+ V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
+ ConstantVector::get(Mask), Name + ".extract");
+ DEBUG(dbgs() << " shuffle: " << *V << "\n");
+ return V;
+}
+
+static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
+ unsigned BeginIndex, const Twine &Name) {
+ VectorType *VecTy = cast<VectorType>(Old->getType());
+ assert(VecTy && "Can only insert a vector into a vector");
+
+ VectorType *Ty = dyn_cast<VectorType>(V->getType());
+ if (!Ty) {
+ // Single element to insert.
+ V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
+ Name + ".insert");
+ DEBUG(dbgs() << " insert: " << *V << "\n");
+ return V;
+ }
+
+ assert(Ty->getNumElements() <= VecTy->getNumElements() &&
+ "Too many elements!");
+ if (Ty->getNumElements() == VecTy->getNumElements()) {
+ assert(V->getType() == VecTy && "Vector type mismatch");
+ return V;
+ }
+ unsigned EndIndex = BeginIndex + Ty->getNumElements();
+
+ // When inserting a smaller vector into the larger to store, we first
+ // use a shuffle vector to widen it with undef elements, and then
+ // a second shuffle vector to select between the loaded vector and the
+ // incoming vector.
+ SmallVector<Constant *, 8> Mask;
+ Mask.reserve(VecTy->getNumElements());
+ for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+ if (i >= BeginIndex && i < EndIndex)
+ Mask.push_back(IRB.getInt32(i - BeginIndex));
+ else
+ Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
+ V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
+ ConstantVector::get(Mask), Name + ".expand");
+ DEBUG(dbgs() << " shuffle: " << *V << "\n");
+
+ Mask.clear();
+ for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+ Mask.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
+
+ V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend");
+
+ DEBUG(dbgs() << " blend: " << *V << "\n");
+ return V;
+}
+
+/// \brief Visitor to rewrite instructions using p particular slice of an alloca
+/// to use a new alloca.
+///
+/// Also implements the rewriting to vector-based accesses when the partition
+/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
+/// lives here.
+class llvm::sroa::AllocaSliceRewriter
+ : public InstVisitor<AllocaSliceRewriter, bool> {
+ // Befriend the base class so it can delegate to private visit methods.
+ friend class InstVisitor<AllocaSliceRewriter, bool>;
+
+ using Base = InstVisitor<AllocaSliceRewriter, bool>;
+
+ const DataLayout &DL;
+ AllocaSlices &AS;
+ SROA &Pass;
+ AllocaInst &OldAI, &NewAI;
+ const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
+ Type *NewAllocaTy;
+
+ // This is a convenience and flag variable that will be null unless the new
+ // alloca's integer operations should be widened to this integer type due to
+ // passing isIntegerWideningViable above. If it is non-null, the desired
+ // integer type will be stored here for easy access during rewriting.
+ IntegerType *IntTy;
+
+ // If we are rewriting an alloca partition which can be written as pure
+ // vector operations, we stash extra information here. When VecTy is
+ // non-null, we have some strict guarantees about the rewritten alloca:
+ // - The new alloca is exactly the size of the vector type here.
+ // - The accesses all either map to the entire vector or to a single
+ // element.
+ // - The set of accessing instructions is only one of those handled above
+ // in isVectorPromotionViable. Generally these are the same access kinds
+ // which are promotable via mem2reg.
+ VectorType *VecTy;
+ Type *ElementTy;
+ uint64_t ElementSize;
+
+ // The original offset of the slice currently being rewritten relative to
+ // the original alloca.
+ uint64_t BeginOffset = 0;
+ uint64_t EndOffset = 0;
+
+ // The new offsets of the slice currently being rewritten relative to the
+ // original alloca.
+ uint64_t NewBeginOffset, NewEndOffset;
+
+ uint64_t SliceSize;
+ bool IsSplittable = false;
+ bool IsSplit = false;
+ Use *OldUse = nullptr;
+ Instruction *OldPtr = nullptr;
+
+ // Track post-rewrite users which are PHI nodes and Selects.
+ SmallSetVector<PHINode *, 8> &PHIUsers;
+ SmallSetVector<SelectInst *, 8> &SelectUsers;
+
+ // Utility IR builder, whose name prefix is setup for each visited use, and
+ // the insertion point is set to point to the user.
+ IRBuilderTy IRB;
+
+public:
+ AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
+ AllocaInst &OldAI, AllocaInst &NewAI,
+ uint64_t NewAllocaBeginOffset,
+ uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
+ VectorType *PromotableVecTy,
+ SmallSetVector<PHINode *, 8> &PHIUsers,
+ SmallSetVector<SelectInst *, 8> &SelectUsers)
+ : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
+ NewAllocaBeginOffset(NewAllocaBeginOffset),
+ NewAllocaEndOffset(NewAllocaEndOffset),
+ NewAllocaTy(NewAI.getAllocatedType()),
+ IntTy(IsIntegerPromotable
+ ? Type::getIntNTy(
+ NewAI.getContext(),
+ DL.getTypeSizeInBits(NewAI.getAllocatedType()))
+ : nullptr),
+ VecTy(PromotableVecTy),
+ ElementTy(VecTy ? VecTy->getElementType() : nullptr),
+ ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
+ PHIUsers(PHIUsers), SelectUsers(SelectUsers),
+ IRB(NewAI.getContext(), ConstantFolder()) {
+ if (VecTy) {
+ assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
+ "Only multiple-of-8 sized vector elements are viable");
+ ++NumVectorized;
+ }
+ assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
+ }
+
+ bool visit(AllocaSlices::const_iterator I) {
+ bool CanSROA = true;
+ BeginOffset = I->beginOffset();
+ EndOffset = I->endOffset();
+ IsSplittable = I->isSplittable();
+ IsSplit =
+ BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
+ DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
+ DEBUG(AS.printSlice(dbgs(), I, ""));
+ DEBUG(dbgs() << "\n");
+
+ // Compute the intersecting offset range.
+ assert(BeginOffset < NewAllocaEndOffset);
+ assert(EndOffset > NewAllocaBeginOffset);
+ NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
+ NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+
+ SliceSize = NewEndOffset - NewBeginOffset;
+
+ OldUse = I->getUse();
+ OldPtr = cast<Instruction>(OldUse->get());
+
+ Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
+ IRB.SetInsertPoint(OldUserI);
+ IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
+ IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
+
+ CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
+ if (VecTy || IntTy)
+ assert(CanSROA);
+ return CanSROA;
+ }
+
+private:
+ // Make sure the other visit overloads are visible.
+ using Base::visit;
+
+ // Every instruction which can end up as a user must have a rewrite rule.
+ bool visitInstruction(Instruction &I) {
+ DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
+ llvm_unreachable("No rewrite rule for this instruction!");
+ }
+
+ Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
+ // Note that the offset computation can use BeginOffset or NewBeginOffset
+ // interchangeably for unsplit slices.
+ assert(IsSplit || BeginOffset == NewBeginOffset);
+ uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+
+#ifndef NDEBUG
+ StringRef OldName = OldPtr->getName();
+ // Skip through the last '.sroa.' component of the name.
+ size_t LastSROAPrefix = OldName.rfind(".sroa.");
+ if (LastSROAPrefix != StringRef::npos) {
+ OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
+ // Look for an SROA slice index.
+ size_t IndexEnd = OldName.find_first_not_of("0123456789");
+ if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
+ // Strip the index and look for the offset.
+ OldName = OldName.substr(IndexEnd + 1);
+ size_t OffsetEnd = OldName.find_first_not_of("0123456789");
+ if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
+ // Strip the offset.
+ OldName = OldName.substr(OffsetEnd + 1);
+ }
+ }
+ // Strip any SROA suffixes as well.
+ OldName = OldName.substr(0, OldName.find(".sroa_"));
+#endif
+
+ return getAdjustedPtr(IRB, DL, &NewAI,
+ APInt(DL.getPointerTypeSizeInBits(PointerTy), Offset),
+ PointerTy,
+#ifndef NDEBUG
+ Twine(OldName) + "."
+#else
+ Twine()
+#endif
+ );
+ }
+
+ /// \brief Compute suitable alignment to access this slice of the *new*
+ /// alloca.
+ ///
+ /// You can optionally pass a type to this routine and if that type's ABI
+ /// alignment is itself suitable, this will return zero.
+ unsigned getSliceAlign(Type *Ty = nullptr) {
+ unsigned NewAIAlign = NewAI.getAlignment();
+ if (!NewAIAlign)
+ NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType());
+ unsigned Align =
+ MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset);
+ return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align;
+ }
+
+ unsigned getIndex(uint64_t Offset) {
+ assert(VecTy && "Can only call getIndex when rewriting a vector");
+ uint64_t RelOffset = Offset - NewAllocaBeginOffset;
+ assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
+ uint32_t Index = RelOffset / ElementSize;
+ assert(Index * ElementSize == RelOffset);
+ return Index;
+ }
+
+ void deleteIfTriviallyDead(Value *V) {
+ Instruction *I = cast<Instruction>(V);
+ if (isInstructionTriviallyDead(I))
+ Pass.DeadInsts.insert(I);
+ }
+
+ Value *rewriteVectorizedLoadInst() {
+ unsigned BeginIndex = getIndex(NewBeginOffset);
+ unsigned EndIndex = getIndex(NewEndOffset);
+ assert(EndIndex > BeginIndex && "Empty vector!");
+
+ Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+ return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
+ }
+
+ Value *rewriteIntegerLoad(LoadInst &LI) {
+ assert(IntTy && "We cannot insert an integer to the alloca");
+ assert(!LI.isVolatile());
+ Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+ V = convertValue(DL, IRB, V, IntTy);
+ assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+ uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+ if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
+ IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
+ V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
+ }
+ // It is possible that the extracted type is not the load type. This
+ // happens if there is a load past the end of the alloca, and as
+ // a consequence the slice is narrower but still a candidate for integer
+ // lowering. To handle this case, we just zero extend the extracted
+ // integer.
+ assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
+ "Can only handle an extract for an overly wide load");
+ if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
+ V = IRB.CreateZExt(V, LI.getType());
+ return V;
+ }
+
+ bool visitLoadInst(LoadInst &LI) {
+ DEBUG(dbgs() << " original: " << LI << "\n");
+ Value *OldOp = LI.getOperand(0);
+ assert(OldOp == OldPtr);
+
+ unsigned AS = LI.getPointerAddressSpace();
+
+ Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
+ : LI.getType();
+ const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize;
+ bool IsPtrAdjusted = false;
+ Value *V;
+ if (VecTy) {
+ V = rewriteVectorizedLoadInst();
+ } else if (IntTy && LI.getType()->isIntegerTy()) {
+ V = rewriteIntegerLoad(LI);
+ } else if (NewBeginOffset == NewAllocaBeginOffset &&
+ NewEndOffset == NewAllocaEndOffset &&
+ (canConvertValue(DL, NewAllocaTy, TargetTy) ||
+ (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
+ TargetTy->isIntegerTy()))) {
+ LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+ LI.isVolatile(), LI.getName());
+ if (LI.isVolatile())
+ NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+
+ // Any !nonnull metadata or !range metadata on the old load is also valid
+ // on the new load. This is even true in some cases even when the loads
+ // are different types, for example by mapping !nonnull metadata to
+ // !range metadata by modeling the null pointer constant converted to the
+ // integer type.
+ // FIXME: Add support for range metadata here. Currently the utilities
+ // for this don't propagate range metadata in trivial cases from one
+ // integer load to another, don't handle non-addrspace-0 null pointers
+ // correctly, and don't have any support for mapping ranges as the
+ // integer type becomes winder or narrower.
+ if (MDNode *N = LI.getMetadata(LLVMContext::MD_nonnull))
+ copyNonnullMetadata(LI, N, *NewLI);
+
+ // Try to preserve nonnull metadata
+ V = NewLI;
+
+ // If this is an integer load past the end of the slice (which means the
+ // bytes outside the slice are undef or this load is dead) just forcibly
+ // fix the integer size with correct handling of endianness.
+ if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
+ if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
+ if (AITy->getBitWidth() < TITy->getBitWidth()) {
+ V = IRB.CreateZExt(V, TITy, "load.ext");
+ if (DL.isBigEndian())
+ V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
+ "endian_shift");
+ }
+ } else {
+ Type *LTy = TargetTy->getPointerTo(AS);
+ LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
+ getSliceAlign(TargetTy),
+ LI.isVolatile(), LI.getName());
+ if (LI.isVolatile())
+ NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+
+ V = NewLI;
+ IsPtrAdjusted = true;
+ }
+ V = convertValue(DL, IRB, V, TargetTy);
+
+ if (IsSplit) {
+ assert(!LI.isVolatile());
+ assert(LI.getType()->isIntegerTy() &&
+ "Only integer type loads and stores are split");
+ assert(SliceSize < DL.getTypeStoreSize(LI.getType()) &&
+ "Split load isn't smaller than original load");
+ assert(LI.getType()->getIntegerBitWidth() ==
+ DL.getTypeStoreSizeInBits(LI.getType()) &&
+ "Non-byte-multiple bit width");
+ // Move the insertion point just past the load so that we can refer to it.
+ IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI)));
+ // Create a placeholder value with the same type as LI to use as the
+ // basis for the new value. This allows us to replace the uses of LI with
+ // the computed value, and then replace the placeholder with LI, leaving
+ // LI only used for this computation.
+ Value *Placeholder =
+ new LoadInst(UndefValue::get(LI.getType()->getPointerTo(AS)));
+ V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
+ "insert");
+ LI.replaceAllUsesWith(V);
+ Placeholder->replaceAllUsesWith(&LI);
+ Placeholder->deleteValue();
+ } else {
+ LI.replaceAllUsesWith(V);
+ }
+
+ Pass.DeadInsts.insert(&LI);
+ deleteIfTriviallyDead(OldOp);
+ DEBUG(dbgs() << " to: " << *V << "\n");
+ return !LI.isVolatile() && !IsPtrAdjusted;
+ }
+
+ bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp) {
+ if (V->getType() != VecTy) {
+ unsigned BeginIndex = getIndex(NewBeginOffset);
+ unsigned EndIndex = getIndex(NewEndOffset);
+ assert(EndIndex > BeginIndex && "Empty vector!");
+ unsigned NumElements = EndIndex - BeginIndex;
+ assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+ Type *SliceTy = (NumElements == 1)
+ ? ElementTy
+ : VectorType::get(ElementTy, NumElements);
+ if (V->getType() != SliceTy)
+ V = convertValue(DL, IRB, V, SliceTy);
+
+ // Mix in the existing elements.
+ Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+ V = insertVector(IRB, Old, V, BeginIndex, "vec");
+ }
+ StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+ Pass.DeadInsts.insert(&SI);
+
+ (void)Store;
+ DEBUG(dbgs() << " to: " << *Store << "\n");
+ return true;
+ }
+
+ bool rewriteIntegerStore(Value *V, StoreInst &SI) {
+ assert(IntTy && "We cannot extract an integer from the alloca");
+ assert(!SI.isVolatile());
+ if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+ Old = convertValue(DL, IRB, Old, IntTy);
+ assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+ uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+ V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
+ }
+ V = convertValue(DL, IRB, V, NewAllocaTy);
+ StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+ Store->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access);
+ Pass.DeadInsts.insert(&SI);
+ DEBUG(dbgs() << " to: " << *Store << "\n");
+ return true;
+ }
+
+ bool visitStoreInst(StoreInst &SI) {
+ DEBUG(dbgs() << " original: " << SI << "\n");
+ Value *OldOp = SI.getOperand(1);
+ assert(OldOp == OldPtr);
+
+ Value *V = SI.getValueOperand();
+
+ // Strip all inbounds GEPs and pointer casts to try to dig out any root
+ // alloca that should be re-examined after promoting this alloca.
+ if (V->getType()->isPointerTy())
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
+ Pass.PostPromotionWorklist.insert(AI);
+
+ if (SliceSize < DL.getTypeStoreSize(V->getType())) {
+ assert(!SI.isVolatile());
+ assert(V->getType()->isIntegerTy() &&
+ "Only integer type loads and stores are split");
+ assert(V->getType()->getIntegerBitWidth() ==
+ DL.getTypeStoreSizeInBits(V->getType()) &&
+ "Non-byte-multiple bit width");
+ IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
+ V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
+ "extract");
+ }
+
+ if (VecTy)
+ return rewriteVectorizedStoreInst(V, SI, OldOp);
+ if (IntTy && V->getType()->isIntegerTy())
+ return rewriteIntegerStore(V, SI);
+
+ const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize;
+ StoreInst *NewSI;
+ if (NewBeginOffset == NewAllocaBeginOffset &&
+ NewEndOffset == NewAllocaEndOffset &&
+ (canConvertValue(DL, V->getType(), NewAllocaTy) ||
+ (IsStorePastEnd && NewAllocaTy->isIntegerTy() &&
+ V->getType()->isIntegerTy()))) {
+ // If this is an integer store past the end of slice (and thus the bytes
+ // past that point are irrelevant or this is unreachable), truncate the
+ // value prior to storing.
+ if (auto *VITy = dyn_cast<IntegerType>(V->getType()))
+ if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
+ if (VITy->getBitWidth() > AITy->getBitWidth()) {
+ if (DL.isBigEndian())
+ V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(),
+ "endian_shift");
+ V = IRB.CreateTrunc(V, AITy, "load.trunc");
+ }
+
+ V = convertValue(DL, IRB, V, NewAllocaTy);
+ NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
+ SI.isVolatile());
+ } else {
+ unsigned AS = SI.getPointerAddressSpace();
+ Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS));
+ NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()),
+ SI.isVolatile());
+ }
+ NewSI->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access);
+ if (SI.isVolatile())
+ NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
+ Pass.DeadInsts.insert(&SI);
+ deleteIfTriviallyDead(OldOp);
+
+ DEBUG(dbgs() << " to: " << *NewSI << "\n");
+ return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
+ }
+
+ /// \brief Compute an integer value from splatting an i8 across the given
+ /// number of bytes.
+ ///
+ /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
+ /// call this routine.
+ /// FIXME: Heed the advice above.
+ ///
+ /// \param V The i8 value to splat.
+ /// \param Size The number of bytes in the output (assuming i8 is one byte)
+ Value *getIntegerSplat(Value *V, unsigned Size) {
+ assert(Size > 0 && "Expected a positive number of bytes.");
+ IntegerType *VTy = cast<IntegerType>(V->getType());
+ assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
+ if (Size == 1)
+ return V;
+
+ Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
+ V = IRB.CreateMul(
+ IRB.CreateZExt(V, SplatIntTy, "zext"),
+ ConstantExpr::getUDiv(
+ Constant::getAllOnesValue(SplatIntTy),
+ ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()),
+ SplatIntTy)),
+ "isplat");
+ return V;
+ }
+
+ /// \brief Compute a vector splat for a given element value.
+ Value *getVectorSplat(Value *V, unsigned NumElements) {
+ V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
+ DEBUG(dbgs() << " splat: " << *V << "\n");
+ return V;
+ }
+
+ bool visitMemSetInst(MemSetInst &II) {
+ DEBUG(dbgs() << " original: " << II << "\n");
+ assert(II.getRawDest() == OldPtr);
+
+ // If the memset has a variable size, it cannot be split, just adjust the
+ // pointer to the new alloca.
+ if (!isa<Constant>(II.getLength())) {
+ assert(!IsSplit);
+ assert(NewBeginOffset == BeginOffset);
+ II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
+ Type *CstTy = II.getAlignmentCst()->getType();
+ II.setAlignment(ConstantInt::get(CstTy, getSliceAlign()));
+
+ deleteIfTriviallyDead(OldPtr);
+ return false;
+ }
+
+ // Record this instruction for deletion.
+ Pass.DeadInsts.insert(&II);
+
+ Type *AllocaTy = NewAI.getAllocatedType();
+ Type *ScalarTy = AllocaTy->getScalarType();
+
+ // If this doesn't map cleanly onto the alloca type, and that type isn't
+ // a single value type, just emit a memset.
+ if (!VecTy && !IntTy &&
+ (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
+ SliceSize != DL.getTypeStoreSize(AllocaTy) ||
+ !AllocaTy->isSingleValueType() ||
+ !DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) ||
+ DL.getTypeSizeInBits(ScalarTy) % 8 != 0)) {
+ Type *SizeTy = II.getLength()->getType();
+ Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
+ CallInst *New = IRB.CreateMemSet(
+ getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
+ getSliceAlign(), II.isVolatile());
+ (void)New;
+ DEBUG(dbgs() << " to: " << *New << "\n");
+ return false;
+ }
+
+ // If we can represent this as a simple value, we have to build the actual
+ // value to store, which requires expanding the byte present in memset to
+ // a sensible representation for the alloca type. This is essentially
+ // splatting the byte to a sufficiently wide integer, splatting it across
+ // any desired vector width, and bitcasting to the final type.
+ Value *V;
+
+ if (VecTy) {
+ // If this is a memset of a vectorized alloca, insert it.
+ assert(ElementTy == ScalarTy);
+
+ unsigned BeginIndex = getIndex(NewBeginOffset);
+ unsigned EndIndex = getIndex(NewEndOffset);
+ assert(EndIndex > BeginIndex && "Empty vector!");
+ unsigned NumElements = EndIndex - BeginIndex;
+ assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+
+ Value *Splat =
+ getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ElementTy) / 8);
+ Splat = convertValue(DL, IRB, Splat, ElementTy);
+ if (NumElements > 1)
+ Splat = getVectorSplat(Splat, NumElements);
+
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+ V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
+ } else if (IntTy) {
+ // If this is a memset on an alloca where we can widen stores, insert the
+ // set integer.
+ assert(!II.isVolatile());
+
+ uint64_t Size = NewEndOffset - NewBeginOffset;
+ V = getIntegerSplat(II.getValue(), Size);
+
+ if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
+ EndOffset != NewAllocaBeginOffset)) {
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+ Old = convertValue(DL, IRB, Old, IntTy);
+ uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+ V = insertInteger(DL, IRB, Old, V, Offset, "insert");
+ } else {
+ assert(V->getType() == IntTy &&
+ "Wrong type for an alloca wide integer!");
+ }
+ V = convertValue(DL, IRB, V, AllocaTy);
+ } else {
+ // Established these invariants above.
+ assert(NewBeginOffset == NewAllocaBeginOffset);
+ assert(NewEndOffset == NewAllocaEndOffset);
+
+ V = getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ScalarTy) / 8);
+ if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
+ V = getVectorSplat(V, AllocaVecTy->getNumElements());
+
+ V = convertValue(DL, IRB, V, AllocaTy);
+ }
+
+ Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
+ II.isVolatile());
+ (void)New;
+ DEBUG(dbgs() << " to: " << *New << "\n");
+ return !II.isVolatile();
+ }
+
+ bool visitMemTransferInst(MemTransferInst &II) {
+ // Rewriting of memory transfer instructions can be a bit tricky. We break
+ // them into two categories: split intrinsics and unsplit intrinsics.
+
+ DEBUG(dbgs() << " original: " << II << "\n");
+
+ bool IsDest = &II.getRawDestUse() == OldUse;
+ assert((IsDest && II.getRawDest() == OldPtr) ||
+ (!IsDest && II.getRawSource() == OldPtr));
+
+ unsigned SliceAlign = getSliceAlign();
+
+ // For unsplit intrinsics, we simply modify the source and destination
+ // pointers in place. This isn't just an optimization, it is a matter of
+ // correctness. With unsplit intrinsics we may be dealing with transfers
+ // within a single alloca before SROA ran, or with transfers that have
+ // a variable length. We may also be dealing with memmove instead of
+ // memcpy, and so simply updating the pointers is the necessary for us to
+ // update both source and dest of a single call.
+ if (!IsSplittable) {
+ Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+ if (IsDest)
+ II.setDest(AdjustedPtr);
+ else
+ II.setSource(AdjustedPtr);
+
+ if (II.getAlignment() > SliceAlign) {
+ Type *CstTy = II.getAlignmentCst()->getType();
+ II.setAlignment(
+ ConstantInt::get(CstTy, MinAlign(II.getAlignment(), SliceAlign)));
+ }
+
+ DEBUG(dbgs() << " to: " << II << "\n");
+ deleteIfTriviallyDead(OldPtr);
+ return false;
+ }
+ // For split transfer intrinsics we have an incredibly useful assurance:
+ // the source and destination do not reside within the same alloca, and at
+ // least one of them does not escape. This means that we can replace
+ // memmove with memcpy, and we don't need to worry about all manner of
+ // downsides to splitting and transforming the operations.
+
+ // If this doesn't map cleanly onto the alloca type, and that type isn't
+ // a single value type, just emit a memcpy.
+ bool EmitMemCpy =
+ !VecTy && !IntTy &&
+ (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
+ SliceSize != DL.getTypeStoreSize(NewAI.getAllocatedType()) ||
+ !NewAI.getAllocatedType()->isSingleValueType());
+
+ // If we're just going to emit a memcpy, the alloca hasn't changed, and the
+ // size hasn't been shrunk based on analysis of the viable range, this is
+ // a no-op.
+ if (EmitMemCpy && &OldAI == &NewAI) {
+ // Ensure the start lines up.
+ assert(NewBeginOffset == BeginOffset);
+
+ // Rewrite the size as needed.
+ if (NewEndOffset != EndOffset)
+ II.setLength(ConstantInt::get(II.getLength()->getType(),
+ NewEndOffset - NewBeginOffset));
+ return false;
+ }
+ // Record this instruction for deletion.
+ Pass.DeadInsts.insert(&II);
+
+ // Strip all inbounds GEPs and pointer casts to try to dig out any root
+ // alloca that should be re-examined after rewriting this instruction.
+ Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
+ if (AllocaInst *AI =
+ dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
+ assert(AI != &OldAI && AI != &NewAI &&
+ "Splittable transfers cannot reach the same alloca on both ends.");
+ Pass.Worklist.insert(AI);
+ }
+
+ Type *OtherPtrTy = OtherPtr->getType();
+ unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
+
+ // Compute the relative offset for the other pointer within the transfer.
+ unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS);
+ APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset);
+ unsigned OtherAlign = MinAlign(II.getAlignment() ? II.getAlignment() : 1,
+ OtherOffset.zextOrTrunc(64).getZExtValue());
+
+ if (EmitMemCpy) {
+ // Compute the other pointer, folding as much as possible to produce
+ // a single, simple GEP in most cases.
+ OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
+ OtherPtr->getName() + ".");
+
+ Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+ Type *SizeTy = II.getLength()->getType();
+ Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
+
+ CallInst *New = IRB.CreateMemCpy(
+ IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, Size,
+ MinAlign(SliceAlign, OtherAlign), II.isVolatile());
+ (void)New;
+ DEBUG(dbgs() << " to: " << *New << "\n");
+ return false;
+ }
+
+ bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
+ NewEndOffset == NewAllocaEndOffset;
+ uint64_t Size = NewEndOffset - NewBeginOffset;
+ unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
+ unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
+ unsigned NumElements = EndIndex - BeginIndex;
+ IntegerType *SubIntTy =
+ IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
+
+ // Reset the other pointer type to match the register type we're going to
+ // use, but using the address space of the original other pointer.
+ if (VecTy && !IsWholeAlloca) {
+ if (NumElements == 1)
+ OtherPtrTy = VecTy->getElementType();
+ else
+ OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements);
+
+ OtherPtrTy = OtherPtrTy->getPointerTo(OtherAS);
+ } else if (IntTy && !IsWholeAlloca) {
+ OtherPtrTy = SubIntTy->getPointerTo(OtherAS);
+ } else {
+ OtherPtrTy = NewAllocaTy->getPointerTo(OtherAS);
+ }
+
+ Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
+ OtherPtr->getName() + ".");
+ unsigned SrcAlign = OtherAlign;
+ Value *DstPtr = &NewAI;
+ unsigned DstAlign = SliceAlign;
+ if (!IsDest) {
+ std::swap(SrcPtr, DstPtr);
+ std::swap(SrcAlign, DstAlign);
+ }
+
+ Value *Src;
+ if (VecTy && !IsWholeAlloca && !IsDest) {
+ Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+ Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
+ } else if (IntTy && !IsWholeAlloca && !IsDest) {
+ Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+ Src = convertValue(DL, IRB, Src, IntTy);
+ uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+ Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
+ } else {
+ Src =
+ IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload");
+ }
+
+ if (VecTy && !IsWholeAlloca && IsDest) {
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+ Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
+ } else if (IntTy && !IsWholeAlloca && IsDest) {
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+ Old = convertValue(DL, IRB, Old, IntTy);
+ uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+ Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
+ Src = convertValue(DL, IRB, Src, NewAllocaTy);
+ }
+
+ StoreInst *Store = cast<StoreInst>(
+ IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
+ (void)Store;
+ DEBUG(dbgs() << " to: " << *Store << "\n");
+ return !II.isVolatile();
+ }
+
+ bool visitIntrinsicInst(IntrinsicInst &II) {
+ assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
+ II.getIntrinsicID() == Intrinsic::lifetime_end);
+ DEBUG(dbgs() << " original: " << II << "\n");
+ assert(II.getArgOperand(1) == OldPtr);
+
+ // Record this instruction for deletion.
+ Pass.DeadInsts.insert(&II);
+
+ // Lifetime intrinsics are only promotable if they cover the whole alloca.
+ // Therefore, we drop lifetime intrinsics which don't cover the whole
+ // alloca.
+ // (In theory, intrinsics which partially cover an alloca could be
+ // promoted, but PromoteMemToReg doesn't handle that case.)
+ // FIXME: Check whether the alloca is promotable before dropping the
+ // lifetime intrinsics?
+ if (NewBeginOffset != NewAllocaBeginOffset ||
+ NewEndOffset != NewAllocaEndOffset)
+ return true;
+
+ ConstantInt *Size =
+ ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
+ NewEndOffset - NewBeginOffset);
+ Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+ Value *New;
+ if (II.getIntrinsicID() == Intrinsic::lifetime_start)
+ New = IRB.CreateLifetimeStart(Ptr, Size);
+ else
+ New = IRB.CreateLifetimeEnd(Ptr, Size);
+
+ (void)New;
+ DEBUG(dbgs() << " to: " << *New << "\n");
+
+ return true;
+ }
+
+ bool visitPHINode(PHINode &PN) {
+ DEBUG(dbgs() << " original: " << PN << "\n");
+ assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
+ assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
+
+ // We would like to compute a new pointer in only one place, but have it be
+ // as local as possible to the PHI. To do that, we re-use the location of
+ // the old pointer, which necessarily must be in the right position to
+ // dominate the PHI.
+ IRBuilderTy PtrBuilder(IRB);
+ if (isa<PHINode>(OldPtr))
+ PtrBuilder.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt());
+ else
+ PtrBuilder.SetInsertPoint(OldPtr);
+ PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc());
+
+ Value *NewPtr = getNewAllocaSlicePtr(PtrBuilder, OldPtr->getType());
+ // Replace the operands which were using the old pointer.
+ std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
+
+ DEBUG(dbgs() << " to: " << PN << "\n");
+ deleteIfTriviallyDead(OldPtr);
+
+ // PHIs can't be promoted on their own, but often can be speculated. We
+ // check the speculation outside of the rewriter so that we see the
+ // fully-rewritten alloca.
+ PHIUsers.insert(&PN);
+ return true;
+ }
+
+ bool visitSelectInst(SelectInst &SI) {
+ DEBUG(dbgs() << " original: " << SI << "\n");
+ assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
+ "Pointer isn't an operand!");
+ assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
+ assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
+
+ Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+ // Replace the operands which were using the old pointer.
+ if (SI.getOperand(1) == OldPtr)
+ SI.setOperand(1, NewPtr);
+ if (SI.getOperand(2) == OldPtr)
+ SI.setOperand(2, NewPtr);
+
+ DEBUG(dbgs() << " to: " << SI << "\n");
+ deleteIfTriviallyDead(OldPtr);
+
+ // Selects can't be promoted on their own, but often can be speculated. We
+ // check the speculation outside of the rewriter so that we see the
+ // fully-rewritten alloca.
+ SelectUsers.insert(&SI);
+ return true;
+ }
+};
+
+namespace {
+
+/// \brief Visitor to rewrite aggregate loads and stores as scalar.
+///
+/// This pass aggressively rewrites all aggregate loads and stores on
+/// a particular pointer (or any pointer derived from it which we can identify)
+/// with scalar loads and stores.
+class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
+ // Befriend the base class so it can delegate to private visit methods.
+ friend class InstVisitor<AggLoadStoreRewriter, bool>;
+
+ /// Queue of pointer uses to analyze and potentially rewrite.
+ SmallVector<Use *, 8> Queue;
+
+ /// Set to prevent us from cycling with phi nodes and loops.
+ SmallPtrSet<User *, 8> Visited;
+
+ /// The current pointer use being rewritten. This is used to dig up the used
+ /// value (as opposed to the user).
+ Use *U;
+
+public:
+ /// Rewrite loads and stores through a pointer and all pointers derived from
+ /// it.
+ bool rewrite(Instruction &I) {
+ DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
+ enqueueUsers(I);
+ bool Changed = false;
+ while (!Queue.empty()) {
+ U = Queue.pop_back_val();
+ Changed |= visit(cast<Instruction>(U->getUser()));
+ }
+ return Changed;
+ }
+
+private:
+ /// Enqueue all the users of the given instruction for further processing.
+ /// This uses a set to de-duplicate users.
+ void enqueueUsers(Instruction &I) {
+ for (Use &U : I.uses())
+ if (Visited.insert(U.getUser()).second)
+ Queue.push_back(&U);
+ }
+
+ // Conservative default is to not rewrite anything.
+ bool visitInstruction(Instruction &I) { return false; }
+
+ /// \brief Generic recursive split emission class.
+ template <typename Derived> class OpSplitter {
+ protected:
+ /// The builder used to form new instructions.
+ IRBuilderTy IRB;
+
+ /// The indices which to be used with insert- or extractvalue to select the
+ /// appropriate value within the aggregate.
+ SmallVector<unsigned, 4> Indices;
+
+ /// The indices to a GEP instruction which will move Ptr to the correct slot
+ /// within the aggregate.
+ SmallVector<Value *, 4> GEPIndices;
+
+ /// The base pointer of the original op, used as a base for GEPing the
+ /// split operations.
+ Value *Ptr;
+
+ /// Initialize the splitter with an insertion point, Ptr and start with a
+ /// single zero GEP index.
+ OpSplitter(Instruction *InsertionPoint, Value *Ptr)
+ : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
+
+ public:
+ /// \brief Generic recursive split emission routine.
+ ///
+ /// This method recursively splits an aggregate op (load or store) into
+ /// scalar or vector ops. It splits recursively until it hits a single value
+ /// and emits that single value operation via the template argument.
+ ///
+ /// The logic of this routine relies on GEPs and insertvalue and
+ /// extractvalue all operating with the same fundamental index list, merely
+ /// formatted differently (GEPs need actual values).
+ ///
+ /// \param Ty The type being split recursively into smaller ops.
+ /// \param Agg The aggregate value being built up or stored, depending on
+ /// whether this is splitting a load or a store respectively.
+ void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
+ if (Ty->isSingleValueType())
+ return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name);
+
+ if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+ unsigned OldSize = Indices.size();
+ (void)OldSize;
+ for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
+ ++Idx) {
+ assert(Indices.size() == OldSize && "Did not return to the old size");
+ Indices.push_back(Idx);
+ GEPIndices.push_back(IRB.getInt32(Idx));
+ emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
+ GEPIndices.pop_back();
+ Indices.pop_back();
+ }
+ return;
+ }
+
+ if (StructType *STy = dyn_cast<StructType>(Ty)) {
+ unsigned OldSize = Indices.size();
+ (void)OldSize;
+ for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
+ ++Idx) {
+ assert(Indices.size() == OldSize && "Did not return to the old size");
+ Indices.push_back(Idx);
+ GEPIndices.push_back(IRB.getInt32(Idx));
+ emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
+ GEPIndices.pop_back();
+ Indices.pop_back();
+ }
+ return;
+ }
+
+ llvm_unreachable("Only arrays and structs are aggregate loadable types");
+ }
+ };
+
+ struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
+ LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr)
+ : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
+
+ /// Emit a leaf load of a single value. This is called at the leaves of the
+ /// recursive emission to actually load values.
+ void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
+ assert(Ty->isSingleValueType());
+ // Load the single value and insert it using the indices.
+ Value *GEP =
+ IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
+ Value *Load = IRB.CreateLoad(GEP, Name + ".load");
+ Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
+ DEBUG(dbgs() << " to: " << *Load << "\n");
+ }
+ };
+
+ bool visitLoadInst(LoadInst &LI) {
+ assert(LI.getPointerOperand() == *U);
+ if (!LI.isSimple() || LI.getType()->isSingleValueType())
+ return false;
+
+ // We have an aggregate being loaded, split it apart.
+ DEBUG(dbgs() << " original: " << LI << "\n");
+ LoadOpSplitter Splitter(&LI, *U);
+ Value *V = UndefValue::get(LI.getType());
+ Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
+ LI.replaceAllUsesWith(V);
+ LI.eraseFromParent();
+ return true;
+ }
+
+ struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
+ StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr)
+ : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
+
+ /// Emit a leaf store of a single value. This is called at the leaves of the
+ /// recursive emission to actually produce stores.
+ void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
+ assert(Ty->isSingleValueType());
+ // Extract the single value and store it using the indices.
+ //
+ // The gep and extractvalue values are factored out of the CreateStore
+ // call to make the output independent of the argument evaluation order.
+ Value *ExtractValue =
+ IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
+ Value *InBoundsGEP =
+ IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
+ Value *Store = IRB.CreateStore(ExtractValue, InBoundsGEP);
+ (void)Store;
+ DEBUG(dbgs() << " to: " << *Store << "\n");
+ }
+ };
+
+ bool visitStoreInst(StoreInst &SI) {
+ if (!SI.isSimple() || SI.getPointerOperand() != *U)
+ return false;
+ Value *V = SI.getValueOperand();
+ if (V->getType()->isSingleValueType())
+ return false;
+
+ // We have an aggregate being stored, split it apart.
+ DEBUG(dbgs() << " original: " << SI << "\n");
+ StoreOpSplitter Splitter(&SI, *U);
+ Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
+ SI.eraseFromParent();
+ return true;
+ }
+
+ bool visitBitCastInst(BitCastInst &BC) {
+ enqueueUsers(BC);
+ return false;
+ }
+
+ bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+ enqueueUsers(GEPI);
+ return false;
+ }
+
+ bool visitPHINode(PHINode &PN) {
+ enqueueUsers(PN);
+ return false;
+ }
+
+ bool visitSelectInst(SelectInst &SI) {
+ enqueueUsers(SI);
+ return false;
+ }
+};
+
+} // end anonymous namespace
+
+/// \brief Strip aggregate type wrapping.
+///
+/// This removes no-op aggregate types wrapping an underlying type. It will
+/// strip as many layers of types as it can without changing either the type
+/// size or the allocated size.
+static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
+ if (Ty->isSingleValueType())
+ return Ty;
+
+ uint64_t AllocSize = DL.getTypeAllocSize(Ty);
+ uint64_t TypeSize = DL.getTypeSizeInBits(Ty);
+
+ Type *InnerTy;
+ if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
+ InnerTy = ArrTy->getElementType();
+ } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+ const StructLayout *SL = DL.getStructLayout(STy);
+ unsigned Index = SL->getElementContainingOffset(0);
+ InnerTy = STy->getElementType(Index);
+ } else {
+ return Ty;
+ }
+
+ if (AllocSize > DL.getTypeAllocSize(InnerTy) ||
+ TypeSize > DL.getTypeSizeInBits(InnerTy))
+ return Ty;
+
+ return stripAggregateTypeWrapping(DL, InnerTy);
+}
+
+/// \brief Try to find a partition of the aggregate type passed in for a given
+/// offset and size.
+///
+/// This recurses through the aggregate type and tries to compute a subtype
+/// based on the offset and size. When the offset and size span a sub-section
+/// of an array, it will even compute a new array type for that sub-section,
+/// and the same for structs.
+///
+/// Note that this routine is very strict and tries to find a partition of the
+/// type which produces the *exact* right offset and size. It is not forgiving
+/// when the size or offset cause either end of type-based partition to be off.
+/// Also, this is a best-effort routine. It is reasonable to give up and not
+/// return a type if necessary.
+static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
+ uint64_t Size) {
+ if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size)
+ return stripAggregateTypeWrapping(DL, Ty);
+ if (Offset > DL.getTypeAllocSize(Ty) ||
+ (DL.getTypeAllocSize(Ty) - Offset) < Size)
+ return nullptr;
+
+ if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
+ Type *ElementTy = SeqTy->getElementType();
+ uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
+ uint64_t NumSkippedElements = Offset / ElementSize;
+ if (NumSkippedElements >= SeqTy->getNumElements())
+ return nullptr;
+ Offset -= NumSkippedElements * ElementSize;
+
+ // First check if we need to recurse.
+ if (Offset > 0 || Size < ElementSize) {
+ // Bail if the partition ends in a different array element.
+ if ((Offset + Size) > ElementSize)
+ return nullptr;
+ // Recurse through the element type trying to peel off offset bytes.
+ return getTypePartition(DL, ElementTy, Offset, Size);
+ }
+ assert(Offset == 0);
+
+ if (Size == ElementSize)
+ return stripAggregateTypeWrapping(DL, ElementTy);
+ assert(Size > ElementSize);
+ uint64_t NumElements = Size / ElementSize;
+ if (NumElements * ElementSize != Size)
+ return nullptr;
+ return ArrayType::get(ElementTy, NumElements);
+ }
+
+ StructType *STy = dyn_cast<StructType>(Ty);
+ if (!STy)
+ return nullptr;
+
+ const StructLayout *SL = DL.getStructLayout(STy);
+ if (Offset >= SL->getSizeInBytes())
+ return nullptr;
+ uint64_t EndOffset = Offset + Size;
+ if (EndOffset > SL->getSizeInBytes())
+ return nullptr;
+
+ unsigned Index = SL->getElementContainingOffset(Offset);
+ Offset -= SL->getElementOffset(Index);
+
+ Type *ElementTy = STy->getElementType(Index);
+ uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
+ if (Offset >= ElementSize)
+ return nullptr; // The offset points into alignment padding.
+
+ // See if any partition must be contained by the element.
+ if (Offset > 0 || Size < ElementSize) {
+ if ((Offset + Size) > ElementSize)
+ return nullptr;
+ return getTypePartition(DL, ElementTy, Offset, Size);
+ }
+ assert(Offset == 0);
+
+ if (Size == ElementSize)
+ return stripAggregateTypeWrapping(DL, ElementTy);
+
+ StructType::element_iterator EI = STy->element_begin() + Index,
+ EE = STy->element_end();
+ if (EndOffset < SL->getSizeInBytes()) {
+ unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
+ if (Index == EndIndex)
+ return nullptr; // Within a single element and its padding.
+
+ // Don't try to form "natural" types if the elements don't line up with the
+ // expected size.
+ // FIXME: We could potentially recurse down through the last element in the
+ // sub-struct to find a natural end point.
+ if (SL->getElementOffset(EndIndex) != EndOffset)
+ return nullptr;
+
+ assert(Index < EndIndex);
+ EE = STy->element_begin() + EndIndex;
+ }
+
+ // Try to build up a sub-structure.
+ StructType *SubTy =
+ StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked());
+ const StructLayout *SubSL = DL.getStructLayout(SubTy);
+ if (Size != SubSL->getSizeInBytes())
+ return nullptr; // The sub-struct doesn't have quite the size needed.
+
+ return SubTy;
+}
+
+/// \brief Pre-split loads and stores to simplify rewriting.
+///
+/// We want to break up the splittable load+store pairs as much as
+/// possible. This is important to do as a preprocessing step, as once we
+/// start rewriting the accesses to partitions of the alloca we lose the
+/// necessary information to correctly split apart paired loads and stores
+/// which both point into this alloca. The case to consider is something like
+/// the following:
+///
+/// %a = alloca [12 x i8]
+/// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
+/// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
+/// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
+/// %iptr1 = bitcast i8* %gep1 to i64*
+/// %iptr2 = bitcast i8* %gep2 to i64*
+/// %fptr1 = bitcast i8* %gep1 to float*
+/// %fptr2 = bitcast i8* %gep2 to float*
+/// %fptr3 = bitcast i8* %gep3 to float*
+/// store float 0.0, float* %fptr1
+/// store float 1.0, float* %fptr2
+/// %v = load i64* %iptr1
+/// store i64 %v, i64* %iptr2
+/// %f1 = load float* %fptr2
+/// %f2 = load float* %fptr3
+///
+/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
+/// promote everything so we recover the 2 SSA values that should have been
+/// there all along.
+///
+/// \returns true if any changes are made.
+bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
+ DEBUG(dbgs() << "Pre-splitting loads and stores\n");
+
+ // Track the loads and stores which are candidates for pre-splitting here, in
+ // the order they first appear during the partition scan. These give stable
+ // iteration order and a basis for tracking which loads and stores we
+ // actually split.
+ SmallVector<LoadInst *, 4> Loads;
+ SmallVector<StoreInst *, 4> Stores;
+
+ // We need to accumulate the splits required of each load or store where we
+ // can find them via a direct lookup. This is important to cross-check loads
+ // and stores against each other. We also track the slice so that we can kill
+ // all the slices that end up split.
+ struct SplitOffsets {
+ Slice *S;
+ std::vector<uint64_t> Splits;
+ };
+ SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
+
+ // Track loads out of this alloca which cannot, for any reason, be pre-split.
+ // This is important as we also cannot pre-split stores of those loads!
+ // FIXME: This is all pretty gross. It means that we can be more aggressive
+ // in pre-splitting when the load feeding the store happens to come from
+ // a separate alloca. Put another way, the effectiveness of SROA would be
+ // decreased by a frontend which just concatenated all of its local allocas
+ // into one big flat alloca. But defeating such patterns is exactly the job
+ // SROA is tasked with! Sadly, to not have this discrepancy we would have
+ // change store pre-splitting to actually force pre-splitting of the load
+ // that feeds it *and all stores*. That makes pre-splitting much harder, but
+ // maybe it would make it more principled?
+ SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
+
+ DEBUG(dbgs() << " Searching for candidate loads and stores\n");
+ for (auto &P : AS.partitions()) {
+ for (Slice &S : P) {
+ Instruction *I = cast<Instruction>(S.getUse()->getUser());
+ if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
+ // If this is a load we have to track that it can't participate in any
+ // pre-splitting. If this is a store of a load we have to track that
+ // that load also can't participate in any pre-splitting.
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ UnsplittableLoads.insert(LI);
+ else if (auto *SI = dyn_cast<StoreInst>(I))
+ if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
+ UnsplittableLoads.insert(LI);
+ continue;
+ }
+ assert(P.endOffset() > S.beginOffset() &&
+ "Empty or backwards partition!");
+
+ // Determine if this is a pre-splittable slice.
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ assert(!LI->isVolatile() && "Cannot split volatile loads!");
+
+ // The load must be used exclusively to store into other pointers for
+ // us to be able to arbitrarily pre-split it. The stores must also be
+ // simple to avoid changing semantics.
+ auto IsLoadSimplyStored = [](LoadInst *LI) {
+ for (User *LU : LI->users()) {
+ auto *SI = dyn_cast<StoreInst>(LU);
+ if (!SI || !SI->isSimple())
+ return false;
+ }
+ return true;
+ };
+ if (!IsLoadSimplyStored(LI)) {
+ UnsplittableLoads.insert(LI);
+ continue;
+ }
+
+ Loads.push_back(LI);
+ } else if (auto *SI = dyn_cast<StoreInst>(I)) {
+ if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
+ // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
+ continue;
+ auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
+ if (!StoredLoad || !StoredLoad->isSimple())
+ continue;
+ assert(!SI->isVolatile() && "Cannot split volatile stores!");
+
+ Stores.push_back(SI);
+ } else {
+ // Other uses cannot be pre-split.
+ continue;
+ }
+
+ // Record the initial split.
+ DEBUG(dbgs() << " Candidate: " << *I << "\n");
+ auto &Offsets = SplitOffsetsMap[I];
+ assert(Offsets.Splits.empty() &&
+ "Should not have splits the first time we see an instruction!");
+ Offsets.S = &S;
+ Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
+ }
+
+ // Now scan the already split slices, and add a split for any of them which
+ // we're going to pre-split.
+ for (Slice *S : P.splitSliceTails()) {
+ auto SplitOffsetsMapI =
+ SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
+ if (SplitOffsetsMapI == SplitOffsetsMap.end())
+ continue;
+ auto &Offsets = SplitOffsetsMapI->second;
+
+ assert(Offsets.S == S && "Found a mismatched slice!");
+ assert(!Offsets.Splits.empty() &&
+ "Cannot have an empty set of splits on the second partition!");
+ assert(Offsets.Splits.back() ==
+ P.beginOffset() - Offsets.S->beginOffset() &&
+ "Previous split does not end where this one begins!");
+
+ // Record each split. The last partition's end isn't needed as the size
+ // of the slice dictates that.
+ if (S->endOffset() > P.endOffset())
+ Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
+ }
+ }
+
+ // We may have split loads where some of their stores are split stores. For
+ // such loads and stores, we can only pre-split them if their splits exactly
+ // match relative to their starting offset. We have to verify this prior to
+ // any rewriting.
+ Stores.erase(
+ llvm::remove_if(Stores,
+ [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
+ // Lookup the load we are storing in our map of split
+ // offsets.
+ auto *LI = cast<LoadInst>(SI->getValueOperand());
+ // If it was completely unsplittable, then we're done,
+ // and this store can't be pre-split.
+ if (UnsplittableLoads.count(LI))
+ return true;
+
+ auto LoadOffsetsI = SplitOffsetsMap.find(LI);
+ if (LoadOffsetsI == SplitOffsetsMap.end())
+ return false; // Unrelated loads are definitely safe.
+ auto &LoadOffsets = LoadOffsetsI->second;
+
+ // Now lookup the store's offsets.
+ auto &StoreOffsets = SplitOffsetsMap[SI];
+
+ // If the relative offsets of each split in the load and
+ // store match exactly, then we can split them and we
+ // don't need to remove them here.
+ if (LoadOffsets.Splits == StoreOffsets.Splits)
+ return false;
+
+ DEBUG(dbgs()
+ << " Mismatched splits for load and store:\n"
+ << " " << *LI << "\n"
+ << " " << *SI << "\n");
+
+ // We've found a store and load that we need to split
+ // with mismatched relative splits. Just give up on them
+ // and remove both instructions from our list of
+ // candidates.
+ UnsplittableLoads.insert(LI);
+ return true;
+ }),
+ Stores.end());
+ // Now we have to go *back* through all the stores, because a later store may
+ // have caused an earlier store's load to become unsplittable and if it is
+ // unsplittable for the later store, then we can't rely on it being split in
+ // the earlier store either.
+ Stores.erase(llvm::remove_if(Stores,
+ [&UnsplittableLoads](StoreInst *SI) {
+ auto *LI =
+ cast<LoadInst>(SI->getValueOperand());
+ return UnsplittableLoads.count(LI);
+ }),
+ Stores.end());
+ // Once we've established all the loads that can't be split for some reason,
+ // filter any that made it into our list out.
+ Loads.erase(llvm::remove_if(Loads,
+ [&UnsplittableLoads](LoadInst *LI) {
+ return UnsplittableLoads.count(LI);
+ }),
+ Loads.end());
+
+ // If no loads or stores are left, there is no pre-splitting to be done for
+ // this alloca.
+ if (Loads.empty() && Stores.empty())
+ return false;
+
+ // From here on, we can't fail and will be building new accesses, so rig up
+ // an IR builder.
+ IRBuilderTy IRB(&AI);
+
+ // Collect the new slices which we will merge into the alloca slices.
+ SmallVector<Slice, 4> NewSlices;
+
+ // Track any allocas we end up splitting loads and stores for so we iterate
+ // on them.
+ SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
+
+ // At this point, we have collected all of the loads and stores we can
+ // pre-split, and the specific splits needed for them. We actually do the
+ // splitting in a specific order in order to handle when one of the loads in
+ // the value operand to one of the stores.
+ //
+ // First, we rewrite all of the split loads, and just accumulate each split
+ // load in a parallel structure. We also build the slices for them and append
+ // them to the alloca slices.
+ SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
+ std::vector<LoadInst *> SplitLoads;
+ const DataLayout &DL = AI.getModule()->getDataLayout();
+ for (LoadInst *LI : Loads) {
+ SplitLoads.clear();
+
+ IntegerType *Ty = cast<IntegerType>(LI->getType());
+ uint64_t LoadSize = Ty->getBitWidth() / 8;
+ assert(LoadSize > 0 && "Cannot have a zero-sized integer load!");
+
+ auto &Offsets = SplitOffsetsMap[LI];
+ assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+ "Slice size should always match load size exactly!");
+ uint64_t BaseOffset = Offsets.S->beginOffset();
+ assert(BaseOffset + LoadSize > BaseOffset &&
+ "Cannot represent alloca access size using 64-bit integers!");
+
+ Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
+ IRB.SetInsertPoint(LI);
+
+ DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
+
+ uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+ int Idx = 0, Size = Offsets.Splits.size();
+ for (;;) {
+ auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+ auto AS = LI->getPointerAddressSpace();
+ auto *PartPtrTy = PartTy->getPointerTo(AS);
+ LoadInst *PLoad = IRB.CreateAlignedLoad(
+ getAdjustedPtr(IRB, DL, BasePtr,
+ APInt(DL.getPointerSizeInBits(AS), PartOffset),
+ PartPtrTy, BasePtr->getName() + "."),
+ getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
+ LI->getName());
+ PLoad->copyMetadata(*LI, LLVMContext::MD_mem_parallel_loop_access);
+
+ // Append this load onto the list of split loads so we can find it later
+ // to rewrite the stores.
+ SplitLoads.push_back(PLoad);
+
+ // Now build a new slice for the alloca.
+ NewSlices.push_back(
+ Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+ &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
+ /*IsSplittable*/ false));
+ DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
+ << ", " << NewSlices.back().endOffset() << "): " << *PLoad
+ << "\n");
+
+ // See if we've handled all the splits.
+ if (Idx >= Size)
+ break;
+
+ // Setup the next partition.
+ PartOffset = Offsets.Splits[Idx];
+ ++Idx;
+ PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset;
+ }
+
+ // Now that we have the split loads, do the slow walk over all uses of the
+ // load and rewrite them as split stores, or save the split loads to use
+ // below if the store is going to be split there anyways.
+ bool DeferredStores = false;
+ for (User *LU : LI->users()) {
+ StoreInst *SI = cast<StoreInst>(LU);
+ if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
+ DeferredStores = true;
+ DEBUG(dbgs() << " Deferred splitting of store: " << *SI << "\n");
+ continue;
+ }
+
+ Value *StoreBasePtr = SI->getPointerOperand();
+ IRB.SetInsertPoint(SI);
+
+ DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
+
+ for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
+ LoadInst *PLoad = SplitLoads[Idx];
+ uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
+ auto *PartPtrTy =
+ PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
+
+ auto AS = SI->getPointerAddressSpace();
+ StoreInst *PStore = IRB.CreateAlignedStore(
+ PLoad,
+ getAdjustedPtr(IRB, DL, StoreBasePtr,
+ APInt(DL.getPointerSizeInBits(AS), PartOffset),
+ PartPtrTy, StoreBasePtr->getName() + "."),
+ getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
+ PStore->copyMetadata(*LI, LLVMContext::MD_mem_parallel_loop_access);
+ DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
+ }
+
+ // We want to immediately iterate on any allocas impacted by splitting
+ // this store, and we have to track any promotable alloca (indicated by
+ // a direct store) as needing to be resplit because it is no longer
+ // promotable.
+ if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
+ ResplitPromotableAllocas.insert(OtherAI);
+ Worklist.insert(OtherAI);
+ } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+ StoreBasePtr->stripInBoundsOffsets())) {
+ Worklist.insert(OtherAI);
+ }
+
+ // Mark the original store as dead.
+ DeadInsts.insert(SI);
+ }
+
+ // Save the split loads if there are deferred stores among the users.
+ if (DeferredStores)
+ SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
+
+ // Mark the original load as dead and kill the original slice.
+ DeadInsts.insert(LI);
+ Offsets.S->kill();
+ }
+
+ // Second, we rewrite all of the split stores. At this point, we know that
+ // all loads from this alloca have been split already. For stores of such
+ // loads, we can simply look up the pre-existing split loads. For stores of
+ // other loads, we split those loads first and then write split stores of
+ // them.
+ for (StoreInst *SI : Stores) {
+ auto *LI = cast<LoadInst>(SI->getValueOperand());
+ IntegerType *Ty = cast<IntegerType>(LI->getType());
+ uint64_t StoreSize = Ty->getBitWidth() / 8;
+ assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
+
+ auto &Offsets = SplitOffsetsMap[SI];
+ assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+ "Slice size should always match load size exactly!");
+ uint64_t BaseOffset = Offsets.S->beginOffset();
+ assert(BaseOffset + StoreSize > BaseOffset &&
+ "Cannot represent alloca access size using 64-bit integers!");
+
+ Value *LoadBasePtr = LI->getPointerOperand();
+ Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
+
+ DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
+
+ // Check whether we have an already split load.
+ auto SplitLoadsMapI = SplitLoadsMap.find(LI);
+ std::vector<LoadInst *> *SplitLoads = nullptr;
+ if (SplitLoadsMapI != SplitLoadsMap.end()) {
+ SplitLoads = &SplitLoadsMapI->second;
+ assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
+ "Too few split loads for the number of splits in the store!");
+ } else {
+ DEBUG(dbgs() << " of load: " << *LI << "\n");
+ }
+
+ uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+ int Idx = 0, Size = Offsets.Splits.size();
+ for (;;) {
+ auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+ auto *LoadPartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+ auto *StorePartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
+
+ // Either lookup a split load or create one.
+ LoadInst *PLoad;
+ if (SplitLoads) {
+ PLoad = (*SplitLoads)[Idx];
+ } else {
+ IRB.SetInsertPoint(LI);
+ auto AS = LI->getPointerAddressSpace();
+ PLoad = IRB.CreateAlignedLoad(
+ getAdjustedPtr(IRB, DL, LoadBasePtr,
+ APInt(DL.getPointerSizeInBits(AS), PartOffset),
+ LoadPartPtrTy, LoadBasePtr->getName() + "."),
+ getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
+ LI->getName());
+ }
+
+ // And store this partition.
+ IRB.SetInsertPoint(SI);
+ auto AS = SI->getPointerAddressSpace();
+ StoreInst *PStore = IRB.CreateAlignedStore(
+ PLoad,
+ getAdjustedPtr(IRB, DL, StoreBasePtr,
+ APInt(DL.getPointerSizeInBits(AS), PartOffset),
+ StorePartPtrTy, StoreBasePtr->getName() + "."),
+ getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
+
+ // Now build a new slice for the alloca.
+ NewSlices.push_back(
+ Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+ &PStore->getOperandUse(PStore->getPointerOperandIndex()),
+ /*IsSplittable*/ false));
+ DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
+ << ", " << NewSlices.back().endOffset() << "): " << *PStore
+ << "\n");
+ if (!SplitLoads) {
+ DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
+ }
+
+ // See if we've finished all the splits.
+ if (Idx >= Size)
+ break;
+
+ // Setup the next partition.
+ PartOffset = Offsets.Splits[Idx];
+ ++Idx;
+ PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
+ }
+
+ // We want to immediately iterate on any allocas impacted by splitting
+ // this load, which is only relevant if it isn't a load of this alloca and
+ // thus we didn't already split the loads above. We also have to keep track
+ // of any promotable allocas we split loads on as they can no longer be
+ // promoted.
+ if (!SplitLoads) {
+ if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
+ assert(OtherAI != &AI && "We can't re-split our own alloca!");
+ ResplitPromotableAllocas.insert(OtherAI);
+ Worklist.insert(OtherAI);
+ } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+ LoadBasePtr->stripInBoundsOffsets())) {
+ assert(OtherAI != &AI && "We can't re-split our own alloca!");
+ Worklist.insert(OtherAI);
+ }
+ }
+
+ // Mark the original store as dead now that we've split it up and kill its
+ // slice. Note that we leave the original load in place unless this store
+ // was its only use. It may in turn be split up if it is an alloca load
+ // for some other alloca, but it may be a normal load. This may introduce
+ // redundant loads, but where those can be merged the rest of the optimizer
+ // should handle the merging, and this uncovers SSA splits which is more
+ // important. In practice, the original loads will almost always be fully
+ // split and removed eventually, and the splits will be merged by any
+ // trivial CSE, including instcombine.
+ if (LI->hasOneUse()) {
+ assert(*LI->user_begin() == SI && "Single use isn't this store!");
+ DeadInsts.insert(LI);
+ }
+ DeadInsts.insert(SI);
+ Offsets.S->kill();
+ }
+
+ // Remove the killed slices that have ben pre-split.
+ AS.erase(llvm::remove_if(AS, [](const Slice &S) { return S.isDead(); }),
+ AS.end());
+
+ // Insert our new slices. This will sort and merge them into the sorted
+ // sequence.
+ AS.insert(NewSlices);
+
+ DEBUG(dbgs() << " Pre-split slices:\n");
+#ifndef NDEBUG
+ for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
+ DEBUG(AS.print(dbgs(), I, " "));
+#endif
+
+ // Finally, don't try to promote any allocas that new require re-splitting.
+ // They have already been added to the worklist above.
+ PromotableAllocas.erase(
+ llvm::remove_if(
+ PromotableAllocas,
+ [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }),
+ PromotableAllocas.end());
+
+ return true;
+}
+
+/// \brief Rewrite an alloca partition's users.
+///
+/// This routine drives both of the rewriting goals of the SROA pass. It tries
+/// to rewrite uses of an alloca partition to be conducive for SSA value
+/// promotion. If the partition needs a new, more refined alloca, this will
+/// build that new alloca, preserving as much type information as possible, and
+/// rewrite the uses of the old alloca to point at the new one and have the
+/// appropriate new offsets. It also evaluates how successful the rewrite was
+/// at enabling promotion and if it was successful queues the alloca to be
+/// promoted.
+AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
+ Partition &P) {
+ // Try to compute a friendly type for this partition of the alloca. This
+ // won't always succeed, in which case we fall back to a legal integer type
+ // or an i8 array of an appropriate size.
+ Type *SliceTy = nullptr;
+ const DataLayout &DL = AI.getModule()->getDataLayout();
+ if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()))
+ if (DL.getTypeAllocSize(CommonUseTy) >= P.size())
+ SliceTy = CommonUseTy;
+ if (!SliceTy)
+ if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
+ P.beginOffset(), P.size()))
+ SliceTy = TypePartitionTy;
+ if ((!SliceTy || (SliceTy->isArrayTy() &&
+ SliceTy->getArrayElementType()->isIntegerTy())) &&
+ DL.isLegalInteger(P.size() * 8))
+ SliceTy = Type::getIntNTy(*C, P.size() * 8);
+ if (!SliceTy)
+ SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
+ assert(DL.getTypeAllocSize(SliceTy) >= P.size());
+
+ bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
+
+ VectorType *VecTy =
+ IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
+ if (VecTy)
+ SliceTy = VecTy;
+
+ // Check for the case where we're going to rewrite to a new alloca of the
+ // exact same type as the original, and with the same access offsets. In that
+ // case, re-use the existing alloca, but still run through the rewriter to
+ // perform phi and select speculation.
+ AllocaInst *NewAI;
+ if (SliceTy == AI.getAllocatedType()) {
+ assert(P.beginOffset() == 0 &&
+ "Non-zero begin offset but same alloca type");
+ NewAI = &AI;
+ // FIXME: We should be able to bail at this point with "nothing changed".
+ // FIXME: We might want to defer PHI speculation until after here.
+ // FIXME: return nullptr;
+ } else {
+ unsigned Alignment = AI.getAlignment();
+ if (!Alignment) {
+ // The minimum alignment which users can rely on when the explicit
+ // alignment is omitted or zero is that required by the ABI for this
+ // type.
+ Alignment = DL.getABITypeAlignment(AI.getAllocatedType());
+ }
+ Alignment = MinAlign(Alignment, P.beginOffset());
+ // If we will get at least this much alignment from the type alone, leave
+ // the alloca's alignment unconstrained.
+ if (Alignment <= DL.getABITypeAlignment(SliceTy))
+ Alignment = 0;
+ NewAI = new AllocaInst(
+ SliceTy, AI.getType()->getAddressSpace(), nullptr, Alignment,
+ AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
+ ++NumNewAllocas;
+ }
+
+ DEBUG(dbgs() << "Rewriting alloca partition "
+ << "[" << P.beginOffset() << "," << P.endOffset()
+ << ") to: " << *NewAI << "\n");
+
+ // Track the high watermark on the worklist as it is only relevant for
+ // promoted allocas. We will reset it to this point if the alloca is not in
+ // fact scheduled for promotion.
+ unsigned PPWOldSize = PostPromotionWorklist.size();
+ unsigned NumUses = 0;
+ SmallSetVector<PHINode *, 8> PHIUsers;
+ SmallSetVector<SelectInst *, 8> SelectUsers;
+
+ AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
+ P.endOffset(), IsIntegerPromotable, VecTy,
+ PHIUsers, SelectUsers);
+ bool Promotable = true;
+ for (Slice *S : P.splitSliceTails()) {
+ Promotable &= Rewriter.visit(S);
+ ++NumUses;
+ }
+ for (Slice &S : P) {
+ Promotable &= Rewriter.visit(&S);
+ ++NumUses;
+ }
+
+ NumAllocaPartitionUses += NumUses;
+ MaxUsesPerAllocaPartition.updateMax(NumUses);
+
+ // Now that we've processed all the slices in the new partition, check if any
+ // PHIs or Selects would block promotion.
+ for (PHINode *PHI : PHIUsers)
+ if (!isSafePHIToSpeculate(*PHI)) {
+ Promotable = false;
+ PHIUsers.clear();
+ SelectUsers.clear();
+ break;
+ }
+
+ for (SelectInst *Sel : SelectUsers)
+ if (!isSafeSelectToSpeculate(*Sel)) {
+ Promotable = false;
+ PHIUsers.clear();
+ SelectUsers.clear();
+ break;
+ }
+
+ if (Promotable) {
+ if (PHIUsers.empty() && SelectUsers.empty()) {
+ // Promote the alloca.
+ PromotableAllocas.push_back(NewAI);
+ } else {
+ // If we have either PHIs or Selects to speculate, add them to those
+ // worklists and re-queue the new alloca so that we promote in on the
+ // next iteration.
+ for (PHINode *PHIUser : PHIUsers)
+ SpeculatablePHIs.insert(PHIUser);
+ for (SelectInst *SelectUser : SelectUsers)
+ SpeculatableSelects.insert(SelectUser);
+ Worklist.insert(NewAI);
+ }
+ } else {
+ // Drop any post-promotion work items if promotion didn't happen.
+ while (PostPromotionWorklist.size() > PPWOldSize)
+ PostPromotionWorklist.pop_back();
+
+ // We couldn't promote and we didn't create a new partition, nothing
+ // happened.
+ if (NewAI == &AI)
+ return nullptr;
+
+ // If we can't promote the alloca, iterate on it to check for new
+ // refinements exposed by splitting the current alloca. Don't iterate on an
+ // alloca which didn't actually change and didn't get promoted.
+ Worklist.insert(NewAI);
+ }
+
+ return NewAI;
+}
+
+/// \brief Walks the slices of an alloca and form partitions based on them,
+/// rewriting each of their uses.
+bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
+ if (AS.begin() == AS.end())
+ return false;
+
+ unsigned NumPartitions = 0;
+ bool Changed = false;
+ const DataLayout &DL = AI.getModule()->getDataLayout();
+
+ // First try to pre-split loads and stores.
+ Changed |= presplitLoadsAndStores(AI, AS);
+
+ // Now that we have identified any pre-splitting opportunities,
+ // mark loads and stores unsplittable except for the following case.
+ // We leave a slice splittable if all other slices are disjoint or fully
+ // included in the slice, such as whole-alloca loads and stores.
+ // If we fail to split these during pre-splitting, we want to force them
+ // to be rewritten into a partition.
+ bool IsSorted = true;
+
+ uint64_t AllocaSize = DL.getTypeAllocSize(AI.getAllocatedType());
+ const uint64_t MaxBitVectorSize = 1024;
+ if (SROASplitNonWholeAllocaSlices && AllocaSize <= MaxBitVectorSize) {
+ // If a byte boundary is included in any load or store, a slice starting or
+ // ending at the boundary is not splittable.
+ SmallBitVector SplittableOffset(AllocaSize + 1, true);
+ for (Slice &S : AS)
+ for (unsigned O = S.beginOffset() + 1;
+ O < S.endOffset() && O < AllocaSize; O++)
+ SplittableOffset.reset(O);
+
+ for (Slice &S : AS) {
+ if (!S.isSplittable())
+ continue;
+
+ if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
+ (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
+ continue;
+
+ if (isa<LoadInst>(S.getUse()->getUser()) ||
+ isa<StoreInst>(S.getUse()->getUser())) {
+ S.makeUnsplittable();
+ IsSorted = false;
+ }
+ }
+ }
+ else {
+ // We only allow whole-alloca splittable loads and stores
+ // for a large alloca to avoid creating too large BitVector.
+ for (Slice &S : AS) {
+ if (!S.isSplittable())
+ continue;
+
+ if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
+ continue;
+
+ if (isa<LoadInst>(S.getUse()->getUser()) ||
+ isa<StoreInst>(S.getUse()->getUser())) {
+ S.makeUnsplittable();
+ IsSorted = false;
+ }
+ }
+ }
+
+ if (!IsSorted)
+ std::sort(AS.begin(), AS.end());
+
+ /// Describes the allocas introduced by rewritePartition in order to migrate
+ /// the debug info.
+ struct Fragment {
+ AllocaInst *Alloca;
+ uint64_t Offset;
+ uint64_t Size;
+ Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
+ : Alloca(AI), Offset(O), Size(S) {}
+ };
+ SmallVector<Fragment, 4> Fragments;
+
+ // Rewrite each partition.
+ for (auto &P : AS.partitions()) {
+ if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
+ Changed = true;
+ if (NewAI != &AI) {
+ uint64_t SizeOfByte = 8;
+ uint64_t AllocaSize = DL.getTypeSizeInBits(NewAI->getAllocatedType());
+ // Don't include any padding.
+ uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
+ Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
+ }
+ }
+ ++NumPartitions;
+ }
+
+ NumAllocaPartitions += NumPartitions;
+ MaxPartitionsPerAlloca.updateMax(NumPartitions);
+
+ // Migrate debug information from the old alloca to the new alloca(s)
+ // and the individual partitions.
+ TinyPtrVector<DbgInfoIntrinsic *> DbgDeclares = FindDbgAddrUses(&AI);
+ if (!DbgDeclares.empty()) {
+ auto *Var = DbgDeclares.front()->getVariable();
+ auto *Expr = DbgDeclares.front()->getExpression();
+ auto VarSize = Var->getSizeInBits();
+ DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
+ uint64_t AllocaSize = DL.getTypeSizeInBits(AI.getAllocatedType());
+ for (auto Fragment : Fragments) {
+ // Create a fragment expression describing the new partition or reuse AI's
+ // expression if there is only one partition.
+ auto *FragmentExpr = Expr;
+ if (Fragment.Size < AllocaSize || Expr->isFragment()) {
+ // If this alloca is already a scalar replacement of a larger aggregate,
+ // Fragment.Offset describes the offset inside the scalar.
+ auto ExprFragment = Expr->getFragmentInfo();
+ uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : 0;
+ uint64_t Start = Offset + Fragment.Offset;
+ uint64_t Size = Fragment.Size;
+ if (ExprFragment) {
+ uint64_t AbsEnd =
+ ExprFragment->OffsetInBits + ExprFragment->SizeInBits;
+ if (Start >= AbsEnd)
+ // No need to describe a SROAed padding.
+ continue;
+ Size = std::min(Size, AbsEnd - Start);
+ }
+ // The new, smaller fragment is stenciled out from the old fragment.
+ if (auto OrigFragment = FragmentExpr->getFragmentInfo()) {
+ assert(Start >= OrigFragment->OffsetInBits &&
+ "new fragment is outside of original fragment");
+ Start -= OrigFragment->OffsetInBits;
+ }
+
+ // The alloca may be larger than the variable.
+ if (VarSize) {
+ if (Size > *VarSize)
+ Size = *VarSize;
+ if (Size == 0 || Start + Size > *VarSize)
+ continue;
+ }
+
+ // Avoid creating a fragment expression that covers the entire variable.
+ if (!VarSize || *VarSize != Size) {
+ if (auto E =
+ DIExpression::createFragmentExpression(Expr, Start, Size))
+ FragmentExpr = *E;
+ else
+ continue;
+ }
+ }
+
+ // Remove any existing intrinsics describing the same alloca.
+ for (DbgInfoIntrinsic *OldDII : FindDbgAddrUses(Fragment.Alloca))
+ OldDII->eraseFromParent();
+
+ DIB.insertDeclare(Fragment.Alloca, Var, FragmentExpr,
+ DbgDeclares.front()->getDebugLoc(), &AI);
+ }
+ }
+ return Changed;
+}
+
+/// \brief Clobber a use with undef, deleting the used value if it becomes dead.
+void SROA::clobberUse(Use &U) {
+ Value *OldV = U;
+ // Replace the use with an undef value.
+ U = UndefValue::get(OldV->getType());
+
+ // Check for this making an instruction dead. We have to garbage collect
+ // all the dead instructions to ensure the uses of any alloca end up being
+ // minimal.
+ if (Instruction *OldI = dyn_cast<Instruction>(OldV))
+ if (isInstructionTriviallyDead(OldI)) {
+ DeadInsts.insert(OldI);
+ }
+}
+
+/// \brief Analyze an alloca for SROA.
+///
+/// This analyzes the alloca to ensure we can reason about it, builds
+/// the slices of the alloca, and then hands it off to be split and
+/// rewritten as needed.
+bool SROA::runOnAlloca(AllocaInst &AI) {
+ DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
+ ++NumAllocasAnalyzed;
+
+ // Special case dead allocas, as they're trivial.
+ if (AI.use_empty()) {
+ AI.eraseFromParent();
+ return true;
+ }
+ const DataLayout &DL = AI.getModule()->getDataLayout();
+
+ // Skip alloca forms that this analysis can't handle.
+ if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
+ DL.getTypeAllocSize(AI.getAllocatedType()) == 0)
+ return false;
+
+ bool Changed = false;
+
+ // First, split any FCA loads and stores touching this alloca to promote
+ // better splitting and promotion opportunities.
+ AggLoadStoreRewriter AggRewriter;
+ Changed |= AggRewriter.rewrite(AI);
+
+ // Build the slices using a recursive instruction-visiting builder.
+ AllocaSlices AS(DL, AI);
+ DEBUG(AS.print(dbgs()));
+ if (AS.isEscaped())
+ return Changed;
+
+ // Delete all the dead users of this alloca before splitting and rewriting it.
+ for (Instruction *DeadUser : AS.getDeadUsers()) {
+ // Free up everything used by this instruction.
+ for (Use &DeadOp : DeadUser->operands())
+ clobberUse(DeadOp);
+
+ // Now replace the uses of this instruction.
+ DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType()));
+
+ // And mark it for deletion.
+ DeadInsts.insert(DeadUser);
+ Changed = true;
+ }
+ for (Use *DeadOp : AS.getDeadOperands()) {
+ clobberUse(*DeadOp);
+ Changed = true;
+ }
+
+ // No slices to split. Leave the dead alloca for a later pass to clean up.
+ if (AS.begin() == AS.end())
+ return Changed;
+
+ Changed |= splitAlloca(AI, AS);
+
+ DEBUG(dbgs() << " Speculating PHIs\n");
+ while (!SpeculatablePHIs.empty())
+ speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
+
+ DEBUG(dbgs() << " Speculating Selects\n");
+ while (!SpeculatableSelects.empty())
+ speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
+
+ return Changed;
+}
+
+/// \brief Delete the dead instructions accumulated in this run.
+///
+/// Recursively deletes the dead instructions we've accumulated. This is done
+/// at the very end to maximize locality of the recursive delete and to
+/// minimize the problems of invalidated instruction pointers as such pointers
+/// are used heavily in the intermediate stages of the algorithm.
+///
+/// We also record the alloca instructions deleted here so that they aren't
+/// subsequently handed to mem2reg to promote.
+bool SROA::deleteDeadInstructions(
+ SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
+ bool Changed = false;
+ while (!DeadInsts.empty()) {
+ Instruction *I = DeadInsts.pop_back_val();
+ DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
+
+ // If the instruction is an alloca, find the possible dbg.declare connected
+ // to it, and remove it too. We must do this before calling RAUW or we will
+ // not be able to find it.
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+ DeletedAllocas.insert(AI);
+ for (DbgInfoIntrinsic *OldDII : FindDbgAddrUses(AI))
+ OldDII->eraseFromParent();
+ }
+
+ I->replaceAllUsesWith(UndefValue::get(I->getType()));
+
+ for (Use &Operand : I->operands())
+ if (Instruction *U = dyn_cast<Instruction>(Operand)) {
+ // Zero out the operand and see if it becomes trivially dead.
+ Operand = nullptr;
+ if (isInstructionTriviallyDead(U))
+ DeadInsts.insert(U);
+ }
+
+ ++NumDeleted;
+ I->eraseFromParent();
+ Changed = true;
+ }
+ return Changed;
+}
+
+/// \brief Promote the allocas, using the best available technique.
+///
+/// This attempts to promote whatever allocas have been identified as viable in
+/// the PromotableAllocas list. If that list is empty, there is nothing to do.
+/// This function returns whether any promotion occurred.
+bool SROA::promoteAllocas(Function &F) {
+ if (PromotableAllocas.empty())
+ return false;
+
+ NumPromoted += PromotableAllocas.size();
+
+ DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
+ PromoteMemToReg(PromotableAllocas, *DT, AC);
+ PromotableAllocas.clear();
+ return true;
+}
+
+PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
+ AssumptionCache &RunAC) {
+ DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
+ C = &F.getContext();
+ DT = &RunDT;
+ AC = &RunAC;
+
+ BasicBlock &EntryBB = F.getEntryBlock();
+ for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
+ I != E; ++I) {
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+ Worklist.insert(AI);
+ }
+
+ bool Changed = false;
+ // A set of deleted alloca instruction pointers which should be removed from
+ // the list of promotable allocas.
+ SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
+
+ do {
+ while (!Worklist.empty()) {
+ Changed |= runOnAlloca(*Worklist.pop_back_val());
+ Changed |= deleteDeadInstructions(DeletedAllocas);
+
+ // Remove the deleted allocas from various lists so that we don't try to
+ // continue processing them.
+ if (!DeletedAllocas.empty()) {
+ auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
+ Worklist.remove_if(IsInSet);
+ PostPromotionWorklist.remove_if(IsInSet);
+ PromotableAllocas.erase(llvm::remove_if(PromotableAllocas, IsInSet),
+ PromotableAllocas.end());
+ DeletedAllocas.clear();
+ }
+ }
+
+ Changed |= promoteAllocas(F);
+
+ Worklist = PostPromotionWorklist;
+ PostPromotionWorklist.clear();
+ } while (!Worklist.empty());
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) {
+ return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F),
+ AM.getResult<AssumptionAnalysis>(F));
+}
+
+/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
+///
+/// This is in the llvm namespace purely to allow it to be a friend of the \c
+/// SROA pass.
+class llvm::sroa::SROALegacyPass : public FunctionPass {
+ /// The SROA implementation.
+ SROA Impl;
+
+public:
+ static char ID;
+
+ SROALegacyPass() : FunctionPass(ID) {
+ initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto PA = Impl.runImpl(
+ F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
+ return !PA.areAllPreserved();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.setPreservesCFG();
+ }
+
+ StringRef getPassName() const override { return "SROA"; }
+};
+
+char SROALegacyPass::ID = 0;
+
+FunctionPass *llvm::createSROAPass() { return new SROALegacyPass(); }
+
+INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
+ "Scalar Replacement Of Aggregates", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
+ false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
new file mode 100644
index 000000000000..3b99ddff2e06
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -0,0 +1,282 @@
+//===-- Scalar.cpp --------------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMScalarOpts.a, which
+// implements several scalar transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Scalar.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+
+using namespace llvm;
+
+/// initializeScalarOptsPasses - Initialize all passes linked into the
+/// ScalarOpts library.
+void llvm::initializeScalarOpts(PassRegistry &Registry) {
+ initializeADCELegacyPassPass(Registry);
+ initializeBDCELegacyPassPass(Registry);
+ initializeAlignmentFromAssumptionsPass(Registry);
+ initializeCallSiteSplittingLegacyPassPass(Registry);
+ initializeConstantHoistingLegacyPassPass(Registry);
+ initializeConstantPropagationPass(Registry);
+ initializeCorrelatedValuePropagationPass(Registry);
+ initializeDCELegacyPassPass(Registry);
+ initializeDeadInstEliminationPass(Registry);
+ initializeDivRemPairsLegacyPassPass(Registry);
+ initializeScalarizerPass(Registry);
+ initializeDSELegacyPassPass(Registry);
+ initializeGuardWideningLegacyPassPass(Registry);
+ initializeGVNLegacyPassPass(Registry);
+ initializeNewGVNLegacyPassPass(Registry);
+ initializeEarlyCSELegacyPassPass(Registry);
+ initializeEarlyCSEMemSSALegacyPassPass(Registry);
+ initializeGVNHoistLegacyPassPass(Registry);
+ initializeGVNSinkLegacyPassPass(Registry);
+ initializeFlattenCFGPassPass(Registry);
+ initializeInductiveRangeCheckEliminationPass(Registry);
+ initializeIndVarSimplifyLegacyPassPass(Registry);
+ initializeInferAddressSpacesPass(Registry);
+ initializeJumpThreadingPass(Registry);
+ initializeLegacyLICMPassPass(Registry);
+ initializeLegacyLoopSinkPassPass(Registry);
+ initializeLoopDataPrefetchLegacyPassPass(Registry);
+ initializeLoopDeletionLegacyPassPass(Registry);
+ initializeLoopAccessLegacyAnalysisPass(Registry);
+ initializeLoopInstSimplifyLegacyPassPass(Registry);
+ initializeLoopInterchangePass(Registry);
+ initializeLoopPredicationLegacyPassPass(Registry);
+ initializeLoopRotateLegacyPassPass(Registry);
+ initializeLoopStrengthReducePass(Registry);
+ initializeLoopRerollPass(Registry);
+ initializeLoopUnrollPass(Registry);
+ initializeLoopUnswitchPass(Registry);
+ initializeLoopVersioningLICMPass(Registry);
+ initializeLoopIdiomRecognizeLegacyPassPass(Registry);
+ initializeLowerAtomicLegacyPassPass(Registry);
+ initializeLowerExpectIntrinsicPass(Registry);
+ initializeLowerGuardIntrinsicLegacyPassPass(Registry);
+ initializeMemCpyOptLegacyPassPass(Registry);
+ initializeMergeICmpsPass(Registry);
+ initializeMergedLoadStoreMotionLegacyPassPass(Registry);
+ initializeNaryReassociateLegacyPassPass(Registry);
+ initializePartiallyInlineLibCallsLegacyPassPass(Registry);
+ initializeReassociateLegacyPassPass(Registry);
+ initializeRegToMemPass(Registry);
+ initializeRewriteStatepointsForGCLegacyPassPass(Registry);
+ initializeSCCPLegacyPassPass(Registry);
+ initializeIPSCCPLegacyPassPass(Registry);
+ initializeSROALegacyPassPass(Registry);
+ initializeCFGSimplifyPassPass(Registry);
+ initializeStructurizeCFGPass(Registry);
+ initializeSimpleLoopUnswitchLegacyPassPass(Registry);
+ initializeSinkingLegacyPassPass(Registry);
+ initializeTailCallElimPass(Registry);
+ initializeSeparateConstOffsetFromGEPPass(Registry);
+ initializeSpeculativeExecutionLegacyPassPass(Registry);
+ initializeStraightLineStrengthReducePass(Registry);
+ initializePlaceBackedgeSafepointsImplPass(Registry);
+ initializePlaceSafepointsPass(Registry);
+ initializeFloat2IntLegacyPassPass(Registry);
+ initializeLoopDistributeLegacyPass(Registry);
+ initializeLoopLoadEliminationPass(Registry);
+ initializeLoopSimplifyCFGLegacyPassPass(Registry);
+ initializeLoopVersioningPassPass(Registry);
+ initializeEntryExitInstrumenterPass(Registry);
+ initializePostInlineEntryExitInstrumenterPass(Registry);
+}
+
+void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
+ initializeScalarOpts(*unwrap(R));
+}
+
+void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createAggressiveDCEPass());
+}
+
+void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createBitTrackingDCEPass());
+}
+
+void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createAlignmentFromAssumptionsPass());
+}
+
+void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createCFGSimplificationPass(1, false, false, true));
+}
+
+void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createDeadStoreEliminationPass());
+}
+
+void LLVMAddScalarizerPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createScalarizerPass());
+}
+
+void LLVMAddGVNPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createGVNPass());
+}
+
+void LLVMAddNewGVNPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createNewGVNPass());
+}
+
+void LLVMAddMergedLoadStoreMotionPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createMergedLoadStoreMotionPass());
+}
+
+void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createIndVarSimplifyPass());
+}
+
+void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createInstructionCombiningPass());
+}
+
+void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createJumpThreadingPass());
+}
+
+void LLVMAddLoopSinkPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopSinkPass());
+}
+
+void LLVMAddLICMPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLICMPass());
+}
+
+void LLVMAddLoopDeletionPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopDeletionPass());
+}
+
+void LLVMAddLoopIdiomPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopIdiomPass());
+}
+
+void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopRotatePass());
+}
+
+void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopRerollPass());
+}
+
+void LLVMAddLoopSimplifyCFGPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopSimplifyCFGPass());
+}
+
+void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopUnrollPass());
+}
+
+void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopUnswitchPass());
+}
+
+void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createMemCpyOptPass());
+}
+
+void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createPartiallyInlineLibCallsPass());
+}
+
+void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLowerSwitchPass());
+}
+
+void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createPromoteMemoryToRegisterPass());
+}
+
+void LLVMAddReassociatePass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createReassociatePass());
+}
+
+void LLVMAddSCCPPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createSCCPPass());
+}
+
+void LLVMAddScalarReplAggregatesPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createSROAPass());
+}
+
+void LLVMAddScalarReplAggregatesPassSSA(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createSROAPass());
+}
+
+void LLVMAddScalarReplAggregatesPassWithThreshold(LLVMPassManagerRef PM,
+ int Threshold) {
+ unwrap(PM)->add(createSROAPass());
+}
+
+void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) {
+ // NOTE: The simplify-libcalls pass has been removed.
+}
+
+void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createTailCallEliminationPass());
+}
+
+void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createConstantPropagationPass());
+}
+
+void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createDemoteRegisterToMemoryPass());
+}
+
+void LLVMAddVerifierPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createVerifierPass());
+}
+
+void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createCorrelatedValuePropagationPass());
+}
+
+void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createEarlyCSEPass(false/*=UseMemorySSA*/));
+}
+
+void LLVMAddEarlyCSEMemSSAPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createEarlyCSEPass(true/*=UseMemorySSA*/));
+}
+
+void LLVMAddGVNHoistLegacyPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createGVNHoistPass());
+}
+
+void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createTypeBasedAAWrapperPass());
+}
+
+void LLVMAddScopedNoAliasAAPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createScopedNoAliasAAWrapperPass());
+}
+
+void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createBasicAAWrapperPass());
+}
+
+void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLowerExpectIntrinsicPass());
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
new file mode 100644
index 000000000000..34ed126155be
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -0,0 +1,802 @@
+//===- Scalarizer.cpp - Scalarize vector operations -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts vector operations into scalar operations, in order
+// to expose optimization opportunities on the individual scalar operations.
+// It is mainly intended for targets that do not have vector units, but it
+// may also be useful for revectorizing code to different vector widths.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Options.h"
+#include "llvm/Transforms/Scalar.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "scalarizer"
+
+namespace {
+
+// Used to store the scattered form of a vector.
+using ValueVector = SmallVector<Value *, 8>;
+
+// Used to map a vector Value to its scattered form. We use std::map
+// because we want iterators to persist across insertion and because the
+// values are relatively large.
+using ScatterMap = std::map<Value *, ValueVector>;
+
+// Lists Instructions that have been replaced with scalar implementations,
+// along with a pointer to their scattered forms.
+using GatherList = SmallVector<std::pair<Instruction *, ValueVector *>, 16>;
+
+// Provides a very limited vector-like interface for lazily accessing one
+// component of a scattered vector or vector pointer.
+class Scatterer {
+public:
+ Scatterer() = default;
+
+ // Scatter V into Size components. If new instructions are needed,
+ // insert them before BBI in BB. If Cache is nonnull, use it to cache
+ // the results.
+ Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
+ ValueVector *cachePtr = nullptr);
+
+ // Return component I, creating a new Value for it if necessary.
+ Value *operator[](unsigned I);
+
+ // Return the number of components.
+ unsigned size() const { return Size; }
+
+private:
+ BasicBlock *BB;
+ BasicBlock::iterator BBI;
+ Value *V;
+ ValueVector *CachePtr;
+ PointerType *PtrTy;
+ ValueVector Tmp;
+ unsigned Size;
+};
+
+// FCmpSpliiter(FCI)(Builder, X, Y, Name) uses Builder to create an FCmp
+// called Name that compares X and Y in the same way as FCI.
+struct FCmpSplitter {
+ FCmpSplitter(FCmpInst &fci) : FCI(fci) {}
+
+ Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+ const Twine &Name) const {
+ return Builder.CreateFCmp(FCI.getPredicate(), Op0, Op1, Name);
+ }
+
+ FCmpInst &FCI;
+};
+
+// ICmpSpliiter(ICI)(Builder, X, Y, Name) uses Builder to create an ICmp
+// called Name that compares X and Y in the same way as ICI.
+struct ICmpSplitter {
+ ICmpSplitter(ICmpInst &ici) : ICI(ici) {}
+
+ Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+ const Twine &Name) const {
+ return Builder.CreateICmp(ICI.getPredicate(), Op0, Op1, Name);
+ }
+
+ ICmpInst &ICI;
+};
+
+// BinarySpliiter(BO)(Builder, X, Y, Name) uses Builder to create
+// a binary operator like BO called Name with operands X and Y.
+struct BinarySplitter {
+ BinarySplitter(BinaryOperator &bo) : BO(bo) {}
+
+ Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+ const Twine &Name) const {
+ return Builder.CreateBinOp(BO.getOpcode(), Op0, Op1, Name);
+ }
+
+ BinaryOperator &BO;
+};
+
+// Information about a load or store that we're scalarizing.
+struct VectorLayout {
+ VectorLayout() = default;
+
+ // Return the alignment of element I.
+ uint64_t getElemAlign(unsigned I) {
+ return MinAlign(VecAlign, I * ElemSize);
+ }
+
+ // The type of the vector.
+ VectorType *VecTy = nullptr;
+
+ // The type of each element.
+ Type *ElemTy = nullptr;
+
+ // The alignment of the vector.
+ uint64_t VecAlign = 0;
+
+ // The size of each element.
+ uint64_t ElemSize = 0;
+};
+
+class Scalarizer : public FunctionPass,
+ public InstVisitor<Scalarizer, bool> {
+public:
+ static char ID;
+
+ Scalarizer() : FunctionPass(ID) {
+ initializeScalarizerPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool doInitialization(Module &M) override;
+ bool runOnFunction(Function &F) override;
+
+ // InstVisitor methods. They return true if the instruction was scalarized,
+ // false if nothing changed.
+ bool visitInstruction(Instruction &I) { return false; }
+ bool visitSelectInst(SelectInst &SI);
+ bool visitICmpInst(ICmpInst &ICI);
+ bool visitFCmpInst(FCmpInst &FCI);
+ bool visitBinaryOperator(BinaryOperator &BO);
+ bool visitGetElementPtrInst(GetElementPtrInst &GEPI);
+ bool visitCastInst(CastInst &CI);
+ bool visitBitCastInst(BitCastInst &BCI);
+ bool visitShuffleVectorInst(ShuffleVectorInst &SVI);
+ bool visitPHINode(PHINode &PHI);
+ bool visitLoadInst(LoadInst &LI);
+ bool visitStoreInst(StoreInst &SI);
+ bool visitCallInst(CallInst &ICI);
+
+ static void registerOptions() {
+ // This is disabled by default because having separate loads and stores
+ // makes it more likely that the -combiner-alias-analysis limits will be
+ // reached.
+ OptionRegistry::registerOption<bool, Scalarizer,
+ &Scalarizer::ScalarizeLoadStore>(
+ "scalarize-load-store",
+ "Allow the scalarizer pass to scalarize loads and store", false);
+ }
+
+private:
+ Scatterer scatter(Instruction *Point, Value *V);
+ void gather(Instruction *Op, const ValueVector &CV);
+ bool canTransferMetadata(unsigned Kind);
+ void transferMetadata(Instruction *Op, const ValueVector &CV);
+ bool getVectorLayout(Type *Ty, unsigned Alignment, VectorLayout &Layout,
+ const DataLayout &DL);
+ bool finish();
+
+ template<typename T> bool splitBinary(Instruction &, const T &);
+
+ bool splitCall(CallInst &CI);
+
+ ScatterMap Scattered;
+ GatherList Gathered;
+ unsigned ParallelLoopAccessMDKind;
+ bool ScalarizeLoadStore;
+};
+
+} // end anonymous namespace
+
+char Scalarizer::ID = 0;
+
+INITIALIZE_PASS_WITH_OPTIONS(Scalarizer, "scalarizer",
+ "Scalarize vector operations", false, false)
+
+Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
+ ValueVector *cachePtr)
+ : BB(bb), BBI(bbi), V(v), CachePtr(cachePtr) {
+ Type *Ty = V->getType();
+ PtrTy = dyn_cast<PointerType>(Ty);
+ if (PtrTy)
+ Ty = PtrTy->getElementType();
+ Size = Ty->getVectorNumElements();
+ if (!CachePtr)
+ Tmp.resize(Size, nullptr);
+ else if (CachePtr->empty())
+ CachePtr->resize(Size, nullptr);
+ else
+ assert(Size == CachePtr->size() && "Inconsistent vector sizes");
+}
+
+// Return component I, creating a new Value for it if necessary.
+Value *Scatterer::operator[](unsigned I) {
+ ValueVector &CV = (CachePtr ? *CachePtr : Tmp);
+ // Try to reuse a previous value.
+ if (CV[I])
+ return CV[I];
+ IRBuilder<> Builder(BB, BBI);
+ if (PtrTy) {
+ if (!CV[0]) {
+ Type *Ty =
+ PointerType::get(PtrTy->getElementType()->getVectorElementType(),
+ PtrTy->getAddressSpace());
+ CV[0] = Builder.CreateBitCast(V, Ty, V->getName() + ".i0");
+ }
+ if (I != 0)
+ CV[I] = Builder.CreateConstGEP1_32(nullptr, CV[0], I,
+ V->getName() + ".i" + Twine(I));
+ } else {
+ // Search through a chain of InsertElementInsts looking for element I.
+ // Record other elements in the cache. The new V is still suitable
+ // for all uncached indices.
+ while (true) {
+ InsertElementInst *Insert = dyn_cast<InsertElementInst>(V);
+ if (!Insert)
+ break;
+ ConstantInt *Idx = dyn_cast<ConstantInt>(Insert->getOperand(2));
+ if (!Idx)
+ break;
+ unsigned J = Idx->getZExtValue();
+ V = Insert->getOperand(0);
+ if (I == J) {
+ CV[J] = Insert->getOperand(1);
+ return CV[J];
+ } else if (!CV[J]) {
+ // Only cache the first entry we find for each index we're not actively
+ // searching for. This prevents us from going too far up the chain and
+ // caching incorrect entries.
+ CV[J] = Insert->getOperand(1);
+ }
+ }
+ CV[I] = Builder.CreateExtractElement(V, Builder.getInt32(I),
+ V->getName() + ".i" + Twine(I));
+ }
+ return CV[I];
+}
+
+bool Scalarizer::doInitialization(Module &M) {
+ ParallelLoopAccessMDKind =
+ M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
+ ScalarizeLoadStore =
+ M.getContext().getOption<bool, Scalarizer, &Scalarizer::ScalarizeLoadStore>();
+ return false;
+}
+
+bool Scalarizer::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+ assert(Gathered.empty() && Scattered.empty());
+ for (BasicBlock &BB : F) {
+ for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) {
+ Instruction *I = &*II;
+ bool Done = visit(I);
+ ++II;
+ if (Done && I->getType()->isVoidTy())
+ I->eraseFromParent();
+ }
+ }
+ return finish();
+}
+
+// Return a scattered form of V that can be accessed by Point. V must be a
+// vector or a pointer to a vector.
+Scatterer Scalarizer::scatter(Instruction *Point, Value *V) {
+ if (Argument *VArg = dyn_cast<Argument>(V)) {
+ // Put the scattered form of arguments in the entry block,
+ // so that it can be used everywhere.
+ Function *F = VArg->getParent();
+ BasicBlock *BB = &F->getEntryBlock();
+ return Scatterer(BB, BB->begin(), V, &Scattered[V]);
+ }
+ if (Instruction *VOp = dyn_cast<Instruction>(V)) {
+ // Put the scattered form of an instruction directly after the
+ // instruction.
+ BasicBlock *BB = VOp->getParent();
+ return Scatterer(BB, std::next(BasicBlock::iterator(VOp)),
+ V, &Scattered[V]);
+ }
+ // In the fallback case, just put the scattered before Point and
+ // keep the result local to Point.
+ return Scatterer(Point->getParent(), Point->getIterator(), V);
+}
+
+// Replace Op with the gathered form of the components in CV. Defer the
+// deletion of Op and creation of the gathered form to the end of the pass,
+// so that we can avoid creating the gathered form if all uses of Op are
+// replaced with uses of CV.
+void Scalarizer::gather(Instruction *Op, const ValueVector &CV) {
+ // Since we're not deleting Op yet, stub out its operands, so that it
+ // doesn't make anything live unnecessarily.
+ for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I)
+ Op->setOperand(I, UndefValue::get(Op->getOperand(I)->getType()));
+
+ transferMetadata(Op, CV);
+
+ // If we already have a scattered form of Op (created from ExtractElements
+ // of Op itself), replace them with the new form.
+ ValueVector &SV = Scattered[Op];
+ if (!SV.empty()) {
+ for (unsigned I = 0, E = SV.size(); I != E; ++I) {
+ Value *V = SV[I];
+ if (V == nullptr)
+ continue;
+
+ Instruction *Old = cast<Instruction>(V);
+ CV[I]->takeName(Old);
+ Old->replaceAllUsesWith(CV[I]);
+ Old->eraseFromParent();
+ }
+ }
+ SV = CV;
+ Gathered.push_back(GatherList::value_type(Op, &SV));
+}
+
+// Return true if it is safe to transfer the given metadata tag from
+// vector to scalar instructions.
+bool Scalarizer::canTransferMetadata(unsigned Tag) {
+ return (Tag == LLVMContext::MD_tbaa
+ || Tag == LLVMContext::MD_fpmath
+ || Tag == LLVMContext::MD_tbaa_struct
+ || Tag == LLVMContext::MD_invariant_load
+ || Tag == LLVMContext::MD_alias_scope
+ || Tag == LLVMContext::MD_noalias
+ || Tag == ParallelLoopAccessMDKind);
+}
+
+// Transfer metadata from Op to the instructions in CV if it is known
+// to be safe to do so.
+void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) {
+ SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+ Op->getAllMetadataOtherThanDebugLoc(MDs);
+ for (unsigned I = 0, E = CV.size(); I != E; ++I) {
+ if (Instruction *New = dyn_cast<Instruction>(CV[I])) {
+ for (const auto &MD : MDs)
+ if (canTransferMetadata(MD.first))
+ New->setMetadata(MD.first, MD.second);
+ if (Op->getDebugLoc() && !New->getDebugLoc())
+ New->setDebugLoc(Op->getDebugLoc());
+ }
+ }
+}
+
+// Try to fill in Layout from Ty, returning true on success. Alignment is
+// the alignment of the vector, or 0 if the ABI default should be used.
+bool Scalarizer::getVectorLayout(Type *Ty, unsigned Alignment,
+ VectorLayout &Layout, const DataLayout &DL) {
+ // Make sure we're dealing with a vector.
+ Layout.VecTy = dyn_cast<VectorType>(Ty);
+ if (!Layout.VecTy)
+ return false;
+
+ // Check that we're dealing with full-byte elements.
+ Layout.ElemTy = Layout.VecTy->getElementType();
+ if (DL.getTypeSizeInBits(Layout.ElemTy) !=
+ DL.getTypeStoreSizeInBits(Layout.ElemTy))
+ return false;
+
+ if (Alignment)
+ Layout.VecAlign = Alignment;
+ else
+ Layout.VecAlign = DL.getABITypeAlignment(Layout.VecTy);
+ Layout.ElemSize = DL.getTypeStoreSize(Layout.ElemTy);
+ return true;
+}
+
+// Scalarize two-operand instruction I, using Split(Builder, X, Y, Name)
+// to create an instruction like I with operands X and Y and name Name.
+template<typename Splitter>
+bool Scalarizer::splitBinary(Instruction &I, const Splitter &Split) {
+ VectorType *VT = dyn_cast<VectorType>(I.getType());
+ if (!VT)
+ return false;
+
+ unsigned NumElems = VT->getNumElements();
+ IRBuilder<> Builder(&I);
+ Scatterer Op0 = scatter(&I, I.getOperand(0));
+ Scatterer Op1 = scatter(&I, I.getOperand(1));
+ assert(Op0.size() == NumElems && "Mismatched binary operation");
+ assert(Op1.size() == NumElems && "Mismatched binary operation");
+ ValueVector Res;
+ Res.resize(NumElems);
+ for (unsigned Elem = 0; Elem < NumElems; ++Elem)
+ Res[Elem] = Split(Builder, Op0[Elem], Op1[Elem],
+ I.getName() + ".i" + Twine(Elem));
+ gather(&I, Res);
+ return true;
+}
+
+static bool isTriviallyScalariable(Intrinsic::ID ID) {
+ return isTriviallyVectorizable(ID);
+}
+
+// All of the current scalarizable intrinsics only have one mangled type.
+static Function *getScalarIntrinsicDeclaration(Module *M,
+ Intrinsic::ID ID,
+ VectorType *Ty) {
+ return Intrinsic::getDeclaration(M, ID, { Ty->getScalarType() });
+}
+
+/// If a call to a vector typed intrinsic function, split into a scalar call per
+/// element if possible for the intrinsic.
+bool Scalarizer::splitCall(CallInst &CI) {
+ VectorType *VT = dyn_cast<VectorType>(CI.getType());
+ if (!VT)
+ return false;
+
+ Function *F = CI.getCalledFunction();
+ if (!F)
+ return false;
+
+ Intrinsic::ID ID = F->getIntrinsicID();
+ if (ID == Intrinsic::not_intrinsic || !isTriviallyScalariable(ID))
+ return false;
+
+ unsigned NumElems = VT->getNumElements();
+ unsigned NumArgs = CI.getNumArgOperands();
+
+ ValueVector ScalarOperands(NumArgs);
+ SmallVector<Scatterer, 8> Scattered(NumArgs);
+
+ Scattered.resize(NumArgs);
+
+ // Assumes that any vector type has the same number of elements as the return
+ // vector type, which is true for all current intrinsics.
+ for (unsigned I = 0; I != NumArgs; ++I) {
+ Value *OpI = CI.getOperand(I);
+ if (OpI->getType()->isVectorTy()) {
+ Scattered[I] = scatter(&CI, OpI);
+ assert(Scattered[I].size() == NumElems && "mismatched call operands");
+ } else {
+ ScalarOperands[I] = OpI;
+ }
+ }
+
+ ValueVector Res(NumElems);
+ ValueVector ScalarCallOps(NumArgs);
+
+ Function *NewIntrin = getScalarIntrinsicDeclaration(F->getParent(), ID, VT);
+ IRBuilder<> Builder(&CI);
+
+ // Perform actual scalarization, taking care to preserve any scalar operands.
+ for (unsigned Elem = 0; Elem < NumElems; ++Elem) {
+ ScalarCallOps.clear();
+
+ for (unsigned J = 0; J != NumArgs; ++J) {
+ if (hasVectorInstrinsicScalarOpd(ID, J))
+ ScalarCallOps.push_back(ScalarOperands[J]);
+ else
+ ScalarCallOps.push_back(Scattered[J][Elem]);
+ }
+
+ Res[Elem] = Builder.CreateCall(NewIntrin, ScalarCallOps,
+ CI.getName() + ".i" + Twine(Elem));
+ }
+
+ gather(&CI, Res);
+ return true;
+}
+
+bool Scalarizer::visitSelectInst(SelectInst &SI) {
+ VectorType *VT = dyn_cast<VectorType>(SI.getType());
+ if (!VT)
+ return false;
+
+ unsigned NumElems = VT->getNumElements();
+ IRBuilder<> Builder(&SI);
+ Scatterer Op1 = scatter(&SI, SI.getOperand(1));
+ Scatterer Op2 = scatter(&SI, SI.getOperand(2));
+ assert(Op1.size() == NumElems && "Mismatched select");
+ assert(Op2.size() == NumElems && "Mismatched select");
+ ValueVector Res;
+ Res.resize(NumElems);
+
+ if (SI.getOperand(0)->getType()->isVectorTy()) {
+ Scatterer Op0 = scatter(&SI, SI.getOperand(0));
+ assert(Op0.size() == NumElems && "Mismatched select");
+ for (unsigned I = 0; I < NumElems; ++I)
+ Res[I] = Builder.CreateSelect(Op0[I], Op1[I], Op2[I],
+ SI.getName() + ".i" + Twine(I));
+ } else {
+ Value *Op0 = SI.getOperand(0);
+ for (unsigned I = 0; I < NumElems; ++I)
+ Res[I] = Builder.CreateSelect(Op0, Op1[I], Op2[I],
+ SI.getName() + ".i" + Twine(I));
+ }
+ gather(&SI, Res);
+ return true;
+}
+
+bool Scalarizer::visitICmpInst(ICmpInst &ICI) {
+ return splitBinary(ICI, ICmpSplitter(ICI));
+}
+
+bool Scalarizer::visitFCmpInst(FCmpInst &FCI) {
+ return splitBinary(FCI, FCmpSplitter(FCI));
+}
+
+bool Scalarizer::visitBinaryOperator(BinaryOperator &BO) {
+ return splitBinary(BO, BinarySplitter(BO));
+}
+
+bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+ VectorType *VT = dyn_cast<VectorType>(GEPI.getType());
+ if (!VT)
+ return false;
+
+ IRBuilder<> Builder(&GEPI);
+ unsigned NumElems = VT->getNumElements();
+ unsigned NumIndices = GEPI.getNumIndices();
+
+ // The base pointer might be scalar even if it's a vector GEP. In those cases,
+ // splat the pointer into a vector value, and scatter that vector.
+ Value *Op0 = GEPI.getOperand(0);
+ if (!Op0->getType()->isVectorTy())
+ Op0 = Builder.CreateVectorSplat(NumElems, Op0);
+ Scatterer Base = scatter(&GEPI, Op0);
+
+ SmallVector<Scatterer, 8> Ops;
+ Ops.resize(NumIndices);
+ for (unsigned I = 0; I < NumIndices; ++I) {
+ Value *Op = GEPI.getOperand(I + 1);
+
+ // The indices might be scalars even if it's a vector GEP. In those cases,
+ // splat the scalar into a vector value, and scatter that vector.
+ if (!Op->getType()->isVectorTy())
+ Op = Builder.CreateVectorSplat(NumElems, Op);
+
+ Ops[I] = scatter(&GEPI, Op);
+ }
+
+ ValueVector Res;
+ Res.resize(NumElems);
+ for (unsigned I = 0; I < NumElems; ++I) {
+ SmallVector<Value *, 8> Indices;
+ Indices.resize(NumIndices);
+ for (unsigned J = 0; J < NumIndices; ++J)
+ Indices[J] = Ops[J][I];
+ Res[I] = Builder.CreateGEP(GEPI.getSourceElementType(), Base[I], Indices,
+ GEPI.getName() + ".i" + Twine(I));
+ if (GEPI.isInBounds())
+ if (GetElementPtrInst *NewGEPI = dyn_cast<GetElementPtrInst>(Res[I]))
+ NewGEPI->setIsInBounds();
+ }
+ gather(&GEPI, Res);
+ return true;
+}
+
+bool Scalarizer::visitCastInst(CastInst &CI) {
+ VectorType *VT = dyn_cast<VectorType>(CI.getDestTy());
+ if (!VT)
+ return false;
+
+ unsigned NumElems = VT->getNumElements();
+ IRBuilder<> Builder(&CI);
+ Scatterer Op0 = scatter(&CI, CI.getOperand(0));
+ assert(Op0.size() == NumElems && "Mismatched cast");
+ ValueVector Res;
+ Res.resize(NumElems);
+ for (unsigned I = 0; I < NumElems; ++I)
+ Res[I] = Builder.CreateCast(CI.getOpcode(), Op0[I], VT->getElementType(),
+ CI.getName() + ".i" + Twine(I));
+ gather(&CI, Res);
+ return true;
+}
+
+bool Scalarizer::visitBitCastInst(BitCastInst &BCI) {
+ VectorType *DstVT = dyn_cast<VectorType>(BCI.getDestTy());
+ VectorType *SrcVT = dyn_cast<VectorType>(BCI.getSrcTy());
+ if (!DstVT || !SrcVT)
+ return false;
+
+ unsigned DstNumElems = DstVT->getNumElements();
+ unsigned SrcNumElems = SrcVT->getNumElements();
+ IRBuilder<> Builder(&BCI);
+ Scatterer Op0 = scatter(&BCI, BCI.getOperand(0));
+ ValueVector Res;
+ Res.resize(DstNumElems);
+
+ if (DstNumElems == SrcNumElems) {
+ for (unsigned I = 0; I < DstNumElems; ++I)
+ Res[I] = Builder.CreateBitCast(Op0[I], DstVT->getElementType(),
+ BCI.getName() + ".i" + Twine(I));
+ } else if (DstNumElems > SrcNumElems) {
+ // <M x t1> -> <N*M x t2>. Convert each t1 to <N x t2> and copy the
+ // individual elements to the destination.
+ unsigned FanOut = DstNumElems / SrcNumElems;
+ Type *MidTy = VectorType::get(DstVT->getElementType(), FanOut);
+ unsigned ResI = 0;
+ for (unsigned Op0I = 0; Op0I < SrcNumElems; ++Op0I) {
+ Value *V = Op0[Op0I];
+ Instruction *VI;
+ // Look through any existing bitcasts before converting to <N x t2>.
+ // In the best case, the resulting conversion might be a no-op.
+ while ((VI = dyn_cast<Instruction>(V)) &&
+ VI->getOpcode() == Instruction::BitCast)
+ V = VI->getOperand(0);
+ V = Builder.CreateBitCast(V, MidTy, V->getName() + ".cast");
+ Scatterer Mid = scatter(&BCI, V);
+ for (unsigned MidI = 0; MidI < FanOut; ++MidI)
+ Res[ResI++] = Mid[MidI];
+ }
+ } else {
+ // <N*M x t1> -> <M x t2>. Convert each group of <N x t1> into a t2.
+ unsigned FanIn = SrcNumElems / DstNumElems;
+ Type *MidTy = VectorType::get(SrcVT->getElementType(), FanIn);
+ unsigned Op0I = 0;
+ for (unsigned ResI = 0; ResI < DstNumElems; ++ResI) {
+ Value *V = UndefValue::get(MidTy);
+ for (unsigned MidI = 0; MidI < FanIn; ++MidI)
+ V = Builder.CreateInsertElement(V, Op0[Op0I++], Builder.getInt32(MidI),
+ BCI.getName() + ".i" + Twine(ResI)
+ + ".upto" + Twine(MidI));
+ Res[ResI] = Builder.CreateBitCast(V, DstVT->getElementType(),
+ BCI.getName() + ".i" + Twine(ResI));
+ }
+ }
+ gather(&BCI, Res);
+ return true;
+}
+
+bool Scalarizer::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+ VectorType *VT = dyn_cast<VectorType>(SVI.getType());
+ if (!VT)
+ return false;
+
+ unsigned NumElems = VT->getNumElements();
+ Scatterer Op0 = scatter(&SVI, SVI.getOperand(0));
+ Scatterer Op1 = scatter(&SVI, SVI.getOperand(1));
+ ValueVector Res;
+ Res.resize(NumElems);
+
+ for (unsigned I = 0; I < NumElems; ++I) {
+ int Selector = SVI.getMaskValue(I);
+ if (Selector < 0)
+ Res[I] = UndefValue::get(VT->getElementType());
+ else if (unsigned(Selector) < Op0.size())
+ Res[I] = Op0[Selector];
+ else
+ Res[I] = Op1[Selector - Op0.size()];
+ }
+ gather(&SVI, Res);
+ return true;
+}
+
+bool Scalarizer::visitPHINode(PHINode &PHI) {
+ VectorType *VT = dyn_cast<VectorType>(PHI.getType());
+ if (!VT)
+ return false;
+
+ unsigned NumElems = VT->getNumElements();
+ IRBuilder<> Builder(&PHI);
+ ValueVector Res;
+ Res.resize(NumElems);
+
+ unsigned NumOps = PHI.getNumOperands();
+ for (unsigned I = 0; I < NumElems; ++I)
+ Res[I] = Builder.CreatePHI(VT->getElementType(), NumOps,
+ PHI.getName() + ".i" + Twine(I));
+
+ for (unsigned I = 0; I < NumOps; ++I) {
+ Scatterer Op = scatter(&PHI, PHI.getIncomingValue(I));
+ BasicBlock *IncomingBlock = PHI.getIncomingBlock(I);
+ for (unsigned J = 0; J < NumElems; ++J)
+ cast<PHINode>(Res[J])->addIncoming(Op[J], IncomingBlock);
+ }
+ gather(&PHI, Res);
+ return true;
+}
+
+bool Scalarizer::visitLoadInst(LoadInst &LI) {
+ if (!ScalarizeLoadStore)
+ return false;
+ if (!LI.isSimple())
+ return false;
+
+ VectorLayout Layout;
+ if (!getVectorLayout(LI.getType(), LI.getAlignment(), Layout,
+ LI.getModule()->getDataLayout()))
+ return false;
+
+ unsigned NumElems = Layout.VecTy->getNumElements();
+ IRBuilder<> Builder(&LI);
+ Scatterer Ptr = scatter(&LI, LI.getPointerOperand());
+ ValueVector Res;
+ Res.resize(NumElems);
+
+ for (unsigned I = 0; I < NumElems; ++I)
+ Res[I] = Builder.CreateAlignedLoad(Ptr[I], Layout.getElemAlign(I),
+ LI.getName() + ".i" + Twine(I));
+ gather(&LI, Res);
+ return true;
+}
+
+bool Scalarizer::visitStoreInst(StoreInst &SI) {
+ if (!ScalarizeLoadStore)
+ return false;
+ if (!SI.isSimple())
+ return false;
+
+ VectorLayout Layout;
+ Value *FullValue = SI.getValueOperand();
+ if (!getVectorLayout(FullValue->getType(), SI.getAlignment(), Layout,
+ SI.getModule()->getDataLayout()))
+ return false;
+
+ unsigned NumElems = Layout.VecTy->getNumElements();
+ IRBuilder<> Builder(&SI);
+ Scatterer Ptr = scatter(&SI, SI.getPointerOperand());
+ Scatterer Val = scatter(&SI, FullValue);
+
+ ValueVector Stores;
+ Stores.resize(NumElems);
+ for (unsigned I = 0; I < NumElems; ++I) {
+ unsigned Align = Layout.getElemAlign(I);
+ Stores[I] = Builder.CreateAlignedStore(Val[I], Ptr[I], Align);
+ }
+ transferMetadata(&SI, Stores);
+ return true;
+}
+
+bool Scalarizer::visitCallInst(CallInst &CI) {
+ return splitCall(CI);
+}
+
+// Delete the instructions that we scalarized. If a full vector result
+// is still needed, recreate it using InsertElements.
+bool Scalarizer::finish() {
+ // The presence of data in Gathered or Scattered indicates changes
+ // made to the Function.
+ if (Gathered.empty() && Scattered.empty())
+ return false;
+ for (const auto &GMI : Gathered) {
+ Instruction *Op = GMI.first;
+ ValueVector &CV = *GMI.second;
+ if (!Op->use_empty()) {
+ // The value is still needed, so recreate it using a series of
+ // InsertElements.
+ Type *Ty = Op->getType();
+ Value *Res = UndefValue::get(Ty);
+ BasicBlock *BB = Op->getParent();
+ unsigned Count = Ty->getVectorNumElements();
+ IRBuilder<> Builder(Op);
+ if (isa<PHINode>(Op))
+ Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());
+ for (unsigned I = 0; I < Count; ++I)
+ Res = Builder.CreateInsertElement(Res, CV[I], Builder.getInt32(I),
+ Op->getName() + ".upto" + Twine(I));
+ Res->takeName(Op);
+ Op->replaceAllUsesWith(Res);
+ }
+ Op->eraseFromParent();
+ }
+ Gathered.clear();
+ Scattered.clear();
+ return true;
+}
+
+FunctionPass *llvm::createScalarizerPass() {
+ return new Scalarizer();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
new file mode 100644
index 000000000000..4a96e0ddca16
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -0,0 +1,1310 @@
+//===- SeparateConstOffsetFromGEP.cpp -------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Loop unrolling may create many similar GEPs for array accesses.
+// e.g., a 2-level loop
+//
+// float a[32][32]; // global variable
+//
+// for (int i = 0; i < 2; ++i) {
+// for (int j = 0; j < 2; ++j) {
+// ...
+// ... = a[x + i][y + j];
+// ...
+// }
+// }
+//
+// will probably be unrolled to:
+//
+// gep %a, 0, %x, %y; load
+// gep %a, 0, %x, %y + 1; load
+// gep %a, 0, %x + 1, %y; load
+// gep %a, 0, %x + 1, %y + 1; load
+//
+// LLVM's GVN does not use partial redundancy elimination yet, and is thus
+// unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs
+// significant slowdown in targets with limited addressing modes. For instance,
+// because the PTX target does not support the reg+reg addressing mode, the
+// NVPTX backend emits PTX code that literally computes the pointer address of
+// each GEP, wasting tons of registers. It emits the following PTX for the
+// first load and similar PTX for other loads.
+//
+// mov.u32 %r1, %x;
+// mov.u32 %r2, %y;
+// mul.wide.u32 %rl2, %r1, 128;
+// mov.u64 %rl3, a;
+// add.s64 %rl4, %rl3, %rl2;
+// mul.wide.u32 %rl5, %r2, 4;
+// add.s64 %rl6, %rl4, %rl5;
+// ld.global.f32 %f1, [%rl6];
+//
+// To reduce the register pressure, the optimization implemented in this file
+// merges the common part of a group of GEPs, so we can compute each pointer
+// address by adding a simple offset to the common part, saving many registers.
+//
+// It works by splitting each GEP into a variadic base and a constant offset.
+// The variadic base can be computed once and reused by multiple GEPs, and the
+// constant offsets can be nicely folded into the reg+immediate addressing mode
+// (supported by most targets) without using any extra register.
+//
+// For instance, we transform the four GEPs and four loads in the above example
+// into:
+//
+// base = gep a, 0, x, y
+// load base
+// laod base + 1 * sizeof(float)
+// load base + 32 * sizeof(float)
+// load base + 33 * sizeof(float)
+//
+// Given the transformed IR, a backend that supports the reg+immediate
+// addressing mode can easily fold the pointer arithmetics into the loads. For
+// example, the NVPTX backend can easily fold the pointer arithmetics into the
+// ld.global.f32 instructions, and the resultant PTX uses much fewer registers.
+//
+// mov.u32 %r1, %tid.x;
+// mov.u32 %r2, %tid.y;
+// mul.wide.u32 %rl2, %r1, 128;
+// mov.u64 %rl3, a;
+// add.s64 %rl4, %rl3, %rl2;
+// mul.wide.u32 %rl5, %r2, 4;
+// add.s64 %rl6, %rl4, %rl5;
+// ld.global.f32 %f1, [%rl6]; // so far the same as unoptimized PTX
+// ld.global.f32 %f2, [%rl6+4]; // much better
+// ld.global.f32 %f3, [%rl6+128]; // much better
+// ld.global.f32 %f4, [%rl6+132]; // much better
+//
+// Another improvement enabled by the LowerGEP flag is to lower a GEP with
+// multiple indices to either multiple GEPs with a single index or arithmetic
+// operations (depending on whether the target uses alias analysis in codegen).
+// Such transformation can have following benefits:
+// (1) It can always extract constants in the indices of structure type.
+// (2) After such Lowering, there are more optimization opportunities such as
+// CSE, LICM and CGP.
+//
+// E.g. The following GEPs have multiple indices:
+// BB1:
+// %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3
+// load %p
+// ...
+// BB2:
+// %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2
+// load %p2
+// ...
+//
+// We can not do CSE to the common part related to index "i64 %i". Lowering
+// GEPs can achieve such goals.
+// If the target does not use alias analysis in codegen, this pass will
+// lower a GEP with multiple indices into arithmetic operations:
+// BB1:
+// %1 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity
+// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
+// %3 = add i64 %1, %2 ; CSE opportunity
+// %4 = mul i64 %j1, length_of_struct
+// %5 = add i64 %3, %4
+// %6 = add i64 %3, struct_field_3 ; Constant offset
+// %p = inttoptr i64 %6 to i32*
+// load %p
+// ...
+// BB2:
+// %7 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity
+// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
+// %9 = add i64 %7, %8 ; CSE opportunity
+// %10 = mul i64 %j2, length_of_struct
+// %11 = add i64 %9, %10
+// %12 = add i64 %11, struct_field_2 ; Constant offset
+// %p = inttoptr i64 %12 to i32*
+// load %p2
+// ...
+//
+// If the target uses alias analysis in codegen, this pass will lower a GEP
+// with multiple indices into multiple GEPs with a single index:
+// BB1:
+// %1 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity
+// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
+// %3 = getelementptr i8* %1, i64 %2 ; CSE opportunity
+// %4 = mul i64 %j1, length_of_struct
+// %5 = getelementptr i8* %3, i64 %4
+// %6 = getelementptr i8* %5, struct_field_3 ; Constant offset
+// %p = bitcast i8* %6 to i32*
+// load %p
+// ...
+// BB2:
+// %7 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity
+// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
+// %9 = getelementptr i8* %7, i64 %8 ; CSE opportunity
+// %10 = mul i64 %j2, length_of_struct
+// %11 = getelementptr i8* %9, i64 %10
+// %12 = getelementptr i8* %11, struct_field_2 ; Constant offset
+// %p2 = bitcast i8* %12 to i32*
+// load %p2
+// ...
+//
+// Lowering GEPs can also benefit other passes such as LICM and CGP.
+// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple
+// indices if one of the index is variant. If we lower such GEP into invariant
+// parts and variant parts, LICM can hoist/sink those invariant parts.
+// CGP (CodeGen Prepare) tries to sink address calculations that match the
+// target's addressing modes. A GEP with multiple indices may not match and will
+// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of
+// them. So we end up with a better addressing mode.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+#include <string>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
+ "disable-separate-const-offset-from-gep", cl::init(false),
+ cl::desc("Do not separate the constant offset from a GEP instruction"),
+ cl::Hidden);
+
+// Setting this flag may emit false positives when the input module already
+// contains dead instructions. Therefore, we set it only in unit tests that are
+// free of dead code.
+static cl::opt<bool>
+ VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(false),
+ cl::desc("Verify this pass produces no dead code"),
+ cl::Hidden);
+
+namespace {
+
+/// \brief A helper class for separating a constant offset from a GEP index.
+///
+/// In real programs, a GEP index may be more complicated than a simple addition
+/// of something and a constant integer which can be trivially splitted. For
+/// example, to split ((a << 3) | 5) + b, we need to search deeper for the
+/// constant offset, so that we can separate the index to (a << 3) + b and 5.
+///
+/// Therefore, this class looks into the expression that computes a given GEP
+/// index, and tries to find a constant integer that can be hoisted to the
+/// outermost level of the expression as an addition. Not every constant in an
+/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a +
+/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case,
+/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15).
+class ConstantOffsetExtractor {
+public:
+ /// Extracts a constant offset from the given GEP index. It returns the
+ /// new index representing the remainder (equal to the original index minus
+ /// the constant offset), or nullptr if we cannot extract a constant offset.
+ /// \p Idx The given GEP index
+ /// \p GEP The given GEP
+ /// \p UserChainTail Outputs the tail of UserChain so that we can
+ /// garbage-collect unused instructions in UserChain.
+ static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
+ User *&UserChainTail, const DominatorTree *DT);
+
+ /// Looks for a constant offset from the given GEP index without extracting
+ /// it. It returns the numeric value of the extracted constant offset (0 if
+ /// failed). The meaning of the arguments are the same as Extract.
+ static int64_t Find(Value *Idx, GetElementPtrInst *GEP,
+ const DominatorTree *DT);
+
+private:
+ ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT)
+ : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) {
+ }
+
+ /// Searches the expression that computes V for a non-zero constant C s.t.
+ /// V can be reassociated into the form V' + C. If the searching is
+ /// successful, returns C and update UserChain as a def-use chain from C to V;
+ /// otherwise, UserChain is empty.
+ ///
+ /// \p V The given expression
+ /// \p SignExtended Whether V will be sign-extended in the computation of the
+ /// GEP index
+ /// \p ZeroExtended Whether V will be zero-extended in the computation of the
+ /// GEP index
+ /// \p NonNegative Whether V is guaranteed to be non-negative. For example,
+ /// an index of an inbounds GEP is guaranteed to be
+ /// non-negative. Levaraging this, we can better split
+ /// inbounds GEPs.
+ APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative);
+
+ /// A helper function to look into both operands of a binary operator.
+ APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
+ bool ZeroExtended);
+
+ /// After finding the constant offset C from the GEP index I, we build a new
+ /// index I' s.t. I' + C = I. This function builds and returns the new
+ /// index I' according to UserChain produced by function "find".
+ ///
+ /// The building conceptually takes two steps:
+ /// 1) iteratively distribute s/zext towards the leaves of the expression tree
+ /// that computes I
+ /// 2) reassociate the expression tree to the form I' + C.
+ ///
+ /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
+ /// sext to a, b and 5 so that we have
+ /// sext(a) + (sext(b) + 5).
+ /// Then, we reassociate it to
+ /// (sext(a) + sext(b)) + 5.
+ /// Given this form, we know I' is sext(a) + sext(b).
+ Value *rebuildWithoutConstOffset();
+
+ /// After the first step of rebuilding the GEP index without the constant
+ /// offset, distribute s/zext to the operands of all operators in UserChain.
+ /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
+ /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
+ ///
+ /// The function also updates UserChain to point to new subexpressions after
+ /// distributing s/zext. e.g., the old UserChain of the above example is
+ /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
+ /// and the new UserChain is
+ /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
+ /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
+ ///
+ /// \p ChainIndex The index to UserChain. ChainIndex is initially
+ /// UserChain.size() - 1, and is decremented during
+ /// the recursion.
+ Value *distributeExtsAndCloneChain(unsigned ChainIndex);
+
+ /// Reassociates the GEP index to the form I' + C and returns I'.
+ Value *removeConstOffset(unsigned ChainIndex);
+
+ /// A helper function to apply ExtInsts, a list of s/zext, to value V.
+ /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
+ /// returns "sext i32 (zext i16 V to i32) to i64".
+ Value *applyExts(Value *V);
+
+ /// A helper function that returns whether we can trace into the operands
+ /// of binary operator BO for a constant offset.
+ ///
+ /// \p SignExtended Whether BO is surrounded by sext
+ /// \p ZeroExtended Whether BO is surrounded by zext
+ /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound
+ /// array index.
+ bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
+ bool NonNegative);
+
+ /// The path from the constant offset to the old GEP index. e.g., if the GEP
+ /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
+ /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
+ /// UserChain[2] will be the entire expression "a * b + (c + 5)".
+ ///
+ /// This path helps to rebuild the new GEP index.
+ SmallVector<User *, 8> UserChain;
+
+ /// A data structure used in rebuildWithoutConstOffset. Contains all
+ /// sext/zext instructions along UserChain.
+ SmallVector<CastInst *, 16> ExtInsts;
+
+ /// Insertion position of cloned instructions.
+ Instruction *IP;
+
+ const DataLayout &DL;
+ const DominatorTree *DT;
+};
+
+/// \brief A pass that tries to split every GEP in the function into a variadic
+/// base and a constant offset. It is a FunctionPass because searching for the
+/// constant offset may inspect other basic blocks.
+class SeparateConstOffsetFromGEP : public FunctionPass {
+public:
+ static char ID;
+
+ SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr,
+ bool LowerGEP = false)
+ : FunctionPass(ID), TM(TM), LowerGEP(LowerGEP) {
+ initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.setPreservesCFG();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+
+ bool doInitialization(Module &M) override {
+ DL = &M.getDataLayout();
+ return false;
+ }
+
+ bool runOnFunction(Function &F) override;
+
+private:
+ /// Tries to split the given GEP into a variadic base and a constant offset,
+ /// and returns true if the splitting succeeds.
+ bool splitGEP(GetElementPtrInst *GEP);
+
+ /// Lower a GEP with multiple indices into multiple GEPs with a single index.
+ /// Function splitGEP already split the original GEP into a variadic part and
+ /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
+ /// variadic part into a set of GEPs with a single index and applies
+ /// AccumulativeByteOffset to it.
+ /// \p Variadic The variadic part of the original GEP.
+ /// \p AccumulativeByteOffset The constant offset.
+ void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
+ int64_t AccumulativeByteOffset);
+
+ /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
+ /// Function splitGEP already split the original GEP into a variadic part and
+ /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
+ /// variadic part into a set of arithmetic operations and applies
+ /// AccumulativeByteOffset to it.
+ /// \p Variadic The variadic part of the original GEP.
+ /// \p AccumulativeByteOffset The constant offset.
+ void lowerToArithmetics(GetElementPtrInst *Variadic,
+ int64_t AccumulativeByteOffset);
+
+ /// Finds the constant offset within each index and accumulates them. If
+ /// LowerGEP is true, it finds in indices of both sequential and structure
+ /// types, otherwise it only finds in sequential indices. The output
+ /// NeedsExtraction indicates whether we successfully find a non-zero constant
+ /// offset.
+ int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
+
+ /// Canonicalize array indices to pointer-size integers. This helps to
+ /// simplify the logic of splitting a GEP. For example, if a + b is a
+ /// pointer-size integer, we have
+ /// gep base, a + b = gep (gep base, a), b
+ /// However, this equality may not hold if the size of a + b is smaller than
+ /// the pointer size, because LLVM conceptually sign-extends GEP indices to
+ /// pointer size before computing the address
+ /// (http://llvm.org/docs/LangRef.html#id181).
+ ///
+ /// This canonicalization is very likely already done in clang and
+ /// instcombine. Therefore, the program will probably remain the same.
+ ///
+ /// Returns true if the module changes.
+ ///
+ /// Verified in @i32_add in split-gep.ll
+ bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
+
+ /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow.
+ /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting
+ /// the constant offset. After extraction, it becomes desirable to reunion the
+ /// distributed sexts. For example,
+ ///
+ /// &a[sext(i +nsw (j +nsw 5)]
+ /// => distribute &a[sext(i) +nsw (sext(j) +nsw 5)]
+ /// => constant extraction &a[sext(i) + sext(j)] + 5
+ /// => reunion &a[sext(i +nsw j)] + 5
+ bool reuniteExts(Function &F);
+
+ /// A helper that reunites sexts in an instruction.
+ bool reuniteExts(Instruction *I);
+
+ /// Find the closest dominator of <Dominatee> that is equivalent to <Key>.
+ Instruction *findClosestMatchingDominator(const SCEV *Key,
+ Instruction *Dominatee);
+ /// Verify F is free of dead code.
+ void verifyNoDeadCode(Function &F);
+
+ bool hasMoreThanOneUseInLoop(Value *v, Loop *L);
+
+ // Swap the index operand of two GEP.
+ void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second);
+
+ // Check if it is safe to swap operand of two GEP.
+ bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second,
+ Loop *CurLoop);
+
+ const DataLayout *DL = nullptr;
+ DominatorTree *DT = nullptr;
+ ScalarEvolution *SE;
+ const TargetMachine *TM;
+
+ LoopInfo *LI;
+ TargetLibraryInfo *TLI;
+
+ /// Whether to lower a GEP with multiple indices into arithmetic operations or
+ /// multiple GEPs with a single index.
+ bool LowerGEP;
+
+ DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingExprs;
+};
+
+} // end anonymous namespace
+
+char SeparateConstOffsetFromGEP::ID = 0;
+
+INITIALIZE_PASS_BEGIN(
+ SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
+ "Split GEPs to a variadic base and a constant offset for better CSE", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+ SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
+ "Split GEPs to a variadic base and a constant offset for better CSE", false,
+ false)
+
+FunctionPass *
+llvm::createSeparateConstOffsetFromGEPPass(const TargetMachine *TM,
+ bool LowerGEP) {
+ return new SeparateConstOffsetFromGEP(TM, LowerGEP);
+}
+
+bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
+ bool ZeroExtended,
+ BinaryOperator *BO,
+ bool NonNegative) {
+ // We only consider ADD, SUB and OR, because a non-zero constant found in
+ // expressions composed of these operations can be easily hoisted as a
+ // constant offset by reassociation.
+ if (BO->getOpcode() != Instruction::Add &&
+ BO->getOpcode() != Instruction::Sub &&
+ BO->getOpcode() != Instruction::Or) {
+ return false;
+ }
+
+ Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
+ // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
+ // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
+ if (BO->getOpcode() == Instruction::Or &&
+ !haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT))
+ return false;
+
+ // In addition, tracing into BO requires that its surrounding s/zext (if
+ // any) is distributable to both operands.
+ //
+ // Suppose BO = A op B.
+ // SignExtended | ZeroExtended | Distributable?
+ // --------------+--------------+----------------------------------
+ // 0 | 0 | true because no s/zext exists
+ // 0 | 1 | zext(BO) == zext(A) op zext(B)
+ // 1 | 0 | sext(BO) == sext(A) op sext(B)
+ // 1 | 1 | zext(sext(BO)) ==
+ // | | zext(sext(A)) op zext(sext(B))
+ if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) {
+ // If a + b >= 0 and (a >= 0 or b >= 0), then
+ // sext(a + b) = sext(a) + sext(b)
+ // even if the addition is not marked nsw.
+ //
+ // Leveraging this invarient, we can trace into an sext'ed inbound GEP
+ // index if the constant offset is non-negative.
+ //
+ // Verified in @sext_add in split-gep.ll.
+ if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) {
+ if (!ConstLHS->isNegative())
+ return true;
+ }
+ if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) {
+ if (!ConstRHS->isNegative())
+ return true;
+ }
+ }
+
+ // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B)
+ // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B)
+ if (BO->getOpcode() == Instruction::Add ||
+ BO->getOpcode() == Instruction::Sub) {
+ if (SignExtended && !BO->hasNoSignedWrap())
+ return false;
+ if (ZeroExtended && !BO->hasNoUnsignedWrap())
+ return false;
+ }
+
+ return true;
+}
+
+APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
+ bool SignExtended,
+ bool ZeroExtended) {
+ // BO being non-negative does not shed light on whether its operands are
+ // non-negative. Clear the NonNegative flag here.
+ APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended,
+ /* NonNegative */ false);
+ // If we found a constant offset in the left operand, stop and return that.
+ // This shortcut might cause us to miss opportunities of combining the
+ // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9.
+ // However, such cases are probably already handled by -instcombine,
+ // given this pass runs after the standard optimizations.
+ if (ConstantOffset != 0) return ConstantOffset;
+ ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended,
+ /* NonNegative */ false);
+ // If U is a sub operator, negate the constant offset found in the right
+ // operand.
+ if (BO->getOpcode() == Instruction::Sub)
+ ConstantOffset = -ConstantOffset;
+ return ConstantOffset;
+}
+
+APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
+ bool ZeroExtended, bool NonNegative) {
+ // TODO(jingyue): We could trace into integer/pointer casts, such as
+ // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only
+ // integers because it gives good enough results for our benchmarks.
+ unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+
+ // We cannot do much with Values that are not a User, such as an Argument.
+ User *U = dyn_cast<User>(V);
+ if (U == nullptr) return APInt(BitWidth, 0);
+
+ APInt ConstantOffset(BitWidth, 0);
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+ // Hooray, we found it!
+ ConstantOffset = CI->getValue();
+ } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
+ // Trace into subexpressions for more hoisting opportunities.
+ if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
+ ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
+ } else if (isa<SExtInst>(V)) {
+ ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
+ ZeroExtended, NonNegative).sext(BitWidth);
+ } else if (isa<ZExtInst>(V)) {
+ // As an optimization, we can clear the SignExtended flag because
+ // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll.
+ //
+ // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0.
+ ConstantOffset =
+ find(U->getOperand(0), /* SignExtended */ false,
+ /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth);
+ }
+
+ // If we found a non-zero constant offset, add it to the path for
+ // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't
+ // help this optimization.
+ if (ConstantOffset != 0)
+ UserChain.push_back(U);
+ return ConstantOffset;
+}
+
+Value *ConstantOffsetExtractor::applyExts(Value *V) {
+ Value *Current = V;
+ // ExtInsts is built in the use-def order. Therefore, we apply them to V
+ // in the reversed order.
+ for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) {
+ if (Constant *C = dyn_cast<Constant>(Current)) {
+ // If Current is a constant, apply s/zext using ConstantExpr::getCast.
+ // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt.
+ Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType());
+ } else {
+ Instruction *Ext = (*I)->clone();
+ Ext->setOperand(0, Current);
+ Ext->insertBefore(IP);
+ Current = Ext;
+ }
+ }
+ return Current;
+}
+
+Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
+ distributeExtsAndCloneChain(UserChain.size() - 1);
+ // Remove all nullptrs (used to be s/zext) from UserChain.
+ unsigned NewSize = 0;
+ for (User *I : UserChain) {
+ if (I != nullptr) {
+ UserChain[NewSize] = I;
+ NewSize++;
+ }
+ }
+ UserChain.resize(NewSize);
+ return removeConstOffset(UserChain.size() - 1);
+}
+
+Value *
+ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
+ User *U = UserChain[ChainIndex];
+ if (ChainIndex == 0) {
+ assert(isa<ConstantInt>(U));
+ // If U is a ConstantInt, applyExts will return a ConstantInt as well.
+ return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
+ }
+
+ if (CastInst *Cast = dyn_cast<CastInst>(U)) {
+ assert((isa<SExtInst>(Cast) || isa<ZExtInst>(Cast)) &&
+ "We only traced into two types of CastInst: sext and zext");
+ ExtInsts.push_back(Cast);
+ UserChain[ChainIndex] = nullptr;
+ return distributeExtsAndCloneChain(ChainIndex - 1);
+ }
+
+ // Function find only trace into BinaryOperator and CastInst.
+ BinaryOperator *BO = cast<BinaryOperator>(U);
+ // OpNo = which operand of BO is UserChain[ChainIndex - 1]
+ unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
+ Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
+ Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);
+
+ BinaryOperator *NewBO = nullptr;
+ if (OpNo == 0) {
+ NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther,
+ BO->getName(), IP);
+ } else {
+ NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain,
+ BO->getName(), IP);
+ }
+ return UserChain[ChainIndex] = NewBO;
+}
+
+Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
+ if (ChainIndex == 0) {
+ assert(isa<ConstantInt>(UserChain[ChainIndex]));
+ return ConstantInt::getNullValue(UserChain[ChainIndex]->getType());
+ }
+
+ BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
+ assert(BO->getNumUses() <= 1 &&
+ "distributeExtsAndCloneChain clones each BinaryOperator in "
+ "UserChain, so no one should be used more than "
+ "once");
+
+ unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
+ assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
+ Value *NextInChain = removeConstOffset(ChainIndex - 1);
+ Value *TheOther = BO->getOperand(1 - OpNo);
+
+ // If NextInChain is 0 and not the LHS of a sub, we can simplify the
+ // sub-expression to be just TheOther.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
+ if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
+ return TheOther;
+ }
+
+ BinaryOperator::BinaryOps NewOp = BO->getOpcode();
+ if (BO->getOpcode() == Instruction::Or) {
+ // Rebuild "or" as "add", because "or" may be invalid for the new
+ // epxression.
+ //
+ // For instance, given
+ // a | (b + 5) where a and b + 5 have no common bits,
+ // we can extract 5 as the constant offset.
+ //
+ // However, reusing the "or" in the new index would give us
+ // (a | b) + 5
+ // which does not equal a | (b + 5).
+ //
+ // Replacing the "or" with "add" is fine, because
+ // a | (b + 5) = a + (b + 5) = (a + b) + 5
+ NewOp = Instruction::Add;
+ }
+
+ BinaryOperator *NewBO;
+ if (OpNo == 0) {
+ NewBO = BinaryOperator::Create(NewOp, NextInChain, TheOther, "", IP);
+ } else {
+ NewBO = BinaryOperator::Create(NewOp, TheOther, NextInChain, "", IP);
+ }
+ NewBO->takeName(BO);
+ return NewBO;
+}
+
+Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
+ User *&UserChainTail,
+ const DominatorTree *DT) {
+ ConstantOffsetExtractor Extractor(GEP, DT);
+ // Find a non-zero constant offset first.
+ APInt ConstantOffset =
+ Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
+ GEP->isInBounds());
+ if (ConstantOffset == 0) {
+ UserChainTail = nullptr;
+ return nullptr;
+ }
+ // Separates the constant offset from the GEP index.
+ Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset();
+ UserChainTail = Extractor.UserChain.back();
+ return IdxWithoutConstOffset;
+}
+
+int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP,
+ const DominatorTree *DT) {
+ // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
+ return ConstantOffsetExtractor(GEP, DT)
+ .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
+ GEP->isInBounds())
+ .getSExtValue();
+}
+
+bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize(
+ GetElementPtrInst *GEP) {
+ bool Changed = false;
+ Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+ gep_type_iterator GTI = gep_type_begin(*GEP);
+ for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end();
+ I != E; ++I, ++GTI) {
+ // Skip struct member indices which must be i32.
+ if (GTI.isSequential()) {
+ if ((*I)->getType() != IntPtrTy) {
+ *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP);
+ Changed = true;
+ }
+ }
+ }
+ return Changed;
+}
+
+int64_t
+SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
+ bool &NeedsExtraction) {
+ NeedsExtraction = false;
+ int64_t AccumulativeByteOffset = 0;
+ gep_type_iterator GTI = gep_type_begin(*GEP);
+ for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+ if (GTI.isSequential()) {
+ // Tries to extract a constant offset from this GEP index.
+ int64_t ConstantOffset =
+ ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT);
+ if (ConstantOffset != 0) {
+ NeedsExtraction = true;
+ // A GEP may have multiple indices. We accumulate the extracted
+ // constant offset to a byte offset, and later offset the remainder of
+ // the original GEP with this byte offset.
+ AccumulativeByteOffset +=
+ ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
+ }
+ } else if (LowerGEP) {
+ StructType *StTy = GTI.getStructType();
+ uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue();
+ // Skip field 0 as the offset is always 0.
+ if (Field != 0) {
+ NeedsExtraction = true;
+ AccumulativeByteOffset +=
+ DL->getStructLayout(StTy)->getElementOffset(Field);
+ }
+ }
+ }
+ return AccumulativeByteOffset;
+}
+
+void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
+ GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) {
+ IRBuilder<> Builder(Variadic);
+ Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
+
+ Type *I8PtrTy =
+ Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
+ Value *ResultPtr = Variadic->getOperand(0);
+ Loop *L = LI->getLoopFor(Variadic->getParent());
+ // Check if the base is not loop invariant or used more than once.
+ bool isSwapCandidate =
+ L && L->isLoopInvariant(ResultPtr) &&
+ !hasMoreThanOneUseInLoop(ResultPtr, L);
+ Value *FirstResult = nullptr;
+
+ if (ResultPtr->getType() != I8PtrTy)
+ ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+
+ gep_type_iterator GTI = gep_type_begin(*Variadic);
+ // Create an ugly GEP for each sequential index. We don't create GEPs for
+ // structure indices, as they are accumulated in the constant offset index.
+ for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
+ if (GTI.isSequential()) {
+ Value *Idx = Variadic->getOperand(I);
+ // Skip zero indices.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
+ if (CI->isZero())
+ continue;
+
+ APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
+ DL->getTypeAllocSize(GTI.getIndexedType()));
+ // Scale the index by element size.
+ if (ElementSize != 1) {
+ if (ElementSize.isPowerOf2()) {
+ Idx = Builder.CreateShl(
+ Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
+ } else {
+ Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
+ }
+ }
+ // Create an ugly GEP with a single index for each index.
+ ResultPtr =
+ Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep");
+ if (FirstResult == nullptr)
+ FirstResult = ResultPtr;
+ }
+ }
+
+ // Create a GEP with the constant offset index.
+ if (AccumulativeByteOffset != 0) {
+ Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
+ ResultPtr =
+ Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep");
+ } else
+ isSwapCandidate = false;
+
+ // If we created a GEP with constant index, and the base is loop invariant,
+ // then we swap the first one with it, so LICM can move constant GEP out
+ // later.
+ GetElementPtrInst *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(FirstResult);
+ GetElementPtrInst *SecondGEP = dyn_cast_or_null<GetElementPtrInst>(ResultPtr);
+ if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L))
+ swapGEPOperand(FirstGEP, SecondGEP);
+
+ if (ResultPtr->getType() != Variadic->getType())
+ ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());
+
+ Variadic->replaceAllUsesWith(ResultPtr);
+ Variadic->eraseFromParent();
+}
+
+void
+SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
+ int64_t AccumulativeByteOffset) {
+ IRBuilder<> Builder(Variadic);
+ Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
+
+ Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy);
+ gep_type_iterator GTI = gep_type_begin(*Variadic);
+ // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We
+ // don't create arithmetics for structure indices, as they are accumulated
+ // in the constant offset index.
+ for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
+ if (GTI.isSequential()) {
+ Value *Idx = Variadic->getOperand(I);
+ // Skip zero indices.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
+ if (CI->isZero())
+ continue;
+
+ APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
+ DL->getTypeAllocSize(GTI.getIndexedType()));
+ // Scale the index by element size.
+ if (ElementSize != 1) {
+ if (ElementSize.isPowerOf2()) {
+ Idx = Builder.CreateShl(
+ Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
+ } else {
+ Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
+ }
+ }
+ // Create an ADD for each index.
+ ResultPtr = Builder.CreateAdd(ResultPtr, Idx);
+ }
+ }
+
+ // Create an ADD for the constant offset index.
+ if (AccumulativeByteOffset != 0) {
+ ResultPtr = Builder.CreateAdd(
+ ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset));
+ }
+
+ ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType());
+ Variadic->replaceAllUsesWith(ResultPtr);
+ Variadic->eraseFromParent();
+}
+
+bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
+ // Skip vector GEPs.
+ if (GEP->getType()->isVectorTy())
+ return false;
+
+ // The backend can already nicely handle the case where all indices are
+ // constant.
+ if (GEP->hasAllConstantIndices())
+ return false;
+
+ bool Changed = canonicalizeArrayIndicesToPointerSize(GEP);
+
+ bool NeedsExtraction;
+ int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
+
+ if (!NeedsExtraction)
+ return Changed;
+ // If LowerGEP is disabled, before really splitting the GEP, check whether the
+ // backend supports the addressing mode we are about to produce. If no, this
+ // splitting probably won't be beneficial.
+ // If LowerGEP is enabled, even the extracted constant offset can not match
+ // the addressing mode, we can still do optimizations to other lowered parts
+ // of variable indices. Therefore, we don't check for addressing modes in that
+ // case.
+ if (!LowerGEP) {
+ TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *GEP->getParent()->getParent());
+ unsigned AddrSpace = GEP->getPointerAddressSpace();
+ if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
+ /*BaseGV=*/nullptr, AccumulativeByteOffset,
+ /*HasBaseReg=*/true, /*Scale=*/0,
+ AddrSpace)) {
+ return Changed;
+ }
+ }
+
+ // Remove the constant offset in each sequential index. The resultant GEP
+ // computes the variadic base.
+ // Notice that we don't remove struct field indices here. If LowerGEP is
+ // disabled, a structure index is not accumulated and we still use the old
+ // one. If LowerGEP is enabled, a structure index is accumulated in the
+ // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later
+ // handle the constant offset and won't need a new structure index.
+ gep_type_iterator GTI = gep_type_begin(*GEP);
+ for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+ if (GTI.isSequential()) {
+ // Splits this GEP index into a variadic part and a constant offset, and
+ // uses the variadic part as the new index.
+ Value *OldIdx = GEP->getOperand(I);
+ User *UserChainTail;
+ Value *NewIdx =
+ ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail, DT);
+ if (NewIdx != nullptr) {
+ // Switches to the index with the constant offset removed.
+ GEP->setOperand(I, NewIdx);
+ // After switching to the new index, we can garbage-collect UserChain
+ // and the old index if they are not used.
+ RecursivelyDeleteTriviallyDeadInstructions(UserChainTail);
+ RecursivelyDeleteTriviallyDeadInstructions(OldIdx);
+ }
+ }
+ }
+
+ // Clear the inbounds attribute because the new index may be off-bound.
+ // e.g.,
+ //
+ // b = add i64 a, 5
+ // addr = gep inbounds float, float* p, i64 b
+ //
+ // is transformed to:
+ //
+ // addr2 = gep float, float* p, i64 a ; inbounds removed
+ // addr = gep inbounds float, float* addr2, i64 5
+ //
+ // If a is -4, although the old index b is in bounds, the new index a is
+ // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
+ // inbounds keyword is not present, the offsets are added to the base
+ // address with silently-wrapping two's complement arithmetic".
+ // Therefore, the final code will be a semantically equivalent.
+ //
+ // TODO(jingyue): do some range analysis to keep as many inbounds as
+ // possible. GEPs with inbounds are more friendly to alias analysis.
+ bool GEPWasInBounds = GEP->isInBounds();
+ GEP->setIsInBounds(false);
+
+ // Lowers a GEP to either GEPs with a single index or arithmetic operations.
+ if (LowerGEP) {
+ // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
+ // arithmetic operations if the target uses alias analysis in codegen.
+ if (TM && TM->getSubtargetImpl(*GEP->getParent()->getParent())->useAA())
+ lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
+ else
+ lowerToArithmetics(GEP, AccumulativeByteOffset);
+ return true;
+ }
+
+ // No need to create another GEP if the accumulative byte offset is 0.
+ if (AccumulativeByteOffset == 0)
+ return true;
+
+ // Offsets the base with the accumulative byte offset.
+ //
+ // %gep ; the base
+ // ... %gep ...
+ //
+ // => add the offset
+ //
+ // %gep2 ; clone of %gep
+ // %new.gep = gep %gep2, <offset / sizeof(*%gep)>
+ // %gep ; will be removed
+ // ... %gep ...
+ //
+ // => replace all uses of %gep with %new.gep and remove %gep
+ //
+ // %gep2 ; clone of %gep
+ // %new.gep = gep %gep2, <offset / sizeof(*%gep)>
+ // ... %new.gep ...
+ //
+ // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an
+ // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep):
+ // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the
+ // type of %gep.
+ //
+ // %gep2 ; clone of %gep
+ // %0 = bitcast %gep2 to i8*
+ // %uglygep = gep %0, <offset>
+ // %new.gep = bitcast %uglygep to <type of %gep>
+ // ... %new.gep ...
+ Instruction *NewGEP = GEP->clone();
+ NewGEP->insertBefore(GEP);
+
+ // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned =
+ // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is
+ // used with unsigned integers later.
+ int64_t ElementTypeSizeOfGEP = static_cast<int64_t>(
+ DL->getTypeAllocSize(GEP->getResultElementType()));
+ Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+ if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
+ // Very likely. As long as %gep is natually aligned, the byte offset we
+ // extracted should be a multiple of sizeof(*%gep).
+ int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
+ NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
+ ConstantInt::get(IntPtrTy, Index, true),
+ GEP->getName(), GEP);
+ NewGEP->copyMetadata(*GEP);
+ // Inherit the inbounds attribute of the original GEP.
+ cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
+ } else {
+ // Unlikely but possible. For example,
+ // #pragma pack(1)
+ // struct S {
+ // int a[3];
+ // int64 b[8];
+ // };
+ // #pragma pack()
+ //
+ // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After
+ // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is
+ // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of
+ // sizeof(int64).
+ //
+ // Emit an uglygep in this case.
+ Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(),
+ GEP->getPointerAddressSpace());
+ NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP);
+ NewGEP = GetElementPtrInst::Create(
+ Type::getInt8Ty(GEP->getContext()), NewGEP,
+ ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep",
+ GEP);
+ NewGEP->copyMetadata(*GEP);
+ // Inherit the inbounds attribute of the original GEP.
+ cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
+ if (GEP->getType() != I8PtrTy)
+ NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP);
+ }
+
+ GEP->replaceAllUsesWith(NewGEP);
+ GEP->eraseFromParent();
+
+ return true;
+}
+
+bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ if (DisableSeparateConstOffsetFromGEP)
+ return false;
+
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ bool Changed = false;
+ for (BasicBlock &B : F) {
+ for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;)
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++))
+ Changed |= splitGEP(GEP);
+ // No need to split GEP ConstantExprs because all its indices are constant
+ // already.
+ }
+
+ Changed |= reuniteExts(F);
+
+ if (VerifyNoDeadCode)
+ verifyNoDeadCode(F);
+
+ return Changed;
+}
+
+Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator(
+ const SCEV *Key, Instruction *Dominatee) {
+ auto Pos = DominatingExprs.find(Key);
+ if (Pos == DominatingExprs.end())
+ return nullptr;
+
+ auto &Candidates = Pos->second;
+ // Because we process the basic blocks in pre-order of the dominator tree, a
+ // candidate that doesn't dominate the current instruction won't dominate any
+ // future instruction either. Therefore, we pop it out of the stack. This
+ // optimization makes the algorithm O(n).
+ while (!Candidates.empty()) {
+ Instruction *Candidate = Candidates.back();
+ if (DT->dominates(Candidate, Dominatee))
+ return Candidate;
+ Candidates.pop_back();
+ }
+ return nullptr;
+}
+
+bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
+ if (!SE->isSCEVable(I->getType()))
+ return false;
+
+ // Dom: LHS+RHS
+ // I: sext(LHS)+sext(RHS)
+ // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom).
+ // TODO: handle zext
+ Value *LHS = nullptr, *RHS = nullptr;
+ if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS)))) ||
+ match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
+ if (LHS->getType() == RHS->getType()) {
+ const SCEV *Key =
+ SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+ if (auto *Dom = findClosestMatchingDominator(Key, I)) {
+ Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
+ NewSExt->takeName(I);
+ I->replaceAllUsesWith(NewSExt);
+ RecursivelyDeleteTriviallyDeadInstructions(I);
+ return true;
+ }
+ }
+ }
+
+ // Add I to DominatingExprs if it's an add/sub that can't sign overflow.
+ if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS))) ||
+ match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) {
+ if (programUndefinedIfFullPoison(I)) {
+ const SCEV *Key =
+ SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+ DominatingExprs[Key].push_back(I);
+ }
+ }
+ return false;
+}
+
+bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
+ bool Changed = false;
+ DominatingExprs.clear();
+ for (const auto Node : depth_first(DT)) {
+ BasicBlock *BB = Node->getBlock();
+ for (auto I = BB->begin(); I != BB->end(); ) {
+ Instruction *Cur = &*I++;
+ Changed |= reuniteExts(Cur);
+ }
+ }
+ return Changed;
+}
+
+void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {
+ for (BasicBlock &B : F) {
+ for (Instruction &I : B) {
+ if (isInstructionTriviallyDead(&I)) {
+ std::string ErrMessage;
+ raw_string_ostream RSO(ErrMessage);
+ RSO << "Dead instruction detected!\n" << I << "\n";
+ llvm_unreachable(RSO.str().c_str());
+ }
+ }
+ }
+}
+
+bool SeparateConstOffsetFromGEP::isLegalToSwapOperand(
+ GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) {
+ if (!FirstGEP || !FirstGEP->hasOneUse())
+ return false;
+
+ if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent())
+ return false;
+
+ if (FirstGEP == SecondGEP)
+ return false;
+
+ unsigned FirstNum = FirstGEP->getNumOperands();
+ unsigned SecondNum = SecondGEP->getNumOperands();
+ // Give up if the number of operands are not 2.
+ if (FirstNum != SecondNum || FirstNum != 2)
+ return false;
+
+ Value *FirstBase = FirstGEP->getOperand(0);
+ Value *SecondBase = SecondGEP->getOperand(0);
+ Value *FirstOffset = FirstGEP->getOperand(1);
+ // Give up if the index of the first GEP is loop invariant.
+ if (CurLoop->isLoopInvariant(FirstOffset))
+ return false;
+
+ // Give up if base doesn't have same type.
+ if (FirstBase->getType() != SecondBase->getType())
+ return false;
+
+ Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset);
+
+ // Check if the second operand of first GEP has constant coefficient.
+ // For an example, for the following code, we won't gain anything by
+ // hoisting the second GEP out because the second GEP can be folded away.
+ // %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256
+ // %67 = shl i64 %scevgep.sum.ur159, 2
+ // %uglygep160 = getelementptr i8* %65, i64 %67
+ // %uglygep161 = getelementptr i8* %uglygep160, i64 -1024
+
+ // Skip constant shift instruction which may be generated by Splitting GEPs.
+ if (FirstOffsetDef && FirstOffsetDef->isShift() &&
+ isa<ConstantInt>(FirstOffsetDef->getOperand(1)))
+ FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0));
+
+ // Give up if FirstOffsetDef is an Add or Sub with constant.
+ // Because it may not profitable at all due to constant folding.
+ if (FirstOffsetDef)
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) {
+ unsigned opc = BO->getOpcode();
+ if ((opc == Instruction::Add || opc == Instruction::Sub) &&
+ (isa<ConstantInt>(BO->getOperand(0)) ||
+ isa<ConstantInt>(BO->getOperand(1))))
+ return false;
+ }
+ return true;
+}
+
+bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) {
+ int UsesInLoop = 0;
+ for (User *U : V->users()) {
+ if (Instruction *User = dyn_cast<Instruction>(U))
+ if (L->contains(User))
+ if (++UsesInLoop > 1)
+ return true;
+ }
+ return false;
+}
+
+void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
+ GetElementPtrInst *Second) {
+ Value *Offset1 = First->getOperand(1);
+ Value *Offset2 = Second->getOperand(1);
+ First->setOperand(1, Offset2);
+ Second->setOperand(1, Offset1);
+
+ // We changed p+o+c to p+c+o, p+c may not be inbound anymore.
+ const DataLayout &DAL = First->getModule()->getDataLayout();
+ APInt Offset(DAL.getPointerSizeInBits(
+ cast<PointerType>(First->getType())->getAddressSpace()),
+ 0);
+ Value *NewBase =
+ First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset);
+ uint64_t ObjectSize;
+ if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) ||
+ Offset.ugt(ObjectSize)) {
+ First->setIsInBounds(false);
+ Second->setIsInBounds(false);
+ } else
+ First->setIsInBounds(true);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
new file mode 100644
index 000000000000..aba732bc413f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -0,0 +1,2171 @@
+//===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GenericDomTree.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <numeric>
+#include <utility>
+
+#define DEBUG_TYPE "simple-loop-unswitch"
+
+using namespace llvm;
+
+STATISTIC(NumBranches, "Number of branches unswitched");
+STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumTrivial, "Number of unswitches that are trivial");
+
+static cl::opt<bool> EnableNonTrivialUnswitch(
+ "enable-nontrivial-unswitch", cl::init(false), cl::Hidden,
+ cl::desc("Forcibly enables non-trivial loop unswitching rather than "
+ "following the configuration passed into the pass."));
+
+static cl::opt<int>
+ UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
+ cl::desc("The cost threshold for unswitching a loop."));
+
+static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,
+ Constant &Replacement) {
+ assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
+
+ // Replace uses of LIC in the loop with the given constant.
+ for (auto UI = LIC.use_begin(), UE = LIC.use_end(); UI != UE;) {
+ // Grab the use and walk past it so we can clobber it in the use list.
+ Use *U = &*UI++;
+ Instruction *UserI = dyn_cast<Instruction>(U->getUser());
+ if (!UserI || !L.contains(UserI))
+ continue;
+
+ // Replace this use within the loop body.
+ *U = &Replacement;
+ }
+}
+
+/// Update the IDom for a basic block whose predecessor set has changed.
+///
+/// This routine is designed to work when the domtree update is relatively
+/// localized by leveraging a known common dominator, often a loop header.
+///
+/// FIXME: Should consider hand-rolling a slightly more efficient non-DFS
+/// approach here as we can do that easily by persisting the candidate IDom's
+/// dominating set between each predecessor.
+///
+/// FIXME: Longer term, many uses of this can be replaced by an incremental
+/// domtree update strategy that starts from a known dominating block and
+/// rebuilds that subtree.
+static bool updateIDomWithKnownCommonDominator(BasicBlock *BB,
+ BasicBlock *KnownDominatingBB,
+ DominatorTree &DT) {
+ assert(pred_begin(BB) != pred_end(BB) &&
+ "This routine does not handle unreachable blocks!");
+
+ BasicBlock *OrigIDom = DT[BB]->getIDom()->getBlock();
+
+ BasicBlock *IDom = *pred_begin(BB);
+ assert(DT.dominates(KnownDominatingBB, IDom) &&
+ "Bad known dominating block!");
+
+ // Walk all of the other predecessors finding the nearest common dominator
+ // until all predecessors are covered or we reach the loop header. The loop
+ // header necessarily dominates all loop exit blocks in loop simplified form
+ // so we can early-exit the moment we hit that block.
+ for (auto PI = std::next(pred_begin(BB)), PE = pred_end(BB);
+ PI != PE && IDom != KnownDominatingBB; ++PI) {
+ assert(DT.dominates(KnownDominatingBB, *PI) &&
+ "Bad known dominating block!");
+ IDom = DT.findNearestCommonDominator(IDom, *PI);
+ }
+
+ if (IDom == OrigIDom)
+ return false;
+
+ DT.changeImmediateDominator(BB, IDom);
+ return true;
+}
+
+// Note that we don't currently use the IDFCalculator here for two reasons:
+// 1) It computes dominator tree levels for the entire function on each run
+// of 'compute'. While this isn't terrible, given that we expect to update
+// relatively small subtrees of the domtree, it isn't necessarily the right
+// tradeoff.
+// 2) The interface doesn't fit this usage well. It doesn't operate in
+// append-only, and builds several sets that we don't need.
+//
+// FIXME: Neither of these issues are a big deal and could be addressed with
+// some amount of refactoring of IDFCalculator. That would allow us to share
+// the core logic here (which is solving the same core problem).
+static void appendDomFrontier(DomTreeNode *Node,
+ SmallSetVector<BasicBlock *, 4> &Worklist,
+ SmallVectorImpl<DomTreeNode *> &DomNodes,
+ SmallPtrSetImpl<BasicBlock *> &DomSet) {
+ assert(DomNodes.empty() && "Must start with no dominator nodes.");
+ assert(DomSet.empty() && "Must start with an empty dominator set.");
+
+ // First flatten this subtree into sequence of nodes by doing a pre-order
+ // walk.
+ DomNodes.push_back(Node);
+ // We intentionally re-evaluate the size as each node can add new children.
+ // Because this is a tree walk, this cannot add any duplicates.
+ for (int i = 0; i < (int)DomNodes.size(); ++i)
+ DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end());
+
+ // Now create a set of the basic blocks so we can quickly test for
+ // dominated successors. We could in theory use the DFS numbers of the
+ // dominator tree for this, but we want this to remain predictably fast
+ // even while we mutate the dominator tree in ways that would invalidate
+ // the DFS numbering.
+ for (DomTreeNode *InnerN : DomNodes)
+ DomSet.insert(InnerN->getBlock());
+
+ // Now re-walk the nodes, appending every successor of every node that isn't
+ // in the set. Note that we don't append the node itself, even though if it
+ // is a successor it does not strictly dominate itself and thus it would be
+ // part of the dominance frontier. The reason we don't append it is that
+ // the node passed in came *from* the worklist and so it has already been
+ // processed.
+ for (DomTreeNode *InnerN : DomNodes)
+ for (BasicBlock *SuccBB : successors(InnerN->getBlock()))
+ if (!DomSet.count(SuccBB))
+ Worklist.insert(SuccBB);
+
+ DomNodes.clear();
+ DomSet.clear();
+}
+
+/// Update the dominator tree after unswitching a particular former exit block.
+///
+/// This handles the full update of the dominator tree after hoisting a block
+/// that previously was an exit block (or split off of an exit block) up to be
+/// reached from the new immediate dominator of the preheader.
+///
+/// The common case is simple -- we just move the unswitched block to have an
+/// immediate dominator of the old preheader. But in complex cases, there may
+/// be other blocks reachable from the unswitched block that are immediately
+/// dominated by some node between the unswitched one and the old preheader.
+/// All of these also need to be hoisted in the dominator tree. We also want to
+/// minimize queries to the dominator tree because each step of this
+/// invalidates any DFS numbers that would make queries fast.
+static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,
+ DominatorTree &DT) {
+ DomTreeNode *OldPHNode = DT[OldPH];
+ DomTreeNode *UnswitchedNode = DT[UnswitchedBB];
+ // If the dominator tree has already been updated for this unswitched node,
+ // we're done. This makes it easier to use this routine if there are multiple
+ // paths to the same unswitched destination.
+ if (UnswitchedNode->getIDom() == OldPHNode)
+ return;
+
+ // First collect the domtree nodes that we are hoisting over. These are the
+ // set of nodes which may have children that need to be hoisted as well.
+ SmallPtrSet<DomTreeNode *, 4> DomChain;
+ for (auto *IDom = UnswitchedNode->getIDom(); IDom != OldPHNode;
+ IDom = IDom->getIDom())
+ DomChain.insert(IDom);
+
+ // The unswitched block ends up immediately dominated by the old preheader --
+ // regardless of whether it is the loop exit block or split off of the loop
+ // exit block.
+ DT.changeImmediateDominator(UnswitchedNode, OldPHNode);
+
+ // For everything that moves up the dominator tree, we need to examine the
+ // dominator frontier to see if it additionally should move up the dominator
+ // tree. This lambda appends the dominator frontier for a node on the
+ // worklist.
+ SmallSetVector<BasicBlock *, 4> Worklist;
+
+ // Scratch data structures reused by domfrontier finding.
+ SmallVector<DomTreeNode *, 4> DomNodes;
+ SmallPtrSet<BasicBlock *, 4> DomSet;
+
+ // Append the initial dom frontier nodes.
+ appendDomFrontier(UnswitchedNode, Worklist, DomNodes, DomSet);
+
+ // Walk the worklist. We grow the list in the loop and so must recompute size.
+ for (int i = 0; i < (int)Worklist.size(); ++i) {
+ auto *BB = Worklist[i];
+
+ DomTreeNode *Node = DT[BB];
+ assert(!DomChain.count(Node) &&
+ "Cannot be dominated by a block you can reach!");
+
+ // If this block had an immediate dominator somewhere in the chain
+ // we hoisted over, then its position in the domtree needs to move as it is
+ // reachable from a node hoisted over this chain.
+ if (!DomChain.count(Node->getIDom()))
+ continue;
+
+ DT.changeImmediateDominator(Node, OldPHNode);
+
+ // Now add this node's dominator frontier to the worklist as well.
+ appendDomFrontier(Node, Worklist, DomNodes, DomSet);
+ }
+}
+
+/// Check that all the LCSSA PHI nodes in the loop exit block have trivial
+/// incoming values along this edge.
+static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
+ BasicBlock &ExitBB) {
+ for (Instruction &I : ExitBB) {
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ // No more PHIs to check.
+ return true;
+
+ // If the incoming value for this edge isn't loop invariant the unswitch
+ // won't be trivial.
+ if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB)))
+ return false;
+ }
+ llvm_unreachable("Basic blocks should never be empty!");
+}
+
+/// Rewrite the PHI nodes in an unswitched loop exit basic block.
+///
+/// Requires that the loop exit and unswitched basic block are the same, and
+/// that the exiting block was a unique predecessor of that block. Rewrites the
+/// PHI nodes in that block such that what were LCSSA PHI nodes become trivial
+/// PHI nodes from the old preheader that now contains the unswitched
+/// terminator.
+static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
+ BasicBlock &OldExitingBB,
+ BasicBlock &OldPH) {
+ for (PHINode &PN : UnswitchedBB.phis()) {
+ // When the loop exit is directly unswitched we just need to update the
+ // incoming basic block. We loop to handle weird cases with repeated
+ // incoming blocks, but expect to typically only have one operand here.
+ for (auto i : seq<int>(0, PN.getNumOperands())) {
+ assert(PN.getIncomingBlock(i) == &OldExitingBB &&
+ "Found incoming block different from unique predecessor!");
+ PN.setIncomingBlock(i, &OldPH);
+ }
+ }
+}
+
+/// Rewrite the PHI nodes in the loop exit basic block and the split off
+/// unswitched block.
+///
+/// Because the exit block remains an exit from the loop, this rewrites the
+/// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI
+/// nodes into the unswitched basic block to select between the value in the
+/// old preheader and the loop exit.
+static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
+ BasicBlock &UnswitchedBB,
+ BasicBlock &OldExitingBB,
+ BasicBlock &OldPH) {
+ assert(&ExitBB != &UnswitchedBB &&
+ "Must have different loop exit and unswitched blocks!");
+ Instruction *InsertPt = &*UnswitchedBB.begin();
+ for (PHINode &PN : ExitBB.phis()) {
+ auto *NewPN = PHINode::Create(PN.getType(), /*NumReservedValues*/ 2,
+ PN.getName() + ".split", InsertPt);
+
+ // Walk backwards over the old PHI node's inputs to minimize the cost of
+ // removing each one. We have to do this weird loop manually so that we
+ // create the same number of new incoming edges in the new PHI as we expect
+ // each case-based edge to be included in the unswitched switch in some
+ // cases.
+ // FIXME: This is really, really gross. It would be much cleaner if LLVM
+ // allowed us to create a single entry for a predecessor block without
+ // having separate entries for each "edge" even though these edges are
+ // required to produce identical results.
+ for (int i = PN.getNumIncomingValues() - 1; i >= 0; --i) {
+ if (PN.getIncomingBlock(i) != &OldExitingBB)
+ continue;
+
+ Value *Incoming = PN.removeIncomingValue(i);
+ NewPN->addIncoming(Incoming, &OldPH);
+ }
+
+ // Now replace the old PHI with the new one and wire the old one in as an
+ // input to the new one.
+ PN.replaceAllUsesWith(NewPN);
+ NewPN->addIncoming(&PN, &ExitBB);
+ }
+}
+
+/// Unswitch a trivial branch if the condition is loop invariant.
+///
+/// This routine should only be called when loop code leading to the branch has
+/// been validated as trivial (no side effects). This routine checks if the
+/// condition is invariant and one of the successors is a loop exit. This
+/// allows us to unswitch without duplicating the loop, making it trivial.
+///
+/// If this routine fails to unswitch the branch it returns false.
+///
+/// If the branch can be unswitched, this routine splits the preheader and
+/// hoists the branch above that split. Preserves loop simplified form
+/// (splitting the exit block as necessary). It simplifies the branch within
+/// the loop to an unconditional branch but doesn't remove it entirely. Further
+/// cleanup can be done with some simplify-cfg like pass.
+static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
+ LoopInfo &LI) {
+ assert(BI.isConditional() && "Can only unswitch a conditional branch!");
+ DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n");
+
+ Value *LoopCond = BI.getCondition();
+
+ // Need a trivial loop condition to unswitch.
+ if (!L.isLoopInvariant(LoopCond))
+ return false;
+
+ // FIXME: We should compute this once at the start and update it!
+ SmallVector<BasicBlock *, 16> ExitBlocks;
+ L.getExitBlocks(ExitBlocks);
+ SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(),
+ ExitBlocks.end());
+
+ // Check to see if a successor of the branch is guaranteed to
+ // exit through a unique exit block without having any
+ // side-effects. If so, determine the value of Cond that causes
+ // it to do this.
+ ConstantInt *CondVal = ConstantInt::getTrue(BI.getContext());
+ ConstantInt *Replacement = ConstantInt::getFalse(BI.getContext());
+ int LoopExitSuccIdx = 0;
+ auto *LoopExitBB = BI.getSuccessor(0);
+ if (!ExitBlockSet.count(LoopExitBB)) {
+ std::swap(CondVal, Replacement);
+ LoopExitSuccIdx = 1;
+ LoopExitBB = BI.getSuccessor(1);
+ if (!ExitBlockSet.count(LoopExitBB))
+ return false;
+ }
+ auto *ContinueBB = BI.getSuccessor(1 - LoopExitSuccIdx);
+ assert(L.contains(ContinueBB) &&
+ "Cannot have both successors exit and still be in the loop!");
+
+ auto *ParentBB = BI.getParent();
+ if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB))
+ return false;
+
+ DEBUG(dbgs() << " unswitching trivial branch when: " << CondVal
+ << " == " << LoopCond << "\n");
+
+ // Split the preheader, so that we know that there is a safe place to insert
+ // the conditional branch. We will change the preheader to have a conditional
+ // branch on LoopCond.
+ BasicBlock *OldPH = L.getLoopPreheader();
+ BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI);
+
+ // Now that we have a place to insert the conditional branch, create a place
+ // to branch to: this is the exit block out of the loop that we are
+ // unswitching. We need to split this if there are other loop predecessors.
+ // Because the loop is in simplified form, *any* other predecessor is enough.
+ BasicBlock *UnswitchedBB;
+ if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) {
+ (void)PredBB;
+ assert(PredBB == BI.getParent() &&
+ "A branch's parent isn't a predecessor!");
+ UnswitchedBB = LoopExitBB;
+ } else {
+ UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI);
+ }
+
+ // Now splice the branch to gate reaching the new preheader and re-point its
+ // successors.
+ OldPH->getInstList().splice(std::prev(OldPH->end()),
+ BI.getParent()->getInstList(), BI);
+ OldPH->getTerminator()->eraseFromParent();
+ BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
+ BI.setSuccessor(1 - LoopExitSuccIdx, NewPH);
+
+ // Create a new unconditional branch that will continue the loop as a new
+ // terminator.
+ BranchInst::Create(ContinueBB, ParentBB);
+
+ // Rewrite the relevant PHI nodes.
+ if (UnswitchedBB == LoopExitBB)
+ rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
+ else
+ rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
+ *ParentBB, *OldPH);
+
+ // Now we need to update the dominator tree.
+ updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
+ // But if we split something off of the loop exit block then we also removed
+ // one of the predecessors for the loop exit block and may need to update its
+ // idom.
+ if (UnswitchedBB != LoopExitBB)
+ updateIDomWithKnownCommonDominator(LoopExitBB, L.getHeader(), DT);
+
+ // Since this is an i1 condition we can also trivially replace uses of it
+ // within the loop with a constant.
+ replaceLoopUsesWithConstant(L, *LoopCond, *Replacement);
+
+ ++NumTrivial;
+ ++NumBranches;
+ return true;
+}
+
+/// Unswitch a trivial switch if the condition is loop invariant.
+///
+/// This routine should only be called when loop code leading to the switch has
+/// been validated as trivial (no side effects). This routine checks if the
+/// condition is invariant and that at least one of the successors is a loop
+/// exit. This allows us to unswitch without duplicating the loop, making it
+/// trivial.
+///
+/// If this routine fails to unswitch the switch it returns false.
+///
+/// If the switch can be unswitched, this routine splits the preheader and
+/// copies the switch above that split. If the default case is one of the
+/// exiting cases, it copies the non-exiting cases and points them at the new
+/// preheader. If the default case is not exiting, it copies the exiting cases
+/// and points the default at the preheader. It preserves loop simplified form
+/// (splitting the exit blocks as necessary). It simplifies the switch within
+/// the loop by removing now-dead cases. If the default case is one of those
+/// unswitched, it replaces its destination with a new basic block containing
+/// only unreachable. Such basic blocks, while technically loop exits, are not
+/// considered for unswitching so this is a stable transform and the same
+/// switch will not be revisited. If after unswitching there is only a single
+/// in-loop successor, the switch is further simplified to an unconditional
+/// branch. Still more cleanup can be done with some simplify-cfg like pass.
+static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
+ LoopInfo &LI) {
+ DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n");
+ Value *LoopCond = SI.getCondition();
+
+ // If this isn't switching on an invariant condition, we can't unswitch it.
+ if (!L.isLoopInvariant(LoopCond))
+ return false;
+
+ auto *ParentBB = SI.getParent();
+
+ // FIXME: We should compute this once at the start and update it!
+ SmallVector<BasicBlock *, 16> ExitBlocks;
+ L.getExitBlocks(ExitBlocks);
+ SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(),
+ ExitBlocks.end());
+
+ SmallVector<int, 4> ExitCaseIndices;
+ for (auto Case : SI.cases()) {
+ auto *SuccBB = Case.getCaseSuccessor();
+ if (ExitBlockSet.count(SuccBB) &&
+ areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB))
+ ExitCaseIndices.push_back(Case.getCaseIndex());
+ }
+ BasicBlock *DefaultExitBB = nullptr;
+ if (ExitBlockSet.count(SI.getDefaultDest()) &&
+ areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) &&
+ !isa<UnreachableInst>(SI.getDefaultDest()->getTerminator()))
+ DefaultExitBB = SI.getDefaultDest();
+ else if (ExitCaseIndices.empty())
+ return false;
+
+ DEBUG(dbgs() << " unswitching trivial cases...\n");
+
+ SmallVector<std::pair<ConstantInt *, BasicBlock *>, 4> ExitCases;
+ ExitCases.reserve(ExitCaseIndices.size());
+ // We walk the case indices backwards so that we remove the last case first
+ // and don't disrupt the earlier indices.
+ for (unsigned Index : reverse(ExitCaseIndices)) {
+ auto CaseI = SI.case_begin() + Index;
+ // Save the value of this case.
+ ExitCases.push_back({CaseI->getCaseValue(), CaseI->getCaseSuccessor()});
+ // Delete the unswitched cases.
+ SI.removeCase(CaseI);
+ }
+
+ // Check if after this all of the remaining cases point at the same
+ // successor.
+ BasicBlock *CommonSuccBB = nullptr;
+ if (SI.getNumCases() > 0 &&
+ std::all_of(std::next(SI.case_begin()), SI.case_end(),
+ [&SI](const SwitchInst::CaseHandle &Case) {
+ return Case.getCaseSuccessor() ==
+ SI.case_begin()->getCaseSuccessor();
+ }))
+ CommonSuccBB = SI.case_begin()->getCaseSuccessor();
+
+ if (DefaultExitBB) {
+ // We can't remove the default edge so replace it with an edge to either
+ // the single common remaining successor (if we have one) or an unreachable
+ // block.
+ if (CommonSuccBB) {
+ SI.setDefaultDest(CommonSuccBB);
+ } else {
+ BasicBlock *UnreachableBB = BasicBlock::Create(
+ ParentBB->getContext(),
+ Twine(ParentBB->getName()) + ".unreachable_default",
+ ParentBB->getParent());
+ new UnreachableInst(ParentBB->getContext(), UnreachableBB);
+ SI.setDefaultDest(UnreachableBB);
+ DT.addNewBlock(UnreachableBB, ParentBB);
+ }
+ } else {
+ // If we're not unswitching the default, we need it to match any cases to
+ // have a common successor or if we have no cases it is the common
+ // successor.
+ if (SI.getNumCases() == 0)
+ CommonSuccBB = SI.getDefaultDest();
+ else if (SI.getDefaultDest() != CommonSuccBB)
+ CommonSuccBB = nullptr;
+ }
+
+ // Split the preheader, so that we know that there is a safe place to insert
+ // the switch.
+ BasicBlock *OldPH = L.getLoopPreheader();
+ BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI);
+ OldPH->getTerminator()->eraseFromParent();
+
+ // Now add the unswitched switch.
+ auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH);
+
+ // Rewrite the IR for the unswitched basic blocks. This requires two steps.
+ // First, we split any exit blocks with remaining in-loop predecessors. Then
+ // we update the PHIs in one of two ways depending on if there was a split.
+ // We walk in reverse so that we split in the same order as the cases
+ // appeared. This is purely for convenience of reading the resulting IR, but
+ // it doesn't cost anything really.
+ SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs;
+ SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap;
+ // Handle the default exit if necessary.
+ // FIXME: It'd be great if we could merge this with the loop below but LLVM's
+ // ranges aren't quite powerful enough yet.
+ if (DefaultExitBB) {
+ if (pred_empty(DefaultExitBB)) {
+ UnswitchedExitBBs.insert(DefaultExitBB);
+ rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH);
+ } else {
+ auto *SplitBB =
+ SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
+ rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
+ *ParentBB, *OldPH);
+ updateIDomWithKnownCommonDominator(DefaultExitBB, L.getHeader(), DT);
+ DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
+ }
+ }
+ // Note that we must use a reference in the for loop so that we update the
+ // container.
+ for (auto &CasePair : reverse(ExitCases)) {
+ // Grab a reference to the exit block in the pair so that we can update it.
+ BasicBlock *ExitBB = CasePair.second;
+
+ // If this case is the last edge into the exit block, we can simply reuse it
+ // as it will no longer be a loop exit. No mapping necessary.
+ if (pred_empty(ExitBB)) {
+ // Only rewrite once.
+ if (UnswitchedExitBBs.insert(ExitBB).second)
+ rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH);
+ continue;
+ }
+
+ // Otherwise we need to split the exit block so that we retain an exit
+ // block from the loop and a target for the unswitched condition.
+ BasicBlock *&SplitExitBB = SplitExitBBMap[ExitBB];
+ if (!SplitExitBB) {
+ // If this is the first time we see this, do the split and remember it.
+ SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
+ rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
+ *ParentBB, *OldPH);
+ updateIDomWithKnownCommonDominator(ExitBB, L.getHeader(), DT);
+ }
+ // Update the case pair to point to the split block.
+ CasePair.second = SplitExitBB;
+ }
+
+ // Now add the unswitched cases. We do this in reverse order as we built them
+ // in reverse order.
+ for (auto CasePair : reverse(ExitCases)) {
+ ConstantInt *CaseVal = CasePair.first;
+ BasicBlock *UnswitchedBB = CasePair.second;
+
+ NewSI->addCase(CaseVal, UnswitchedBB);
+ updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
+ }
+
+ // If the default was unswitched, re-point it and add explicit cases for
+ // entering the loop.
+ if (DefaultExitBB) {
+ NewSI->setDefaultDest(DefaultExitBB);
+ updateDTAfterUnswitch(DefaultExitBB, OldPH, DT);
+
+ // We removed all the exit cases, so we just copy the cases to the
+ // unswitched switch.
+ for (auto Case : SI.cases())
+ NewSI->addCase(Case.getCaseValue(), NewPH);
+ }
+
+ // If we ended up with a common successor for every path through the switch
+ // after unswitching, rewrite it to an unconditional branch to make it easy
+ // to recognize. Otherwise we potentially have to recognize the default case
+ // pointing at unreachable and other complexity.
+ if (CommonSuccBB) {
+ BasicBlock *BB = SI.getParent();
+ SI.eraseFromParent();
+ BranchInst::Create(CommonSuccBB, BB);
+ }
+
+ DT.verifyDomTree();
+ ++NumTrivial;
+ ++NumSwitches;
+ return true;
+}
+
+/// This routine scans the loop to find a branch or switch which occurs before
+/// any side effects occur. These can potentially be unswitched without
+/// duplicating the loop. If a branch or switch is successfully unswitched the
+/// scanning continues to see if subsequent branches or switches have become
+/// trivial. Once all trivial candidates have been unswitched, this routine
+/// returns.
+///
+/// The return value indicates whether anything was unswitched (and therefore
+/// changed).
+static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
+ LoopInfo &LI) {
+ bool Changed = false;
+
+ // If loop header has only one reachable successor we should keep looking for
+ // trivial condition candidates in the successor as well. An alternative is
+ // to constant fold conditions and merge successors into loop header (then we
+ // only need to check header's terminator). The reason for not doing this in
+ // LoopUnswitch pass is that it could potentially break LoopPassManager's
+ // invariants. Folding dead branches could either eliminate the current loop
+ // or make other loops unreachable. LCSSA form might also not be preserved
+ // after deleting branches. The following code keeps traversing loop header's
+ // successors until it finds the trivial condition candidate (condition that
+ // is not a constant). Since unswitching generates branches with constant
+ // conditions, this scenario could be very common in practice.
+ BasicBlock *CurrentBB = L.getHeader();
+ SmallPtrSet<BasicBlock *, 8> Visited;
+ Visited.insert(CurrentBB);
+ do {
+ // Check if there are any side-effecting instructions (e.g. stores, calls,
+ // volatile loads) in the part of the loop that the code *would* execute
+ // without unswitching.
+ if (llvm::any_of(*CurrentBB,
+ [](Instruction &I) { return I.mayHaveSideEffects(); }))
+ return Changed;
+
+ TerminatorInst *CurrentTerm = CurrentBB->getTerminator();
+
+ if (auto *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+ // Don't bother trying to unswitch past a switch with a constant
+ // condition. This should be removed prior to running this pass by
+ // simplify-cfg.
+ if (isa<Constant>(SI->getCondition()))
+ return Changed;
+
+ if (!unswitchTrivialSwitch(L, *SI, DT, LI))
+ // Coludn't unswitch this one so we're done.
+ return Changed;
+
+ // Mark that we managed to unswitch something.
+ Changed = true;
+
+ // If unswitching turned the terminator into an unconditional branch then
+ // we can continue. The unswitching logic specifically works to fold any
+ // cases it can into an unconditional branch to make it easier to
+ // recognize here.
+ auto *BI = dyn_cast<BranchInst>(CurrentBB->getTerminator());
+ if (!BI || BI->isConditional())
+ return Changed;
+
+ CurrentBB = BI->getSuccessor(0);
+ continue;
+ }
+
+ auto *BI = dyn_cast<BranchInst>(CurrentTerm);
+ if (!BI)
+ // We do not understand other terminator instructions.
+ return Changed;
+
+ // Don't bother trying to unswitch past an unconditional branch or a branch
+ // with a constant value. These should be removed by simplify-cfg prior to
+ // running this pass.
+ if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
+ return Changed;
+
+ // Found a trivial condition candidate: non-foldable conditional branch. If
+ // we fail to unswitch this, we can't do anything else that is trivial.
+ if (!unswitchTrivialBranch(L, *BI, DT, LI))
+ return Changed;
+
+ // Mark that we managed to unswitch something.
+ Changed = true;
+
+ // We unswitched the branch. This should always leave us with an
+ // unconditional branch that we can follow now.
+ BI = cast<BranchInst>(CurrentBB->getTerminator());
+ assert(!BI->isConditional() &&
+ "Cannot form a conditional branch by unswitching1");
+ CurrentBB = BI->getSuccessor(0);
+
+ // When continuing, if we exit the loop or reach a previous visited block,
+ // then we can not reach any trivial condition candidates (unfoldable
+ // branch instructions or switch instructions) and no unswitch can happen.
+ } while (L.contains(CurrentBB) && Visited.insert(CurrentBB).second);
+
+ return Changed;
+}
+
+/// Build the cloned blocks for an unswitched copy of the given loop.
+///
+/// The cloned blocks are inserted before the loop preheader (`LoopPH`) and
+/// after the split block (`SplitBB`) that will be used to select between the
+/// cloned and original loop.
+///
+/// This routine handles cloning all of the necessary loop blocks and exit
+/// blocks including rewriting their instructions and the relevant PHI nodes.
+/// It skips loop and exit blocks that are not necessary based on the provided
+/// set. It also correctly creates the unconditional branch in the cloned
+/// unswitched parent block to only point at the unswitched successor.
+///
+/// This does not handle most of the necessary updates to `LoopInfo`. Only exit
+/// block splitting is correctly reflected in `LoopInfo`, essentially all of
+/// the cloned blocks (and their loops) are left without full `LoopInfo`
+/// updates. This also doesn't fully update `DominatorTree`. It adds the cloned
+/// blocks to them but doesn't create the cloned `DominatorTree` structure and
+/// instead the caller must recompute an accurate DT. It *does* correctly
+/// update the `AssumptionCache` provided in `AC`.
+static BasicBlock *buildClonedLoopBlocks(
+ Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB,
+ ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB,
+ BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB,
+ const SmallPtrSetImpl<BasicBlock *> &SkippedLoopAndExitBlocks,
+ ValueToValueMapTy &VMap, AssumptionCache &AC, DominatorTree &DT,
+ LoopInfo &LI) {
+ SmallVector<BasicBlock *, 4> NewBlocks;
+ NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size());
+
+ // We will need to clone a bunch of blocks, wrap up the clone operation in
+ // a helper.
+ auto CloneBlock = [&](BasicBlock *OldBB) {
+ // Clone the basic block and insert it before the new preheader.
+ BasicBlock *NewBB = CloneBasicBlock(OldBB, VMap, ".us", OldBB->getParent());
+ NewBB->moveBefore(LoopPH);
+
+ // Record this block and the mapping.
+ NewBlocks.push_back(NewBB);
+ VMap[OldBB] = NewBB;
+
+ // Add the block to the domtree. We'll move it to the correct position
+ // below.
+ DT.addNewBlock(NewBB, SplitBB);
+
+ return NewBB;
+ };
+
+ // First, clone the preheader.
+ auto *ClonedPH = CloneBlock(LoopPH);
+
+ // Then clone all the loop blocks, skipping the ones that aren't necessary.
+ for (auto *LoopBB : L.blocks())
+ if (!SkippedLoopAndExitBlocks.count(LoopBB))
+ CloneBlock(LoopBB);
+
+ // Split all the loop exit edges so that when we clone the exit blocks, if
+ // any of the exit blocks are *also* a preheader for some other loop, we
+ // don't create multiple predecessors entering the loop header.
+ for (auto *ExitBB : ExitBlocks) {
+ if (SkippedLoopAndExitBlocks.count(ExitBB))
+ continue;
+
+ // When we are going to clone an exit, we don't need to clone all the
+ // instructions in the exit block and we want to ensure we have an easy
+ // place to merge the CFG, so split the exit first. This is always safe to
+ // do because there cannot be any non-loop predecessors of a loop exit in
+ // loop simplified form.
+ auto *MergeBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
+
+ // Rearrange the names to make it easier to write test cases by having the
+ // exit block carry the suffix rather than the merge block carrying the
+ // suffix.
+ MergeBB->takeName(ExitBB);
+ ExitBB->setName(Twine(MergeBB->getName()) + ".split");
+
+ // Now clone the original exit block.
+ auto *ClonedExitBB = CloneBlock(ExitBB);
+ assert(ClonedExitBB->getTerminator()->getNumSuccessors() == 1 &&
+ "Exit block should have been split to have one successor!");
+ assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB &&
+ "Cloned exit block has the wrong successor!");
+
+ // Move the merge block's idom to be the split point as one exit is
+ // dominated by one header, and the other by another, so we know the split
+ // point dominates both. While the dominator tree isn't fully accurate, we
+ // want sub-trees within the original loop to be correctly reflect
+ // dominance within that original loop (at least) and that requires moving
+ // the merge block out of that subtree.
+ // FIXME: This is very brittle as we essentially have a partial contract on
+ // the dominator tree. We really need to instead update it and keep it
+ // valid or stop relying on it.
+ DT.changeImmediateDominator(MergeBB, SplitBB);
+
+ // Remap any cloned instructions and create a merge phi node for them.
+ for (auto ZippedInsts : llvm::zip_first(
+ llvm::make_range(ExitBB->begin(), std::prev(ExitBB->end())),
+ llvm::make_range(ClonedExitBB->begin(),
+ std::prev(ClonedExitBB->end())))) {
+ Instruction &I = std::get<0>(ZippedInsts);
+ Instruction &ClonedI = std::get<1>(ZippedInsts);
+
+ // The only instructions in the exit block should be PHI nodes and
+ // potentially a landing pad.
+ assert(
+ (isa<PHINode>(I) || isa<LandingPadInst>(I) || isa<CatchPadInst>(I)) &&
+ "Bad instruction in exit block!");
+ // We should have a value map between the instruction and its clone.
+ assert(VMap.lookup(&I) == &ClonedI && "Mismatch in the value map!");
+
+ auto *MergePN =
+ PHINode::Create(I.getType(), /*NumReservedValues*/ 2, ".us-phi",
+ &*MergeBB->getFirstInsertionPt());
+ I.replaceAllUsesWith(MergePN);
+ MergePN->addIncoming(&I, ExitBB);
+ MergePN->addIncoming(&ClonedI, ClonedExitBB);
+ }
+ }
+
+ // Rewrite the instructions in the cloned blocks to refer to the instructions
+ // in the cloned blocks. We have to do this as a second pass so that we have
+ // everything available. Also, we have inserted new instructions which may
+ // include assume intrinsics, so we update the assumption cache while
+ // processing this.
+ for (auto *ClonedBB : NewBlocks)
+ for (Instruction &I : *ClonedBB) {
+ RemapInstruction(&I, VMap,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC.registerAssumption(II);
+ }
+
+ // Remove the cloned parent as a predecessor of the cloned continue successor
+ // if we did in fact clone it.
+ auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB));
+ if (auto *ClonedContinueSuccBB =
+ cast_or_null<BasicBlock>(VMap.lookup(ContinueSuccBB)))
+ ClonedContinueSuccBB->removePredecessor(ClonedParentBB,
+ /*DontDeleteUselessPHIs*/ true);
+ // Replace the cloned branch with an unconditional branch to the cloneed
+ // unswitched successor.
+ auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB));
+ ClonedParentBB->getTerminator()->eraseFromParent();
+ BranchInst::Create(ClonedSuccBB, ClonedParentBB);
+
+ // Update any PHI nodes in the cloned successors of the skipped blocks to not
+ // have spurious incoming values.
+ for (auto *LoopBB : L.blocks())
+ if (SkippedLoopAndExitBlocks.count(LoopBB))
+ for (auto *SuccBB : successors(LoopBB))
+ if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB)))
+ for (PHINode &PN : ClonedSuccBB->phis())
+ PN.removeIncomingValue(LoopBB, /*DeletePHIIfEmpty*/ false);
+
+ return ClonedPH;
+}
+
+/// Recursively clone the specified loop and all of its children.
+///
+/// The target parent loop for the clone should be provided, or can be null if
+/// the clone is a top-level loop. While cloning, all the blocks are mapped
+/// with the provided value map. The entire original loop must be present in
+/// the value map. The cloned loop is returned.
+static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL,
+ const ValueToValueMapTy &VMap, LoopInfo &LI) {
+ auto AddClonedBlocksToLoop = [&](Loop &OrigL, Loop &ClonedL) {
+ assert(ClonedL.getBlocks().empty() && "Must start with an empty loop!");
+ ClonedL.reserveBlocks(OrigL.getNumBlocks());
+ for (auto *BB : OrigL.blocks()) {
+ auto *ClonedBB = cast<BasicBlock>(VMap.lookup(BB));
+ ClonedL.addBlockEntry(ClonedBB);
+ if (LI.getLoopFor(BB) == &OrigL) {
+ assert(!LI.getLoopFor(ClonedBB) &&
+ "Should not have an existing loop for this block!");
+ LI.changeLoopFor(ClonedBB, &ClonedL);
+ }
+ }
+ };
+
+ // We specially handle the first loop because it may get cloned into
+ // a different parent and because we most commonly are cloning leaf loops.
+ Loop *ClonedRootL = LI.AllocateLoop();
+ if (RootParentL)
+ RootParentL->addChildLoop(ClonedRootL);
+ else
+ LI.addTopLevelLoop(ClonedRootL);
+ AddClonedBlocksToLoop(OrigRootL, *ClonedRootL);
+
+ if (OrigRootL.empty())
+ return ClonedRootL;
+
+ // If we have a nest, we can quickly clone the entire loop nest using an
+ // iterative approach because it is a tree. We keep the cloned parent in the
+ // data structure to avoid repeatedly querying through a map to find it.
+ SmallVector<std::pair<Loop *, Loop *>, 16> LoopsToClone;
+ // Build up the loops to clone in reverse order as we'll clone them from the
+ // back.
+ for (Loop *ChildL : llvm::reverse(OrigRootL))
+ LoopsToClone.push_back({ClonedRootL, ChildL});
+ do {
+ Loop *ClonedParentL, *L;
+ std::tie(ClonedParentL, L) = LoopsToClone.pop_back_val();
+ Loop *ClonedL = LI.AllocateLoop();
+ ClonedParentL->addChildLoop(ClonedL);
+ AddClonedBlocksToLoop(*L, *ClonedL);
+ for (Loop *ChildL : llvm::reverse(*L))
+ LoopsToClone.push_back({ClonedL, ChildL});
+ } while (!LoopsToClone.empty());
+
+ return ClonedRootL;
+}
+
+/// Build the cloned loops of an original loop from unswitching.
+///
+/// Because unswitching simplifies the CFG of the loop, this isn't a trivial
+/// operation. We need to re-verify that there even is a loop (as the backedge
+/// may not have been cloned), and even if there are remaining backedges the
+/// backedge set may be different. However, we know that each child loop is
+/// undisturbed, we only need to find where to place each child loop within
+/// either any parent loop or within a cloned version of the original loop.
+///
+/// Because child loops may end up cloned outside of any cloned version of the
+/// original loop, multiple cloned sibling loops may be created. All of them
+/// are returned so that the newly introduced loop nest roots can be
+/// identified.
+static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
+ const ValueToValueMapTy &VMap, LoopInfo &LI,
+ SmallVectorImpl<Loop *> &NonChildClonedLoops) {
+ Loop *ClonedL = nullptr;
+
+ auto *OrigPH = OrigL.getLoopPreheader();
+ auto *OrigHeader = OrigL.getHeader();
+
+ auto *ClonedPH = cast<BasicBlock>(VMap.lookup(OrigPH));
+ auto *ClonedHeader = cast<BasicBlock>(VMap.lookup(OrigHeader));
+
+ // We need to know the loops of the cloned exit blocks to even compute the
+ // accurate parent loop. If we only clone exits to some parent of the
+ // original parent, we want to clone into that outer loop. We also keep track
+ // of the loops that our cloned exit blocks participate in.
+ Loop *ParentL = nullptr;
+ SmallVector<BasicBlock *, 4> ClonedExitsInLoops;
+ SmallDenseMap<BasicBlock *, Loop *, 16> ExitLoopMap;
+ ClonedExitsInLoops.reserve(ExitBlocks.size());
+ for (auto *ExitBB : ExitBlocks)
+ if (auto *ClonedExitBB = cast_or_null<BasicBlock>(VMap.lookup(ExitBB)))
+ if (Loop *ExitL = LI.getLoopFor(ExitBB)) {
+ ExitLoopMap[ClonedExitBB] = ExitL;
+ ClonedExitsInLoops.push_back(ClonedExitBB);
+ if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL)))
+ ParentL = ExitL;
+ }
+ assert((!ParentL || ParentL == OrigL.getParentLoop() ||
+ ParentL->contains(OrigL.getParentLoop())) &&
+ "The computed parent loop should always contain (or be) the parent of "
+ "the original loop.");
+
+ // We build the set of blocks dominated by the cloned header from the set of
+ // cloned blocks out of the original loop. While not all of these will
+ // necessarily be in the cloned loop, it is enough to establish that they
+ // aren't in unreachable cycles, etc.
+ SmallSetVector<BasicBlock *, 16> ClonedLoopBlocks;
+ for (auto *BB : OrigL.blocks())
+ if (auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB)))
+ ClonedLoopBlocks.insert(ClonedBB);
+
+ // Rebuild the set of blocks that will end up in the cloned loop. We may have
+ // skipped cloning some region of this loop which can in turn skip some of
+ // the backedges so we have to rebuild the blocks in the loop based on the
+ // backedges that remain after cloning.
+ SmallVector<BasicBlock *, 16> Worklist;
+ SmallPtrSet<BasicBlock *, 16> BlocksInClonedLoop;
+ for (auto *Pred : predecessors(ClonedHeader)) {
+ // The only possible non-loop header predecessor is the preheader because
+ // we know we cloned the loop in simplified form.
+ if (Pred == ClonedPH)
+ continue;
+
+ // Because the loop was in simplified form, the only non-loop predecessor
+ // should be the preheader.
+ assert(ClonedLoopBlocks.count(Pred) && "Found a predecessor of the loop "
+ "header other than the preheader "
+ "that is not part of the loop!");
+
+ // Insert this block into the loop set and on the first visit (and if it
+ // isn't the header we're currently walking) put it into the worklist to
+ // recurse through.
+ if (BlocksInClonedLoop.insert(Pred).second && Pred != ClonedHeader)
+ Worklist.push_back(Pred);
+ }
+
+ // If we had any backedges then there *is* a cloned loop. Put the header into
+ // the loop set and then walk the worklist backwards to find all the blocks
+ // that remain within the loop after cloning.
+ if (!BlocksInClonedLoop.empty()) {
+ BlocksInClonedLoop.insert(ClonedHeader);
+
+ while (!Worklist.empty()) {
+ BasicBlock *BB = Worklist.pop_back_val();
+ assert(BlocksInClonedLoop.count(BB) &&
+ "Didn't put block into the loop set!");
+
+ // Insert any predecessors that are in the possible set into the cloned
+ // set, and if the insert is successful, add them to the worklist. Note
+ // that we filter on the blocks that are definitely reachable via the
+ // backedge to the loop header so we may prune out dead code within the
+ // cloned loop.
+ for (auto *Pred : predecessors(BB))
+ if (ClonedLoopBlocks.count(Pred) &&
+ BlocksInClonedLoop.insert(Pred).second)
+ Worklist.push_back(Pred);
+ }
+
+ ClonedL = LI.AllocateLoop();
+ if (ParentL) {
+ ParentL->addBasicBlockToLoop(ClonedPH, LI);
+ ParentL->addChildLoop(ClonedL);
+ } else {
+ LI.addTopLevelLoop(ClonedL);
+ }
+
+ ClonedL->reserveBlocks(BlocksInClonedLoop.size());
+ // We don't want to just add the cloned loop blocks based on how we
+ // discovered them. The original order of blocks was carefully built in
+ // a way that doesn't rely on predecessor ordering. Rather than re-invent
+ // that logic, we just re-walk the original blocks (and those of the child
+ // loops) and filter them as we add them into the cloned loop.
+ for (auto *BB : OrigL.blocks()) {
+ auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB));
+ if (!ClonedBB || !BlocksInClonedLoop.count(ClonedBB))
+ continue;
+
+ // Directly add the blocks that are only in this loop.
+ if (LI.getLoopFor(BB) == &OrigL) {
+ ClonedL->addBasicBlockToLoop(ClonedBB, LI);
+ continue;
+ }
+
+ // We want to manually add it to this loop and parents.
+ // Registering it with LoopInfo will happen when we clone the top
+ // loop for this block.
+ for (Loop *PL = ClonedL; PL; PL = PL->getParentLoop())
+ PL->addBlockEntry(ClonedBB);
+ }
+
+ // Now add each child loop whose header remains within the cloned loop. All
+ // of the blocks within the loop must satisfy the same constraints as the
+ // header so once we pass the header checks we can just clone the entire
+ // child loop nest.
+ for (Loop *ChildL : OrigL) {
+ auto *ClonedChildHeader =
+ cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader()));
+ if (!ClonedChildHeader || !BlocksInClonedLoop.count(ClonedChildHeader))
+ continue;
+
+#ifndef NDEBUG
+ // We should never have a cloned child loop header but fail to have
+ // all of the blocks for that child loop.
+ for (auto *ChildLoopBB : ChildL->blocks())
+ assert(BlocksInClonedLoop.count(
+ cast<BasicBlock>(VMap.lookup(ChildLoopBB))) &&
+ "Child cloned loop has a header within the cloned outer "
+ "loop but not all of its blocks!");
+#endif
+
+ cloneLoopNest(*ChildL, ClonedL, VMap, LI);
+ }
+ }
+
+ // Now that we've handled all the components of the original loop that were
+ // cloned into a new loop, we still need to handle anything from the original
+ // loop that wasn't in a cloned loop.
+
+ // Figure out what blocks are left to place within any loop nest containing
+ // the unswitched loop. If we never formed a loop, the cloned PH is one of
+ // them.
+ SmallPtrSet<BasicBlock *, 16> UnloopedBlockSet;
+ if (BlocksInClonedLoop.empty())
+ UnloopedBlockSet.insert(ClonedPH);
+ for (auto *ClonedBB : ClonedLoopBlocks)
+ if (!BlocksInClonedLoop.count(ClonedBB))
+ UnloopedBlockSet.insert(ClonedBB);
+
+ // Copy the cloned exits and sort them in ascending loop depth, we'll work
+ // backwards across these to process them inside out. The order shouldn't
+ // matter as we're just trying to build up the map from inside-out; we use
+ // the map in a more stably ordered way below.
+ auto OrderedClonedExitsInLoops = ClonedExitsInLoops;
+ std::sort(OrderedClonedExitsInLoops.begin(), OrderedClonedExitsInLoops.end(),
+ [&](BasicBlock *LHS, BasicBlock *RHS) {
+ return ExitLoopMap.lookup(LHS)->getLoopDepth() <
+ ExitLoopMap.lookup(RHS)->getLoopDepth();
+ });
+
+ // Populate the existing ExitLoopMap with everything reachable from each
+ // exit, starting from the inner most exit.
+ while (!UnloopedBlockSet.empty() && !OrderedClonedExitsInLoops.empty()) {
+ assert(Worklist.empty() && "Didn't clear worklist!");
+
+ BasicBlock *ExitBB = OrderedClonedExitsInLoops.pop_back_val();
+ Loop *ExitL = ExitLoopMap.lookup(ExitBB);
+
+ // Walk the CFG back until we hit the cloned PH adding everything reachable
+ // and in the unlooped set to this exit block's loop.
+ Worklist.push_back(ExitBB);
+ do {
+ BasicBlock *BB = Worklist.pop_back_val();
+ // We can stop recursing at the cloned preheader (if we get there).
+ if (BB == ClonedPH)
+ continue;
+
+ for (BasicBlock *PredBB : predecessors(BB)) {
+ // If this pred has already been moved to our set or is part of some
+ // (inner) loop, no update needed.
+ if (!UnloopedBlockSet.erase(PredBB)) {
+ assert(
+ (BlocksInClonedLoop.count(PredBB) || ExitLoopMap.count(PredBB)) &&
+ "Predecessor not mapped to a loop!");
+ continue;
+ }
+
+ // We just insert into the loop set here. We'll add these blocks to the
+ // exit loop after we build up the set in an order that doesn't rely on
+ // predecessor order (which in turn relies on use list order).
+ bool Inserted = ExitLoopMap.insert({PredBB, ExitL}).second;
+ (void)Inserted;
+ assert(Inserted && "Should only visit an unlooped block once!");
+
+ // And recurse through to its predecessors.
+ Worklist.push_back(PredBB);
+ }
+ } while (!Worklist.empty());
+ }
+
+ // Now that the ExitLoopMap gives as mapping for all the non-looping cloned
+ // blocks to their outer loops, walk the cloned blocks and the cloned exits
+ // in their original order adding them to the correct loop.
+
+ // We need a stable insertion order. We use the order of the original loop
+ // order and map into the correct parent loop.
+ for (auto *BB : llvm::concat<BasicBlock *const>(
+ makeArrayRef(ClonedPH), ClonedLoopBlocks, ClonedExitsInLoops))
+ if (Loop *OuterL = ExitLoopMap.lookup(BB))
+ OuterL->addBasicBlockToLoop(BB, LI);
+
+#ifndef NDEBUG
+ for (auto &BBAndL : ExitLoopMap) {
+ auto *BB = BBAndL.first;
+ auto *OuterL = BBAndL.second;
+ assert(LI.getLoopFor(BB) == OuterL &&
+ "Failed to put all blocks into outer loops!");
+ }
+#endif
+
+ // Now that all the blocks are placed into the correct containing loop in the
+ // absence of child loops, find all the potentially cloned child loops and
+ // clone them into whatever outer loop we placed their header into.
+ for (Loop *ChildL : OrigL) {
+ auto *ClonedChildHeader =
+ cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader()));
+ if (!ClonedChildHeader || BlocksInClonedLoop.count(ClonedChildHeader))
+ continue;
+
+#ifndef NDEBUG
+ for (auto *ChildLoopBB : ChildL->blocks())
+ assert(VMap.count(ChildLoopBB) &&
+ "Cloned a child loop header but not all of that loops blocks!");
+#endif
+
+ NonChildClonedLoops.push_back(cloneLoopNest(
+ *ChildL, ExitLoopMap.lookup(ClonedChildHeader), VMap, LI));
+ }
+
+ // Return the main cloned loop if any.
+ return ClonedL;
+}
+
+static void deleteDeadBlocksFromLoop(Loop &L, BasicBlock *DeadSubtreeRoot,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks,
+ DominatorTree &DT, LoopInfo &LI) {
+ // Walk the dominator tree to build up the set of blocks we will delete here.
+ // The order is designed to allow us to always delete bottom-up and avoid any
+ // dangling uses.
+ SmallSetVector<BasicBlock *, 16> DeadBlocks;
+ DeadBlocks.insert(DeadSubtreeRoot);
+ for (int i = 0; i < (int)DeadBlocks.size(); ++i)
+ for (DomTreeNode *ChildN : *DT[DeadBlocks[i]]) {
+ // FIXME: This assert should pass and that means we don't change nearly
+ // as much below! Consider rewriting all of this to avoid deleting
+ // blocks. They are always cloned before being deleted, and so instead
+ // could just be moved.
+ // FIXME: This in turn means that we might actually be more able to
+ // update the domtree.
+ assert((L.contains(ChildN->getBlock()) ||
+ llvm::find(ExitBlocks, ChildN->getBlock()) != ExitBlocks.end()) &&
+ "Should never reach beyond the loop and exits when deleting!");
+ DeadBlocks.insert(ChildN->getBlock());
+ }
+
+ // Filter out the dead blocks from the exit blocks list so that it can be
+ // used in the caller.
+ llvm::erase_if(ExitBlocks,
+ [&](BasicBlock *BB) { return DeadBlocks.count(BB); });
+
+ // Remove these blocks from their successors.
+ for (auto *BB : DeadBlocks)
+ for (BasicBlock *SuccBB : successors(BB))
+ SuccBB->removePredecessor(BB, /*DontDeleteUselessPHIs*/ true);
+
+ // Walk from this loop up through its parents removing all of the dead blocks.
+ for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) {
+ for (auto *BB : DeadBlocks)
+ ParentL->getBlocksSet().erase(BB);
+ llvm::erase_if(ParentL->getBlocksVector(),
+ [&](BasicBlock *BB) { return DeadBlocks.count(BB); });
+ }
+
+ // Now delete the dead child loops. This raw delete will clear them
+ // recursively.
+ llvm::erase_if(L.getSubLoopsVector(), [&](Loop *ChildL) {
+ if (!DeadBlocks.count(ChildL->getHeader()))
+ return false;
+
+ assert(llvm::all_of(ChildL->blocks(),
+ [&](BasicBlock *ChildBB) {
+ return DeadBlocks.count(ChildBB);
+ }) &&
+ "If the child loop header is dead all blocks in the child loop must "
+ "be dead as well!");
+ LI.destroy(ChildL);
+ return true;
+ });
+
+ // Remove the mappings for the dead blocks.
+ for (auto *BB : DeadBlocks)
+ LI.changeLoopFor(BB, nullptr);
+
+ // Drop all the references from these blocks to others to handle cyclic
+ // references as we start deleting the blocks themselves.
+ for (auto *BB : DeadBlocks)
+ BB->dropAllReferences();
+
+ for (auto *BB : llvm::reverse(DeadBlocks)) {
+ DT.eraseNode(BB);
+ BB->eraseFromParent();
+ }
+}
+
+/// Recompute the set of blocks in a loop after unswitching.
+///
+/// This walks from the original headers predecessors to rebuild the loop. We
+/// take advantage of the fact that new blocks can't have been added, and so we
+/// filter by the original loop's blocks. This also handles potentially
+/// unreachable code that we don't want to explore but might be found examining
+/// the predecessors of the header.
+///
+/// If the original loop is no longer a loop, this will return an empty set. If
+/// it remains a loop, all the blocks within it will be added to the set
+/// (including those blocks in inner loops).
+static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
+ LoopInfo &LI) {
+ SmallPtrSet<const BasicBlock *, 16> LoopBlockSet;
+
+ auto *PH = L.getLoopPreheader();
+ auto *Header = L.getHeader();
+
+ // A worklist to use while walking backwards from the header.
+ SmallVector<BasicBlock *, 16> Worklist;
+
+ // First walk the predecessors of the header to find the backedges. This will
+ // form the basis of our walk.
+ for (auto *Pred : predecessors(Header)) {
+ // Skip the preheader.
+ if (Pred == PH)
+ continue;
+
+ // Because the loop was in simplified form, the only non-loop predecessor
+ // is the preheader.
+ assert(L.contains(Pred) && "Found a predecessor of the loop header other "
+ "than the preheader that is not part of the "
+ "loop!");
+
+ // Insert this block into the loop set and on the first visit and, if it
+ // isn't the header we're currently walking, put it into the worklist to
+ // recurse through.
+ if (LoopBlockSet.insert(Pred).second && Pred != Header)
+ Worklist.push_back(Pred);
+ }
+
+ // If no backedges were found, we're done.
+ if (LoopBlockSet.empty())
+ return LoopBlockSet;
+
+ // Add the loop header to the set.
+ LoopBlockSet.insert(Header);
+
+ // We found backedges, recurse through them to identify the loop blocks.
+ while (!Worklist.empty()) {
+ BasicBlock *BB = Worklist.pop_back_val();
+ assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!");
+
+ // Because we know the inner loop structure remains valid we can use the
+ // loop structure to jump immediately across the entire nested loop.
+ // Further, because it is in loop simplified form, we can directly jump
+ // to its preheader afterward.
+ if (Loop *InnerL = LI.getLoopFor(BB))
+ if (InnerL != &L) {
+ assert(L.contains(InnerL) &&
+ "Should not reach a loop *outside* this loop!");
+ // The preheader is the only possible predecessor of the loop so
+ // insert it into the set and check whether it was already handled.
+ auto *InnerPH = InnerL->getLoopPreheader();
+ assert(L.contains(InnerPH) && "Cannot contain an inner loop block "
+ "but not contain the inner loop "
+ "preheader!");
+ if (!LoopBlockSet.insert(InnerPH).second)
+ // The only way to reach the preheader is through the loop body
+ // itself so if it has been visited the loop is already handled.
+ continue;
+
+ // Insert all of the blocks (other than those already present) into
+ // the loop set. The only block we expect to already be in the set is
+ // the one we used to find this loop as we immediately handle the
+ // others the first time we encounter the loop.
+ for (auto *InnerBB : InnerL->blocks()) {
+ if (InnerBB == BB) {
+ assert(LoopBlockSet.count(InnerBB) &&
+ "Block should already be in the set!");
+ continue;
+ }
+
+ bool Inserted = LoopBlockSet.insert(InnerBB).second;
+ (void)Inserted;
+ assert(Inserted && "Should only insert an inner loop once!");
+ }
+
+ // Add the preheader to the worklist so we will continue past the
+ // loop body.
+ Worklist.push_back(InnerPH);
+ continue;
+ }
+
+ // Insert any predecessors that were in the original loop into the new
+ // set, and if the insert is successful, add them to the worklist.
+ for (auto *Pred : predecessors(BB))
+ if (L.contains(Pred) && LoopBlockSet.insert(Pred).second)
+ Worklist.push_back(Pred);
+ }
+
+ // We've found all the blocks participating in the loop, return our completed
+ // set.
+ return LoopBlockSet;
+}
+
+/// Rebuild a loop after unswitching removes some subset of blocks and edges.
+///
+/// The removal may have removed some child loops entirely but cannot have
+/// disturbed any remaining child loops. However, they may need to be hoisted
+/// to the parent loop (or to be top-level loops). The original loop may be
+/// completely removed.
+///
+/// The sibling loops resulting from this update are returned. If the original
+/// loop remains a valid loop, it will be the first entry in this list with all
+/// of the newly sibling loops following it.
+///
+/// Returns true if the loop remains a loop after unswitching, and false if it
+/// is no longer a loop after unswitching (and should not continue to be
+/// referenced).
+static bool rebuildLoopAfterUnswitch(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
+ LoopInfo &LI,
+ SmallVectorImpl<Loop *> &HoistedLoops) {
+ auto *PH = L.getLoopPreheader();
+
+ // Compute the actual parent loop from the exit blocks. Because we may have
+ // pruned some exits the loop may be different from the original parent.
+ Loop *ParentL = nullptr;
+ SmallVector<Loop *, 4> ExitLoops;
+ SmallVector<BasicBlock *, 4> ExitsInLoops;
+ ExitsInLoops.reserve(ExitBlocks.size());
+ for (auto *ExitBB : ExitBlocks)
+ if (Loop *ExitL = LI.getLoopFor(ExitBB)) {
+ ExitLoops.push_back(ExitL);
+ ExitsInLoops.push_back(ExitBB);
+ if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL)))
+ ParentL = ExitL;
+ }
+
+ // Recompute the blocks participating in this loop. This may be empty if it
+ // is no longer a loop.
+ auto LoopBlockSet = recomputeLoopBlockSet(L, LI);
+
+ // If we still have a loop, we need to re-set the loop's parent as the exit
+ // block set changing may have moved it within the loop nest. Note that this
+ // can only happen when this loop has a parent as it can only hoist the loop
+ // *up* the nest.
+ if (!LoopBlockSet.empty() && L.getParentLoop() != ParentL) {
+ // Remove this loop's (original) blocks from all of the intervening loops.
+ for (Loop *IL = L.getParentLoop(); IL != ParentL;
+ IL = IL->getParentLoop()) {
+ IL->getBlocksSet().erase(PH);
+ for (auto *BB : L.blocks())
+ IL->getBlocksSet().erase(BB);
+ llvm::erase_if(IL->getBlocksVector(), [&](BasicBlock *BB) {
+ return BB == PH || L.contains(BB);
+ });
+ }
+
+ LI.changeLoopFor(PH, ParentL);
+ L.getParentLoop()->removeChildLoop(&L);
+ if (ParentL)
+ ParentL->addChildLoop(&L);
+ else
+ LI.addTopLevelLoop(&L);
+ }
+
+ // Now we update all the blocks which are no longer within the loop.
+ auto &Blocks = L.getBlocksVector();
+ auto BlocksSplitI =
+ LoopBlockSet.empty()
+ ? Blocks.begin()
+ : std::stable_partition(
+ Blocks.begin(), Blocks.end(),
+ [&](BasicBlock *BB) { return LoopBlockSet.count(BB); });
+
+ // Before we erase the list of unlooped blocks, build a set of them.
+ SmallPtrSet<BasicBlock *, 16> UnloopedBlocks(BlocksSplitI, Blocks.end());
+ if (LoopBlockSet.empty())
+ UnloopedBlocks.insert(PH);
+
+ // Now erase these blocks from the loop.
+ for (auto *BB : make_range(BlocksSplitI, Blocks.end()))
+ L.getBlocksSet().erase(BB);
+ Blocks.erase(BlocksSplitI, Blocks.end());
+
+ // Sort the exits in ascending loop depth, we'll work backwards across these
+ // to process them inside out.
+ std::stable_sort(ExitsInLoops.begin(), ExitsInLoops.end(),
+ [&](BasicBlock *LHS, BasicBlock *RHS) {
+ return LI.getLoopDepth(LHS) < LI.getLoopDepth(RHS);
+ });
+
+ // We'll build up a set for each exit loop.
+ SmallPtrSet<BasicBlock *, 16> NewExitLoopBlocks;
+ Loop *PrevExitL = L.getParentLoop(); // The deepest possible exit loop.
+
+ auto RemoveUnloopedBlocksFromLoop =
+ [](Loop &L, SmallPtrSetImpl<BasicBlock *> &UnloopedBlocks) {
+ for (auto *BB : UnloopedBlocks)
+ L.getBlocksSet().erase(BB);
+ llvm::erase_if(L.getBlocksVector(), [&](BasicBlock *BB) {
+ return UnloopedBlocks.count(BB);
+ });
+ };
+
+ SmallVector<BasicBlock *, 16> Worklist;
+ while (!UnloopedBlocks.empty() && !ExitsInLoops.empty()) {
+ assert(Worklist.empty() && "Didn't clear worklist!");
+ assert(NewExitLoopBlocks.empty() && "Didn't clear loop set!");
+
+ // Grab the next exit block, in decreasing loop depth order.
+ BasicBlock *ExitBB = ExitsInLoops.pop_back_val();
+ Loop &ExitL = *LI.getLoopFor(ExitBB);
+ assert(ExitL.contains(&L) && "Exit loop must contain the inner loop!");
+
+ // Erase all of the unlooped blocks from the loops between the previous
+ // exit loop and this exit loop. This works because the ExitInLoops list is
+ // sorted in increasing order of loop depth and thus we visit loops in
+ // decreasing order of loop depth.
+ for (; PrevExitL != &ExitL; PrevExitL = PrevExitL->getParentLoop())
+ RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks);
+
+ // Walk the CFG back until we hit the cloned PH adding everything reachable
+ // and in the unlooped set to this exit block's loop.
+ Worklist.push_back(ExitBB);
+ do {
+ BasicBlock *BB = Worklist.pop_back_val();
+ // We can stop recursing at the cloned preheader (if we get there).
+ if (BB == PH)
+ continue;
+
+ for (BasicBlock *PredBB : predecessors(BB)) {
+ // If this pred has already been moved to our set or is part of some
+ // (inner) loop, no update needed.
+ if (!UnloopedBlocks.erase(PredBB)) {
+ assert((NewExitLoopBlocks.count(PredBB) ||
+ ExitL.contains(LI.getLoopFor(PredBB))) &&
+ "Predecessor not in a nested loop (or already visited)!");
+ continue;
+ }
+
+ // We just insert into the loop set here. We'll add these blocks to the
+ // exit loop after we build up the set in a deterministic order rather
+ // than the predecessor-influenced visit order.
+ bool Inserted = NewExitLoopBlocks.insert(PredBB).second;
+ (void)Inserted;
+ assert(Inserted && "Should only visit an unlooped block once!");
+
+ // And recurse through to its predecessors.
+ Worklist.push_back(PredBB);
+ }
+ } while (!Worklist.empty());
+
+ // If blocks in this exit loop were directly part of the original loop (as
+ // opposed to a child loop) update the map to point to this exit loop. This
+ // just updates a map and so the fact that the order is unstable is fine.
+ for (auto *BB : NewExitLoopBlocks)
+ if (Loop *BBL = LI.getLoopFor(BB))
+ if (BBL == &L || !L.contains(BBL))
+ LI.changeLoopFor(BB, &ExitL);
+
+ // We will remove the remaining unlooped blocks from this loop in the next
+ // iteration or below.
+ NewExitLoopBlocks.clear();
+ }
+
+ // Any remaining unlooped blocks are no longer part of any loop unless they
+ // are part of some child loop.
+ for (; PrevExitL; PrevExitL = PrevExitL->getParentLoop())
+ RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks);
+ for (auto *BB : UnloopedBlocks)
+ if (Loop *BBL = LI.getLoopFor(BB))
+ if (BBL == &L || !L.contains(BBL))
+ LI.changeLoopFor(BB, nullptr);
+
+ // Sink all the child loops whose headers are no longer in the loop set to
+ // the parent (or to be top level loops). We reach into the loop and directly
+ // update its subloop vector to make this batch update efficient.
+ auto &SubLoops = L.getSubLoopsVector();
+ auto SubLoopsSplitI =
+ LoopBlockSet.empty()
+ ? SubLoops.begin()
+ : std::stable_partition(
+ SubLoops.begin(), SubLoops.end(), [&](Loop *SubL) {
+ return LoopBlockSet.count(SubL->getHeader());
+ });
+ for (auto *HoistedL : make_range(SubLoopsSplitI, SubLoops.end())) {
+ HoistedLoops.push_back(HoistedL);
+ HoistedL->setParentLoop(nullptr);
+
+ // To compute the new parent of this hoisted loop we look at where we
+ // placed the preheader above. We can't lookup the header itself because we
+ // retained the mapping from the header to the hoisted loop. But the
+ // preheader and header should have the exact same new parent computed
+ // based on the set of exit blocks from the original loop as the preheader
+ // is a predecessor of the header and so reached in the reverse walk. And
+ // because the loops were all in simplified form the preheader of the
+ // hoisted loop can't be part of some *other* loop.
+ if (auto *NewParentL = LI.getLoopFor(HoistedL->getLoopPreheader()))
+ NewParentL->addChildLoop(HoistedL);
+ else
+ LI.addTopLevelLoop(HoistedL);
+ }
+ SubLoops.erase(SubLoopsSplitI, SubLoops.end());
+
+ // Actually delete the loop if nothing remained within it.
+ if (Blocks.empty()) {
+ assert(SubLoops.empty() &&
+ "Failed to remove all subloops from the original loop!");
+ if (Loop *ParentL = L.getParentLoop())
+ ParentL->removeChildLoop(llvm::find(*ParentL, &L));
+ else
+ LI.removeLoop(llvm::find(LI, &L));
+ LI.destroy(&L);
+ return false;
+ }
+
+ return true;
+}
+
+/// Helper to visit a dominator subtree, invoking a callable on each node.
+///
+/// Returning false at any point will stop walking past that node of the tree.
+template <typename CallableT>
+void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
+ SmallVector<DomTreeNode *, 4> DomWorklist;
+ DomWorklist.push_back(DT[BB]);
+#ifndef NDEBUG
+ SmallPtrSet<DomTreeNode *, 4> Visited;
+ Visited.insert(DT[BB]);
+#endif
+ do {
+ DomTreeNode *N = DomWorklist.pop_back_val();
+
+ // Visit this node.
+ if (!Callable(N->getBlock()))
+ continue;
+
+ // Accumulate the child nodes.
+ for (DomTreeNode *ChildN : *N) {
+ assert(Visited.insert(ChildN).second &&
+ "Cannot visit a node twice when walking a tree!");
+ DomWorklist.push_back(ChildN);
+ }
+ } while (!DomWorklist.empty());
+}
+
+/// Take an invariant branch that has been determined to be safe and worthwhile
+/// to unswitch despite being non-trivial to do so and perform the unswitch.
+///
+/// This directly updates the CFG to hoist the predicate out of the loop, and
+/// clone the necessary parts of the loop to maintain behavior.
+///
+/// It also updates both dominator tree and loopinfo based on the unswitching.
+///
+/// Once unswitching has been performed it runs the provided callback to report
+/// the new loops and no-longer valid loops to the caller.
+static bool unswitchInvariantBranch(
+ Loop &L, BranchInst &BI, DominatorTree &DT, LoopInfo &LI,
+ AssumptionCache &AC,
+ function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) {
+ assert(BI.isConditional() && "Can only unswitch a conditional branch!");
+ assert(L.isLoopInvariant(BI.getCondition()) &&
+ "Can only unswitch an invariant branch condition!");
+
+ // Constant and BBs tracking the cloned and continuing successor.
+ const int ClonedSucc = 0;
+ auto *ParentBB = BI.getParent();
+ auto *UnswitchedSuccBB = BI.getSuccessor(ClonedSucc);
+ auto *ContinueSuccBB = BI.getSuccessor(1 - ClonedSucc);
+
+ assert(UnswitchedSuccBB != ContinueSuccBB &&
+ "Should not unswitch a branch that always goes to the same place!");
+
+ // The branch should be in this exact loop. Any inner loop's invariant branch
+ // should be handled by unswitching that inner loop. The caller of this
+ // routine should filter out any candidates that remain (but were skipped for
+ // whatever reason).
+ assert(LI.getLoopFor(ParentBB) == &L && "Branch in an inner loop!");
+
+ SmallVector<BasicBlock *, 4> ExitBlocks;
+ L.getUniqueExitBlocks(ExitBlocks);
+
+ // We cannot unswitch if exit blocks contain a cleanuppad instruction as we
+ // don't know how to split those exit blocks.
+ // FIXME: We should teach SplitBlock to handle this and remove this
+ // restriction.
+ for (auto *ExitBB : ExitBlocks)
+ if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI()))
+ return false;
+
+ SmallPtrSet<BasicBlock *, 4> ExitBlockSet(ExitBlocks.begin(),
+ ExitBlocks.end());
+
+ // Compute the parent loop now before we start hacking on things.
+ Loop *ParentL = L.getParentLoop();
+
+ // Compute the outer-most loop containing one of our exit blocks. This is the
+ // furthest up our loopnest which can be mutated, which we will use below to
+ // update things.
+ Loop *OuterExitL = &L;
+ for (auto *ExitBB : ExitBlocks) {
+ Loop *NewOuterExitL = LI.getLoopFor(ExitBB);
+ if (!NewOuterExitL) {
+ // We exited the entire nest with this block, so we're done.
+ OuterExitL = nullptr;
+ break;
+ }
+ if (NewOuterExitL != OuterExitL && NewOuterExitL->contains(OuterExitL))
+ OuterExitL = NewOuterExitL;
+ }
+
+ // If the edge we *aren't* cloning in the unswitch (the continuing edge)
+ // dominates its target, we can skip cloning the dominated region of the loop
+ // and its exits. We compute this as a set of nodes to be skipped.
+ SmallPtrSet<BasicBlock *, 4> SkippedLoopAndExitBlocks;
+ if (ContinueSuccBB->getUniquePredecessor() ||
+ llvm::all_of(predecessors(ContinueSuccBB), [&](BasicBlock *PredBB) {
+ return PredBB == ParentBB || DT.dominates(ContinueSuccBB, PredBB);
+ })) {
+ visitDomSubTree(DT, ContinueSuccBB, [&](BasicBlock *BB) {
+ SkippedLoopAndExitBlocks.insert(BB);
+ return true;
+ });
+ }
+ // Similarly, if the edge we *are* cloning in the unswitch (the unswitched
+ // edge) dominates its target, we will end up with dead nodes in the original
+ // loop and its exits that will need to be deleted. Here, we just retain that
+ // the property holds and will compute the deleted set later.
+ bool DeleteUnswitchedSucc =
+ UnswitchedSuccBB->getUniquePredecessor() ||
+ llvm::all_of(predecessors(UnswitchedSuccBB), [&](BasicBlock *PredBB) {
+ return PredBB == ParentBB || DT.dominates(UnswitchedSuccBB, PredBB);
+ });
+
+ // Split the preheader, so that we know that there is a safe place to insert
+ // the conditional branch. We will change the preheader to have a conditional
+ // branch on LoopCond. The original preheader will become the split point
+ // between the unswitched versions, and we will have a new preheader for the
+ // original loop.
+ BasicBlock *SplitBB = L.getLoopPreheader();
+ BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI);
+
+ // Keep a mapping for the cloned values.
+ ValueToValueMapTy VMap;
+
+ // Build the cloned blocks from the loop.
+ auto *ClonedPH = buildClonedLoopBlocks(
+ L, LoopPH, SplitBB, ExitBlocks, ParentBB, UnswitchedSuccBB,
+ ContinueSuccBB, SkippedLoopAndExitBlocks, VMap, AC, DT, LI);
+
+ // Build the cloned loop structure itself. This may be substantially
+ // different from the original structure due to the simplified CFG. This also
+ // handles inserting all the cloned blocks into the correct loops.
+ SmallVector<Loop *, 4> NonChildClonedLoops;
+ Loop *ClonedL =
+ buildClonedLoops(L, ExitBlocks, VMap, LI, NonChildClonedLoops);
+
+ // Remove the parent as a predecessor of the unswitched successor.
+ UnswitchedSuccBB->removePredecessor(ParentBB, /*DontDeleteUselessPHIs*/ true);
+
+ // Now splice the branch from the original loop and use it to select between
+ // the two loops.
+ SplitBB->getTerminator()->eraseFromParent();
+ SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), BI);
+ BI.setSuccessor(ClonedSucc, ClonedPH);
+ BI.setSuccessor(1 - ClonedSucc, LoopPH);
+
+ // Create a new unconditional branch to the continuing block (as opposed to
+ // the one cloned).
+ BranchInst::Create(ContinueSuccBB, ParentBB);
+
+ // Delete anything that was made dead in the original loop due to
+ // unswitching.
+ if (DeleteUnswitchedSucc)
+ deleteDeadBlocksFromLoop(L, UnswitchedSuccBB, ExitBlocks, DT, LI);
+
+ SmallVector<Loop *, 4> HoistedLoops;
+ bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops);
+
+ // This will have completely invalidated the dominator tree. We can't easily
+ // bound how much is invalid because in some cases we will refine the
+ // predecessor set of exit blocks of the loop which can move large unrelated
+ // regions of code into a new subtree.
+ //
+ // FIXME: Eventually, we should use an incremental update utility that
+ // leverages the existing information in the dominator tree (and potentially
+ // the nature of the change) to more efficiently update things.
+ DT.recalculate(*SplitBB->getParent());
+
+ // We can change which blocks are exit blocks of all the cloned sibling
+ // loops, the current loop, and any parent loops which shared exit blocks
+ // with the current loop. As a consequence, we need to re-form LCSSA for
+ // them. But we shouldn't need to re-form LCSSA for any child loops.
+ // FIXME: This could be made more efficient by tracking which exit blocks are
+ // new, and focusing on them, but that isn't likely to be necessary.
+ //
+ // In order to reasonably rebuild LCSSA we need to walk inside-out across the
+ // loop nest and update every loop that could have had its exits changed. We
+ // also need to cover any intervening loops. We add all of these loops to
+ // a list and sort them by loop depth to achieve this without updating
+ // unnecessary loops.
+ auto UpdateLCSSA = [&](Loop &UpdateL) {
+#ifndef NDEBUG
+ for (Loop *ChildL : UpdateL)
+ assert(ChildL->isRecursivelyLCSSAForm(DT, LI) &&
+ "Perturbed a child loop's LCSSA form!");
+#endif
+ formLCSSA(UpdateL, DT, &LI, nullptr);
+ };
+
+ // For non-child cloned loops and hoisted loops, we just need to update LCSSA
+ // and we can do it in any order as they don't nest relative to each other.
+ for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
+ UpdateLCSSA(*UpdatedL);
+
+ // If the original loop had exit blocks, walk up through the outer most loop
+ // of those exit blocks to update LCSSA and form updated dedicated exits.
+ if (OuterExitL != &L) {
+ SmallVector<Loop *, 4> OuterLoops;
+ // We start with the cloned loop and the current loop if they are loops and
+ // move toward OuterExitL. Also, if either the cloned loop or the current
+ // loop have become top level loops we need to walk all the way out.
+ if (ClonedL) {
+ OuterLoops.push_back(ClonedL);
+ if (!ClonedL->getParentLoop())
+ OuterExitL = nullptr;
+ }
+ if (IsStillLoop) {
+ OuterLoops.push_back(&L);
+ if (!L.getParentLoop())
+ OuterExitL = nullptr;
+ }
+ // Grab all of the enclosing loops now.
+ for (Loop *OuterL = ParentL; OuterL != OuterExitL;
+ OuterL = OuterL->getParentLoop())
+ OuterLoops.push_back(OuterL);
+
+ // Finally, update our list of outer loops. This is nicely ordered to work
+ // inside-out.
+ for (Loop *OuterL : OuterLoops) {
+ // First build LCSSA for this loop so that we can preserve it when
+ // forming dedicated exits. We don't want to perturb some other loop's
+ // LCSSA while doing that CFG edit.
+ UpdateLCSSA(*OuterL);
+
+ // For loops reached by this loop's original exit blocks we may
+ // introduced new, non-dedicated exits. At least try to re-form dedicated
+ // exits for these loops. This may fail if they couldn't have dedicated
+ // exits to start with.
+ formDedicatedExitBlocks(OuterL, &DT, &LI, /*PreserveLCSSA*/ true);
+ }
+ }
+
+#ifndef NDEBUG
+ // Verify the entire loop structure to catch any incorrect updates before we
+ // progress in the pass pipeline.
+ LI.verify(DT);
+#endif
+
+ // Now that we've unswitched something, make callbacks to report the changes.
+ // For that we need to merge together the updated loops and the cloned loops
+ // and check whether the original loop survived.
+ SmallVector<Loop *, 4> SibLoops;
+ for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
+ if (UpdatedL->getParentLoop() == ParentL)
+ SibLoops.push_back(UpdatedL);
+ NonTrivialUnswitchCB(IsStillLoop, SibLoops);
+
+ ++NumBranches;
+ return true;
+}
+
+/// Recursively compute the cost of a dominator subtree based on the per-block
+/// cost map provided.
+///
+/// The recursive computation is memozied into the provided DT-indexed cost map
+/// to allow querying it for most nodes in the domtree without it becoming
+/// quadratic.
+static int
+computeDomSubtreeCost(DomTreeNode &N,
+ const SmallDenseMap<BasicBlock *, int, 4> &BBCostMap,
+ SmallDenseMap<DomTreeNode *, int, 4> &DTCostMap) {
+ // Don't accumulate cost (or recurse through) blocks not in our block cost
+ // map and thus not part of the duplication cost being considered.
+ auto BBCostIt = BBCostMap.find(N.getBlock());
+ if (BBCostIt == BBCostMap.end())
+ return 0;
+
+ // Lookup this node to see if we already computed its cost.
+ auto DTCostIt = DTCostMap.find(&N);
+ if (DTCostIt != DTCostMap.end())
+ return DTCostIt->second;
+
+ // If not, we have to compute it. We can't use insert above and update
+ // because computing the cost may insert more things into the map.
+ int Cost = std::accumulate(
+ N.begin(), N.end(), BBCostIt->second, [&](int Sum, DomTreeNode *ChildN) {
+ return Sum + computeDomSubtreeCost(*ChildN, BBCostMap, DTCostMap);
+ });
+ bool Inserted = DTCostMap.insert({&N, Cost}).second;
+ (void)Inserted;
+ assert(Inserted && "Should not insert a node while visiting children!");
+ return Cost;
+}
+
+/// Unswitch control flow predicated on loop invariant conditions.
+///
+/// This first hoists all branches or switches which are trivial (IE, do not
+/// require duplicating any part of the loop) out of the loop body. It then
+/// looks at other loop invariant control flows and tries to unswitch those as
+/// well by cloning the loop if the result is small enough.
+static bool
+unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
+ TargetTransformInfo &TTI, bool NonTrivial,
+ function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) {
+ assert(L.isRecursivelyLCSSAForm(DT, LI) &&
+ "Loops must be in LCSSA form before unswitching.");
+ bool Changed = false;
+
+ // Must be in loop simplified form: we need a preheader and dedicated exits.
+ if (!L.isLoopSimplifyForm())
+ return false;
+
+ // Try trivial unswitch first before loop over other basic blocks in the loop.
+ Changed |= unswitchAllTrivialConditions(L, DT, LI);
+
+ // If we're not doing non-trivial unswitching, we're done. We both accept
+ // a parameter but also check a local flag that can be used for testing
+ // a debugging.
+ if (!NonTrivial && !EnableNonTrivialUnswitch)
+ return Changed;
+
+ // Collect all remaining invariant branch conditions within this loop (as
+ // opposed to an inner loop which would be handled when visiting that inner
+ // loop).
+ SmallVector<TerminatorInst *, 4> UnswitchCandidates;
+ for (auto *BB : L.blocks())
+ if (LI.getLoopFor(BB) == &L)
+ if (auto *BI = dyn_cast<BranchInst>(BB->getTerminator()))
+ if (BI->isConditional() && L.isLoopInvariant(BI->getCondition()) &&
+ BI->getSuccessor(0) != BI->getSuccessor(1))
+ UnswitchCandidates.push_back(BI);
+
+ // If we didn't find any candidates, we're done.
+ if (UnswitchCandidates.empty())
+ return Changed;
+
+ DEBUG(dbgs() << "Considering " << UnswitchCandidates.size()
+ << " non-trivial loop invariant conditions for unswitching.\n");
+
+ // Given that unswitching these terminators will require duplicating parts of
+ // the loop, so we need to be able to model that cost. Compute the ephemeral
+ // values and set up a data structure to hold per-BB costs. We cache each
+ // block's cost so that we don't recompute this when considering different
+ // subsets of the loop for duplication during unswitching.
+ SmallPtrSet<const Value *, 4> EphValues;
+ CodeMetrics::collectEphemeralValues(&L, &AC, EphValues);
+ SmallDenseMap<BasicBlock *, int, 4> BBCostMap;
+
+ // Compute the cost of each block, as well as the total loop cost. Also, bail
+ // out if we see instructions which are incompatible with loop unswitching
+ // (convergent, noduplicate, or cross-basic-block tokens).
+ // FIXME: We might be able to safely handle some of these in non-duplicated
+ // regions.
+ int LoopCost = 0;
+ for (auto *BB : L.blocks()) {
+ int Cost = 0;
+ for (auto &I : *BB) {
+ if (EphValues.count(&I))
+ continue;
+
+ if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
+ return Changed;
+ if (auto CS = CallSite(&I))
+ if (CS.isConvergent() || CS.cannotDuplicate())
+ return Changed;
+
+ Cost += TTI.getUserCost(&I);
+ }
+ assert(Cost >= 0 && "Must not have negative costs!");
+ LoopCost += Cost;
+ assert(LoopCost >= 0 && "Must not have negative loop costs!");
+ BBCostMap[BB] = Cost;
+ }
+ DEBUG(dbgs() << " Total loop cost: " << LoopCost << "\n");
+
+ // Now we find the best candidate by searching for the one with the following
+ // properties in order:
+ //
+ // 1) An unswitching cost below the threshold
+ // 2) The smallest number of duplicated unswitch candidates (to avoid
+ // creating redundant subsequent unswitching)
+ // 3) The smallest cost after unswitching.
+ //
+ // We prioritize reducing fanout of unswitch candidates provided the cost
+ // remains below the threshold because this has a multiplicative effect.
+ //
+ // This requires memoizing each dominator subtree to avoid redundant work.
+ //
+ // FIXME: Need to actually do the number of candidates part above.
+ SmallDenseMap<DomTreeNode *, int, 4> DTCostMap;
+ // Given a terminator which might be unswitched, computes the non-duplicated
+ // cost for that terminator.
+ auto ComputeUnswitchedCost = [&](TerminatorInst *TI) {
+ BasicBlock &BB = *TI->getParent();
+ SmallPtrSet<BasicBlock *, 4> Visited;
+
+ int Cost = LoopCost;
+ for (BasicBlock *SuccBB : successors(&BB)) {
+ // Don't count successors more than once.
+ if (!Visited.insert(SuccBB).second)
+ continue;
+
+ // This successor's domtree will not need to be duplicated after
+ // unswitching if the edge to the successor dominates it (and thus the
+ // entire tree). This essentially means there is no other path into this
+ // subtree and so it will end up live in only one clone of the loop.
+ if (SuccBB->getUniquePredecessor() ||
+ llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) {
+ return PredBB == &BB || DT.dominates(SuccBB, PredBB);
+ })) {
+ Cost -= computeDomSubtreeCost(*DT[SuccBB], BBCostMap, DTCostMap);
+ assert(Cost >= 0 &&
+ "Non-duplicated cost should never exceed total loop cost!");
+ }
+ }
+
+ // Now scale the cost by the number of unique successors minus one. We
+ // subtract one because there is already at least one copy of the entire
+ // loop. This is computing the new cost of unswitching a condition.
+ assert(Visited.size() > 1 &&
+ "Cannot unswitch a condition without multiple distinct successors!");
+ return Cost * (Visited.size() - 1);
+ };
+ TerminatorInst *BestUnswitchTI = nullptr;
+ int BestUnswitchCost;
+ for (TerminatorInst *CandidateTI : UnswitchCandidates) {
+ int CandidateCost = ComputeUnswitchedCost(CandidateTI);
+ DEBUG(dbgs() << " Computed cost of " << CandidateCost
+ << " for unswitch candidate: " << *CandidateTI << "\n");
+ if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) {
+ BestUnswitchTI = CandidateTI;
+ BestUnswitchCost = CandidateCost;
+ }
+ }
+
+ if (BestUnswitchCost < UnswitchThreshold) {
+ DEBUG(dbgs() << " Trying to unswitch non-trivial (cost = "
+ << BestUnswitchCost << ") branch: " << *BestUnswitchTI
+ << "\n");
+ Changed |= unswitchInvariantBranch(L, cast<BranchInst>(*BestUnswitchTI), DT,
+ LI, AC, NonTrivialUnswitchCB);
+ } else {
+ DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " << BestUnswitchCost
+ << "\n");
+ }
+
+ return Changed;
+}
+
+PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ Function &F = *L.getHeader()->getParent();
+ (void)F;
+
+ DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n");
+
+ // Save the current loop name in a variable so that we can report it even
+ // after it has been deleted.
+ std::string LoopName = L.getName();
+
+ auto NonTrivialUnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid,
+ ArrayRef<Loop *> NewLoops) {
+ // If we did a non-trivial unswitch, we have added new (cloned) loops.
+ U.addSiblingLoops(NewLoops);
+
+ // If the current loop remains valid, we should revisit it to catch any
+ // other unswitch opportunities. Otherwise, we need to mark it as deleted.
+ if (CurrentLoopValid)
+ U.revisitCurrentLoop();
+ else
+ U.markLoopAsDeleted(L, LoopName);
+ };
+
+ if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial,
+ NonTrivialUnswitchCB))
+ return PreservedAnalyses::all();
+
+#ifndef NDEBUG
+ // Historically this pass has had issues with the dominator tree so verify it
+ // in asserts builds.
+ AR.DT.verifyDomTree();
+#endif
+ return getLoopPassPreservedAnalyses();
+}
+
+namespace {
+
+class SimpleLoopUnswitchLegacyPass : public LoopPass {
+ bool NonTrivial;
+
+public:
+ static char ID; // Pass ID, replacement for typeid
+
+ explicit SimpleLoopUnswitchLegacyPass(bool NonTrivial = false)
+ : LoopPass(ID), NonTrivial(NonTrivial) {
+ initializeSimpleLoopUnswitchLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ getLoopAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
+ if (skipLoop(L))
+ return false;
+
+ Function &F = *L->getHeader()->getParent();
+
+ DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L << "\n");
+
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+ auto NonTrivialUnswitchCB = [&L, &LPM](bool CurrentLoopValid,
+ ArrayRef<Loop *> NewLoops) {
+ // If we did a non-trivial unswitch, we have added new (cloned) loops.
+ for (auto *NewL : NewLoops)
+ LPM.addLoop(*NewL);
+
+ // If the current loop remains valid, re-add it to the queue. This is
+ // a little wasteful as we'll finish processing the current loop as well,
+ // but it is the best we can do in the old PM.
+ if (CurrentLoopValid)
+ LPM.addLoop(*L);
+ else
+ LPM.markLoopAsDeleted(*L);
+ };
+
+ bool Changed =
+ unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, NonTrivialUnswitchCB);
+
+ // If anything was unswitched, also clear any cached information about this
+ // loop.
+ LPM.deleteSimpleAnalysisLoop(L);
+
+#ifndef NDEBUG
+ // Historically this pass has had issues with the dominator tree so verify it
+ // in asserts builds.
+ DT.verifyDomTree();
+#endif
+ return Changed;
+}
+
+char SimpleLoopUnswitchLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
+ "Simple unswitch loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
+ "Simple unswitch loops", false, false)
+
+Pass *llvm::createSimpleLoopUnswitchLegacyPass(bool NonTrivial) {
+ return new SimpleLoopUnswitchLegacyPass(NonTrivial);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
new file mode 100644
index 000000000000..1522170dc3b9
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -0,0 +1,296 @@
+//===- SimplifyCFGPass.cpp - CFG Simplification Pass ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead code elimination and basic block merging, along
+// with a collection of other peephole control flow optimizations. For example:
+//
+// * Removes basic blocks with no predecessors.
+// * Merges a basic block into its predecessor if there is only one and the
+// predecessor only has one successor.
+// * Eliminates PHI nodes for basic blocks with a single predecessor.
+// * Eliminates a basic block that only contains an unconditional branch.
+// * Changes invoke instructions to nounwind functions to be calls.
+// * Change things like "if (x) if (y)" into "if (x&y)".
+// * etc..
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "simplifycfg"
+
+static cl::opt<unsigned> UserBonusInstThreshold(
+ "bonus-inst-threshold", cl::Hidden, cl::init(1),
+ cl::desc("Control the number of bonus instructions (default = 1)"));
+
+static cl::opt<bool> UserKeepLoops(
+ "keep-loops", cl::Hidden, cl::init(true),
+ cl::desc("Preserve canonical loop structure (default = true)"));
+
+static cl::opt<bool> UserSwitchToLookup(
+ "switch-to-lookup", cl::Hidden, cl::init(false),
+ cl::desc("Convert switches to lookup tables (default = false)"));
+
+static cl::opt<bool> UserForwardSwitchCond(
+ "forward-switch-cond", cl::Hidden, cl::init(false),
+ cl::desc("Forward switch condition to phi ops (default = false)"));
+
+static cl::opt<bool> UserSinkCommonInsts(
+ "sink-common-insts", cl::Hidden, cl::init(false),
+ cl::desc("Sink common instructions (default = false)"));
+
+
+STATISTIC(NumSimpl, "Number of blocks simplified");
+
+/// If we have more than one empty (other than phi node) return blocks,
+/// merge them together to promote recursive block merging.
+static bool mergeEmptyReturnBlocks(Function &F) {
+ bool Changed = false;
+
+ BasicBlock *RetBlock = nullptr;
+
+ // Scan all the blocks in the function, looking for empty return blocks.
+ for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) {
+ BasicBlock &BB = *BBI++;
+
+ // Only look at return blocks.
+ ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator());
+ if (!Ret) continue;
+
+ // Only look at the block if it is empty or the only other thing in it is a
+ // single PHI node that is the operand to the return.
+ if (Ret != &BB.front()) {
+ // Check for something else in the block.
+ BasicBlock::iterator I(Ret);
+ --I;
+ // Skip over debug info.
+ while (isa<DbgInfoIntrinsic>(I) && I != BB.begin())
+ --I;
+ if (!isa<DbgInfoIntrinsic>(I) &&
+ (!isa<PHINode>(I) || I != BB.begin() || Ret->getNumOperands() == 0 ||
+ Ret->getOperand(0) != &*I))
+ continue;
+ }
+
+ // If this is the first returning block, remember it and keep going.
+ if (!RetBlock) {
+ RetBlock = &BB;
+ continue;
+ }
+
+ // Otherwise, we found a duplicate return block. Merge the two.
+ Changed = true;
+
+ // Case when there is no input to the return or when the returned values
+ // agree is trivial. Note that they can't agree if there are phis in the
+ // blocks.
+ if (Ret->getNumOperands() == 0 ||
+ Ret->getOperand(0) ==
+ cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0)) {
+ BB.replaceAllUsesWith(RetBlock);
+ BB.eraseFromParent();
+ continue;
+ }
+
+ // If the canonical return block has no PHI node, create one now.
+ PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin());
+ if (!RetBlockPHI) {
+ Value *InVal = cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0);
+ pred_iterator PB = pred_begin(RetBlock), PE = pred_end(RetBlock);
+ RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(),
+ std::distance(PB, PE), "merge",
+ &RetBlock->front());
+
+ for (pred_iterator PI = PB; PI != PE; ++PI)
+ RetBlockPHI->addIncoming(InVal, *PI);
+ RetBlock->getTerminator()->setOperand(0, RetBlockPHI);
+ }
+
+ // Turn BB into a block that just unconditionally branches to the return
+ // block. This handles the case when the two return blocks have a common
+ // predecessor but that return different things.
+ RetBlockPHI->addIncoming(Ret->getOperand(0), &BB);
+ BB.getTerminator()->eraseFromParent();
+ BranchInst::Create(RetBlock, &BB);
+ }
+
+ return Changed;
+}
+
+/// Call SimplifyCFG on all the blocks in the function,
+/// iterating until no more changes are made.
+static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
+ const SimplifyCFGOptions &Options) {
+ bool Changed = false;
+ bool LocalChange = true;
+
+ SmallVector<std::pair<const BasicBlock *, const BasicBlock *>, 32> Edges;
+ FindFunctionBackedges(F, Edges);
+ SmallPtrSet<BasicBlock *, 16> LoopHeaders;
+ for (unsigned i = 0, e = Edges.size(); i != e; ++i)
+ LoopHeaders.insert(const_cast<BasicBlock *>(Edges[i].second));
+
+ while (LocalChange) {
+ LocalChange = false;
+
+ // Loop over all of the basic blocks and remove them if they are unneeded.
+ for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
+ if (simplifyCFG(&*BBIt++, TTI, Options, &LoopHeaders)) {
+ LocalChange = true;
+ ++NumSimpl;
+ }
+ }
+ Changed |= LocalChange;
+ }
+ return Changed;
+}
+
+static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
+ const SimplifyCFGOptions &Options) {
+ bool EverChanged = removeUnreachableBlocks(F);
+ EverChanged |= mergeEmptyReturnBlocks(F);
+ EverChanged |= iterativelySimplifyCFG(F, TTI, Options);
+
+ // If neither pass changed anything, we're done.
+ if (!EverChanged) return false;
+
+ // iterativelySimplifyCFG can (rarely) make some loops dead. If this happens,
+ // removeUnreachableBlocks is needed to nuke them, which means we should
+ // iterate between the two optimizations. We structure the code like this to
+ // avoid rerunning iterativelySimplifyCFG if the second pass of
+ // removeUnreachableBlocks doesn't do anything.
+ if (!removeUnreachableBlocks(F))
+ return true;
+
+ do {
+ EverChanged = iterativelySimplifyCFG(F, TTI, Options);
+ EverChanged |= removeUnreachableBlocks(F);
+ } while (EverChanged);
+
+ return true;
+}
+
+// Command-line settings override compile-time settings.
+SimplifyCFGPass::SimplifyCFGPass(const SimplifyCFGOptions &Opts) {
+ Options.BonusInstThreshold = UserBonusInstThreshold.getNumOccurrences()
+ ? UserBonusInstThreshold
+ : Opts.BonusInstThreshold;
+ Options.ForwardSwitchCondToPhi = UserForwardSwitchCond.getNumOccurrences()
+ ? UserForwardSwitchCond
+ : Opts.ForwardSwitchCondToPhi;
+ Options.ConvertSwitchToLookupTable = UserSwitchToLookup.getNumOccurrences()
+ ? UserSwitchToLookup
+ : Opts.ConvertSwitchToLookupTable;
+ Options.NeedCanonicalLoop = UserKeepLoops.getNumOccurrences()
+ ? UserKeepLoops
+ : Opts.NeedCanonicalLoop;
+ Options.SinkCommonInsts = UserSinkCommonInsts.getNumOccurrences()
+ ? UserSinkCommonInsts
+ : Opts.SinkCommonInsts;
+}
+
+PreservedAnalyses SimplifyCFGPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ Options.AC = &AM.getResult<AssumptionAnalysis>(F);
+ if (!simplifyFunctionCFG(F, TTI, Options))
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+namespace {
+struct CFGSimplifyPass : public FunctionPass {
+ static char ID;
+ SimplifyCFGOptions Options;
+ std::function<bool(const Function &)> PredicateFtor;
+
+ CFGSimplifyPass(unsigned Threshold = 1, bool ForwardSwitchCond = false,
+ bool ConvertSwitch = false, bool KeepLoops = true,
+ bool SinkCommon = false,
+ std::function<bool(const Function &)> Ftor = nullptr)
+ : FunctionPass(ID), PredicateFtor(std::move(Ftor)) {
+
+ initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+
+ // Check for command-line overrides of options for debug/customization.
+ Options.BonusInstThreshold = UserBonusInstThreshold.getNumOccurrences()
+ ? UserBonusInstThreshold
+ : Threshold;
+
+ Options.ForwardSwitchCondToPhi = UserForwardSwitchCond.getNumOccurrences()
+ ? UserForwardSwitchCond
+ : ForwardSwitchCond;
+
+ Options.ConvertSwitchToLookupTable = UserSwitchToLookup.getNumOccurrences()
+ ? UserSwitchToLookup
+ : ConvertSwitch;
+
+ Options.NeedCanonicalLoop =
+ UserKeepLoops.getNumOccurrences() ? UserKeepLoops : KeepLoops;
+
+ Options.SinkCommonInsts = UserSinkCommonInsts.getNumOccurrences()
+ ? UserSinkCommonInsts
+ : SinkCommon;
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F) || (PredicateFtor && !PredicateFtor(F)))
+ return false;
+
+ Options.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ return simplifyFunctionCFG(F, TTI, Options);
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+}
+
+char CFGSimplifyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+ false)
+
+// Public interface to the CFGSimplification pass
+FunctionPass *
+llvm::createCFGSimplificationPass(unsigned Threshold, bool ForwardSwitchCond,
+ bool ConvertSwitch, bool KeepLoops,
+ bool SinkCommon,
+ std::function<bool(const Function &)> Ftor) {
+ return new CFGSimplifyPass(Threshold, ForwardSwitchCond, ConvertSwitch,
+ KeepLoops, SinkCommon, std::move(Ftor));
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
new file mode 100644
index 000000000000..cfb8a062299f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -0,0 +1,306 @@
+//===-- Sink.cpp - Code Sinking -------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass moves instructions into successor blocks, when possible, so that
+// they aren't executed on paths where their results aren't needed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/Sink.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "sink"
+
+STATISTIC(NumSunk, "Number of instructions sunk");
+STATISTIC(NumSinkIter, "Number of sinking iterations");
+
+/// AllUsesDominatedByBlock - Return true if all uses of the specified value
+/// occur in blocks dominated by the specified block.
+static bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB,
+ DominatorTree &DT) {
+ // Ignoring debug uses is necessary so debug info doesn't affect the code.
+ // This may leave a referencing dbg_value in the original block, before
+ // the definition of the vreg. Dwarf generator handles this although the
+ // user might not get the right info at runtime.
+ for (Use &U : Inst->uses()) {
+ // Determine the block of the use.
+ Instruction *UseInst = cast<Instruction>(U.getUser());
+ BasicBlock *UseBlock = UseInst->getParent();
+ if (PHINode *PN = dyn_cast<PHINode>(UseInst)) {
+ // PHI nodes use the operand in the predecessor block, not the block with
+ // the PHI.
+ unsigned Num = PHINode::getIncomingValueNumForOperand(U.getOperandNo());
+ UseBlock = PN->getIncomingBlock(Num);
+ }
+ // Check that it dominates.
+ if (!DT.dominates(BB, UseBlock))
+ return false;
+ }
+ return true;
+}
+
+static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
+ SmallPtrSetImpl<Instruction *> &Stores) {
+
+ if (Inst->mayWriteToMemory()) {
+ Stores.insert(Inst);
+ return false;
+ }
+
+ if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
+ MemoryLocation Loc = MemoryLocation::get(L);
+ for (Instruction *S : Stores)
+ if (isModSet(AA.getModRefInfo(S, Loc)))
+ return false;
+ }
+
+ if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst) || Inst->isEHPad() ||
+ Inst->mayThrow())
+ return false;
+
+ if (auto CS = CallSite(Inst)) {
+ // Convergent operations cannot be made control-dependent on additional
+ // values.
+ if (CS.hasFnAttr(Attribute::Convergent))
+ return false;
+
+ for (Instruction *S : Stores)
+ if (isModSet(AA.getModRefInfo(S, CS)))
+ return false;
+ }
+
+ return true;
+}
+
+/// IsAcceptableTarget - Return true if it is possible to sink the instruction
+/// in the specified basic block.
+static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
+ DominatorTree &DT, LoopInfo &LI) {
+ assert(Inst && "Instruction to be sunk is null");
+ assert(SuccToSinkTo && "Candidate sink target is null");
+
+ // It is not possible to sink an instruction into its own block. This can
+ // happen with loops.
+ if (Inst->getParent() == SuccToSinkTo)
+ return false;
+
+ // It's never legal to sink an instruction into a block which terminates in an
+ // EH-pad.
+ if (SuccToSinkTo->getTerminator()->isExceptional())
+ return false;
+
+ // If the block has multiple predecessors, this would introduce computation
+ // on different code paths. We could split the critical edge, but for now we
+ // just punt.
+ // FIXME: Split critical edges if not backedges.
+ if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
+ // We cannot sink a load across a critical edge - there may be stores in
+ // other code paths.
+ if (isa<LoadInst>(Inst))
+ return false;
+
+ // We don't want to sink across a critical edge if we don't dominate the
+ // successor. We could be introducing calculations to new code paths.
+ if (!DT.dominates(Inst->getParent(), SuccToSinkTo))
+ return false;
+
+ // Don't sink instructions into a loop.
+ Loop *succ = LI.getLoopFor(SuccToSinkTo);
+ Loop *cur = LI.getLoopFor(Inst->getParent());
+ if (succ != nullptr && succ != cur)
+ return false;
+ }
+
+ // Finally, check that all the uses of the instruction are actually
+ // dominated by the candidate
+ return AllUsesDominatedByBlock(Inst, SuccToSinkTo, DT);
+}
+
+/// SinkInstruction - Determine whether it is safe to sink the specified machine
+/// instruction out of its current block into a successor.
+static bool SinkInstruction(Instruction *Inst,
+ SmallPtrSetImpl<Instruction *> &Stores,
+ DominatorTree &DT, LoopInfo &LI, AAResults &AA) {
+
+ // Don't sink static alloca instructions. CodeGen assumes allocas outside the
+ // entry block are dynamically sized stack objects.
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst))
+ if (AI->isStaticAlloca())
+ return false;
+
+ // Check if it's safe to move the instruction.
+ if (!isSafeToMove(Inst, AA, Stores))
+ return false;
+
+ // FIXME: This should include support for sinking instructions within the
+ // block they are currently in to shorten the live ranges. We often get
+ // instructions sunk into the top of a large block, but it would be better to
+ // also sink them down before their first use in the block. This xform has to
+ // be careful not to *increase* register pressure though, e.g. sinking
+ // "x = y + z" down if it kills y and z would increase the live ranges of y
+ // and z and only shrink the live range of x.
+
+ // SuccToSinkTo - This is the successor to sink this instruction to, once we
+ // decide.
+ BasicBlock *SuccToSinkTo = nullptr;
+
+ // Instructions can only be sunk if all their uses are in blocks
+ // dominated by one of the successors.
+ // Look at all the dominated blocks and see if we can sink it in one.
+ DomTreeNode *DTN = DT.getNode(Inst->getParent());
+ for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end();
+ I != E && SuccToSinkTo == nullptr; ++I) {
+ BasicBlock *Candidate = (*I)->getBlock();
+ // A node always immediate-dominates its children on the dominator
+ // tree.
+ if (IsAcceptableTarget(Inst, Candidate, DT, LI))
+ SuccToSinkTo = Candidate;
+ }
+
+ // If no suitable postdominator was found, look at all the successors and
+ // decide which one we should sink to, if any.
+ for (succ_iterator I = succ_begin(Inst->getParent()),
+ E = succ_end(Inst->getParent()); I != E && !SuccToSinkTo; ++I) {
+ if (IsAcceptableTarget(Inst, *I, DT, LI))
+ SuccToSinkTo = *I;
+ }
+
+ // If we couldn't find a block to sink to, ignore this instruction.
+ if (!SuccToSinkTo)
+ return false;
+
+ DEBUG(dbgs() << "Sink" << *Inst << " (";
+ Inst->getParent()->printAsOperand(dbgs(), false);
+ dbgs() << " -> ";
+ SuccToSinkTo->printAsOperand(dbgs(), false);
+ dbgs() << ")\n");
+
+ // Move the instruction.
+ Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt());
+ return true;
+}
+
+static bool ProcessBlock(BasicBlock &BB, DominatorTree &DT, LoopInfo &LI,
+ AAResults &AA) {
+ // Can't sink anything out of a block that has less than two successors.
+ if (BB.getTerminator()->getNumSuccessors() <= 1) return false;
+
+ // Don't bother sinking code out of unreachable blocks. In addition to being
+ // unprofitable, it can also lead to infinite looping, because in an
+ // unreachable loop there may be nowhere to stop.
+ if (!DT.isReachableFromEntry(&BB)) return false;
+
+ bool MadeChange = false;
+
+ // Walk the basic block bottom-up. Remember if we saw a store.
+ BasicBlock::iterator I = BB.end();
+ --I;
+ bool ProcessedBegin = false;
+ SmallPtrSet<Instruction *, 8> Stores;
+ do {
+ Instruction *Inst = &*I; // The instruction to sink.
+
+ // Predecrement I (if it's not begin) so that it isn't invalidated by
+ // sinking.
+ ProcessedBegin = I == BB.begin();
+ if (!ProcessedBegin)
+ --I;
+
+ if (isa<DbgInfoIntrinsic>(Inst))
+ continue;
+
+ if (SinkInstruction(Inst, Stores, DT, LI, AA)) {
+ ++NumSunk;
+ MadeChange = true;
+ }
+
+ // If we just processed the first instruction in the block, we're done.
+ } while (!ProcessedBegin);
+
+ return MadeChange;
+}
+
+static bool iterativelySinkInstructions(Function &F, DominatorTree &DT,
+ LoopInfo &LI, AAResults &AA) {
+ bool MadeChange, EverMadeChange = false;
+
+ do {
+ MadeChange = false;
+ DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
+ // Process all basic blocks.
+ for (BasicBlock &I : F)
+ MadeChange |= ProcessBlock(I, DT, LI, AA);
+ EverMadeChange |= MadeChange;
+ NumSinkIter++;
+ } while (MadeChange);
+
+ return EverMadeChange;
+}
+
+PreservedAnalyses SinkingPass::run(Function &F, FunctionAnalysisManager &AM) {
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &AA = AM.getResult<AAManager>(F);
+
+ if (!iterativelySinkInstructions(F, DT, LI, AA))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+namespace {
+ class SinkingLegacyPass : public FunctionPass {
+ public:
+ static char ID; // Pass identification
+ SinkingLegacyPass() : FunctionPass(ID) {
+ initializeSinkingLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+ return iterativelySinkInstructions(F, DT, LI, AA);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ FunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ }
+ };
+} // end anonymous namespace
+
+char SinkingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SinkingLegacyPass, "sink", "Code sinking", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(SinkingLegacyPass, "sink", "Code sinking", false, false)
+
+FunctionPass *llvm::createSinkingPass() { return new SinkingLegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/contrib/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
new file mode 100644
index 000000000000..23156d5a4d83
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
@@ -0,0 +1,811 @@
+//===- SpeculateAroundPHIs.cpp --------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "spec-phis"
+
+STATISTIC(NumPHIsSpeculated, "Number of PHI nodes we speculated around");
+STATISTIC(NumEdgesSplit,
+ "Number of critical edges which were split for speculation");
+STATISTIC(NumSpeculatedInstructions,
+ "Number of instructions we speculated around the PHI nodes");
+STATISTIC(NumNewRedundantInstructions,
+ "Number of new, redundant instructions inserted");
+
+/// Check wether speculating the users of a PHI node around the PHI
+/// will be safe.
+///
+/// This checks both that all of the users are safe and also that all of their
+/// operands are either recursively safe or already available along an incoming
+/// edge to the PHI.
+///
+/// This routine caches both all the safe nodes explored in `PotentialSpecSet`
+/// and the chain of nodes that definitively reach any unsafe node in
+/// `UnsafeSet`. By preserving these between repeated calls to this routine for
+/// PHIs in the same basic block, the exploration here can be reused. However,
+/// these caches must no be reused for PHIs in a different basic block as they
+/// reflect what is available along incoming edges.
+static bool
+isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
+ SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+ SmallPtrSetImpl<Instruction *> &UnsafeSet) {
+ auto *PhiBB = PN.getParent();
+ SmallPtrSet<Instruction *, 4> Visited;
+ SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
+
+ // Walk each user of the PHI node.
+ for (Use &U : PN.uses()) {
+ auto *UI = cast<Instruction>(U.getUser());
+
+ // Ensure the use post-dominates the PHI node. This ensures that, in the
+ // absence of unwinding, the use will actually be reached.
+ // FIXME: We use a blunt hammer of requiring them to be in the same basic
+ // block. We should consider using actual post-dominance here in the
+ // future.
+ if (UI->getParent() != PhiBB) {
+ DEBUG(dbgs() << " Unsafe: use in a different BB: " << *UI << "\n");
+ return false;
+ }
+
+ // FIXME: This check is much too conservative. We're not going to move these
+ // instructions onto new dynamic paths through the program unless there is
+ // a call instruction between the use and the PHI node. And memory isn't
+ // changing unless there is a store in that same sequence. We should
+ // probably change this to do at least a limited scan of the intervening
+ // instructions and allow handling stores in easily proven safe cases.
+ if (mayBeMemoryDependent(*UI)) {
+ DEBUG(dbgs() << " Unsafe: can't speculate use: " << *UI << "\n");
+ return false;
+ }
+
+ // Now do a depth-first search of everything these users depend on to make
+ // sure they are transitively safe. This is a depth-first search, but we
+ // check nodes in preorder to minimize the amount of checking.
+ Visited.insert(UI);
+ DFSStack.push_back({UI, UI->value_op_begin()});
+ do {
+ User::value_op_iterator OpIt;
+ std::tie(UI, OpIt) = DFSStack.pop_back_val();
+
+ while (OpIt != UI->value_op_end()) {
+ auto *OpI = dyn_cast<Instruction>(*OpIt);
+ // Increment to the next operand for whenever we continue.
+ ++OpIt;
+ // No need to visit non-instructions, which can't form dependencies.
+ if (!OpI)
+ continue;
+
+ // Now do the main pre-order checks that this operand is a viable
+ // dependency of something we want to speculate.
+
+ // First do a few checks for instructions that won't require
+ // speculation at all because they are trivially available on the
+ // incoming edge (either through dominance or through an incoming value
+ // to a PHI).
+ //
+ // The cases in the current block will be trivially dominated by the
+ // edge.
+ auto *ParentBB = OpI->getParent();
+ if (ParentBB == PhiBB) {
+ if (isa<PHINode>(OpI)) {
+ // We can trivially map through phi nodes in the same block.
+ continue;
+ }
+ } else if (DT.dominates(ParentBB, PhiBB)) {
+ // Instructions from dominating blocks are already available.
+ continue;
+ }
+
+ // Once we know that we're considering speculating the operand, check
+ // if we've already explored this subgraph and found it to be safe.
+ if (PotentialSpecSet.count(OpI))
+ continue;
+
+ // If we've already explored this subgraph and found it unsafe, bail.
+ // If when we directly test whether this is safe it fails, bail.
+ if (UnsafeSet.count(OpI) || ParentBB != PhiBB ||
+ mayBeMemoryDependent(*OpI)) {
+ DEBUG(dbgs() << " Unsafe: can't speculate transitive use: " << *OpI
+ << "\n");
+ // Record the stack of instructions which reach this node as unsafe
+ // so we prune subsequent searches.
+ UnsafeSet.insert(OpI);
+ for (auto &StackPair : DFSStack) {
+ Instruction *I = StackPair.first;
+ UnsafeSet.insert(I);
+ }
+ return false;
+ }
+
+ // Skip any operands we're already recursively checking.
+ if (!Visited.insert(OpI).second)
+ continue;
+
+ // Push onto the stack and descend. We can directly continue this
+ // loop when ascending.
+ DFSStack.push_back({UI, OpIt});
+ UI = OpI;
+ OpIt = OpI->value_op_begin();
+ }
+
+ // This node and all its operands are safe. Go ahead and cache that for
+ // reuse later.
+ PotentialSpecSet.insert(UI);
+
+ // Continue with the next node on the stack.
+ } while (!DFSStack.empty());
+ }
+
+#ifndef NDEBUG
+ // Every visited operand should have been marked as safe for speculation at
+ // this point. Verify this and return success.
+ for (auto *I : Visited)
+ assert(PotentialSpecSet.count(I) &&
+ "Failed to mark a visited instruction as safe!");
+#endif
+ return true;
+}
+
+/// Check whether, in isolation, a given PHI node is both safe and profitable
+/// to speculate users around.
+///
+/// This handles checking whether there are any constant operands to a PHI
+/// which could represent a useful speculation candidate, whether the users of
+/// the PHI are safe to speculate including all their transitive dependencies,
+/// and whether after speculation there will be some cost savings (profit) to
+/// folding the operands into the users of the PHI node. Returns true if both
+/// safe and profitable with relevant cost savings updated in the map and with
+/// an update to the `PotentialSpecSet`. Returns false if either safety or
+/// profitability are absent. Some new entries may be made to the
+/// `PotentialSpecSet` even when this routine returns false, but they remain
+/// conservatively correct.
+///
+/// The profitability check here is a local one, but it checks this in an
+/// interesting way. Beyond checking that the total cost of materializing the
+/// constants will be less than the cost of folding them into their users, it
+/// also checks that no one incoming constant will have a higher cost when
+/// folded into its users rather than materialized. This higher cost could
+/// result in a dynamic *path* that is more expensive even when the total cost
+/// is lower. Currently, all of the interesting cases where this optimization
+/// should fire are ones where it is a no-loss operation in this sense. If we
+/// ever want to be more aggressive here, we would need to balance the
+/// different incoming edges' cost by looking at their respective
+/// probabilities.
+static bool isSafeAndProfitableToSpeculateAroundPHI(
+ PHINode &PN, SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
+ SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+ SmallPtrSetImpl<Instruction *> &UnsafeSet, DominatorTree &DT,
+ TargetTransformInfo &TTI) {
+ // First see whether there is any cost savings to speculating around this
+ // PHI, and build up a map of the constant inputs to how many times they
+ // occur.
+ bool NonFreeMat = false;
+ struct CostsAndCount {
+ int MatCost = TargetTransformInfo::TCC_Free;
+ int FoldedCost = TargetTransformInfo::TCC_Free;
+ int Count = 0;
+ };
+ SmallDenseMap<ConstantInt *, CostsAndCount, 16> CostsAndCounts;
+ SmallPtrSet<BasicBlock *, 16> IncomingConstantBlocks;
+ for (int i : llvm::seq<int>(0, PN.getNumIncomingValues())) {
+ auto *IncomingC = dyn_cast<ConstantInt>(PN.getIncomingValue(i));
+ if (!IncomingC)
+ continue;
+
+ // Only visit each incoming edge with a constant input once.
+ if (!IncomingConstantBlocks.insert(PN.getIncomingBlock(i)).second)
+ continue;
+
+ auto InsertResult = CostsAndCounts.insert({IncomingC, {}});
+ // Count how many edges share a given incoming costant.
+ ++InsertResult.first->second.Count;
+ // Only compute the cost the first time we see a particular constant.
+ if (!InsertResult.second)
+ continue;
+
+ int &MatCost = InsertResult.first->second.MatCost;
+ MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType());
+ NonFreeMat |= MatCost != TTI.TCC_Free;
+ }
+ if (!NonFreeMat) {
+ DEBUG(dbgs() << " Free: " << PN << "\n");
+ // No profit in free materialization.
+ return false;
+ }
+
+ // Now check that the uses of this PHI can actually be speculated,
+ // otherwise we'll still have to materialize the PHI value.
+ if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) {
+ DEBUG(dbgs() << " Unsafe PHI: " << PN << "\n");
+ return false;
+ }
+
+ // Compute how much (if any) savings are available by speculating around this
+ // PHI.
+ for (Use &U : PN.uses()) {
+ auto *UserI = cast<Instruction>(U.getUser());
+ // Now check whether there is any savings to folding the incoming constants
+ // into this use.
+ unsigned Idx = U.getOperandNo();
+
+ // If we have a binary operator that is commutative, an actual constant
+ // operand would end up on the RHS, so pretend the use of the PHI is on the
+ // RHS.
+ //
+ // Technically, this is a bit weird if *both* operands are PHIs we're
+ // speculating. But if that is the case, giving an "optimistic" cost isn't
+ // a bad thing because after speculation it will constant fold. And
+ // moreover, such cases should likely have been constant folded already by
+ // some other pass, so we shouldn't worry about "modeling" them terribly
+ // accurately here. Similarly, if the other operand is a constant, it still
+ // seems fine to be "optimistic" in our cost modeling, because when the
+ // incoming operand from the PHI node is also a constant, we will end up
+ // constant folding.
+ if (UserI->isBinaryOp() && UserI->isCommutative() && Idx != 1)
+ // Assume we will commute the constant to the RHS to be canonical.
+ Idx = 1;
+
+ // Get the intrinsic ID if this user is an instrinsic.
+ Intrinsic::ID IID = Intrinsic::not_intrinsic;
+ if (auto *UserII = dyn_cast<IntrinsicInst>(UserI))
+ IID = UserII->getIntrinsicID();
+
+ for (auto &IncomingConstantAndCostsAndCount : CostsAndCounts) {
+ ConstantInt *IncomingC = IncomingConstantAndCostsAndCount.first;
+ int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
+ int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
+ if (IID)
+ FoldedCost += TTI.getIntImmCost(IID, Idx, IncomingC->getValue(),
+ IncomingC->getType());
+ else
+ FoldedCost +=
+ TTI.getIntImmCost(UserI->getOpcode(), Idx, IncomingC->getValue(),
+ IncomingC->getType());
+
+ // If we accumulate more folded cost for this incoming constant than
+ // materialized cost, then we'll regress any edge with this constant so
+ // just bail. We're only interested in cases where folding the incoming
+ // constants is at least break-even on all paths.
+ if (FoldedCost > MatCost) {
+ DEBUG(dbgs() << " Not profitable to fold imm: " << *IncomingC << "\n"
+ " Materializing cost: " << MatCost << "\n"
+ " Accumulated folded cost: " << FoldedCost << "\n");
+ return false;
+ }
+ }
+ }
+
+ // Compute the total cost savings afforded by this PHI node.
+ int TotalMatCost = TTI.TCC_Free, TotalFoldedCost = TTI.TCC_Free;
+ for (auto IncomingConstantAndCostsAndCount : CostsAndCounts) {
+ int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
+ int FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
+ int Count = IncomingConstantAndCostsAndCount.second.Count;
+
+ TotalMatCost += MatCost * Count;
+ TotalFoldedCost += FoldedCost * Count;
+ }
+ assert(TotalFoldedCost <= TotalMatCost && "If each constant's folded cost is "
+ "less that its materialized cost, "
+ "the sum must be as well.");
+
+ DEBUG(dbgs() << " Cost savings " << (TotalMatCost - TotalFoldedCost)
+ << ": " << PN << "\n");
+ CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost;
+ return true;
+}
+
+/// Simple helper to walk all the users of a list of phis depth first, and call
+/// a visit function on each one in post-order.
+///
+/// All of the PHIs should be in the same basic block, and this is primarily
+/// used to make a single depth-first walk across their collective users
+/// without revisiting any subgraphs. Callers should provide a fast, idempotent
+/// callable to test whether a node has been visited and the more important
+/// callable to actually visit a particular node.
+///
+/// Depth-first and postorder here refer to the *operand* graph -- we start
+/// from a collection of users of PHI nodes and walk "up" the operands
+/// depth-first.
+template <typename IsVisitedT, typename VisitT>
+static void visitPHIUsersAndDepsInPostOrder(ArrayRef<PHINode *> PNs,
+ IsVisitedT IsVisited,
+ VisitT Visit) {
+ SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
+ for (auto *PN : PNs)
+ for (Use &U : PN->uses()) {
+ auto *UI = cast<Instruction>(U.getUser());
+ if (IsVisited(UI))
+ // Already visited this user, continue across the roots.
+ continue;
+
+ // Otherwise, walk the operand graph depth-first and visit each
+ // dependency in postorder.
+ DFSStack.push_back({UI, UI->value_op_begin()});
+ do {
+ User::value_op_iterator OpIt;
+ std::tie(UI, OpIt) = DFSStack.pop_back_val();
+ while (OpIt != UI->value_op_end()) {
+ auto *OpI = dyn_cast<Instruction>(*OpIt);
+ // Increment to the next operand for whenever we continue.
+ ++OpIt;
+ // No need to visit non-instructions, which can't form dependencies,
+ // or instructions outside of our potential dependency set that we
+ // were given. Finally, if we've already visited the node, continue
+ // to the next.
+ if (!OpI || IsVisited(OpI))
+ continue;
+
+ // Push onto the stack and descend. We can directly continue this
+ // loop when ascending.
+ DFSStack.push_back({UI, OpIt});
+ UI = OpI;
+ OpIt = OpI->value_op_begin();
+ }
+
+ // Finished visiting children, visit this node.
+ assert(!IsVisited(UI) && "Should not have already visited a node!");
+ Visit(UI);
+ } while (!DFSStack.empty());
+ }
+}
+
+/// Find profitable PHIs to speculate.
+///
+/// For a PHI node to be profitable, we need the cost of speculating its users
+/// (and their dependencies) to not exceed the savings of folding the PHI's
+/// constant operands into the speculated users.
+///
+/// Computing this is surprisingly challenging. Because users of two different
+/// PHI nodes can depend on each other or on common other instructions, it may
+/// be profitable to speculate two PHI nodes together even though neither one
+/// in isolation is profitable. The straightforward way to find all the
+/// profitable PHIs would be to check each combination of PHIs' cost, but this
+/// is exponential in complexity.
+///
+/// Even if we assume that we only care about cases where we can consider each
+/// PHI node in isolation (rather than considering cases where none are
+/// profitable in isolation but some subset are profitable as a set), we still
+/// have a challenge. The obvious way to find all individually profitable PHIs
+/// is to iterate until reaching a fixed point, but this will be quadratic in
+/// complexity. =/
+///
+/// This code currently uses a linear-to-compute order for a greedy approach.
+/// It won't find cases where a set of PHIs must be considered together, but it
+/// handles most cases of order dependence without quadratic iteration. The
+/// specific order used is the post-order across the operand DAG. When the last
+/// user of a PHI is visited in this postorder walk, we check it for
+/// profitability.
+///
+/// There is an orthogonal extra complexity to all of this: computing the cost
+/// itself can easily become a linear computation making everything again (at
+/// best) quadratic. Using a postorder over the operand graph makes it
+/// particularly easy to avoid this through dynamic programming. As we do the
+/// postorder walk, we build the transitive cost of that subgraph. It is also
+/// straightforward to then update these costs when we mark a PHI for
+/// speculation so that subsequent PHIs don't re-pay the cost of already
+/// speculated instructions.
+static SmallVector<PHINode *, 16>
+findProfitablePHIs(ArrayRef<PHINode *> PNs,
+ const SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
+ const SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+ int NumPreds, DominatorTree &DT, TargetTransformInfo &TTI) {
+ SmallVector<PHINode *, 16> SpecPNs;
+
+ // First, establish a reverse mapping from immediate users of the PHI nodes
+ // to the nodes themselves, and count how many users each PHI node has in
+ // a way we can update while processing them.
+ SmallDenseMap<Instruction *, TinyPtrVector<PHINode *>, 16> UserToPNMap;
+ SmallDenseMap<PHINode *, int, 16> PNUserCountMap;
+ SmallPtrSet<Instruction *, 16> UserSet;
+ for (auto *PN : PNs) {
+ assert(UserSet.empty() && "Must start with an empty user set!");
+ for (Use &U : PN->uses())
+ UserSet.insert(cast<Instruction>(U.getUser()));
+ PNUserCountMap[PN] = UserSet.size();
+ for (auto *UI : UserSet)
+ UserToPNMap.insert({UI, {}}).first->second.push_back(PN);
+ UserSet.clear();
+ }
+
+ // Now do a DFS across the operand graph of the users, computing cost as we
+ // go and when all costs for a given PHI are known, checking that PHI for
+ // profitability.
+ SmallDenseMap<Instruction *, int, 16> SpecCostMap;
+ visitPHIUsersAndDepsInPostOrder(
+ PNs,
+ /*IsVisited*/
+ [&](Instruction *I) {
+ // We consider anything that isn't potentially speculated to be
+ // "visited" as it is already handled. Similarly, anything that *is*
+ // potentially speculated but for which we have an entry in our cost
+ // map, we're done.
+ return !PotentialSpecSet.count(I) || SpecCostMap.count(I);
+ },
+ /*Visit*/
+ [&](Instruction *I) {
+ // We've fully visited the operands, so sum their cost with this node
+ // and update the cost map.
+ int Cost = TTI.TCC_Free;
+ for (Value *OpV : I->operand_values())
+ if (auto *OpI = dyn_cast<Instruction>(OpV)) {
+ auto CostMapIt = SpecCostMap.find(OpI);
+ if (CostMapIt != SpecCostMap.end())
+ Cost += CostMapIt->second;
+ }
+ Cost += TTI.getUserCost(I);
+ bool Inserted = SpecCostMap.insert({I, Cost}).second;
+ (void)Inserted;
+ assert(Inserted && "Must not re-insert a cost during the DFS!");
+
+ // Now check if this node had a corresponding PHI node using it. If so,
+ // we need to decrement the outstanding user count for it.
+ auto UserPNsIt = UserToPNMap.find(I);
+ if (UserPNsIt == UserToPNMap.end())
+ return;
+ auto &UserPNs = UserPNsIt->second;
+ auto UserPNsSplitIt = std::stable_partition(
+ UserPNs.begin(), UserPNs.end(), [&](PHINode *UserPN) {
+ int &PNUserCount = PNUserCountMap.find(UserPN)->second;
+ assert(
+ PNUserCount > 0 &&
+ "Should never re-visit a PN after its user count hits zero!");
+ --PNUserCount;
+ return PNUserCount != 0;
+ });
+
+ // FIXME: Rather than one at a time, we should sum the savings as the
+ // cost will be completely shared.
+ SmallVector<Instruction *, 16> SpecWorklist;
+ for (auto *PN : llvm::make_range(UserPNsSplitIt, UserPNs.end())) {
+ int SpecCost = TTI.TCC_Free;
+ for (Use &U : PN->uses())
+ SpecCost +=
+ SpecCostMap.find(cast<Instruction>(U.getUser()))->second;
+ SpecCost *= (NumPreds - 1);
+ // When the user count of a PHI node hits zero, we should check its
+ // profitability. If profitable, we should mark it for speculation
+ // and zero out the cost of everything it depends on.
+ int CostSavings = CostSavingsMap.find(PN)->second;
+ if (SpecCost > CostSavings) {
+ DEBUG(dbgs() << " Not profitable, speculation cost: " << *PN << "\n"
+ " Cost savings: " << CostSavings << "\n"
+ " Speculation cost: " << SpecCost << "\n");
+ continue;
+ }
+
+ // We're going to speculate this user-associated PHI. Copy it out and
+ // add its users to the worklist to update their cost.
+ SpecPNs.push_back(PN);
+ for (Use &U : PN->uses()) {
+ auto *UI = cast<Instruction>(U.getUser());
+ auto CostMapIt = SpecCostMap.find(UI);
+ if (CostMapIt->second == 0)
+ continue;
+ // Zero out this cost entry to avoid duplicates.
+ CostMapIt->second = 0;
+ SpecWorklist.push_back(UI);
+ }
+ }
+
+ // Now walk all the operands of the users in the worklist transitively
+ // to zero out all the memoized costs.
+ while (!SpecWorklist.empty()) {
+ Instruction *SpecI = SpecWorklist.pop_back_val();
+ assert(SpecCostMap.find(SpecI)->second == 0 &&
+ "Didn't zero out a cost!");
+
+ // Walk the operands recursively to zero out their cost as well.
+ for (auto *OpV : SpecI->operand_values()) {
+ auto *OpI = dyn_cast<Instruction>(OpV);
+ if (!OpI)
+ continue;
+ auto CostMapIt = SpecCostMap.find(OpI);
+ if (CostMapIt == SpecCostMap.end() || CostMapIt->second == 0)
+ continue;
+ CostMapIt->second = 0;
+ SpecWorklist.push_back(OpI);
+ }
+ }
+ });
+
+ return SpecPNs;
+}
+
+/// Speculate users around a set of PHI nodes.
+///
+/// This routine does the actual speculation around a set of PHI nodes where we
+/// have determined this to be both safe and profitable.
+///
+/// This routine handles any spliting of critical edges necessary to create
+/// a safe block to speculate into as well as cloning the instructions and
+/// rewriting all uses.
+static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
+ SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+ SmallSetVector<BasicBlock *, 16> &PredSet,
+ DominatorTree &DT) {
+ DEBUG(dbgs() << " Speculating around " << SpecPNs.size() << " PHIs!\n");
+ NumPHIsSpeculated += SpecPNs.size();
+
+ // Split any critical edges so that we have a block to hoist into.
+ auto *ParentBB = SpecPNs[0]->getParent();
+ SmallVector<BasicBlock *, 16> SpecPreds;
+ SpecPreds.reserve(PredSet.size());
+ for (auto *PredBB : PredSet) {
+ auto *NewPredBB = SplitCriticalEdge(
+ PredBB, ParentBB,
+ CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges());
+ if (NewPredBB) {
+ ++NumEdgesSplit;
+ DEBUG(dbgs() << " Split critical edge from: " << PredBB->getName()
+ << "\n");
+ SpecPreds.push_back(NewPredBB);
+ } else {
+ assert(PredBB->getSingleSuccessor() == ParentBB &&
+ "We need a non-critical predecessor to speculate into.");
+ assert(!isa<InvokeInst>(PredBB->getTerminator()) &&
+ "Cannot have a non-critical invoke!");
+
+ // Already non-critical, use existing pred.
+ SpecPreds.push_back(PredBB);
+ }
+ }
+
+ SmallPtrSet<Instruction *, 16> SpecSet;
+ SmallVector<Instruction *, 16> SpecList;
+ visitPHIUsersAndDepsInPostOrder(SpecPNs,
+ /*IsVisited*/
+ [&](Instruction *I) {
+ // This is visited if we don't need to
+ // speculate it or we already have
+ // speculated it.
+ return !PotentialSpecSet.count(I) ||
+ SpecSet.count(I);
+ },
+ /*Visit*/
+ [&](Instruction *I) {
+ // All operands scheduled, schedule this
+ // node.
+ SpecSet.insert(I);
+ SpecList.push_back(I);
+ });
+
+ int NumSpecInsts = SpecList.size() * SpecPreds.size();
+ int NumRedundantInsts = NumSpecInsts - SpecList.size();
+ DEBUG(dbgs() << " Inserting " << NumSpecInsts << " speculated instructions, "
+ << NumRedundantInsts << " redundancies\n");
+ NumSpeculatedInstructions += NumSpecInsts;
+ NumNewRedundantInstructions += NumRedundantInsts;
+
+ // Each predecessor is numbered by its index in `SpecPreds`, so for each
+ // instruction we speculate, the speculated instruction is stored in that
+ // index of the vector asosciated with the original instruction. We also
+ // store the incoming values for each predecessor from any PHIs used.
+ SmallDenseMap<Instruction *, SmallVector<Value *, 2>, 16> SpeculatedValueMap;
+
+ // Inject the synthetic mappings to rewrite PHIs to the appropriate incoming
+ // value. This handles both the PHIs we are speculating around and any other
+ // PHIs that happen to be used.
+ for (auto *OrigI : SpecList)
+ for (auto *OpV : OrigI->operand_values()) {
+ auto *OpPN = dyn_cast<PHINode>(OpV);
+ if (!OpPN || OpPN->getParent() != ParentBB)
+ continue;
+
+ auto InsertResult = SpeculatedValueMap.insert({OpPN, {}});
+ if (!InsertResult.second)
+ continue;
+
+ auto &SpeculatedVals = InsertResult.first->second;
+
+ // Populating our structure for mapping is particularly annoying because
+ // finding an incoming value for a particular predecessor block in a PHI
+ // node is a linear time operation! To avoid quadratic behavior, we build
+ // a map for this PHI node's incoming values and then translate it into
+ // the more compact representation used below.
+ SmallDenseMap<BasicBlock *, Value *, 16> IncomingValueMap;
+ for (int i : llvm::seq<int>(0, OpPN->getNumIncomingValues()))
+ IncomingValueMap[OpPN->getIncomingBlock(i)] = OpPN->getIncomingValue(i);
+
+ for (auto *PredBB : SpecPreds)
+ SpeculatedVals.push_back(IncomingValueMap.find(PredBB)->second);
+ }
+
+ // Speculate into each predecessor.
+ for (int PredIdx : llvm::seq<int>(0, SpecPreds.size())) {
+ auto *PredBB = SpecPreds[PredIdx];
+ assert(PredBB->getSingleSuccessor() == ParentBB &&
+ "We need a non-critical predecessor to speculate into.");
+
+ for (auto *OrigI : SpecList) {
+ auto *NewI = OrigI->clone();
+ NewI->setName(Twine(OrigI->getName()) + "." + Twine(PredIdx));
+ NewI->insertBefore(PredBB->getTerminator());
+
+ // Rewrite all the operands to the previously speculated instructions.
+ // Because we're walking in-order, the defs must precede the uses and we
+ // should already have these mappings.
+ for (Use &U : NewI->operands()) {
+ auto *OpI = dyn_cast<Instruction>(U.get());
+ if (!OpI)
+ continue;
+ auto MapIt = SpeculatedValueMap.find(OpI);
+ if (MapIt == SpeculatedValueMap.end())
+ continue;
+ const auto &SpeculatedVals = MapIt->second;
+ assert(SpeculatedVals[PredIdx] &&
+ "Must have a speculated value for this predecessor!");
+ assert(SpeculatedVals[PredIdx]->getType() == OpI->getType() &&
+ "Speculated value has the wrong type!");
+
+ // Rewrite the use to this predecessor's speculated instruction.
+ U.set(SpeculatedVals[PredIdx]);
+ }
+
+ // Commute instructions which now have a constant in the LHS but not the
+ // RHS.
+ if (NewI->isBinaryOp() && NewI->isCommutative() &&
+ isa<Constant>(NewI->getOperand(0)) &&
+ !isa<Constant>(NewI->getOperand(1)))
+ NewI->getOperandUse(0).swap(NewI->getOperandUse(1));
+
+ SpeculatedValueMap[OrigI].push_back(NewI);
+ assert(SpeculatedValueMap[OrigI][PredIdx] == NewI &&
+ "Mismatched speculated instruction index!");
+ }
+ }
+
+ // Walk the speculated instruction list and if they have uses, insert a PHI
+ // for them from the speculated versions, and replace the uses with the PHI.
+ // Then erase the instructions as they have been fully speculated. The walk
+ // needs to be in reverse so that we don't think there are users when we'll
+ // actually eventually remove them later.
+ IRBuilder<> IRB(SpecPNs[0]);
+ for (auto *OrigI : llvm::reverse(SpecList)) {
+ // Check if we need a PHI for any remaining users and if so, insert it.
+ if (!OrigI->use_empty()) {
+ auto *SpecIPN = IRB.CreatePHI(OrigI->getType(), SpecPreds.size(),
+ Twine(OrigI->getName()) + ".phi");
+ // Add the incoming values we speculated.
+ auto &SpeculatedVals = SpeculatedValueMap.find(OrigI)->second;
+ for (int PredIdx : llvm::seq<int>(0, SpecPreds.size()))
+ SpecIPN->addIncoming(SpeculatedVals[PredIdx], SpecPreds[PredIdx]);
+
+ // And replace the uses with the PHI node.
+ OrigI->replaceAllUsesWith(SpecIPN);
+ }
+
+ // It is important to immediately erase this so that it stops using other
+ // instructions. This avoids inserting needless PHIs of them.
+ OrigI->eraseFromParent();
+ }
+
+ // All of the uses of the speculated phi nodes should be removed at this
+ // point, so erase them.
+ for (auto *SpecPN : SpecPNs) {
+ assert(SpecPN->use_empty() && "All users should have been speculated!");
+ SpecPN->eraseFromParent();
+ }
+}
+
+/// Try to speculate around a series of PHIs from a single basic block.
+///
+/// This routine checks whether any of these PHIs are profitable to speculate
+/// users around. If safe and profitable, it does the speculation. It returns
+/// true when at least some speculation occurs.
+static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
+ DominatorTree &DT, TargetTransformInfo &TTI) {
+ DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n");
+
+ // Savings in cost from speculating around a PHI node.
+ SmallDenseMap<PHINode *, int, 16> CostSavingsMap;
+
+ // Remember the set of instructions that are candidates for speculation so
+ // that we can quickly walk things within that space. This prunes out
+ // instructions already available along edges, etc.
+ SmallPtrSet<Instruction *, 16> PotentialSpecSet;
+
+ // Remember the set of instructions that are (transitively) unsafe to
+ // speculate into the incoming edges of this basic block. This avoids
+ // recomputing them for each PHI node we check. This set is specific to this
+ // block though as things are pruned out of it based on what is available
+ // along incoming edges.
+ SmallPtrSet<Instruction *, 16> UnsafeSet;
+
+ // For each PHI node in this block, check whether there are immediate folding
+ // opportunities from speculation, and whether that speculation will be
+ // valid. This determise the set of safe PHIs to speculate.
+ PNs.erase(llvm::remove_if(PNs,
+ [&](PHINode *PN) {
+ return !isSafeAndProfitableToSpeculateAroundPHI(
+ *PN, CostSavingsMap, PotentialSpecSet,
+ UnsafeSet, DT, TTI);
+ }),
+ PNs.end());
+ // If no PHIs were profitable, skip.
+ if (PNs.empty()) {
+ DEBUG(dbgs() << " No safe and profitable PHIs found!\n");
+ return false;
+ }
+
+ // We need to know how much speculation will cost which is determined by how
+ // many incoming edges will need a copy of each speculated instruction.
+ SmallSetVector<BasicBlock *, 16> PredSet;
+ for (auto *PredBB : PNs[0]->blocks()) {
+ if (!PredSet.insert(PredBB))
+ continue;
+
+ // We cannot speculate when a predecessor is an indirect branch.
+ // FIXME: We also can't reliably create a non-critical edge block for
+ // speculation if the predecessor is an invoke. This doesn't seem
+ // fundamental and we should probably be splitting critical edges
+ // differently.
+ if (isa<IndirectBrInst>(PredBB->getTerminator()) ||
+ isa<InvokeInst>(PredBB->getTerminator())) {
+ DEBUG(dbgs() << " Invalid: predecessor terminator: " << PredBB->getName()
+ << "\n");
+ return false;
+ }
+ }
+ if (PredSet.size() < 2) {
+ DEBUG(dbgs() << " Unimportant: phi with only one predecessor\n");
+ return false;
+ }
+
+ SmallVector<PHINode *, 16> SpecPNs = findProfitablePHIs(
+ PNs, CostSavingsMap, PotentialSpecSet, PredSet.size(), DT, TTI);
+ if (SpecPNs.empty())
+ // Nothing to do.
+ return false;
+
+ speculatePHIs(SpecPNs, PotentialSpecSet, PredSet, DT);
+ return true;
+}
+
+PreservedAnalyses SpeculateAroundPHIsPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+
+ bool Changed = false;
+ for (auto *BB : ReversePostOrderTraversal<Function *>(&F)) {
+ SmallVector<PHINode *, 16> PNs;
+ auto BBI = BB->begin();
+ while (auto *PN = dyn_cast<PHINode>(&*BBI)) {
+ PNs.push_back(PN);
+ ++BBI;
+ }
+
+ if (PNs.empty())
+ continue;
+
+ Changed |= tryToSpeculatePHIs(PNs, DT, TTI);
+ }
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
new file mode 100644
index 000000000000..a7c308b59877
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -0,0 +1,319 @@
+//===- SpeculativeExecution.cpp ---------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass hoists instructions to enable speculative execution on
+// targets where branches are expensive. This is aimed at GPUs. It
+// currently works on simple if-then and if-then-else
+// patterns.
+//
+// Removing branches is not the only motivation for this
+// pass. E.g. consider this code and assume that there is no
+// addressing mode for multiplying by sizeof(*a):
+//
+// if (b > 0)
+// c = a[i + 1]
+// if (d > 0)
+// e = a[i + 2]
+//
+// turns into
+//
+// p = &a[i + 1];
+// if (b > 0)
+// c = *p;
+// q = &a[i + 2];
+// if (d > 0)
+// e = *q;
+//
+// which could later be optimized to
+//
+// r = &a[i];
+// if (b > 0)
+// c = r[1];
+// if (d > 0)
+// e = r[2];
+//
+// Later passes sink back much of the speculated code that did not enable
+// further optimization.
+//
+// This pass is more aggressive than the function SpeculativeyExecuteBB in
+// SimplifyCFG. SimplifyCFG will not speculate if no selects are introduced and
+// it will speculate at most one instruction. It also will not speculate if
+// there is a value defined in the if-block that is only used in the then-block.
+// These restrictions make sense since the speculation in SimplifyCFG seems
+// aimed at introducing cheap selects, while this pass is intended to do more
+// aggressive speculation while counting on later passes to either capitalize on
+// that or clean it up.
+//
+// If the pass was created by calling
+// createSpeculativeExecutionIfHasBranchDivergencePass or the
+// -spec-exec-only-if-divergent-target option is present, this pass only has an
+// effect on targets where TargetTransformInfo::hasBranchDivergence() is true;
+// on other targets, it is a nop.
+//
+// This lets you include this pass unconditionally in the IR pass pipeline, but
+// only enable it for relevant targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SpeculativeExecution.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "speculative-execution"
+
+// The risk that speculation will not pay off increases with the
+// number of instructions speculated, so we put a limit on that.
+static cl::opt<unsigned> SpecExecMaxSpeculationCost(
+ "spec-exec-max-speculation-cost", cl::init(7), cl::Hidden,
+ cl::desc("Speculative execution is not applied to basic blocks where "
+ "the cost of the instructions to speculatively execute "
+ "exceeds this limit."));
+
+// Speculating just a few instructions from a larger block tends not
+// to be profitable and this limit prevents that. A reason for that is
+// that small basic blocks are more likely to be candidates for
+// further optimization.
+static cl::opt<unsigned> SpecExecMaxNotHoisted(
+ "spec-exec-max-not-hoisted", cl::init(5), cl::Hidden,
+ cl::desc("Speculative execution is not applied to basic blocks where the "
+ "number of instructions that would not be speculatively executed "
+ "exceeds this limit."));
+
+static cl::opt<bool> SpecExecOnlyIfDivergentTarget(
+ "spec-exec-only-if-divergent-target", cl::init(false), cl::Hidden,
+ cl::desc("Speculative execution is applied only to targets with divergent "
+ "branches, even if the pass was configured to apply only to all "
+ "targets."));
+
+namespace {
+
+class SpeculativeExecutionLegacyPass : public FunctionPass {
+public:
+ static char ID;
+ explicit SpeculativeExecutionLegacyPass(bool OnlyIfDivergentTarget = false)
+ : FunctionPass(ID), OnlyIfDivergentTarget(OnlyIfDivergentTarget ||
+ SpecExecOnlyIfDivergentTarget),
+ Impl(OnlyIfDivergentTarget) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override {
+ if (OnlyIfDivergentTarget)
+ return "Speculatively execute instructions if target has divergent "
+ "branches";
+ return "Speculatively execute instructions";
+ }
+
+private:
+ // Variable preserved purely for correct name printing.
+ const bool OnlyIfDivergentTarget;
+
+ SpeculativeExecutionPass Impl;
+};
+} // namespace
+
+char SpeculativeExecutionLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SpeculativeExecutionLegacyPass, "speculative-execution",
+ "Speculatively execute instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(SpeculativeExecutionLegacyPass, "speculative-execution",
+ "Speculatively execute instructions", false, false)
+
+void SpeculativeExecutionLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+}
+
+bool SpeculativeExecutionLegacyPass::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ return Impl.runImpl(F, TTI);
+}
+
+namespace llvm {
+
+bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) {
+ if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence()) {
+ DEBUG(dbgs() << "Not running SpeculativeExecution because "
+ "TTI->hasBranchDivergence() is false.\n");
+ return false;
+ }
+
+ this->TTI = TTI;
+ bool Changed = false;
+ for (auto& B : F) {
+ Changed |= runOnBasicBlock(B);
+ }
+ return Changed;
+}
+
+bool SpeculativeExecutionPass::runOnBasicBlock(BasicBlock &B) {
+ BranchInst *BI = dyn_cast<BranchInst>(B.getTerminator());
+ if (BI == nullptr)
+ return false;
+
+ if (BI->getNumSuccessors() != 2)
+ return false;
+ BasicBlock &Succ0 = *BI->getSuccessor(0);
+ BasicBlock &Succ1 = *BI->getSuccessor(1);
+
+ if (&B == &Succ0 || &B == &Succ1 || &Succ0 == &Succ1) {
+ return false;
+ }
+
+ // Hoist from if-then (triangle).
+ if (Succ0.getSinglePredecessor() != nullptr &&
+ Succ0.getSingleSuccessor() == &Succ1) {
+ return considerHoistingFromTo(Succ0, B);
+ }
+
+ // Hoist from if-else (triangle).
+ if (Succ1.getSinglePredecessor() != nullptr &&
+ Succ1.getSingleSuccessor() == &Succ0) {
+ return considerHoistingFromTo(Succ1, B);
+ }
+
+ // Hoist from if-then-else (diamond), but only if it is equivalent to
+ // an if-else or if-then due to one of the branches doing nothing.
+ if (Succ0.getSinglePredecessor() != nullptr &&
+ Succ1.getSinglePredecessor() != nullptr &&
+ Succ1.getSingleSuccessor() != nullptr &&
+ Succ1.getSingleSuccessor() != &B &&
+ Succ1.getSingleSuccessor() == Succ0.getSingleSuccessor()) {
+ // If a block has only one instruction, then that is a terminator
+ // instruction so that the block does nothing. This does happen.
+ if (Succ1.size() == 1) // equivalent to if-then
+ return considerHoistingFromTo(Succ0, B);
+ if (Succ0.size() == 1) // equivalent to if-else
+ return considerHoistingFromTo(Succ1, B);
+ }
+
+ return false;
+}
+
+static unsigned ComputeSpeculationCost(const Instruction *I,
+ const TargetTransformInfo &TTI) {
+ switch (Operator::getOpcode(I)) {
+ case Instruction::GetElementPtr:
+ case Instruction::Add:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Select:
+ case Instruction::Shl:
+ case Instruction::Sub:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::Xor:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::Call:
+ case Instruction::BitCast:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::AddrSpaceCast:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPExt:
+ case Instruction::FPTrunc:
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return TTI.getUserCost(I);
+
+ default:
+ return UINT_MAX; // Disallow anything not whitelisted.
+ }
+}
+
+bool SpeculativeExecutionPass::considerHoistingFromTo(
+ BasicBlock &FromBlock, BasicBlock &ToBlock) {
+ SmallSet<const Instruction *, 8> NotHoisted;
+ const auto AllPrecedingUsesFromBlockHoisted = [&NotHoisted](User *U) {
+ for (Value* V : U->operand_values()) {
+ if (Instruction *I = dyn_cast<Instruction>(V)) {
+ if (NotHoisted.count(I) > 0)
+ return false;
+ }
+ }
+ return true;
+ };
+
+ unsigned TotalSpeculationCost = 0;
+ for (auto& I : FromBlock) {
+ const unsigned Cost = ComputeSpeculationCost(&I, *TTI);
+ if (Cost != UINT_MAX && isSafeToSpeculativelyExecute(&I) &&
+ AllPrecedingUsesFromBlockHoisted(&I)) {
+ TotalSpeculationCost += Cost;
+ if (TotalSpeculationCost > SpecExecMaxSpeculationCost)
+ return false; // too much to hoist
+ } else {
+ NotHoisted.insert(&I);
+ if (NotHoisted.size() > SpecExecMaxNotHoisted)
+ return false; // too much left behind
+ }
+ }
+
+ if (TotalSpeculationCost == 0)
+ return false; // nothing to hoist
+
+ for (auto I = FromBlock.begin(); I != FromBlock.end();) {
+ // We have to increment I before moving Current as moving Current
+ // changes the list that I is iterating through.
+ auto Current = I;
+ ++I;
+ if (!NotHoisted.count(&*Current)) {
+ Current->moveBefore(ToBlock.getTerminator());
+ }
+ }
+ return true;
+}
+
+FunctionPass *createSpeculativeExecutionPass() {
+ return new SpeculativeExecutionLegacyPass();
+}
+
+FunctionPass *createSpeculativeExecutionIfHasBranchDivergencePass() {
+ return new SpeculativeExecutionLegacyPass(/* OnlyIfDivergentTarget = */ true);
+}
+
+SpeculativeExecutionPass::SpeculativeExecutionPass(bool OnlyIfDivergentTarget)
+ : OnlyIfDivergentTarget(OnlyIfDivergentTarget ||
+ SpecExecOnlyIfDivergentTarget) {}
+
+PreservedAnalyses SpeculativeExecutionPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+ bool Changed = runImpl(F, TTI);
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+} // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
new file mode 100644
index 000000000000..ce40af1223f6
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -0,0 +1,737 @@
+//===- StraightLineStrengthReduce.cpp - -----------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements straight-line strength reduction (SLSR). Unlike loop
+// strength reduction, this algorithm is designed to reduce arithmetic
+// redundancy in straight-line code instead of loops. It has proven to be
+// effective in simplifying arithmetic statements derived from an unrolled loop.
+// It can also simplify the logic of SeparateConstOffsetFromGEP.
+//
+// There are many optimizations we can perform in the domain of SLSR. This file
+// for now contains only an initial step. Specifically, we look for strength
+// reduction candidates in the following forms:
+//
+// Form 1: B + i * S
+// Form 2: (B + i) * S
+// Form 3: &B[i * S]
+//
+// where S is an integer variable, and i is a constant integer. If we found two
+// candidates S1 and S2 in the same form and S1 dominates S2, we may rewrite S2
+// in a simpler way with respect to S1. For example,
+//
+// S1: X = B + i * S
+// S2: Y = B + i' * S => X + (i' - i) * S
+//
+// S1: X = (B + i) * S
+// S2: Y = (B + i') * S => X + (i' - i) * S
+//
+// S1: X = &B[i * S]
+// S2: Y = &B[i' * S] => &X[(i' - i) * S]
+//
+// Note: (i' - i) * S is folded to the extent possible.
+//
+// This rewriting is in general a good idea. The code patterns we focus on
+// usually come from loop unrolling, so (i' - i) * S is likely the same
+// across iterations and can be reused. When that happens, the optimized form
+// takes only one add starting from the second iteration.
+//
+// When such rewriting is possible, we call S1 a "basis" of S2. When S2 has
+// multiple bases, we choose to rewrite S2 with respect to its "immediate"
+// basis, the basis that is the closest ancestor in the dominator tree.
+//
+// TODO:
+//
+// - Floating point arithmetics when fast math is enabled.
+//
+// - SLSR may decrease ILP at the architecture level. Targets that are very
+// sensitive to ILP may want to disable it. Having SLSR to consider ILP is
+// left as future work.
+//
+// - When (i' - i) is constant but i and i' are not, we could still perform
+// SLSR.
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <list>
+#include <vector>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+static const unsigned UnknownAddressSpace =
+ std::numeric_limits<unsigned>::max();
+
+namespace {
+
+class StraightLineStrengthReduce : public FunctionPass {
+public:
+ // SLSR candidate. Such a candidate must be in one of the forms described in
+ // the header comments.
+ struct Candidate {
+ enum Kind {
+ Invalid, // reserved for the default constructor
+ Add, // B + i * S
+ Mul, // (B + i) * S
+ GEP, // &B[..][i * S][..]
+ };
+
+ Candidate() = default;
+ Candidate(Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
+ Instruction *I)
+ : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I) {}
+
+ Kind CandidateKind = Invalid;
+
+ const SCEV *Base = nullptr;
+
+ // Note that Index and Stride of a GEP candidate do not necessarily have the
+ // same integer type. In that case, during rewriting, Stride will be
+ // sign-extended or truncated to Index's type.
+ ConstantInt *Index = nullptr;
+
+ Value *Stride = nullptr;
+
+ // The instruction this candidate corresponds to. It helps us to rewrite a
+ // candidate with respect to its immediate basis. Note that one instruction
+ // can correspond to multiple candidates depending on how you associate the
+ // expression. For instance,
+ //
+ // (a + 1) * (b + 2)
+ //
+ // can be treated as
+ //
+ // <Base: a, Index: 1, Stride: b + 2>
+ //
+ // or
+ //
+ // <Base: b, Index: 2, Stride: a + 1>
+ Instruction *Ins = nullptr;
+
+ // Points to the immediate basis of this candidate, or nullptr if we cannot
+ // find any basis for this candidate.
+ Candidate *Basis = nullptr;
+ };
+
+ static char ID;
+
+ StraightLineStrengthReduce() : FunctionPass(ID) {
+ initializeStraightLineStrengthReducePass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ // We do not modify the shape of the CFG.
+ AU.setPreservesCFG();
+ }
+
+ bool doInitialization(Module &M) override {
+ DL = &M.getDataLayout();
+ return false;
+ }
+
+ bool runOnFunction(Function &F) override;
+
+private:
+ // Returns true if Basis is a basis for C, i.e., Basis dominates C and they
+ // share the same base and stride.
+ bool isBasisFor(const Candidate &Basis, const Candidate &C);
+
+ // Returns whether the candidate can be folded into an addressing mode.
+ bool isFoldable(const Candidate &C, TargetTransformInfo *TTI,
+ const DataLayout *DL);
+
+ // Returns true if C is already in a simplest form and not worth being
+ // rewritten.
+ bool isSimplestForm(const Candidate &C);
+
+ // Checks whether I is in a candidate form. If so, adds all the matching forms
+ // to Candidates, and tries to find the immediate basis for each of them.
+ void allocateCandidatesAndFindBasis(Instruction *I);
+
+ // Allocate candidates and find bases for Add instructions.
+ void allocateCandidatesAndFindBasisForAdd(Instruction *I);
+
+ // Given I = LHS + RHS, factors RHS into i * S and makes (LHS + i * S) a
+ // candidate.
+ void allocateCandidatesAndFindBasisForAdd(Value *LHS, Value *RHS,
+ Instruction *I);
+ // Allocate candidates and find bases for Mul instructions.
+ void allocateCandidatesAndFindBasisForMul(Instruction *I);
+
+ // Splits LHS into Base + Index and, if succeeds, calls
+ // allocateCandidatesAndFindBasis.
+ void allocateCandidatesAndFindBasisForMul(Value *LHS, Value *RHS,
+ Instruction *I);
+
+ // Allocate candidates and find bases for GetElementPtr instructions.
+ void allocateCandidatesAndFindBasisForGEP(GetElementPtrInst *GEP);
+
+ // A helper function that scales Idx with ElementSize before invoking
+ // allocateCandidatesAndFindBasis.
+ void allocateCandidatesAndFindBasisForGEP(const SCEV *B, ConstantInt *Idx,
+ Value *S, uint64_t ElementSize,
+ Instruction *I);
+
+ // Adds the given form <CT, B, Idx, S> to Candidates, and finds its immediate
+ // basis.
+ void allocateCandidatesAndFindBasis(Candidate::Kind CT, const SCEV *B,
+ ConstantInt *Idx, Value *S,
+ Instruction *I);
+
+ // Rewrites candidate C with respect to Basis.
+ void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis);
+
+ // A helper function that factors ArrayIdx to a product of a stride and a
+ // constant index, and invokes allocateCandidatesAndFindBasis with the
+ // factorings.
+ void factorArrayIndex(Value *ArrayIdx, const SCEV *Base, uint64_t ElementSize,
+ GetElementPtrInst *GEP);
+
+ // Emit code that computes the "bump" from Basis to C. If the candidate is a
+ // GEP and the bump is not divisible by the element size of the GEP, this
+ // function sets the BumpWithUglyGEP flag to notify its caller to bump the
+ // basis using an ugly GEP.
+ static Value *emitBump(const Candidate &Basis, const Candidate &C,
+ IRBuilder<> &Builder, const DataLayout *DL,
+ bool &BumpWithUglyGEP);
+
+ const DataLayout *DL = nullptr;
+ DominatorTree *DT = nullptr;
+ ScalarEvolution *SE;
+ TargetTransformInfo *TTI = nullptr;
+ std::list<Candidate> Candidates;
+
+ // Temporarily holds all instructions that are unlinked (but not deleted) by
+ // rewriteCandidateWithBasis. These instructions will be actually removed
+ // after all rewriting finishes.
+ std::vector<Instruction *> UnlinkedInstructions;
+};
+
+} // end anonymous namespace
+
+char StraightLineStrengthReduce::ID = 0;
+
+INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr",
+ "Straight line strength reduction", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr",
+ "Straight line strength reduction", false, false)
+
+FunctionPass *llvm::createStraightLineStrengthReducePass() {
+ return new StraightLineStrengthReduce();
+}
+
+bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
+ const Candidate &C) {
+ return (Basis.Ins != C.Ins && // skip the same instruction
+ // They must have the same type too. Basis.Base == C.Base doesn't
+ // guarantee their types are the same (PR23975).
+ Basis.Ins->getType() == C.Ins->getType() &&
+ // Basis must dominate C in order to rewrite C with respect to Basis.
+ DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) &&
+ // They share the same base, stride, and candidate kind.
+ Basis.Base == C.Base && Basis.Stride == C.Stride &&
+ Basis.CandidateKind == C.CandidateKind);
+}
+
+static bool isGEPFoldable(GetElementPtrInst *GEP,
+ const TargetTransformInfo *TTI) {
+ SmallVector<const Value*, 4> Indices;
+ for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
+ Indices.push_back(*I);
+ return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
+ Indices) == TargetTransformInfo::TCC_Free;
+}
+
+// Returns whether (Base + Index * Stride) can be folded to an addressing mode.
+static bool isAddFoldable(const SCEV *Base, ConstantInt *Index, Value *Stride,
+ TargetTransformInfo *TTI) {
+ // Index->getSExtValue() may crash if Index is wider than 64-bit.
+ return Index->getBitWidth() <= 64 &&
+ TTI->isLegalAddressingMode(Base->getType(), nullptr, 0, true,
+ Index->getSExtValue(), UnknownAddressSpace);
+}
+
+bool StraightLineStrengthReduce::isFoldable(const Candidate &C,
+ TargetTransformInfo *TTI,
+ const DataLayout *DL) {
+ if (C.CandidateKind == Candidate::Add)
+ return isAddFoldable(C.Base, C.Index, C.Stride, TTI);
+ if (C.CandidateKind == Candidate::GEP)
+ return isGEPFoldable(cast<GetElementPtrInst>(C.Ins), TTI);
+ return false;
+}
+
+// Returns true if GEP has zero or one non-zero index.
+static bool hasOnlyOneNonZeroIndex(GetElementPtrInst *GEP) {
+ unsigned NumNonZeroIndices = 0;
+ for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I) {
+ ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I);
+ if (ConstIdx == nullptr || !ConstIdx->isZero())
+ ++NumNonZeroIndices;
+ }
+ return NumNonZeroIndices <= 1;
+}
+
+bool StraightLineStrengthReduce::isSimplestForm(const Candidate &C) {
+ if (C.CandidateKind == Candidate::Add) {
+ // B + 1 * S or B + (-1) * S
+ return C.Index->isOne() || C.Index->isMinusOne();
+ }
+ if (C.CandidateKind == Candidate::Mul) {
+ // (B + 0) * S
+ return C.Index->isZero();
+ }
+ if (C.CandidateKind == Candidate::GEP) {
+ // (char*)B + S or (char*)B - S
+ return ((C.Index->isOne() || C.Index->isMinusOne()) &&
+ hasOnlyOneNonZeroIndex(cast<GetElementPtrInst>(C.Ins)));
+ }
+ return false;
+}
+
+// TODO: We currently implement an algorithm whose time complexity is linear in
+// the number of existing candidates. However, we could do better by using
+// ScopedHashTable. Specifically, while traversing the dominator tree, we could
+// maintain all the candidates that dominate the basic block being traversed in
+// a ScopedHashTable. This hash table is indexed by the base and the stride of
+// a candidate. Therefore, finding the immediate basis of a candidate boils down
+// to one hash-table look up.
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasis(
+ Candidate::Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
+ Instruction *I) {
+ Candidate C(CT, B, Idx, S, I);
+ // SLSR can complicate an instruction in two cases:
+ //
+ // 1. If we can fold I into an addressing mode, computing I is likely free or
+ // takes only one instruction.
+ //
+ // 2. I is already in a simplest form. For example, when
+ // X = B + 8 * S
+ // Y = B + S,
+ // rewriting Y to X - 7 * S is probably a bad idea.
+ //
+ // In the above cases, we still add I to the candidate list so that I can be
+ // the basis of other candidates, but we leave I's basis blank so that I
+ // won't be rewritten.
+ if (!isFoldable(C, TTI, DL) && !isSimplestForm(C)) {
+ // Try to compute the immediate basis of C.
+ unsigned NumIterations = 0;
+ // Limit the scan radius to avoid running in quadratice time.
+ static const unsigned MaxNumIterations = 50;
+ for (auto Basis = Candidates.rbegin();
+ Basis != Candidates.rend() && NumIterations < MaxNumIterations;
+ ++Basis, ++NumIterations) {
+ if (isBasisFor(*Basis, C)) {
+ C.Basis = &(*Basis);
+ break;
+ }
+ }
+ }
+ // Regardless of whether we find a basis for C, we need to push C to the
+ // candidate list so that it can be the basis of other candidates.
+ Candidates.push_back(C);
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasis(
+ Instruction *I) {
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ allocateCandidatesAndFindBasisForAdd(I);
+ break;
+ case Instruction::Mul:
+ allocateCandidatesAndFindBasisForMul(I);
+ break;
+ case Instruction::GetElementPtr:
+ allocateCandidatesAndFindBasisForGEP(cast<GetElementPtrInst>(I));
+ break;
+ }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd(
+ Instruction *I) {
+ // Try matching B + i * S.
+ if (!isa<IntegerType>(I->getType()))
+ return;
+
+ assert(I->getNumOperands() == 2 && "isn't I an add?");
+ Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+ allocateCandidatesAndFindBasisForAdd(LHS, RHS, I);
+ if (LHS != RHS)
+ allocateCandidatesAndFindBasisForAdd(RHS, LHS, I);
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd(
+ Value *LHS, Value *RHS, Instruction *I) {
+ Value *S = nullptr;
+ ConstantInt *Idx = nullptr;
+ if (match(RHS, m_Mul(m_Value(S), m_ConstantInt(Idx)))) {
+ // I = LHS + RHS = LHS + Idx * S
+ allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), Idx, S, I);
+ } else if (match(RHS, m_Shl(m_Value(S), m_ConstantInt(Idx)))) {
+ // I = LHS + RHS = LHS + (S << Idx) = LHS + S * (1 << Idx)
+ APInt One(Idx->getBitWidth(), 1);
+ Idx = ConstantInt::get(Idx->getContext(), One << Idx->getValue());
+ allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), Idx, S, I);
+ } else {
+ // At least, I = LHS + 1 * RHS
+ ConstantInt *One = ConstantInt::get(cast<IntegerType>(I->getType()), 1);
+ allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), One, RHS,
+ I);
+ }
+}
+
+// Returns true if A matches B + C where C is constant.
+static bool matchesAdd(Value *A, Value *&B, ConstantInt *&C) {
+ return (match(A, m_Add(m_Value(B), m_ConstantInt(C))) ||
+ match(A, m_Add(m_ConstantInt(C), m_Value(B))));
+}
+
+// Returns true if A matches B | C where C is constant.
+static bool matchesOr(Value *A, Value *&B, ConstantInt *&C) {
+ return (match(A, m_Or(m_Value(B), m_ConstantInt(C))) ||
+ match(A, m_Or(m_ConstantInt(C), m_Value(B))));
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
+ Value *LHS, Value *RHS, Instruction *I) {
+ Value *B = nullptr;
+ ConstantInt *Idx = nullptr;
+ if (matchesAdd(LHS, B, Idx)) {
+ // If LHS is in the form of "Base + Index", then I is in the form of
+ // "(Base + Index) * RHS".
+ allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I);
+ } else if (matchesOr(LHS, B, Idx) && haveNoCommonBitsSet(B, Idx, *DL)) {
+ // If LHS is in the form of "Base | Index" and Base and Index have no common
+ // bits set, then
+ // Base | Index = Base + Index
+ // and I is thus in the form of "(Base + Index) * RHS".
+ allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I);
+ } else {
+ // Otherwise, at least try the form (LHS + 0) * RHS.
+ ConstantInt *Zero = ConstantInt::get(cast<IntegerType>(I->getType()), 0);
+ allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(LHS), Zero, RHS,
+ I);
+ }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
+ Instruction *I) {
+ // Try matching (B + i) * S.
+ // TODO: we could extend SLSR to float and vector types.
+ if (!isa<IntegerType>(I->getType()))
+ return;
+
+ assert(I->getNumOperands() == 2 && "isn't I a mul?");
+ Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+ allocateCandidatesAndFindBasisForMul(LHS, RHS, I);
+ if (LHS != RHS) {
+ // Symmetrically, try to split RHS to Base + Index.
+ allocateCandidatesAndFindBasisForMul(RHS, LHS, I);
+ }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
+ const SCEV *B, ConstantInt *Idx, Value *S, uint64_t ElementSize,
+ Instruction *I) {
+ // I = B + sext(Idx *nsw S) * ElementSize
+ // = B + (sext(Idx) * sext(S)) * ElementSize
+ // = B + (sext(Idx) * ElementSize) * sext(S)
+ // Casting to IntegerType is safe because we skipped vector GEPs.
+ IntegerType *IntPtrTy = cast<IntegerType>(DL->getIntPtrType(I->getType()));
+ ConstantInt *ScaledIdx = ConstantInt::get(
+ IntPtrTy, Idx->getSExtValue() * (int64_t)ElementSize, true);
+ allocateCandidatesAndFindBasis(Candidate::GEP, B, ScaledIdx, S, I);
+}
+
+void StraightLineStrengthReduce::factorArrayIndex(Value *ArrayIdx,
+ const SCEV *Base,
+ uint64_t ElementSize,
+ GetElementPtrInst *GEP) {
+ // At least, ArrayIdx = ArrayIdx *nsw 1.
+ allocateCandidatesAndFindBasisForGEP(
+ Base, ConstantInt::get(cast<IntegerType>(ArrayIdx->getType()), 1),
+ ArrayIdx, ElementSize, GEP);
+ Value *LHS = nullptr;
+ ConstantInt *RHS = nullptr;
+ // One alternative is matching the SCEV of ArrayIdx instead of ArrayIdx
+ // itself. This would allow us to handle the shl case for free. However,
+ // matching SCEVs has two issues:
+ //
+ // 1. this would complicate rewriting because the rewriting procedure
+ // would have to translate SCEVs back to IR instructions. This translation
+ // is difficult when LHS is further evaluated to a composite SCEV.
+ //
+ // 2. ScalarEvolution is designed to be control-flow oblivious. It tends
+ // to strip nsw/nuw flags which are critical for SLSR to trace into
+ // sext'ed multiplication.
+ if (match(ArrayIdx, m_NSWMul(m_Value(LHS), m_ConstantInt(RHS)))) {
+ // SLSR is currently unsafe if i * S may overflow.
+ // GEP = Base + sext(LHS *nsw RHS) * ElementSize
+ allocateCandidatesAndFindBasisForGEP(Base, RHS, LHS, ElementSize, GEP);
+ } else if (match(ArrayIdx, m_NSWShl(m_Value(LHS), m_ConstantInt(RHS)))) {
+ // GEP = Base + sext(LHS <<nsw RHS) * ElementSize
+ // = Base + sext(LHS *nsw (1 << RHS)) * ElementSize
+ APInt One(RHS->getBitWidth(), 1);
+ ConstantInt *PowerOf2 =
+ ConstantInt::get(RHS->getContext(), One << RHS->getValue());
+ allocateCandidatesAndFindBasisForGEP(Base, PowerOf2, LHS, ElementSize, GEP);
+ }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
+ GetElementPtrInst *GEP) {
+ // TODO: handle vector GEPs
+ if (GEP->getType()->isVectorTy())
+ return;
+
+ SmallVector<const SCEV *, 4> IndexExprs;
+ for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
+ IndexExprs.push_back(SE->getSCEV(*I));
+
+ gep_type_iterator GTI = gep_type_begin(GEP);
+ for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+ if (GTI.isStruct())
+ continue;
+
+ const SCEV *OrigIndexExpr = IndexExprs[I - 1];
+ IndexExprs[I - 1] = SE->getZero(OrigIndexExpr->getType());
+
+ // The base of this candidate is GEP's base plus the offsets of all
+ // indices except this current one.
+ const SCEV *BaseExpr = SE->getGEPExpr(cast<GEPOperator>(GEP), IndexExprs);
+ Value *ArrayIdx = GEP->getOperand(I);
+ uint64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
+ if (ArrayIdx->getType()->getIntegerBitWidth() <=
+ DL->getPointerSizeInBits(GEP->getAddressSpace())) {
+ // Skip factoring if ArrayIdx is wider than the pointer size, because
+ // ArrayIdx is implicitly truncated to the pointer size.
+ factorArrayIndex(ArrayIdx, BaseExpr, ElementSize, GEP);
+ }
+ // When ArrayIdx is the sext of a value, we try to factor that value as
+ // well. Handling this case is important because array indices are
+ // typically sign-extended to the pointer size.
+ Value *TruncatedArrayIdx = nullptr;
+ if (match(ArrayIdx, m_SExt(m_Value(TruncatedArrayIdx))) &&
+ TruncatedArrayIdx->getType()->getIntegerBitWidth() <=
+ DL->getPointerSizeInBits(GEP->getAddressSpace())) {
+ // Skip factoring if TruncatedArrayIdx is wider than the pointer size,
+ // because TruncatedArrayIdx is implicitly truncated to the pointer size.
+ factorArrayIndex(TruncatedArrayIdx, BaseExpr, ElementSize, GEP);
+ }
+
+ IndexExprs[I - 1] = OrigIndexExpr;
+ }
+}
+
+// A helper function that unifies the bitwidth of A and B.
+static void unifyBitWidth(APInt &A, APInt &B) {
+ if (A.getBitWidth() < B.getBitWidth())
+ A = A.sext(B.getBitWidth());
+ else if (A.getBitWidth() > B.getBitWidth())
+ B = B.sext(A.getBitWidth());
+}
+
+Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
+ const Candidate &C,
+ IRBuilder<> &Builder,
+ const DataLayout *DL,
+ bool &BumpWithUglyGEP) {
+ APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue();
+ unifyBitWidth(Idx, BasisIdx);
+ APInt IndexOffset = Idx - BasisIdx;
+
+ BumpWithUglyGEP = false;
+ if (Basis.CandidateKind == Candidate::GEP) {
+ APInt ElementSize(
+ IndexOffset.getBitWidth(),
+ DL->getTypeAllocSize(
+ cast<GetElementPtrInst>(Basis.Ins)->getResultElementType()));
+ APInt Q, R;
+ APInt::sdivrem(IndexOffset, ElementSize, Q, R);
+ if (R == 0)
+ IndexOffset = Q;
+ else
+ BumpWithUglyGEP = true;
+ }
+
+ // Compute Bump = C - Basis = (i' - i) * S.
+ // Common case 1: if (i' - i) is 1, Bump = S.
+ if (IndexOffset == 1)
+ return C.Stride;
+ // Common case 2: if (i' - i) is -1, Bump = -S.
+ if (IndexOffset.isAllOnesValue())
+ return Builder.CreateNeg(C.Stride);
+
+ // Otherwise, Bump = (i' - i) * sext/trunc(S). Note that (i' - i) and S may
+ // have different bit widths.
+ IntegerType *DeltaType =
+ IntegerType::get(Basis.Ins->getContext(), IndexOffset.getBitWidth());
+ Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, DeltaType);
+ if (IndexOffset.isPowerOf2()) {
+ // If (i' - i) is a power of 2, Bump = sext/trunc(S) << log(i' - i).
+ ConstantInt *Exponent = ConstantInt::get(DeltaType, IndexOffset.logBase2());
+ return Builder.CreateShl(ExtendedStride, Exponent);
+ }
+ if ((-IndexOffset).isPowerOf2()) {
+ // If (i - i') is a power of 2, Bump = -sext/trunc(S) << log(i' - i).
+ ConstantInt *Exponent =
+ ConstantInt::get(DeltaType, (-IndexOffset).logBase2());
+ return Builder.CreateNeg(Builder.CreateShl(ExtendedStride, Exponent));
+ }
+ Constant *Delta = ConstantInt::get(DeltaType, IndexOffset);
+ return Builder.CreateMul(ExtendedStride, Delta);
+}
+
+void StraightLineStrengthReduce::rewriteCandidateWithBasis(
+ const Candidate &C, const Candidate &Basis) {
+ assert(C.CandidateKind == Basis.CandidateKind && C.Base == Basis.Base &&
+ C.Stride == Basis.Stride);
+ // We run rewriteCandidateWithBasis on all candidates in a post-order, so the
+ // basis of a candidate cannot be unlinked before the candidate.
+ assert(Basis.Ins->getParent() != nullptr && "the basis is unlinked");
+
+ // An instruction can correspond to multiple candidates. Therefore, instead of
+ // simply deleting an instruction when we rewrite it, we mark its parent as
+ // nullptr (i.e. unlink it) so that we can skip the candidates whose
+ // instruction is already rewritten.
+ if (!C.Ins->getParent())
+ return;
+
+ IRBuilder<> Builder(C.Ins);
+ bool BumpWithUglyGEP;
+ Value *Bump = emitBump(Basis, C, Builder, DL, BumpWithUglyGEP);
+ Value *Reduced = nullptr; // equivalent to but weaker than C.Ins
+ switch (C.CandidateKind) {
+ case Candidate::Add:
+ case Candidate::Mul:
+ // C = Basis + Bump
+ if (BinaryOperator::isNeg(Bump)) {
+ // If Bump is a neg instruction, emit C = Basis - (-Bump).
+ Reduced =
+ Builder.CreateSub(Basis.Ins, BinaryOperator::getNegArgument(Bump));
+ // We only use the negative argument of Bump, and Bump itself may be
+ // trivially dead.
+ RecursivelyDeleteTriviallyDeadInstructions(Bump);
+ } else {
+ // It's tempting to preserve nsw on Bump and/or Reduced. However, it's
+ // usually unsound, e.g.,
+ //
+ // X = (-2 +nsw 1) *nsw INT_MAX
+ // Y = (-2 +nsw 3) *nsw INT_MAX
+ // =>
+ // Y = X + 2 * INT_MAX
+ //
+ // Neither + and * in the resultant expression are nsw.
+ Reduced = Builder.CreateAdd(Basis.Ins, Bump);
+ }
+ break;
+ case Candidate::GEP:
+ {
+ Type *IntPtrTy = DL->getIntPtrType(C.Ins->getType());
+ bool InBounds = cast<GetElementPtrInst>(C.Ins)->isInBounds();
+ if (BumpWithUglyGEP) {
+ // C = (char *)Basis + Bump
+ unsigned AS = Basis.Ins->getType()->getPointerAddressSpace();
+ Type *CharTy = Type::getInt8PtrTy(Basis.Ins->getContext(), AS);
+ Reduced = Builder.CreateBitCast(Basis.Ins, CharTy);
+ if (InBounds)
+ Reduced =
+ Builder.CreateInBoundsGEP(Builder.getInt8Ty(), Reduced, Bump);
+ else
+ Reduced = Builder.CreateGEP(Builder.getInt8Ty(), Reduced, Bump);
+ Reduced = Builder.CreateBitCast(Reduced, C.Ins->getType());
+ } else {
+ // C = gep Basis, Bump
+ // Canonicalize bump to pointer size.
+ Bump = Builder.CreateSExtOrTrunc(Bump, IntPtrTy);
+ if (InBounds)
+ Reduced = Builder.CreateInBoundsGEP(nullptr, Basis.Ins, Bump);
+ else
+ Reduced = Builder.CreateGEP(nullptr, Basis.Ins, Bump);
+ }
+ break;
+ }
+ default:
+ llvm_unreachable("C.CandidateKind is invalid");
+ };
+ Reduced->takeName(C.Ins);
+ C.Ins->replaceAllUsesWith(Reduced);
+ // Unlink C.Ins so that we can skip other candidates also corresponding to
+ // C.Ins. The actual deletion is postponed to the end of runOnFunction.
+ C.Ins->removeFromParent();
+ UnlinkedInstructions.push_back(C.Ins);
+}
+
+bool StraightLineStrengthReduce::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ // Traverse the dominator tree in the depth-first order. This order makes sure
+ // all bases of a candidate are in Candidates when we process it.
+ for (const auto Node : depth_first(DT))
+ for (auto &I : *(Node->getBlock()))
+ allocateCandidatesAndFindBasis(&I);
+
+ // Rewrite candidates in the reverse depth-first order. This order makes sure
+ // a candidate being rewritten is not a basis for any other candidate.
+ while (!Candidates.empty()) {
+ const Candidate &C = Candidates.back();
+ if (C.Basis != nullptr) {
+ rewriteCandidateWithBasis(C, *C.Basis);
+ }
+ Candidates.pop_back();
+ }
+
+ // Delete all unlink instructions.
+ for (auto *UnlinkedInst : UnlinkedInstructions) {
+ for (unsigned I = 0, E = UnlinkedInst->getNumOperands(); I != E; ++I) {
+ Value *Op = UnlinkedInst->getOperand(I);
+ UnlinkedInst->setOperand(I, nullptr);
+ RecursivelyDeleteTriviallyDeadInstructions(Op);
+ }
+ UnlinkedInst->deleteValue();
+ }
+ bool Ret = !UnlinkedInstructions.empty();
+ UnlinkedInstructions.clear();
+ return Ret;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
new file mode 100644
index 000000000000..b8fb80b6cc26
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -0,0 +1,953 @@
+//===- StructurizeCFG.cpp -------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Analysis/RegionPass.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "structurizecfg"
+
+// The name for newly created blocks.
+static const char *const FlowBlockName = "Flow";
+
+namespace {
+
+// Definition of the complex types used in this pass.
+
+using BBValuePair = std::pair<BasicBlock *, Value *>;
+
+using RNVector = SmallVector<RegionNode *, 8>;
+using BBVector = SmallVector<BasicBlock *, 8>;
+using BranchVector = SmallVector<BranchInst *, 8>;
+using BBValueVector = SmallVector<BBValuePair, 2>;
+
+using BBSet = SmallPtrSet<BasicBlock *, 8>;
+
+using PhiMap = MapVector<PHINode *, BBValueVector>;
+using BB2BBVecMap = MapVector<BasicBlock *, BBVector>;
+
+using BBPhiMap = DenseMap<BasicBlock *, PhiMap>;
+using BBPredicates = DenseMap<BasicBlock *, Value *>;
+using PredMap = DenseMap<BasicBlock *, BBPredicates>;
+using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>;
+
+/// Finds the nearest common dominator of a set of BasicBlocks.
+///
+/// For every BB you add to the set, you can specify whether we "remember" the
+/// block. When you get the common dominator, you can also ask whether it's one
+/// of the blocks we remembered.
+class NearestCommonDominator {
+ DominatorTree *DT;
+ BasicBlock *Result = nullptr;
+ bool ResultIsRemembered = false;
+
+ /// Add BB to the resulting dominator.
+ void addBlock(BasicBlock *BB, bool Remember) {
+ if (!Result) {
+ Result = BB;
+ ResultIsRemembered = Remember;
+ return;
+ }
+
+ BasicBlock *NewResult = DT->findNearestCommonDominator(Result, BB);
+ if (NewResult != Result)
+ ResultIsRemembered = false;
+ if (NewResult == BB)
+ ResultIsRemembered |= Remember;
+ Result = NewResult;
+ }
+
+public:
+ explicit NearestCommonDominator(DominatorTree *DomTree) : DT(DomTree) {}
+
+ void addBlock(BasicBlock *BB) {
+ addBlock(BB, /* Remember = */ false);
+ }
+
+ void addAndRememberBlock(BasicBlock *BB) {
+ addBlock(BB, /* Remember = */ true);
+ }
+
+ /// Get the nearest common dominator of all the BBs added via addBlock() and
+ /// addAndRememberBlock().
+ BasicBlock *result() { return Result; }
+
+ /// Is the BB returned by getResult() one of the blocks we added to the set
+ /// with addAndRememberBlock()?
+ bool resultIsRememberedBlock() { return ResultIsRemembered; }
+};
+
+/// @brief Transforms the control flow graph on one single entry/exit region
+/// at a time.
+///
+/// After the transform all "If"/"Then"/"Else" style control flow looks like
+/// this:
+///
+/// \verbatim
+/// 1
+/// ||
+/// | |
+/// 2 |
+/// | /
+/// |/
+/// 3
+/// || Where:
+/// | | 1 = "If" block, calculates the condition
+/// 4 | 2 = "Then" subregion, runs if the condition is true
+/// | / 3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow
+/// |/ 4 = "Else" optional subregion, runs if the condition is false
+/// 5 5 = "End" block, also rejoins the control flow
+/// \endverbatim
+///
+/// Control flow is expressed as a branch where the true exit goes into the
+/// "Then"/"Else" region, while the false exit skips the region
+/// The condition for the optional "Else" region is expressed as a PHI node.
+/// The incoming values of the PHI node are true for the "If" edge and false
+/// for the "Then" edge.
+///
+/// Additionally to that even complicated loops look like this:
+///
+/// \verbatim
+/// 1
+/// ||
+/// | |
+/// 2 ^ Where:
+/// | / 1 = "Entry" block
+/// |/ 2 = "Loop" optional subregion, with all exits at "Flow" block
+/// 3 3 = "Flow" block, with back edge to entry block
+/// |
+/// \endverbatim
+///
+/// The back edge of the "Flow" block is always on the false side of the branch
+/// while the true side continues the general flow. So the loop condition
+/// consist of a network of PHI nodes where the true incoming values expresses
+/// breaks and the false values expresses continue states.
+class StructurizeCFG : public RegionPass {
+ bool SkipUniformRegions;
+
+ Type *Boolean;
+ ConstantInt *BoolTrue;
+ ConstantInt *BoolFalse;
+ UndefValue *BoolUndef;
+
+ Function *Func;
+ Region *ParentRegion;
+
+ DominatorTree *DT;
+ LoopInfo *LI;
+
+ SmallVector<RegionNode *, 8> Order;
+ BBSet Visited;
+
+ BBPhiMap DeletedPhis;
+ BB2BBVecMap AddedPhis;
+
+ PredMap Predicates;
+ BranchVector Conditions;
+
+ BB2BBMap Loops;
+ PredMap LoopPreds;
+ BranchVector LoopConds;
+
+ RegionNode *PrevNode;
+
+ void orderNodes();
+
+ void analyzeLoops(RegionNode *N);
+
+ Value *invert(Value *Condition);
+
+ Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
+
+ void gatherPredicates(RegionNode *N);
+
+ void collectInfos();
+
+ void insertConditions(bool Loops);
+
+ void delPhiValues(BasicBlock *From, BasicBlock *To);
+
+ void addPhiValues(BasicBlock *From, BasicBlock *To);
+
+ void setPhiValues();
+
+ void killTerminator(BasicBlock *BB);
+
+ void changeExit(RegionNode *Node, BasicBlock *NewExit,
+ bool IncludeDominator);
+
+ BasicBlock *getNextFlow(BasicBlock *Dominator);
+
+ BasicBlock *needPrefix(bool NeedEmpty);
+
+ BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed);
+
+ void setPrevNode(BasicBlock *BB);
+
+ bool dominatesPredicates(BasicBlock *BB, RegionNode *Node);
+
+ bool isPredictableTrue(RegionNode *Node);
+
+ void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd);
+
+ void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd);
+
+ void createFlow();
+
+ void rebuildSSA();
+
+public:
+ static char ID;
+
+ explicit StructurizeCFG(bool SkipUniformRegions = false)
+ : RegionPass(ID), SkipUniformRegions(SkipUniformRegions) {
+ initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool doInitialization(Region *R, RGPassManager &RGM) override;
+
+ bool runOnRegion(Region *R, RGPassManager &RGM) override;
+
+ StringRef getPassName() const override { return "Structurize control flow"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ if (SkipUniformRegions)
+ AU.addRequired<DivergenceAnalysis>();
+ AU.addRequiredID(LowerSwitchID);
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ RegionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char StructurizeCFG::ID = 0;
+
+INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
+INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG",
+ false, false)
+
+/// \brief Initialize the types and constants used in the pass
+bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
+ LLVMContext &Context = R->getEntry()->getContext();
+
+ Boolean = Type::getInt1Ty(Context);
+ BoolTrue = ConstantInt::getTrue(Context);
+ BoolFalse = ConstantInt::getFalse(Context);
+ BoolUndef = UndefValue::get(Boolean);
+
+ return false;
+}
+
+/// \brief Build up the general order of nodes
+void StructurizeCFG::orderNodes() {
+ ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
+ SmallDenseMap<Loop*, unsigned, 8> LoopBlocks;
+
+ // The reverse post-order traversal of the list gives us an ordering close
+ // to what we want. The only problem with it is that sometimes backedges
+ // for outer loops will be visited before backedges for inner loops.
+ for (RegionNode *RN : RPOT) {
+ BasicBlock *BB = RN->getEntry();
+ Loop *Loop = LI->getLoopFor(BB);
+ ++LoopBlocks[Loop];
+ }
+
+ unsigned CurrentLoopDepth = 0;
+ Loop *CurrentLoop = nullptr;
+ for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
+ BasicBlock *BB = (*I)->getEntry();
+ unsigned LoopDepth = LI->getLoopDepth(BB);
+
+ if (is_contained(Order, *I))
+ continue;
+
+ if (LoopDepth < CurrentLoopDepth) {
+ // Make sure we have visited all blocks in this loop before moving back to
+ // the outer loop.
+
+ auto LoopI = I;
+ while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
+ LoopI++;
+ BasicBlock *LoopBB = (*LoopI)->getEntry();
+ if (LI->getLoopFor(LoopBB) == CurrentLoop) {
+ --BlockCount;
+ Order.push_back(*LoopI);
+ }
+ }
+ }
+
+ CurrentLoop = LI->getLoopFor(BB);
+ if (CurrentLoop)
+ LoopBlocks[CurrentLoop]--;
+
+ CurrentLoopDepth = LoopDepth;
+ Order.push_back(*I);
+ }
+
+ // This pass originally used a post-order traversal and then operated on
+ // the list in reverse. Now that we are using a reverse post-order traversal
+ // rather than re-working the whole pass to operate on the list in order,
+ // we just reverse the list and continue to operate on it in reverse.
+ std::reverse(Order.begin(), Order.end());
+}
+
+/// \brief Determine the end of the loops
+void StructurizeCFG::analyzeLoops(RegionNode *N) {
+ if (N->isSubRegion()) {
+ // Test for exit as back edge
+ BasicBlock *Exit = N->getNodeAs<Region>()->getExit();
+ if (Visited.count(Exit))
+ Loops[Exit] = N->getEntry();
+
+ } else {
+ // Test for successors as back edge
+ BasicBlock *BB = N->getNodeAs<BasicBlock>();
+ BranchInst *Term = cast<BranchInst>(BB->getTerminator());
+
+ for (BasicBlock *Succ : Term->successors())
+ if (Visited.count(Succ))
+ Loops[Succ] = BB;
+ }
+}
+
+/// \brief Invert the given condition
+Value *StructurizeCFG::invert(Value *Condition) {
+ // First: Check if it's a constant
+ if (Constant *C = dyn_cast<Constant>(Condition))
+ return ConstantExpr::getNot(C);
+
+ // Second: If the condition is already inverted, return the original value
+ if (match(Condition, m_Not(m_Value(Condition))))
+ return Condition;
+
+ if (Instruction *Inst = dyn_cast<Instruction>(Condition)) {
+ // Third: Check all the users for an invert
+ BasicBlock *Parent = Inst->getParent();
+ for (User *U : Condition->users())
+ if (Instruction *I = dyn_cast<Instruction>(U))
+ if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition))))
+ return I;
+
+ // Last option: Create a new instruction
+ return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
+ }
+
+ if (Argument *Arg = dyn_cast<Argument>(Condition)) {
+ BasicBlock &EntryBlock = Arg->getParent()->getEntryBlock();
+ return BinaryOperator::CreateNot(Condition,
+ Arg->getName() + ".inv",
+ EntryBlock.getTerminator());
+ }
+
+ llvm_unreachable("Unhandled condition to invert");
+}
+
+/// \brief Build the condition for one edge
+Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
+ bool Invert) {
+ Value *Cond = Invert ? BoolFalse : BoolTrue;
+ if (Term->isConditional()) {
+ Cond = Term->getCondition();
+
+ if (Idx != (unsigned)Invert)
+ Cond = invert(Cond);
+ }
+ return Cond;
+}
+
+/// \brief Analyze the predecessors of each block and build up predicates
+void StructurizeCFG::gatherPredicates(RegionNode *N) {
+ RegionInfo *RI = ParentRegion->getRegionInfo();
+ BasicBlock *BB = N->getEntry();
+ BBPredicates &Pred = Predicates[BB];
+ BBPredicates &LPred = LoopPreds[BB];
+
+ for (BasicBlock *P : predecessors(BB)) {
+ // Ignore it if it's a branch from outside into our region entry
+ if (!ParentRegion->contains(P))
+ continue;
+
+ Region *R = RI->getRegionFor(P);
+ if (R == ParentRegion) {
+ // It's a top level block in our region
+ BranchInst *Term = cast<BranchInst>(P->getTerminator());
+ for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+ BasicBlock *Succ = Term->getSuccessor(i);
+ if (Succ != BB)
+ continue;
+
+ if (Visited.count(P)) {
+ // Normal forward edge
+ if (Term->isConditional()) {
+ // Try to treat it like an ELSE block
+ BasicBlock *Other = Term->getSuccessor(!i);
+ if (Visited.count(Other) && !Loops.count(Other) &&
+ !Pred.count(Other) && !Pred.count(P)) {
+
+ Pred[Other] = BoolFalse;
+ Pred[P] = BoolTrue;
+ continue;
+ }
+ }
+ Pred[P] = buildCondition(Term, i, false);
+ } else {
+ // Back edge
+ LPred[P] = buildCondition(Term, i, true);
+ }
+ }
+ } else {
+ // It's an exit from a sub region
+ while (R->getParent() != ParentRegion)
+ R = R->getParent();
+
+ // Edge from inside a subregion to its entry, ignore it
+ if (*R == *N)
+ continue;
+
+ BasicBlock *Entry = R->getEntry();
+ if (Visited.count(Entry))
+ Pred[Entry] = BoolTrue;
+ else
+ LPred[Entry] = BoolFalse;
+ }
+ }
+}
+
+/// \brief Collect various loop and predicate infos
+void StructurizeCFG::collectInfos() {
+ // Reset predicate
+ Predicates.clear();
+
+ // and loop infos
+ Loops.clear();
+ LoopPreds.clear();
+
+ // Reset the visited nodes
+ Visited.clear();
+
+ for (RegionNode *RN : reverse(Order)) {
+ DEBUG(dbgs() << "Visiting: "
+ << (RN->isSubRegion() ? "SubRegion with entry: " : "")
+ << RN->getEntry()->getName() << " Loop Depth: "
+ << LI->getLoopDepth(RN->getEntry()) << "\n");
+
+ // Analyze all the conditions leading to a node
+ gatherPredicates(RN);
+
+ // Remember that we've seen this node
+ Visited.insert(RN->getEntry());
+
+ // Find the last back edges
+ analyzeLoops(RN);
+ }
+}
+
+/// \brief Insert the missing branch conditions
+void StructurizeCFG::insertConditions(bool Loops) {
+ BranchVector &Conds = Loops ? LoopConds : Conditions;
+ Value *Default = Loops ? BoolTrue : BoolFalse;
+ SSAUpdater PhiInserter;
+
+ for (BranchInst *Term : Conds) {
+ assert(Term->isConditional());
+
+ BasicBlock *Parent = Term->getParent();
+ BasicBlock *SuccTrue = Term->getSuccessor(0);
+ BasicBlock *SuccFalse = Term->getSuccessor(1);
+
+ PhiInserter.Initialize(Boolean, "");
+ PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default);
+ PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default);
+
+ BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];
+
+ NearestCommonDominator Dominator(DT);
+ Dominator.addBlock(Parent);
+
+ Value *ParentValue = nullptr;
+ for (std::pair<BasicBlock *, Value *> BBAndPred : Preds) {
+ BasicBlock *BB = BBAndPred.first;
+ Value *Pred = BBAndPred.second;
+
+ if (BB == Parent) {
+ ParentValue = Pred;
+ break;
+ }
+ PhiInserter.AddAvailableValue(BB, Pred);
+ Dominator.addAndRememberBlock(BB);
+ }
+
+ if (ParentValue) {
+ Term->setCondition(ParentValue);
+ } else {
+ if (!Dominator.resultIsRememberedBlock())
+ PhiInserter.AddAvailableValue(Dominator.result(), Default);
+
+ Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent));
+ }
+ }
+}
+
+/// \brief Remove all PHI values coming from "From" into "To" and remember
+/// them in DeletedPhis
+void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
+ PhiMap &Map = DeletedPhis[To];
+ for (PHINode &Phi : To->phis()) {
+ while (Phi.getBasicBlockIndex(From) != -1) {
+ Value *Deleted = Phi.removeIncomingValue(From, false);
+ Map[&Phi].push_back(std::make_pair(From, Deleted));
+ }
+ }
+}
+
+/// \brief Add a dummy PHI value as soon as we knew the new predecessor
+void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
+ for (PHINode &Phi : To->phis()) {
+ Value *Undef = UndefValue::get(Phi.getType());
+ Phi.addIncoming(Undef, From);
+ }
+ AddedPhis[To].push_back(From);
+}
+
+/// \brief Add the real PHI value as soon as everything is set up
+void StructurizeCFG::setPhiValues() {
+ SSAUpdater Updater;
+ for (const auto &AddedPhi : AddedPhis) {
+ BasicBlock *To = AddedPhi.first;
+ const BBVector &From = AddedPhi.second;
+
+ if (!DeletedPhis.count(To))
+ continue;
+
+ PhiMap &Map = DeletedPhis[To];
+ for (const auto &PI : Map) {
+ PHINode *Phi = PI.first;
+ Value *Undef = UndefValue::get(Phi->getType());
+ Updater.Initialize(Phi->getType(), "");
+ Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
+ Updater.AddAvailableValue(To, Undef);
+
+ NearestCommonDominator Dominator(DT);
+ Dominator.addBlock(To);
+ for (const auto &VI : PI.second) {
+ Updater.AddAvailableValue(VI.first, VI.second);
+ Dominator.addAndRememberBlock(VI.first);
+ }
+
+ if (!Dominator.resultIsRememberedBlock())
+ Updater.AddAvailableValue(Dominator.result(), Undef);
+
+ for (BasicBlock *FI : From) {
+ int Idx = Phi->getBasicBlockIndex(FI);
+ assert(Idx != -1);
+ Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(FI));
+ }
+ }
+
+ DeletedPhis.erase(To);
+ }
+ assert(DeletedPhis.empty());
+}
+
+/// \brief Remove phi values from all successors and then remove the terminator.
+void StructurizeCFG::killTerminator(BasicBlock *BB) {
+ TerminatorInst *Term = BB->getTerminator();
+ if (!Term)
+ return;
+
+ for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
+ SI != SE; ++SI)
+ delPhiValues(BB, *SI);
+
+ Term->eraseFromParent();
+}
+
+/// \brief Let node exit(s) point to NewExit
+void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
+ bool IncludeDominator) {
+ if (Node->isSubRegion()) {
+ Region *SubRegion = Node->getNodeAs<Region>();
+ BasicBlock *OldExit = SubRegion->getExit();
+ BasicBlock *Dominator = nullptr;
+
+ // Find all the edges from the sub region to the exit
+ for (auto BBI = pred_begin(OldExit), E = pred_end(OldExit); BBI != E;) {
+ // Incrememt BBI before mucking with BB's terminator.
+ BasicBlock *BB = *BBI++;
+
+ if (!SubRegion->contains(BB))
+ continue;
+
+ // Modify the edges to point to the new exit
+ delPhiValues(BB, OldExit);
+ BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit);
+ addPhiValues(BB, NewExit);
+
+ // Find the new dominator (if requested)
+ if (IncludeDominator) {
+ if (!Dominator)
+ Dominator = BB;
+ else
+ Dominator = DT->findNearestCommonDominator(Dominator, BB);
+ }
+ }
+
+ // Change the dominator (if requested)
+ if (Dominator)
+ DT->changeImmediateDominator(NewExit, Dominator);
+
+ // Update the region info
+ SubRegion->replaceExit(NewExit);
+ } else {
+ BasicBlock *BB = Node->getNodeAs<BasicBlock>();
+ killTerminator(BB);
+ BranchInst::Create(NewExit, BB);
+ addPhiValues(BB, NewExit);
+ if (IncludeDominator)
+ DT->changeImmediateDominator(NewExit, BB);
+ }
+}
+
+/// \brief Create a new flow node and update dominator tree and region info
+BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
+ LLVMContext &Context = Func->getContext();
+ BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
+ Order.back()->getEntry();
+ BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
+ Func, Insert);
+ DT->addNewBlock(Flow, Dominator);
+ ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion);
+ return Flow;
+}
+
+/// \brief Create a new or reuse the previous node as flow node
+BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
+ BasicBlock *Entry = PrevNode->getEntry();
+
+ if (!PrevNode->isSubRegion()) {
+ killTerminator(Entry);
+ if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end())
+ return Entry;
+ }
+
+ // create a new flow node
+ BasicBlock *Flow = getNextFlow(Entry);
+
+ // and wire it up
+ changeExit(PrevNode, Flow, true);
+ PrevNode = ParentRegion->getBBNode(Flow);
+ return Flow;
+}
+
+/// \brief Returns the region exit if possible, otherwise just a new flow node
+BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow,
+ bool ExitUseAllowed) {
+ if (!Order.empty() || !ExitUseAllowed)
+ return getNextFlow(Flow);
+
+ BasicBlock *Exit = ParentRegion->getExit();
+ DT->changeImmediateDominator(Exit, Flow);
+ addPhiValues(Flow, Exit);
+ return Exit;
+}
+
+/// \brief Set the previous node
+void StructurizeCFG::setPrevNode(BasicBlock *BB) {
+ PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB)
+ : nullptr;
+}
+
+/// \brief Does BB dominate all the predicates of Node?
+bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
+ BBPredicates &Preds = Predicates[Node->getEntry()];
+ return llvm::all_of(Preds, [&](std::pair<BasicBlock *, Value *> Pred) {
+ return DT->dominates(BB, Pred.first);
+ });
+}
+
+/// \brief Can we predict that this node will always be called?
+bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
+ BBPredicates &Preds = Predicates[Node->getEntry()];
+ bool Dominated = false;
+
+ // Regionentry is always true
+ if (!PrevNode)
+ return true;
+
+ for (std::pair<BasicBlock*, Value*> Pred : Preds) {
+ BasicBlock *BB = Pred.first;
+ Value *V = Pred.second;
+
+ if (V != BoolTrue)
+ return false;
+
+ if (!Dominated && DT->dominates(BB, PrevNode->getEntry()))
+ Dominated = true;
+ }
+
+ // TODO: The dominator check is too strict
+ return Dominated;
+}
+
+/// Take one node from the order vector and wire it up
+void StructurizeCFG::wireFlow(bool ExitUseAllowed,
+ BasicBlock *LoopEnd) {
+ RegionNode *Node = Order.pop_back_val();
+ Visited.insert(Node->getEntry());
+
+ if (isPredictableTrue(Node)) {
+ // Just a linear flow
+ if (PrevNode) {
+ changeExit(PrevNode, Node->getEntry(), true);
+ }
+ PrevNode = Node;
+ } else {
+ // Insert extra prefix node (or reuse last one)
+ BasicBlock *Flow = needPrefix(false);
+
+ // Insert extra postfix node (or use exit instead)
+ BasicBlock *Entry = Node->getEntry();
+ BasicBlock *Next = needPostfix(Flow, ExitUseAllowed);
+
+ // let it point to entry and next block
+ Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow));
+ addPhiValues(Flow, Entry);
+ DT->changeImmediateDominator(Entry, Flow);
+
+ PrevNode = Node;
+ while (!Order.empty() && !Visited.count(LoopEnd) &&
+ dominatesPredicates(Entry, Order.back())) {
+ handleLoops(false, LoopEnd);
+ }
+
+ changeExit(PrevNode, Next, false);
+ setPrevNode(Next);
+ }
+}
+
+void StructurizeCFG::handleLoops(bool ExitUseAllowed,
+ BasicBlock *LoopEnd) {
+ RegionNode *Node = Order.back();
+ BasicBlock *LoopStart = Node->getEntry();
+
+ if (!Loops.count(LoopStart)) {
+ wireFlow(ExitUseAllowed, LoopEnd);
+ return;
+ }
+
+ if (!isPredictableTrue(Node))
+ LoopStart = needPrefix(true);
+
+ LoopEnd = Loops[Node->getEntry()];
+ wireFlow(false, LoopEnd);
+ while (!Visited.count(LoopEnd)) {
+ handleLoops(false, LoopEnd);
+ }
+
+ // If the start of the loop is the entry block, we can't branch to it so
+ // insert a new dummy entry block.
+ Function *LoopFunc = LoopStart->getParent();
+ if (LoopStart == &LoopFunc->getEntryBlock()) {
+ LoopStart->setName("entry.orig");
+
+ BasicBlock *NewEntry =
+ BasicBlock::Create(LoopStart->getContext(),
+ "entry",
+ LoopFunc,
+ LoopStart);
+ BranchInst::Create(LoopStart, NewEntry);
+ DT->setNewRoot(NewEntry);
+ }
+
+ // Create an extra loop end node
+ LoopEnd = needPrefix(false);
+ BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed);
+ LoopConds.push_back(BranchInst::Create(Next, LoopStart,
+ BoolUndef, LoopEnd));
+ addPhiValues(LoopEnd, LoopStart);
+ setPrevNode(Next);
+}
+
+/// After this function control flow looks like it should be, but
+/// branches and PHI nodes only have undefined conditions.
+void StructurizeCFG::createFlow() {
+ BasicBlock *Exit = ParentRegion->getExit();
+ bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit);
+
+ DeletedPhis.clear();
+ AddedPhis.clear();
+ Conditions.clear();
+ LoopConds.clear();
+
+ PrevNode = nullptr;
+ Visited.clear();
+
+ while (!Order.empty()) {
+ handleLoops(EntryDominatesExit, nullptr);
+ }
+
+ if (PrevNode)
+ changeExit(PrevNode, Exit, EntryDominatesExit);
+ else
+ assert(EntryDominatesExit);
+}
+
+/// Handle a rare case where the disintegrated nodes instructions
+/// no longer dominate all their uses. Not sure if this is really nessasary
+void StructurizeCFG::rebuildSSA() {
+ SSAUpdater Updater;
+ for (BasicBlock *BB : ParentRegion->blocks())
+ for (Instruction &I : *BB) {
+ bool Initialized = false;
+ // We may modify the use list as we iterate over it, so be careful to
+ // compute the next element in the use list at the top of the loop.
+ for (auto UI = I.use_begin(), E = I.use_end(); UI != E;) {
+ Use &U = *UI++;
+ Instruction *User = cast<Instruction>(U.getUser());
+ if (User->getParent() == BB) {
+ continue;
+ } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+ if (UserPN->getIncomingBlock(U) == BB)
+ continue;
+ }
+
+ if (DT->dominates(&I, User))
+ continue;
+
+ if (!Initialized) {
+ Value *Undef = UndefValue::get(I.getType());
+ Updater.Initialize(I.getType(), "");
+ Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
+ Updater.AddAvailableValue(BB, &I);
+ Initialized = true;
+ }
+ Updater.RewriteUseAfterInsertions(U);
+ }
+ }
+}
+
+static bool hasOnlyUniformBranches(const Region *R,
+ const DivergenceAnalysis &DA) {
+ for (const BasicBlock *BB : R->blocks()) {
+ const BranchInst *Br = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!Br || !Br->isConditional())
+ continue;
+
+ if (!DA.isUniform(Br->getCondition()))
+ return false;
+ DEBUG(dbgs() << "BB: " << BB->getName() << " has uniform terminator\n");
+ }
+ return true;
+}
+
+/// \brief Run the transformation for each region found
+bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
+ if (R->isTopLevelRegion())
+ return false;
+
+ if (SkipUniformRegions) {
+ // TODO: We could probably be smarter here with how we handle sub-regions.
+ auto &DA = getAnalysis<DivergenceAnalysis>();
+ if (hasOnlyUniformBranches(R, DA)) {
+ DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R << '\n');
+
+ // Mark all direct child block terminators as having been treated as
+ // uniform. To account for a possible future in which non-uniform
+ // sub-regions are treated more cleverly, indirect children are not
+ // marked as uniform.
+ MDNode *MD = MDNode::get(R->getEntry()->getParent()->getContext(), {});
+ for (RegionNode *E : R->elements()) {
+ if (E->isSubRegion())
+ continue;
+
+ if (Instruction *Term = E->getEntry()->getTerminator())
+ Term->setMetadata("structurizecfg.uniform", MD);
+ }
+
+ return false;
+ }
+ }
+
+ Func = R->getEntry()->getParent();
+ ParentRegion = R;
+
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+ orderNodes();
+ collectInfos();
+ createFlow();
+ insertConditions(false);
+ insertConditions(true);
+ setPhiValues();
+ rebuildSSA();
+
+ // Cleanup
+ Order.clear();
+ Visited.clear();
+ DeletedPhis.clear();
+ AddedPhis.clear();
+ Predicates.clear();
+ Conditions.clear();
+ Loops.clear();
+ LoopPreds.clear();
+ LoopConds.clear();
+
+ return true;
+}
+
+Pass *llvm::createStructurizeCFGPass(bool SkipUniformRegions) {
+ return new StructurizeCFG(SkipUniformRegions);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
new file mode 100644
index 000000000000..2a1106b41de2
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -0,0 +1,853 @@
+//===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file transforms calls of the current function (self recursion) followed
+// by a return instruction with a branch to the entry of the function, creating
+// a loop. This pass also implements the following extensions to the basic
+// algorithm:
+//
+// 1. Trivial instructions between the call and return do not prevent the
+// transformation from taking place, though currently the analysis cannot
+// support moving any really useful instructions (only dead ones).
+// 2. This pass transforms functions that are prevented from being tail
+// recursive by an associative and commutative expression to use an
+// accumulator variable, thus compiling the typical naive factorial or
+// 'fib' implementation into efficient code.
+// 3. TRE is performed if the function returns void, if the return
+// returns the result returned by the call, or if the function returns a
+// run-time constant on all exits from the function. It is possible, though
+// unlikely, that the return returns something else (like constant 0), and
+// can still be TRE'd. It can be TRE'd if ALL OTHER return instructions in
+// the function return the exact same value.
+// 4. If it can prove that callees do not access their caller stack frame,
+// they are marked as eligible for tail call elimination (by the code
+// generator).
+//
+// There are several improvements that could be made:
+//
+// 1. If the function has any alloca instructions, these instructions will be
+// moved out of the entry block of the function, causing them to be
+// evaluated each time through the tail recursion. Safely keeping allocas
+// in the entry block requires analysis to proves that the tail-called
+// function does not read or write the stack object.
+// 2. Tail recursion is only performed if the call immediately precedes the
+// return instruction. It's possible that there could be a jump between
+// the call and the return.
+// 3. There can be intervening operations between the call and the return that
+// prevent the TRE from occurring. For example, there could be GEP's and
+// stores to memory that will not be read or written by the call. This
+// requires some substantial analysis (such as with DSA) to prove safe to
+// move ahead of the call, but doing so could allow many more TREs to be
+// performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark.
+// 4. The algorithm we use to detect if callees access their caller stack
+// frames is very primitive.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/TailRecursionElimination.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "tailcallelim"
+
+STATISTIC(NumEliminated, "Number of tail calls removed");
+STATISTIC(NumRetDuped, "Number of return duplicated");
+STATISTIC(NumAccumAdded, "Number of accumulators introduced");
+
+/// \brief Scan the specified function for alloca instructions.
+/// If it contains any dynamic allocas, returns false.
+static bool canTRE(Function &F) {
+ // Because of PR962, we don't TRE dynamic allocas.
+ return llvm::all_of(instructions(F), [](Instruction &I) {
+ auto *AI = dyn_cast<AllocaInst>(&I);
+ return !AI || AI->isStaticAlloca();
+ });
+}
+
+namespace {
+struct AllocaDerivedValueTracker {
+ // Start at a root value and walk its use-def chain to mark calls that use the
+ // value or a derived value in AllocaUsers, and places where it may escape in
+ // EscapePoints.
+ void walk(Value *Root) {
+ SmallVector<Use *, 32> Worklist;
+ SmallPtrSet<Use *, 32> Visited;
+
+ auto AddUsesToWorklist = [&](Value *V) {
+ for (auto &U : V->uses()) {
+ if (!Visited.insert(&U).second)
+ continue;
+ Worklist.push_back(&U);
+ }
+ };
+
+ AddUsesToWorklist(Root);
+
+ while (!Worklist.empty()) {
+ Use *U = Worklist.pop_back_val();
+ Instruction *I = cast<Instruction>(U->getUser());
+
+ switch (I->getOpcode()) {
+ case Instruction::Call:
+ case Instruction::Invoke: {
+ CallSite CS(I);
+ bool IsNocapture =
+ CS.isDataOperand(U) && CS.doesNotCapture(CS.getDataOperandNo(U));
+ callUsesLocalStack(CS, IsNocapture);
+ if (IsNocapture) {
+ // If the alloca-derived argument is passed in as nocapture, then it
+ // can't propagate to the call's return. That would be capturing.
+ continue;
+ }
+ break;
+ }
+ case Instruction::Load: {
+ // The result of a load is not alloca-derived (unless an alloca has
+ // otherwise escaped, but this is a local analysis).
+ continue;
+ }
+ case Instruction::Store: {
+ if (U->getOperandNo() == 0)
+ EscapePoints.insert(I);
+ continue; // Stores have no users to analyze.
+ }
+ case Instruction::BitCast:
+ case Instruction::GetElementPtr:
+ case Instruction::PHI:
+ case Instruction::Select:
+ case Instruction::AddrSpaceCast:
+ break;
+ default:
+ EscapePoints.insert(I);
+ break;
+ }
+
+ AddUsesToWorklist(I);
+ }
+ }
+
+ void callUsesLocalStack(CallSite CS, bool IsNocapture) {
+ // Add it to the list of alloca users.
+ AllocaUsers.insert(CS.getInstruction());
+
+ // If it's nocapture then it can't capture this alloca.
+ if (IsNocapture)
+ return;
+
+ // If it can write to memory, it can leak the alloca value.
+ if (!CS.onlyReadsMemory())
+ EscapePoints.insert(CS.getInstruction());
+ }
+
+ SmallPtrSet<Instruction *, 32> AllocaUsers;
+ SmallPtrSet<Instruction *, 32> EscapePoints;
+};
+}
+
+static bool markTails(Function &F, bool &AllCallsAreTailCalls,
+ OptimizationRemarkEmitter *ORE) {
+ if (F.callsFunctionThatReturnsTwice())
+ return false;
+ AllCallsAreTailCalls = true;
+
+ // The local stack holds all alloca instructions and all byval arguments.
+ AllocaDerivedValueTracker Tracker;
+ for (Argument &Arg : F.args()) {
+ if (Arg.hasByValAttr())
+ Tracker.walk(&Arg);
+ }
+ for (auto &BB : F) {
+ for (auto &I : BB)
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
+ Tracker.walk(AI);
+ }
+
+ bool Modified = false;
+
+ // Track whether a block is reachable after an alloca has escaped. Blocks that
+ // contain the escaping instruction will be marked as being visited without an
+ // escaped alloca, since that is how the block began.
+ enum VisitType {
+ UNVISITED,
+ UNESCAPED,
+ ESCAPED
+ };
+ DenseMap<BasicBlock *, VisitType> Visited;
+
+ // We propagate the fact that an alloca has escaped from block to successor.
+ // Visit the blocks that are propagating the escapedness first. To do this, we
+ // maintain two worklists.
+ SmallVector<BasicBlock *, 32> WorklistUnescaped, WorklistEscaped;
+
+ // We may enter a block and visit it thinking that no alloca has escaped yet,
+ // then see an escape point and go back around a loop edge and come back to
+ // the same block twice. Because of this, we defer setting tail on calls when
+ // we first encounter them in a block. Every entry in this list does not
+ // statically use an alloca via use-def chain analysis, but may find an alloca
+ // through other means if the block turns out to be reachable after an escape
+ // point.
+ SmallVector<CallInst *, 32> DeferredTails;
+
+ BasicBlock *BB = &F.getEntryBlock();
+ VisitType Escaped = UNESCAPED;
+ do {
+ for (auto &I : *BB) {
+ if (Tracker.EscapePoints.count(&I))
+ Escaped = ESCAPED;
+
+ CallInst *CI = dyn_cast<CallInst>(&I);
+ if (!CI || CI->isTailCall() || isa<DbgInfoIntrinsic>(&I))
+ continue;
+
+ bool IsNoTail = CI->isNoTailCall() || CI->hasOperandBundles();
+
+ if (!IsNoTail && CI->doesNotAccessMemory()) {
+ // A call to a readnone function whose arguments are all things computed
+ // outside this function can be marked tail. Even if you stored the
+ // alloca address into a global, a readnone function can't load the
+ // global anyhow.
+ //
+ // Note that this runs whether we know an alloca has escaped or not. If
+ // it has, then we can't trust Tracker.AllocaUsers to be accurate.
+ bool SafeToTail = true;
+ for (auto &Arg : CI->arg_operands()) {
+ if (isa<Constant>(Arg.getUser()))
+ continue;
+ if (Argument *A = dyn_cast<Argument>(Arg.getUser()))
+ if (!A->hasByValAttr())
+ continue;
+ SafeToTail = false;
+ break;
+ }
+ if (SafeToTail) {
+ using namespace ore;
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "tailcall-readnone", CI)
+ << "marked as tail call candidate (readnone)";
+ });
+ CI->setTailCall();
+ Modified = true;
+ continue;
+ }
+ }
+
+ if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) {
+ DeferredTails.push_back(CI);
+ } else {
+ AllCallsAreTailCalls = false;
+ }
+ }
+
+ for (auto *SuccBB : make_range(succ_begin(BB), succ_end(BB))) {
+ auto &State = Visited[SuccBB];
+ if (State < Escaped) {
+ State = Escaped;
+ if (State == ESCAPED)
+ WorklistEscaped.push_back(SuccBB);
+ else
+ WorklistUnescaped.push_back(SuccBB);
+ }
+ }
+
+ if (!WorklistEscaped.empty()) {
+ BB = WorklistEscaped.pop_back_val();
+ Escaped = ESCAPED;
+ } else {
+ BB = nullptr;
+ while (!WorklistUnescaped.empty()) {
+ auto *NextBB = WorklistUnescaped.pop_back_val();
+ if (Visited[NextBB] == UNESCAPED) {
+ BB = NextBB;
+ Escaped = UNESCAPED;
+ break;
+ }
+ }
+ }
+ } while (BB);
+
+ for (CallInst *CI : DeferredTails) {
+ if (Visited[CI->getParent()] != ESCAPED) {
+ // If the escape point was part way through the block, calls after the
+ // escape point wouldn't have been put into DeferredTails.
+ DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n");
+ CI->setTailCall();
+ Modified = true;
+ } else {
+ AllCallsAreTailCalls = false;
+ }
+ }
+
+ return Modified;
+}
+
+/// Return true if it is safe to move the specified
+/// instruction from after the call to before the call, assuming that all
+/// instructions between the call and this instruction are movable.
+///
+static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
+ // FIXME: We can move load/store/call/free instructions above the call if the
+ // call does not mod/ref the memory location being processed.
+ if (I->mayHaveSideEffects()) // This also handles volatile loads.
+ return false;
+
+ if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+ // Loads may always be moved above calls without side effects.
+ if (CI->mayHaveSideEffects()) {
+ // Non-volatile loads may be moved above a call with side effects if it
+ // does not write to memory and the load provably won't trap.
+ // Writes to memory only matter if they may alias the pointer
+ // being loaded from.
+ const DataLayout &DL = L->getModule()->getDataLayout();
+ if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) ||
+ !isSafeToLoadUnconditionally(L->getPointerOperand(),
+ L->getAlignment(), DL, L))
+ return false;
+ }
+ }
+
+ // Otherwise, if this is a side-effect free instruction, check to make sure
+ // that it does not use the return value of the call. If it doesn't use the
+ // return value of the call, it must only use things that are defined before
+ // the call, or movable instructions between the call and the instruction
+ // itself.
+ return !is_contained(I->operands(), CI);
+}
+
+/// Return true if the specified value is the same when the return would exit
+/// as it was when the initial iteration of the recursive function was executed.
+///
+/// We currently handle static constants and arguments that are not modified as
+/// part of the recursion.
+static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) {
+ if (isa<Constant>(V)) return true; // Static constants are always dyn consts
+
+ // Check to see if this is an immutable argument, if so, the value
+ // will be available to initialize the accumulator.
+ if (Argument *Arg = dyn_cast<Argument>(V)) {
+ // Figure out which argument number this is...
+ unsigned ArgNo = 0;
+ Function *F = CI->getParent()->getParent();
+ for (Function::arg_iterator AI = F->arg_begin(); &*AI != Arg; ++AI)
+ ++ArgNo;
+
+ // If we are passing this argument into call as the corresponding
+ // argument operand, then the argument is dynamically constant.
+ // Otherwise, we cannot transform this function safely.
+ if (CI->getArgOperand(ArgNo) == Arg)
+ return true;
+ }
+
+ // Switch cases are always constant integers. If the value is being switched
+ // on and the return is only reachable from one of its cases, it's
+ // effectively constant.
+ if (BasicBlock *UniquePred = RI->getParent()->getUniquePredecessor())
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(UniquePred->getTerminator()))
+ if (SI->getCondition() == V)
+ return SI->getDefaultDest() != RI->getParent();
+
+ // Not a constant or immutable argument, we can't safely transform.
+ return false;
+}
+
+/// Check to see if the function containing the specified tail call consistently
+/// returns the same runtime-constant value at all exit points except for
+/// IgnoreRI. If so, return the returned value.
+static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) {
+ Function *F = CI->getParent()->getParent();
+ Value *ReturnedValue = nullptr;
+
+ for (BasicBlock &BBI : *F) {
+ ReturnInst *RI = dyn_cast<ReturnInst>(BBI.getTerminator());
+ if (RI == nullptr || RI == IgnoreRI) continue;
+
+ // We can only perform this transformation if the value returned is
+ // evaluatable at the start of the initial invocation of the function,
+ // instead of at the end of the evaluation.
+ //
+ Value *RetOp = RI->getOperand(0);
+ if (!isDynamicConstant(RetOp, CI, RI))
+ return nullptr;
+
+ if (ReturnedValue && RetOp != ReturnedValue)
+ return nullptr; // Cannot transform if differing values are returned.
+ ReturnedValue = RetOp;
+ }
+ return ReturnedValue;
+}
+
+/// If the specified instruction can be transformed using accumulator recursion
+/// elimination, return the constant which is the start of the accumulator
+/// value. Otherwise return null.
+static Value *canTransformAccumulatorRecursion(Instruction *I, CallInst *CI) {
+ if (!I->isAssociative() || !I->isCommutative()) return nullptr;
+ assert(I->getNumOperands() == 2 &&
+ "Associative/commutative operations should have 2 args!");
+
+ // Exactly one operand should be the result of the call instruction.
+ if ((I->getOperand(0) == CI && I->getOperand(1) == CI) ||
+ (I->getOperand(0) != CI && I->getOperand(1) != CI))
+ return nullptr;
+
+ // The only user of this instruction we allow is a single return instruction.
+ if (!I->hasOneUse() || !isa<ReturnInst>(I->user_back()))
+ return nullptr;
+
+ // Ok, now we have to check all of the other return instructions in this
+ // function. If they return non-constants or differing values, then we cannot
+ // transform the function safely.
+ return getCommonReturnValue(cast<ReturnInst>(I->user_back()), CI);
+}
+
+static Instruction *firstNonDbg(BasicBlock::iterator I) {
+ while (isa<DbgInfoIntrinsic>(I))
+ ++I;
+ return &*I;
+}
+
+static CallInst *findTRECandidate(Instruction *TI,
+ bool CannotTailCallElimCallsMarkedTail,
+ const TargetTransformInfo *TTI) {
+ BasicBlock *BB = TI->getParent();
+ Function *F = BB->getParent();
+
+ if (&BB->front() == TI) // Make sure there is something before the terminator.
+ return nullptr;
+
+ // Scan backwards from the return, checking to see if there is a tail call in
+ // this block. If so, set CI to it.
+ CallInst *CI = nullptr;
+ BasicBlock::iterator BBI(TI);
+ while (true) {
+ CI = dyn_cast<CallInst>(BBI);
+ if (CI && CI->getCalledFunction() == F)
+ break;
+
+ if (BBI == BB->begin())
+ return nullptr; // Didn't find a potential tail call.
+ --BBI;
+ }
+
+ // If this call is marked as a tail call, and if there are dynamic allocas in
+ // the function, we cannot perform this optimization.
+ if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail)
+ return nullptr;
+
+ // As a special case, detect code like this:
+ // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
+ // and disable this xform in this case, because the code generator will
+ // lower the call to fabs into inline code.
+ if (BB == &F->getEntryBlock() &&
+ firstNonDbg(BB->front().getIterator()) == CI &&
+ firstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() &&
+ !TTI->isLoweredToCall(CI->getCalledFunction())) {
+ // A single-block function with just a call and a return. Check that
+ // the arguments match.
+ CallSite::arg_iterator I = CallSite(CI).arg_begin(),
+ E = CallSite(CI).arg_end();
+ Function::arg_iterator FI = F->arg_begin(),
+ FE = F->arg_end();
+ for (; I != E && FI != FE; ++I, ++FI)
+ if (*I != &*FI) break;
+ if (I == E && FI == FE)
+ return nullptr;
+ }
+
+ return CI;
+}
+
+static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
+ BasicBlock *&OldEntry,
+ bool &TailCallsAreMarkedTail,
+ SmallVectorImpl<PHINode *> &ArgumentPHIs,
+ AliasAnalysis *AA,
+ OptimizationRemarkEmitter *ORE) {
+ // If we are introducing accumulator recursion to eliminate operations after
+ // the call instruction that are both associative and commutative, the initial
+ // value for the accumulator is placed in this variable. If this value is set
+ // then we actually perform accumulator recursion elimination instead of
+ // simple tail recursion elimination. If the operation is an LLVM instruction
+ // (eg: "add") then it is recorded in AccumulatorRecursionInstr. If not, then
+ // we are handling the case when the return instruction returns a constant C
+ // which is different to the constant returned by other return instructions
+ // (which is recorded in AccumulatorRecursionEliminationInitVal). This is a
+ // special case of accumulator recursion, the operation being "return C".
+ Value *AccumulatorRecursionEliminationInitVal = nullptr;
+ Instruction *AccumulatorRecursionInstr = nullptr;
+
+ // Ok, we found a potential tail call. We can currently only transform the
+ // tail call if all of the instructions between the call and the return are
+ // movable to above the call itself, leaving the call next to the return.
+ // Check that this is the case now.
+ BasicBlock::iterator BBI(CI);
+ for (++BBI; &*BBI != Ret; ++BBI) {
+ if (canMoveAboveCall(&*BBI, CI, AA))
+ continue;
+
+ // If we can't move the instruction above the call, it might be because it
+ // is an associative and commutative operation that could be transformed
+ // using accumulator recursion elimination. Check to see if this is the
+ // case, and if so, remember the initial accumulator value for later.
+ if ((AccumulatorRecursionEliminationInitVal =
+ canTransformAccumulatorRecursion(&*BBI, CI))) {
+ // Yes, this is accumulator recursion. Remember which instruction
+ // accumulates.
+ AccumulatorRecursionInstr = &*BBI;
+ } else {
+ return false; // Otherwise, we cannot eliminate the tail recursion!
+ }
+ }
+
+ // We can only transform call/return pairs that either ignore the return value
+ // of the call and return void, ignore the value of the call and return a
+ // constant, return the value returned by the tail call, or that are being
+ // accumulator recursion variable eliminated.
+ if (Ret->getNumOperands() == 1 && Ret->getReturnValue() != CI &&
+ !isa<UndefValue>(Ret->getReturnValue()) &&
+ AccumulatorRecursionEliminationInitVal == nullptr &&
+ !getCommonReturnValue(nullptr, CI)) {
+ // One case remains that we are able to handle: the current return
+ // instruction returns a constant, and all other return instructions
+ // return a different constant.
+ if (!isDynamicConstant(Ret->getReturnValue(), CI, Ret))
+ return false; // Current return instruction does not return a constant.
+ // Check that all other return instructions return a common constant. If
+ // so, record it in AccumulatorRecursionEliminationInitVal.
+ AccumulatorRecursionEliminationInitVal = getCommonReturnValue(Ret, CI);
+ if (!AccumulatorRecursionEliminationInitVal)
+ return false;
+ }
+
+ BasicBlock *BB = Ret->getParent();
+ Function *F = BB->getParent();
+
+ using namespace ore;
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "tailcall-recursion", CI)
+ << "transforming tail recursion into loop";
+ });
+
+ // OK! We can transform this tail call. If this is the first one found,
+ // create the new entry block, allowing us to branch back to the old entry.
+ if (!OldEntry) {
+ OldEntry = &F->getEntryBlock();
+ BasicBlock *NewEntry = BasicBlock::Create(F->getContext(), "", F, OldEntry);
+ NewEntry->takeName(OldEntry);
+ OldEntry->setName("tailrecurse");
+ BranchInst::Create(OldEntry, NewEntry);
+
+ // If this tail call is marked 'tail' and if there are any allocas in the
+ // entry block, move them up to the new entry block.
+ TailCallsAreMarkedTail = CI->isTailCall();
+ if (TailCallsAreMarkedTail)
+ // Move all fixed sized allocas from OldEntry to NewEntry.
+ for (BasicBlock::iterator OEBI = OldEntry->begin(), E = OldEntry->end(),
+ NEBI = NewEntry->begin(); OEBI != E; )
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
+ if (isa<ConstantInt>(AI->getArraySize()))
+ AI->moveBefore(&*NEBI);
+
+ // Now that we have created a new block, which jumps to the entry
+ // block, insert a PHI node for each argument of the function.
+ // For now, we initialize each PHI to only have the real arguments
+ // which are passed in.
+ Instruction *InsertPos = &OldEntry->front();
+ for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+ I != E; ++I) {
+ PHINode *PN = PHINode::Create(I->getType(), 2,
+ I->getName() + ".tr", InsertPos);
+ I->replaceAllUsesWith(PN); // Everyone use the PHI node now!
+ PN->addIncoming(&*I, NewEntry);
+ ArgumentPHIs.push_back(PN);
+ }
+ }
+
+ // If this function has self recursive calls in the tail position where some
+ // are marked tail and some are not, only transform one flavor or another. We
+ // have to choose whether we move allocas in the entry block to the new entry
+ // block or not, so we can't make a good choice for both. NOTE: We could do
+ // slightly better here in the case that the function has no entry block
+ // allocas.
+ if (TailCallsAreMarkedTail && !CI->isTailCall())
+ return false;
+
+ // Ok, now that we know we have a pseudo-entry block WITH all of the
+ // required PHI nodes, add entries into the PHI node for the actual
+ // parameters passed into the tail-recursive call.
+ for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
+ ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB);
+
+ // If we are introducing an accumulator variable to eliminate the recursion,
+ // do so now. Note that we _know_ that no subsequent tail recursion
+ // eliminations will happen on this function because of the way the
+ // accumulator recursion predicate is set up.
+ //
+ if (AccumulatorRecursionEliminationInitVal) {
+ Instruction *AccRecInstr = AccumulatorRecursionInstr;
+ // Start by inserting a new PHI node for the accumulator.
+ pred_iterator PB = pred_begin(OldEntry), PE = pred_end(OldEntry);
+ PHINode *AccPN = PHINode::Create(
+ AccumulatorRecursionEliminationInitVal->getType(),
+ std::distance(PB, PE) + 1, "accumulator.tr", &OldEntry->front());
+
+ // Loop over all of the predecessors of the tail recursion block. For the
+ // real entry into the function we seed the PHI with the initial value,
+ // computed earlier. For any other existing branches to this block (due to
+ // other tail recursions eliminated) the accumulator is not modified.
+ // Because we haven't added the branch in the current block to OldEntry yet,
+ // it will not show up as a predecessor.
+ for (pred_iterator PI = PB; PI != PE; ++PI) {
+ BasicBlock *P = *PI;
+ if (P == &F->getEntryBlock())
+ AccPN->addIncoming(AccumulatorRecursionEliminationInitVal, P);
+ else
+ AccPN->addIncoming(AccPN, P);
+ }
+
+ if (AccRecInstr) {
+ // Add an incoming argument for the current block, which is computed by
+ // our associative and commutative accumulator instruction.
+ AccPN->addIncoming(AccRecInstr, BB);
+
+ // Next, rewrite the accumulator recursion instruction so that it does not
+ // use the result of the call anymore, instead, use the PHI node we just
+ // inserted.
+ AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
+ } else {
+ // Add an incoming argument for the current block, which is just the
+ // constant returned by the current return instruction.
+ AccPN->addIncoming(Ret->getReturnValue(), BB);
+ }
+
+ // Finally, rewrite any return instructions in the program to return the PHI
+ // node instead of the "initval" that they do currently. This loop will
+ // actually rewrite the return value we are destroying, but that's ok.
+ for (BasicBlock &BBI : *F)
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI.getTerminator()))
+ RI->setOperand(0, AccPN);
+ ++NumAccumAdded;
+ }
+
+ // Now that all of the PHI nodes are in place, remove the call and
+ // ret instructions, replacing them with an unconditional branch.
+ BranchInst *NewBI = BranchInst::Create(OldEntry, Ret);
+ NewBI->setDebugLoc(CI->getDebugLoc());
+
+ BB->getInstList().erase(Ret); // Remove return.
+ BB->getInstList().erase(CI); // Remove call.
+ ++NumEliminated;
+ return true;
+}
+
+static bool foldReturnAndProcessPred(
+ BasicBlock *BB, ReturnInst *Ret, BasicBlock *&OldEntry,
+ bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs,
+ bool CannotTailCallElimCallsMarkedTail, const TargetTransformInfo *TTI,
+ AliasAnalysis *AA, OptimizationRemarkEmitter *ORE) {
+ bool Change = false;
+
+ // Make sure this block is a trivial return block.
+ assert(BB->getFirstNonPHIOrDbg() == Ret &&
+ "Trying to fold non-trivial return block");
+
+ // If the return block contains nothing but the return and PHI's,
+ // there might be an opportunity to duplicate the return in its
+ // predecessors and perform TRE there. Look for predecessors that end
+ // in unconditional branch and recursive call(s).
+ SmallVector<BranchInst*, 8> UncondBranchPreds;
+ for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+ BasicBlock *Pred = *PI;
+ TerminatorInst *PTI = Pred->getTerminator();
+ if (BranchInst *BI = dyn_cast<BranchInst>(PTI))
+ if (BI->isUnconditional())
+ UncondBranchPreds.push_back(BI);
+ }
+
+ while (!UncondBranchPreds.empty()) {
+ BranchInst *BI = UncondBranchPreds.pop_back_val();
+ BasicBlock *Pred = BI->getParent();
+ if (CallInst *CI = findTRECandidate(BI, CannotTailCallElimCallsMarkedTail, TTI)){
+ DEBUG(dbgs() << "FOLDING: " << *BB
+ << "INTO UNCOND BRANCH PRED: " << *Pred);
+ ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred);
+
+ // Cleanup: if all predecessors of BB have been eliminated by
+ // FoldReturnIntoUncondBranch, delete it. It is important to empty it,
+ // because the ret instruction in there is still using a value which
+ // eliminateRecursiveTailCall will attempt to remove.
+ if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
+ BB->eraseFromParent();
+
+ eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
+ ArgumentPHIs, AA, ORE);
+ ++NumRetDuped;
+ Change = true;
+ }
+ }
+
+ return Change;
+}
+
+static bool processReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
+ bool &TailCallsAreMarkedTail,
+ SmallVectorImpl<PHINode *> &ArgumentPHIs,
+ bool CannotTailCallElimCallsMarkedTail,
+ const TargetTransformInfo *TTI,
+ AliasAnalysis *AA,
+ OptimizationRemarkEmitter *ORE) {
+ CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail, TTI);
+ if (!CI)
+ return false;
+
+ return eliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail,
+ ArgumentPHIs, AA, ORE);
+}
+
+static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI,
+ AliasAnalysis *AA,
+ OptimizationRemarkEmitter *ORE) {
+ if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
+ return false;
+
+ bool MadeChange = false;
+ bool AllCallsAreTailCalls = false;
+ MadeChange |= markTails(F, AllCallsAreTailCalls, ORE);
+ if (!AllCallsAreTailCalls)
+ return MadeChange;
+
+ // If this function is a varargs function, we won't be able to PHI the args
+ // right, so don't even try to convert it...
+ if (F.getFunctionType()->isVarArg())
+ return false;
+
+ BasicBlock *OldEntry = nullptr;
+ bool TailCallsAreMarkedTail = false;
+ SmallVector<PHINode*, 8> ArgumentPHIs;
+
+ // If false, we cannot perform TRE on tail calls marked with the 'tail'
+ // attribute, because doing so would cause the stack size to increase (real
+ // TRE would deallocate variable sized allocas, TRE doesn't).
+ bool CanTRETailMarkedCall = canTRE(F);
+
+ // Change any tail recursive calls to loops.
+ //
+ // FIXME: The code generator produces really bad code when an 'escaping
+ // alloca' is changed from being a static alloca to being a dynamic alloca.
+ // Until this is resolved, disable this transformation if that would ever
+ // happen. This bug is PR962.
+ for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) {
+ BasicBlock *BB = &*BBI++; // foldReturnAndProcessPred may delete BB.
+ if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
+ bool Change = processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
+ ArgumentPHIs, !CanTRETailMarkedCall,
+ TTI, AA, ORE);
+ if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
+ Change = foldReturnAndProcessPred(BB, Ret, OldEntry,
+ TailCallsAreMarkedTail, ArgumentPHIs,
+ !CanTRETailMarkedCall, TTI, AA, ORE);
+ MadeChange |= Change;
+ }
+ }
+
+ // If we eliminated any tail recursions, it's possible that we inserted some
+ // silly PHI nodes which just merge an initial value (the incoming operand)
+ // with themselves. Check to see if we did and clean up our mess if so. This
+ // occurs when a function passes an argument straight through to its tail
+ // call.
+ for (PHINode *PN : ArgumentPHIs) {
+ // If the PHI Node is a dynamic constant, replace it with the value it is.
+ if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) {
+ PN->replaceAllUsesWith(PNV);
+ PN->eraseFromParent();
+ }
+ }
+
+ return MadeChange;
+}
+
+namespace {
+struct TailCallElim : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ TailCallElim() : FunctionPass(ID) {
+ initializeTailCallElimPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ return eliminateTailRecursion(
+ F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
+ &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+ &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE());
+ }
+};
+}
+
+char TailCallElim::ID = 0;
+INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(TailCallElim, "tailcallelim", "Tail Call Elimination",
+ false, false)
+
+// Public interface to the TailCallElimination pass
+FunctionPass *llvm::createTailCallEliminationPass() {
+ return new TailCallElim();
+}
+
+PreservedAnalyses TailCallElimPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+
+ TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+ AliasAnalysis &AA = AM.getResult<AAManager>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+ bool Changed = eliminateTailRecursion(F, &TTI, &AA, &ORE);
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<GlobalsAA>();
+ return PA;
+}